X-Git-Url: https://git.8kb.co.uk/?p=dataflex%2Fdf32func;a=blobdiff_plain;f=src%2Fc%2Fgnuregex.c;fp=src%2Fc%2Fgnuregex.c;h=f0f2dea668b76d7960c85bef8c81b931eb99de56;hp=0000000000000000000000000000000000000000;hb=21b727fd491be6f9953f1675b18385296cab0955;hpb=0342737c4763de343d9d87c0cb25a8e31f0211e7 diff --git a/src/c/gnuregex.c b/src/c/gnuregex.c new file mode 100644 index 0000000..f0f2dea --- /dev/null +++ b/src/c/gnuregex.c @@ -0,0 +1,464 @@ +/*------------------------------------------------------------------------- + * gnuregex.c + * posix regex extensions + * + * Copyright (c) 2007-2015, glyn@8kb.co.uk + * Author: Glyn Astill + * + *------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include "memman.h" + +#define MAX_ERROR_MSG 0x1000 + +/* + * Return a properly escaped / quoted string + */ +static char * quote_output(char *str) { + char *result; + char *result_return; + int len; + int do_quote = 0; + char *ptr; + + len = strlen(str); + + /* Check for characters that need quoting */ + for (ptr = str; *ptr; ptr++) { + char ch = *ptr; + if (ch == '\"' || ch =='\\' || ch == '\{' || ch == ',') { + do_quote = 1; + break; + } + } + + /* If we find no characters that need quoting just return the input */ + if (do_quote != 1) + return str; + + /* Do the quoting, here the allocation is wasteful */ + result = (char *) wmalloc((len * 2 + 3) * sizeof(char)); + result_return = result; + + /* + * Starting address of result is incremented as we modify it's contents here + * with result_return keeping the starting address + */ + *result++ = '"'; + while (len-- > 0) { + /* Escape double quotes and backslash with backslash */ + if (*str == '"') { + *result++ = '\\'; + } + if (*str == '\\') { + *result++ = '\\'; + } + *result++ = *str++; + } + *result++ = '"'; + *result++ = '\0'; + + return result_return; +} + +/* + * Count open parenthesis to evaluate the number of subexpressions in the regex + */ +static int count_subexpressions(const char *str){ + int result = 0; + int last_was_backslash = 0; + const char *ptr; + + for(ptr=str; *ptr; ptr++){ + if (*ptr == '\\' && !last_was_backslash){ + last_was_backslash = 1; + continue; + } + if (*ptr == ')' && !last_was_backslash) + result++; + last_was_backslash = 0; + } + return result; +} + +/* + * Check to see if string contains any escape chars + * these could of course just be escaped backslashes + * themselvs. + */ +static int has_escapes(const char *str){ + const char *ptr; + + for(ptr=str; *ptr; ptr++){ + if (*ptr == '\\') + return 1; + } + return 0; +} + +/* + * Compile the regex pattern + */ +static int compile_regex(regex_t *re, const char *pattern, const char *flags, int errors) +{ + int status; + int cflags = REG_EXTENDED; + + if (strchr(flags, 'i')) { + cflags = cflags|REG_ICASE; + } + if (strchr(flags, 'n')) { + cflags = cflags|REG_NEWLINE; + } + + status = regcomp(re, pattern, cflags); + if (status != REG_NOERROR) { + if (errors == 1) { + char error_message[MAX_ERROR_MSG]; + regerror (status, re, error_message, MAX_ERROR_MSG); + fprintf (stderr, "Regex error compiling '%s': %s\n", pattern, error_message); + } + } + return status; +} + +/* + * Returns a pointer to a malloced array of regmatch_t containing match offsets + * in the input string. (As opposed to offests from each match) + * + * The regmatch struct info: + * regmatch_t.rm_so (regoff_t) = byte offset from start of string to start of substring + * regmatch_t.rm_eo (regoff_t) = byte offset from start of string to first character after the end of substring + */ +static int find_regex_matches(regex_t *re, const char *str, const int nsub, const char *flags, regmatch_t **result) +{ + /* Each individual match and it's subexpression matches stored in m */ + regmatch_t m[nsub+1]; + + /* A pointer into the string at the end of the previous match */ + const char *prev_match_eo = str; + + /* + * We return a count of matches and pass back an array of regmatch_t in + * matches containing match offsets in the original string + */ + int array_len = strchr(flags, 'g') ? 256 : 32; + int match_count = 0; + regmatch_t *matches; + + matches = (regmatch_t *) wmalloc(sizeof(regmatch_t) * array_len); + + while (!regexec(re, prev_match_eo, nsub+1, m, 0)) { + int i = 0; + + /* resize the matches array; when more space is required double current size */ + while (match_count + (nsub * 2) > array_len) { + array_len *= 2; + matches = (regmatch_t *) wrealloc(matches, sizeof(regmatch_t) * array_len); + } + + /* when we have subexpressions, we're only interested in their match offsets */ + if (nsub > 0) { + for (i = 1; i <= nsub; i++) { + if (m[i].rm_so < 0 || m[i].rm_eo < 0) { + matches[match_count].rm_so = -1; + matches[match_count++].rm_eo = -1; + } + else { + matches[match_count].rm_so = (prev_match_eo - str) + m[i].rm_so; + matches[match_count++].rm_eo = (prev_match_eo - str) + m[i].rm_eo; + } + } + } + /* else we want the original match offsets*/ + else { + matches[match_count].rm_so = (prev_match_eo - str) + m[0].rm_so; + matches[match_count++].rm_eo = (prev_match_eo - str) + m[0].rm_eo; + } + + /* + * If we have matched on a blank expression or we were + * not flagged to do greedy matching then break + */ + if (!m[0].rm_eo || !strchr(flags, 'g')) + break; + + /* + * Advance the search position to the end of the current match + * If the match happens to be zero length, advance search position + * by one? + */ + if (m[0].rm_eo == m[0].rm_so) + prev_match_eo++; + else + prev_match_eo += m[0].rm_eo; + } + *result = matches; + + return match_count; +} + +/* + * Takes regmatch_t array returned by find_regex_matches and returns a malloced + * string representing the captured substrings. + */ +static char * regex_matches_to_string(const char *str, int nsub, int match_count, regmatch_t *matches) { + int j; + int i; + char *unquoted = NULL; + char *quoted = NULL; + int quoted_len; + char *result; + + int str_len = strlen(str); + int allocated_sz = str_len+1; + result = wmalloc(allocated_sz * sizeof(char)); + int result_sz = 0; + + j = 0; + while (j < match_count) { + + if (j > 0) { + result_sz += 2; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-2] = ','; + result[result_sz-1] = '{'; + } + else { + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = '{'; + } + + for (i = 0; i <= nsub; i++) { + if ((nsub > 0) && (i == 0)) + continue; + + if (i > 1) { + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = ','; + } + + int so = matches[j].rm_so; + int eo = matches[j].rm_eo; + + if (so == -1 || eo == -1) { + result = reallocate_block(result, &allocated_sz, (result_sz+4) * sizeof(char), str_len); + strncpy(result+result_sz, "NULL", 4); + result_sz += 4; + } + else { + unquoted = wmalloc((eo-so)+1 * sizeof(char)); + strncpy(unquoted, str+so, eo-so); + unquoted[eo-so] = '\0'; + quoted = quote_output(unquoted); + quoted_len = strlen(quoted); + + result = reallocate_block(result, &allocated_sz, (result_sz+quoted_len) * sizeof(char), str_len); + strncpy(result+result_sz, quoted, quoted_len); + result_sz += quoted_len; + + if (quoted != unquoted) + wfree(unquoted); + wfree(quoted); + } + j++; + } + + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = '}'; + } + + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = '\0'; + + return result; +} + +/* + * Purely check for a match in the regex + */ +int regexp_match(const char *str, const char *pattern, const char *flags, int errors) +{ + regex_t re; + int result; + int status; + + status = compile_regex(&re, pattern, flags, errors); + if (status == REG_NOERROR) { + result = regexec(&re, str, (size_t) 0, NULL, 0); + regfree(&re); + + if (!result) /* match */ + return 1; + else /* no match */ + return 0; + } + else /* error condition, but still: no match */ + return 0; +} + +/* + * Return all matches in the regex as a string by first calling find_regex_matches + * and then regex_matches_to_string. Arguably this could all be one function + * however separation will make future multiple output formats easier. + */ +char * regexp_matches(const char *str, const char *pattern, const char *flags, int errors) +{ + regex_t re; + regmatch_t *matches_p = NULL; + int nsub; + int match_count; + int status; + char *result = NULL; + + /* Compile the regex */ + status = compile_regex(&re, pattern, flags, errors); + if (status == REG_NOERROR) { + /* Count our subexpressions to size our regmatch_t array */ + nsub = count_subexpressions(pattern); + /* Find all the matches relative to the input string */ + match_count = find_regex_matches(&re, str, nsub, flags, &matches_p); + /* Turn the matches into an output string */ + result = regex_matches_to_string(str, nsub, match_count, matches_p); + /* Free up the regmatch_t malloced by find_regex_matches */ + wfree(matches_p); + regfree(&re); + } + + return result; +} + +/* + * Substitutes matches with the regex pattern in the string with the replacement + * pattern/string. + */ +char * regexp_replace(const char *str, const char *pattern, const char *replacement, const char *flags, int errors) +{ + regex_t re; + int nsub; + char *result = NULL; + char *match_str; + int status; + const char *prev_match_eo = str; + int str_len = strlen(str); + int replacement_len = strlen(replacement); + int allocated_sz = str_len+1; + int result_sz = 0; + + status = compile_regex(&re, pattern, flags, errors); + if (status == REG_NOERROR) { + + result = wmalloc(allocated_sz * sizeof(char)); + + /* Count our subexpressions to size our regmatch_t array */ + nsub = count_subexpressions(pattern); + regmatch_t m[nsub+1]; + + while (!regexec(&re, prev_match_eo, nsub+1, m, 0)) { + + /* Copy everything to the left of the first match */ + if (m[0].rm_so > 0) { + result = reallocate_block(result, &allocated_sz, (result_sz+m[0].rm_so) * sizeof(char), str_len); + strncpy(result+result_sz, prev_match_eo, m[0].rm_so); + result_sz += m[0].rm_so; + } + + /* If there are no backreferences in the replacement, copy in the replacement */ + if (!has_escapes(replacement)) { + result = reallocate_block(result, &allocated_sz, (result_sz+replacement_len) * sizeof(char), str_len); + strncpy(result+result_sz, replacement, replacement_len); + result_sz += replacement_len; + } + /* Otherwise process the backreferences and copy in subcaptures */ + else { + /* find the next escape char */ + const char *start = replacement; + const char *ptr; + + for(ptr = replacement; *ptr; ptr++) { + if (*ptr != '\\') + continue; + + /* append everything to the left of the current escape */ + result = reallocate_block(result, &allocated_sz, (result_sz+(ptr-start)) * sizeof(char), str_len); + strncpy(result+result_sz, start, (ptr-start)); + result_sz += (ptr-start); + + ptr++; + + if ((*ptr >= '1' && *ptr <= '9') || (*ptr == '&')) + { + /* Use the back reference of regexp. */ + int sub; + if (*ptr == '&') + sub = 0; + else + sub = *ptr - '0'; + + if (m[sub].rm_so != -1 && m[sub].rm_eo != -1 && sub <= nsub) { + result = reallocate_block(result, &allocated_sz, (result_sz+(m[sub].rm_eo-m[sub].rm_so)) * sizeof(char), str_len); + strncpy(result+result_sz, prev_match_eo+m[sub].rm_so, (m[sub].rm_eo-m[sub].rm_so)); + result_sz += (m[sub].rm_eo-m[sub].rm_so); + } + ptr++; + } + else if (*ptr == '\\') + { + /* append backsalsh */ + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = '\\'; + ptr++; + } + else { + /* append backsalsh */ + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = '\\'; + } + start = ptr; + } + /* + * Append right trailing replacement, except in the instance + * when it starts with character zero, which can happen when the + * last part of the replace string is escaped. + */ + if (*start) { + result = reallocate_block(result, &allocated_sz, (result_sz+(ptr-start)) * sizeof(char), str_len); + strncpy(result+result_sz, start, (ptr-start)); + result_sz += (ptr-start); + } + + } + prev_match_eo += m[0].rm_eo; + + /* + * If we have matched on a blank expression or we were + * not flagged to do greedy matching then break + */ + if (!m[0].rm_eo || !strchr(flags, 'g')) + break; + } + + /* Copy everything to the right of the last match */ + result = reallocate_block(result, &allocated_sz, (result_sz+(str_len-(prev_match_eo-str))) * sizeof(char), str_len); + strncpy(result+result_sz, prev_match_eo, str_len-(prev_match_eo-str)); + result_sz += str_len-(prev_match_eo-str); + + regfree(&re); + + result_sz++; + result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len); + result[result_sz-1] = '\0'; + } + return result; +}