+/*-------------------------------------------------------------------------\r
+ * gnuregex.c\r
+ * posix regex extensions\r
+ *\r
+ * Copyright (c) 2007-2015, glyn@8kb.co.uk\r
+ * Author: Glyn Astill <glyn@8kb.co.uk>\r
+ *\r
+ *-------------------------------------------------------------------------\r
+ */\r
+\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <string.h>\r
+#include <regex.h>\r
+#include "memman.h"\r
+\r
+#define MAX_ERROR_MSG 0x1000\r
+\r
+/*\r
+ * Return a properly escaped / quoted string\r
+ */\r
+static char * quote_output(char *str) {\r
+ char *result;\r
+ char *result_return;\r
+ int len;\r
+ int do_quote = 0;\r
+ char *ptr;\r
+\r
+ len = strlen(str);\r
+\r
+ /* Check for characters that need quoting */\r
+ for (ptr = str; *ptr; ptr++) {\r
+ char ch = *ptr;\r
+ if (ch == '\"' || ch =='\\' || ch == '\{' || ch == ',') {\r
+ do_quote = 1;\r
+ break;\r
+ }\r
+ }\r
+\r
+ /* If we find no characters that need quoting just return the input */\r
+ if (do_quote != 1)\r
+ return str;\r
+\r
+ /* Do the quoting, here the allocation is wasteful */\r
+ result = (char *) wmalloc((len * 2 + 3) * sizeof(char));\r
+ result_return = result;\r
+\r
+ /*\r
+ * Starting address of result is incremented as we modify it's contents here\r
+ * with result_return keeping the starting address\r
+ */\r
+ *result++ = '"';\r
+ while (len-- > 0) {\r
+ /* Escape double quotes and backslash with backslash */\r
+ if (*str == '"') {\r
+ *result++ = '\\';\r
+ }\r
+ if (*str == '\\') {\r
+ *result++ = '\\';\r
+ }\r
+ *result++ = *str++;\r
+ }\r
+ *result++ = '"';\r
+ *result++ = '\0';\r
+\r
+ return result_return;\r
+}\r
+\r
+/*\r
+ * Count open parenthesis to evaluate the number of subexpressions in the regex\r
+ */\r
+static int count_subexpressions(const char *str){\r
+ int result = 0;\r
+ int last_was_backslash = 0;\r
+ const char *ptr;\r
+\r
+ for(ptr=str; *ptr; ptr++){\r
+ if (*ptr == '\\' && !last_was_backslash){\r
+ last_was_backslash = 1;\r
+ continue;\r
+ }\r
+ if (*ptr == ')' && !last_was_backslash)\r
+ result++;\r
+ last_was_backslash = 0;\r
+ }\r
+ return result;\r
+}\r
+\r
+/*\r
+ * Check to see if string contains any escape chars\r
+ * these could of course just be escaped backslashes\r
+ * themselvs.\r
+ */\r
+static int has_escapes(const char *str){\r
+ const char *ptr;\r
+\r
+ for(ptr=str; *ptr; ptr++){\r
+ if (*ptr == '\\')\r
+ return 1;\r
+ }\r
+ return 0;\r
+}\r
+\r
+/*\r
+ * Compile the regex pattern\r
+ */\r
+static int compile_regex(regex_t *re, const char *pattern, const char *flags, int errors)\r
+{\r
+ int status;\r
+ int cflags = REG_EXTENDED;\r
+\r
+ if (strchr(flags, 'i')) {\r
+ cflags = cflags|REG_ICASE;\r
+ }\r
+ if (strchr(flags, 'n')) {\r
+ cflags = cflags|REG_NEWLINE;\r
+ }\r
+\r
+ status = regcomp(re, pattern, cflags);\r
+ if (status != REG_NOERROR) {\r
+ if (errors == 1) {\r
+ char error_message[MAX_ERROR_MSG];\r
+ regerror (status, re, error_message, MAX_ERROR_MSG);\r
+ fprintf (stderr, "Regex error compiling '%s': %s\n", pattern, error_message);\r
+ }\r
+ }\r
+ return status;\r
+}\r
+\r
+/*\r
+ * Returns a pointer to a malloced array of regmatch_t containing match offsets\r
+ * in the input string. (As opposed to offests from each match)\r
+ *\r
+ * The regmatch struct info:\r
+ * regmatch_t.rm_so (regoff_t) = byte offset from start of string to start of substring\r
+ * regmatch_t.rm_eo (regoff_t) = byte offset from start of string to first character after the end of substring\r
+ */\r
+static int find_regex_matches(regex_t *re, const char *str, const int nsub, const char *flags, regmatch_t **result)\r
+{\r
+ /* Each individual match and it's subexpression matches stored in m */\r
+ regmatch_t m[nsub+1];\r
+\r
+ /* A pointer into the string at the end of the previous match */\r
+ const char *prev_match_eo = str;\r
+\r
+ /*\r
+ * We return a count of matches and pass back an array of regmatch_t in\r
+ * matches containing match offsets in the original string\r
+ */\r
+ int array_len = strchr(flags, 'g') ? 256 : 32;\r
+ int match_count = 0;\r
+ regmatch_t *matches;\r
+\r
+ matches = (regmatch_t *) wmalloc(sizeof(regmatch_t) * array_len);\r
+\r
+ while (!regexec(re, prev_match_eo, nsub+1, m, 0)) {\r
+ int i = 0;\r
+\r
+ /* resize the matches array; when more space is required double current size */\r
+ while (match_count + (nsub * 2) > array_len) {\r
+ array_len *= 2;\r
+ matches = (regmatch_t *) wrealloc(matches, sizeof(regmatch_t) * array_len);\r
+ }\r
+\r
+ /* when we have subexpressions, we're only interested in their match offsets */\r
+ if (nsub > 0) {\r
+ for (i = 1; i <= nsub; i++) {\r
+ if (m[i].rm_so < 0 || m[i].rm_eo < 0) {\r
+ matches[match_count].rm_so = -1;\r
+ matches[match_count++].rm_eo = -1;\r
+ }\r
+ else {\r
+ matches[match_count].rm_so = (prev_match_eo - str) + m[i].rm_so;\r
+ matches[match_count++].rm_eo = (prev_match_eo - str) + m[i].rm_eo;\r
+ }\r
+ }\r
+ }\r
+ /* else we want the original match offsets*/\r
+ else {\r
+ matches[match_count].rm_so = (prev_match_eo - str) + m[0].rm_so;\r
+ matches[match_count++].rm_eo = (prev_match_eo - str) + m[0].rm_eo;\r
+ }\r
+\r
+ /*\r
+ * If we have matched on a blank expression or we were\r
+ * not flagged to do greedy matching then break\r
+ */\r
+ if (!m[0].rm_eo || !strchr(flags, 'g'))\r
+ break;\r
+\r
+ /*\r
+ * Advance the search position to the end of the current match\r
+ * If the match happens to be zero length, advance search position\r
+ * by one?\r
+ */\r
+ if (m[0].rm_eo == m[0].rm_so)\r
+ prev_match_eo++;\r
+ else\r
+ prev_match_eo += m[0].rm_eo;\r
+ }\r
+ *result = matches;\r
+\r
+ return match_count;\r
+}\r
+\r
+/*\r
+ * Takes regmatch_t array returned by find_regex_matches and returns a malloced\r
+ * string representing the captured substrings.\r
+ */\r
+static char * regex_matches_to_string(const char *str, int nsub, int match_count, regmatch_t *matches) {\r
+ int j;\r
+ int i;\r
+ char *unquoted = NULL;\r
+ char *quoted = NULL;\r
+ int quoted_len;\r
+ char *result;\r
+\r
+ int str_len = strlen(str);\r
+ int allocated_sz = str_len+1;\r
+ result = wmalloc(allocated_sz * sizeof(char));\r
+ int result_sz = 0;\r
+\r
+ j = 0;\r
+ while (j < match_count) {\r
+\r
+ if (j > 0) {\r
+ result_sz += 2;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-2] = ',';\r
+ result[result_sz-1] = '{';\r
+ }\r
+ else {\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = '{';\r
+ }\r
+\r
+ for (i = 0; i <= nsub; i++) {\r
+ if ((nsub > 0) && (i == 0))\r
+ continue;\r
+\r
+ if (i > 1) {\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = ',';\r
+ }\r
+\r
+ int so = matches[j].rm_so;\r
+ int eo = matches[j].rm_eo;\r
+\r
+ if (so == -1 || eo == -1) {\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+4) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, "NULL", 4);\r
+ result_sz += 4;\r
+ }\r
+ else {\r
+ unquoted = wmalloc((eo-so)+1 * sizeof(char));\r
+ strncpy(unquoted, str+so, eo-so);\r
+ unquoted[eo-so] = '\0';\r
+ quoted = quote_output(unquoted);\r
+ quoted_len = strlen(quoted);\r
+\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+quoted_len) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, quoted, quoted_len);\r
+ result_sz += quoted_len;\r
+\r
+ if (quoted != unquoted)\r
+ wfree(unquoted);\r
+ wfree(quoted);\r
+ }\r
+ j++;\r
+ }\r
+\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = '}';\r
+ }\r
+\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = '\0';\r
+\r
+ return result;\r
+}\r
+\r
+/*\r
+ * Purely check for a match in the regex\r
+ */\r
+int regexp_match(const char *str, const char *pattern, const char *flags, int errors)\r
+{\r
+ regex_t re;\r
+ int result;\r
+ int status;\r
+\r
+ status = compile_regex(&re, pattern, flags, errors);\r
+ if (status == REG_NOERROR) {\r
+ result = regexec(&re, str, (size_t) 0, NULL, 0);\r
+ regfree(&re);\r
+\r
+ if (!result) /* match */\r
+ return 1;\r
+ else /* no match */\r
+ return 0;\r
+ }\r
+ else /* error condition, but still: no match */\r
+ return 0;\r
+}\r
+\r
+/*\r
+ * Return all matches in the regex as a string by first calling find_regex_matches\r
+ * and then regex_matches_to_string. Arguably this could all be one function\r
+ * however separation will make future multiple output formats easier.\r
+ */\r
+char * regexp_matches(const char *str, const char *pattern, const char *flags, int errors)\r
+{\r
+ regex_t re;\r
+ regmatch_t *matches_p = NULL;\r
+ int nsub;\r
+ int match_count;\r
+ int status;\r
+ char *result = NULL;\r
+\r
+ /* Compile the regex */\r
+ status = compile_regex(&re, pattern, flags, errors);\r
+ if (status == REG_NOERROR) {\r
+ /* Count our subexpressions to size our regmatch_t array */\r
+ nsub = count_subexpressions(pattern);\r
+ /* Find all the matches relative to the input string */\r
+ match_count = find_regex_matches(&re, str, nsub, flags, &matches_p);\r
+ /* Turn the matches into an output string */\r
+ result = regex_matches_to_string(str, nsub, match_count, matches_p);\r
+ /* Free up the regmatch_t malloced by find_regex_matches */\r
+ wfree(matches_p);\r
+ regfree(&re);\r
+ }\r
+\r
+ return result;\r
+}\r
+\r
+/*\r
+ * Substitutes matches with the regex pattern in the string with the replacement\r
+ * pattern/string.\r
+ */\r
+char * regexp_replace(const char *str, const char *pattern, const char *replacement, const char *flags, int errors)\r
+{\r
+ regex_t re;\r
+ int nsub;\r
+ char *result = NULL;\r
+ char *match_str;\r
+ int status;\r
+ const char *prev_match_eo = str;\r
+ int str_len = strlen(str);\r
+ int replacement_len = strlen(replacement);\r
+ int allocated_sz = str_len+1;\r
+ int result_sz = 0;\r
+\r
+ status = compile_regex(&re, pattern, flags, errors);\r
+ if (status == REG_NOERROR) {\r
+\r
+ result = wmalloc(allocated_sz * sizeof(char));\r
+\r
+ /* Count our subexpressions to size our regmatch_t array */\r
+ nsub = count_subexpressions(pattern);\r
+ regmatch_t m[nsub+1];\r
+\r
+ while (!regexec(&re, prev_match_eo, nsub+1, m, 0)) {\r
+\r
+ /* Copy everything to the left of the first match */\r
+ if (m[0].rm_so > 0) {\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+m[0].rm_so) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, prev_match_eo, m[0].rm_so);\r
+ result_sz += m[0].rm_so;\r
+ }\r
+\r
+ /* If there are no backreferences in the replacement, copy in the replacement */\r
+ if (!has_escapes(replacement)) {\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+replacement_len) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, replacement, replacement_len);\r
+ result_sz += replacement_len;\r
+ }\r
+ /* Otherwise process the backreferences and copy in subcaptures */\r
+ else {\r
+ /* find the next escape char */\r
+ const char *start = replacement;\r
+ const char *ptr;\r
+\r
+ for(ptr = replacement; *ptr; ptr++) {\r
+ if (*ptr != '\\')\r
+ continue;\r
+\r
+ /* append everything to the left of the current escape */\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+(ptr-start)) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, start, (ptr-start));\r
+ result_sz += (ptr-start);\r
+\r
+ ptr++;\r
+\r
+ if ((*ptr >= '1' && *ptr <= '9') || (*ptr == '&'))\r
+ {\r
+ /* Use the back reference of regexp. */\r
+ int sub;\r
+ if (*ptr == '&')\r
+ sub = 0;\r
+ else\r
+ sub = *ptr - '0';\r
+\r
+ if (m[sub].rm_so != -1 && m[sub].rm_eo != -1 && sub <= nsub) {\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+(m[sub].rm_eo-m[sub].rm_so)) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, prev_match_eo+m[sub].rm_so, (m[sub].rm_eo-m[sub].rm_so));\r
+ result_sz += (m[sub].rm_eo-m[sub].rm_so);\r
+ }\r
+ ptr++;\r
+ }\r
+ else if (*ptr == '\\')\r
+ {\r
+ /* append backsalsh */\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = '\\';\r
+ ptr++;\r
+ }\r
+ else {\r
+ /* append backsalsh */\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = '\\';\r
+ }\r
+ start = ptr;\r
+ }\r
+ /*\r
+ * Append right trailing replacement, except in the instance\r
+ * when it starts with character zero, which can happen when the\r
+ * last part of the replace string is escaped.\r
+ */\r
+ if (*start) {\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+(ptr-start)) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, start, (ptr-start));\r
+ result_sz += (ptr-start);\r
+ }\r
+\r
+ }\r
+ prev_match_eo += m[0].rm_eo;\r
+\r
+ /*\r
+ * If we have matched on a blank expression or we were\r
+ * not flagged to do greedy matching then break\r
+ */\r
+ if (!m[0].rm_eo || !strchr(flags, 'g'))\r
+ break;\r
+ }\r
+\r
+ /* Copy everything to the right of the last match */\r
+ result = reallocate_block(result, &allocated_sz, (result_sz+(str_len-(prev_match_eo-str))) * sizeof(char), str_len);\r
+ strncpy(result+result_sz, prev_match_eo, str_len-(prev_match_eo-str));\r
+ result_sz += str_len-(prev_match_eo-str);\r
+\r
+ regfree(&re);\r
+\r
+ result_sz++;\r
+ result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);\r
+ result[result_sz-1] = '\0';\r
+ }\r
+ return result;\r
+}\r