1 /*-------------------------------------------------------------------------
\r
3 * posix regex extensions
\r
5 * Copyright (c) 2007-2015, glyn@8kb.co.uk
\r
6 * Author: Glyn Astill <glyn@8kb.co.uk>
\r
8 *-------------------------------------------------------------------------
\r
17 #define MAX_ERROR_MSG 0x1000
\r
20 * Return a properly escaped / quoted string
\r
22 static char * quote_output(char *str) {
\r
24 char *result_return;
\r
31 /* Check for characters that need quoting */
\r
32 for (ptr = str; *ptr; ptr++) {
\r
34 if (ch == '"' || ch =='\\' || ch == '{' || ch == ',') {
\r
40 /* If we find no characters that need quoting just return the input */
\r
44 /* Do the quoting, here the allocation is wasteful */
\r
45 result = (char *) wmalloc((len * 2 + 3) * sizeof(char));
\r
46 result_return = result;
\r
49 * Starting address of result is incremented as we modify it's contents here
\r
50 * with result_return keeping the starting address
\r
54 /* Escape double quotes and backslash with backslash */
\r
66 return result_return;
\r
70 * Count open parenthesis to evaluate the number of subexpressions in the regex
\r
72 static int count_subexpressions(const char *str){
\r
74 int last_was_backslash = 0;
\r
77 for(ptr=str; *ptr; ptr++){
\r
78 if (*ptr == '\\' && !last_was_backslash){
\r
79 last_was_backslash = 1;
\r
82 if (*ptr == ')' && !last_was_backslash)
\r
84 last_was_backslash = 0;
\r
90 * Check to see if string contains any escape chars
\r
91 * these could of course just be escaped backslashes
\r
94 static int has_escapes(const char *str){
\r
97 for(ptr=str; *ptr; ptr++){
\r
105 * Compile the regex pattern
\r
107 static int compile_regex(regex_t *re, const char *pattern, const char *flags, int errors)
\r
110 int cflags = REG_EXTENDED;
\r
112 if (strchr(flags, 'i')) {
\r
113 cflags = cflags|REG_ICASE;
\r
115 if (strchr(flags, 'n')) {
\r
116 cflags = cflags|REG_NEWLINE;
\r
119 status = regcomp(re, pattern, cflags);
\r
120 if (status != REG_NOERROR) {
\r
122 char error_message[MAX_ERROR_MSG];
\r
123 regerror (status, re, error_message, MAX_ERROR_MSG);
\r
124 fprintf (stderr, "Regex error compiling '%s': %s\n", pattern, error_message);
\r
131 * Returns a pointer to a malloced array of regmatch_t containing match offsets
\r
132 * in the input string. (As opposed to offests from each match)
\r
134 * The regmatch struct info:
\r
135 * regmatch_t.rm_so (regoff_t) = byte offset from start of string to start of substring
\r
136 * regmatch_t.rm_eo (regoff_t) = byte offset from start of string to first character after the end of substring
\r
138 static int find_regex_matches(regex_t *re, const char *str, const int nsub, const char *flags, regmatch_t **result)
\r
140 /* Each individual match and it's subexpression matches stored in m */
\r
141 regmatch_t m[nsub+1];
\r
143 /* A pointer into the string at the end of the previous match */
\r
144 const char *prev_match_eo = str;
\r
147 * We return a count of matches and pass back an array of regmatch_t in
\r
148 * matches containing match offsets in the original string
\r
150 int array_len = strchr(flags, 'g') ? 256 : 32;
\r
151 int match_count = 0;
\r
152 regmatch_t *matches;
\r
154 matches = (regmatch_t *) wmalloc(sizeof(regmatch_t) * array_len);
\r
156 while (!regexec(re, prev_match_eo, nsub+1, m, 0)) {
\r
159 /* resize the matches array; when more space is required double current size */
\r
160 while (match_count + (nsub * 2) > array_len) {
\r
162 matches = (regmatch_t *) wrealloc(matches, sizeof(regmatch_t) * array_len);
\r
165 /* when we have subexpressions, we're only interested in their match offsets */
\r
167 for (i = 1; i <= nsub; i++) {
\r
168 if (m[i].rm_so < 0 || m[i].rm_eo < 0) {
\r
169 matches[match_count].rm_so = -1;
\r
170 matches[match_count++].rm_eo = -1;
\r
173 matches[match_count].rm_so = (prev_match_eo - str) + m[i].rm_so;
\r
174 matches[match_count++].rm_eo = (prev_match_eo - str) + m[i].rm_eo;
\r
178 /* else we want the original match offsets*/
\r
180 matches[match_count].rm_so = (prev_match_eo - str) + m[0].rm_so;
\r
181 matches[match_count++].rm_eo = (prev_match_eo - str) + m[0].rm_eo;
\r
185 * If we have matched on a blank expression or we were
\r
186 * not flagged to do greedy matching then break
\r
188 if (!m[0].rm_eo || !strchr(flags, 'g'))
\r
192 * Advance the search position to the end of the current match
\r
193 * If the match happens to be zero length, advance search position
\r
196 if (m[0].rm_eo == m[0].rm_so)
\r
199 prev_match_eo += m[0].rm_eo;
\r
203 return match_count;
\r
207 * Takes regmatch_t array returned by find_regex_matches and returns a malloced
\r
208 * string representing the captured substrings.
\r
210 static char * regex_matches_to_string(const char *str, int nsub, int match_count, regmatch_t *matches) {
\r
213 char *unquoted = NULL;
\r
214 char *quoted = NULL;
\r
218 int str_len = strlen(str);
\r
219 int allocated_sz = str_len+1;
\r
220 result = wmalloc(allocated_sz * sizeof(char));
\r
224 while (j < match_count) {
\r
228 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
229 result[result_sz-2] = ',';
\r
230 result[result_sz-1] = '{';
\r
234 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
235 result[result_sz-1] = '{';
\r
238 for (i = 0; i <= nsub; i++) {
\r
239 if ((nsub > 0) && (i == 0))
\r
244 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
245 result[result_sz-1] = ',';
\r
248 int so = matches[j].rm_so;
\r
249 int eo = matches[j].rm_eo;
\r
251 if (so == -1 || eo == -1) {
\r
252 result = reallocate_block(result, &allocated_sz, (result_sz+4) * sizeof(char), str_len);
\r
253 strncpy(result+result_sz, "NULL", 4);
\r
257 unquoted = wmalloc((eo-so)+1 * sizeof(char));
\r
258 strncpy(unquoted, str+so, eo-so);
\r
259 unquoted[eo-so] = '\0';
\r
260 quoted = quote_output(unquoted);
\r
261 quoted_len = strlen(quoted);
\r
263 result = reallocate_block(result, &allocated_sz, (result_sz+quoted_len) * sizeof(char), str_len);
\r
264 strncpy(result+result_sz, quoted, quoted_len);
\r
265 result_sz += quoted_len;
\r
267 if (quoted != unquoted)
\r
275 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
276 result[result_sz-1] = '}';
\r
280 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
281 result[result_sz-1] = '\0';
\r
287 * Purely check for a match in the regex
\r
289 int regexp_match(const char *str, const char *pattern, const char *flags, int errors)
\r
295 status = compile_regex(&re, pattern, flags, errors);
\r
296 if (status == REG_NOERROR) {
\r
297 result = regexec(&re, str, (size_t) 0, NULL, 0);
\r
300 if (!result) /* match */
\r
302 else /* no match */
\r
305 else /* error condition, but still: no match */
\r
310 * Return all matches in the regex as a string by first calling find_regex_matches
\r
311 * and then regex_matches_to_string. Arguably this could all be one function
\r
312 * however separation will make future multiple output formats easier.
\r
314 char * regexp_matches(const char *str, const char *pattern, const char *flags, int errors)
\r
317 regmatch_t *matches_p = NULL;
\r
321 char *result = NULL;
\r
323 /* Compile the regex */
\r
324 status = compile_regex(&re, pattern, flags, errors);
\r
325 if (status == REG_NOERROR) {
\r
326 /* Count our subexpressions to size our regmatch_t array */
\r
327 nsub = count_subexpressions(pattern);
\r
328 /* Find all the matches relative to the input string */
\r
329 match_count = find_regex_matches(&re, str, nsub, flags, &matches_p);
\r
330 /* Turn the matches into an output string */
\r
331 result = regex_matches_to_string(str, nsub, match_count, matches_p);
\r
332 /* Free up the regmatch_t malloced by find_regex_matches */
\r
341 * Substitutes matches with the regex pattern in the string with the replacement
\r
344 char * regexp_replace(const char *str, const char *pattern, const char *replacement, const char *flags, int errors)
\r
348 char *result = NULL;
\r
351 const char *prev_match_eo = str;
\r
352 int str_len = strlen(str);
\r
353 int replacement_len = strlen(replacement);
\r
354 int allocated_sz = str_len+1;
\r
357 status = compile_regex(&re, pattern, flags, errors);
\r
358 if (status == REG_NOERROR) {
\r
360 result = wmalloc(allocated_sz * sizeof(char));
\r
362 /* Count our subexpressions to size our regmatch_t array */
\r
363 nsub = count_subexpressions(pattern);
\r
364 regmatch_t m[nsub+1];
\r
366 while (!regexec(&re, prev_match_eo, nsub+1, m, 0)) {
\r
368 /* Copy everything to the left of the first match */
\r
369 if (m[0].rm_so > 0) {
\r
370 result = reallocate_block(result, &allocated_sz, (result_sz+m[0].rm_so) * sizeof(char), str_len);
\r
371 strncpy(result+result_sz, prev_match_eo, m[0].rm_so);
\r
372 result_sz += m[0].rm_so;
\r
375 /* If there are no backreferences in the replacement, copy in the replacement */
\r
376 if (!has_escapes(replacement)) {
\r
377 result = reallocate_block(result, &allocated_sz, (result_sz+replacement_len) * sizeof(char), str_len);
\r
378 strncpy(result+result_sz, replacement, replacement_len);
\r
379 result_sz += replacement_len;
\r
381 /* Otherwise process the backreferences and copy in subcaptures */
\r
383 /* find the next escape char */
\r
384 const char *start = replacement;
\r
387 for(ptr = replacement; *ptr; ptr++) {
\r
391 /* append everything to the left of the current escape */
\r
392 result = reallocate_block(result, &allocated_sz, (result_sz+(ptr-start)) * sizeof(char), str_len);
\r
393 strncpy(result+result_sz, start, (ptr-start));
\r
394 result_sz += (ptr-start);
\r
398 if ((*ptr >= '1' && *ptr <= '9') || (*ptr == '&'))
\r
400 /* Use the back reference of regexp. */
\r
407 if (m[sub].rm_so != -1 && m[sub].rm_eo != -1 && sub <= nsub) {
\r
408 result = reallocate_block(result, &allocated_sz, (result_sz+(m[sub].rm_eo-m[sub].rm_so)) * sizeof(char), str_len);
\r
409 strncpy(result+result_sz, prev_match_eo+m[sub].rm_so, (m[sub].rm_eo-m[sub].rm_so));
\r
410 result_sz += (m[sub].rm_eo-m[sub].rm_so);
\r
414 else if (*ptr == '\\')
\r
416 /* append backsalsh */
\r
418 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
419 result[result_sz-1] = '\\';
\r
423 /* append backsalsh */
\r
425 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
426 result[result_sz-1] = '\\';
\r
431 * Append right trailing replacement, except in the instance
\r
432 * when it starts with character zero, which can happen when the
\r
433 * last part of the replace string is escaped.
\r
436 result = reallocate_block(result, &allocated_sz, (result_sz+(ptr-start)) * sizeof(char), str_len);
\r
437 strncpy(result+result_sz, start, (ptr-start));
\r
438 result_sz += (ptr-start);
\r
442 prev_match_eo += m[0].rm_eo;
\r
445 * If we have matched on a blank expression or we were
\r
446 * not flagged to do greedy matching then break
\r
448 if (!m[0].rm_eo || !strchr(flags, 'g'))
\r
452 /* Copy everything to the right of the last match */
\r
453 result = reallocate_block(result, &allocated_sz, (result_sz+(str_len-(prev_match_eo-str))) * sizeof(char), str_len);
\r
454 strncpy(result+result_sz, prev_match_eo, str_len-(prev_match_eo-str));
\r
455 result_sz += str_len-(prev_match_eo-str);
\r
460 result = reallocate_block(result, &allocated_sz, result_sz * sizeof(char), str_len);
\r
461 result[result_sz-1] = '\0';
\r