1 /*
2 tre-parse.c - Regexp parser
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 This parser is just a simple recursive descent parser for POSIX.2
11 regexps. The parser supports both the obsolete default syntax and
12 the "extended" syntax, and some nonstandard extensions.
13 */
14
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 // #include <assert.h>
21 #include <limits.h>
22
23 #include "xmalloc.h"
24 #include "tre-mem.h"
25 #include "tre-ast.h"
26 #include "tre-stack.h"
27 #include "tre-parse.h"
28
29 #define assert(a) R_assert(a)
30
31 /* Characters with special meanings in regexp syntax. */
32 #define CHAR_PIPE L'|'
33 #define CHAR_LPAREN L'('
34 #define CHAR_RPAREN L')'
35 #define CHAR_LBRACE L'{'
36 #define CHAR_RBRACE L'}'
37 #define CHAR_LBRACKET L'['
38 #define CHAR_RBRACKET L']'
39 #define CHAR_MINUS L'-'
40 #define CHAR_STAR L'*'
41 #define CHAR_QUESTIONMARK L'?'
42 #define CHAR_PLUS L'+'
43 #define CHAR_PERIOD L'.'
44 #define CHAR_COLON L':'
45 #define CHAR_EQUAL L'='
46 #define CHAR_COMMA L','
47 #define CHAR_CARET L'^'
48 #define CHAR_DOLLAR L'$'
49 #define CHAR_BACKSLASH L'\\'
50 #define CHAR_HASH L'#'
51 #define CHAR_TILDE L'~'
52
53
54 /* Some macros for expanding \w, \s, etc. */
55 static const struct tre_macro_struct {
56 const char c;
57 const char *expansion;
58 } tre_macros[] =
59 { {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
60 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
61 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
62 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
63 { 0, NULL }
64 };
65
66
67 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
68 must have at least `len' items. Sets buf[0] to zero if the there
69 is no match in `tre_macros'. */
70 static void
tre_expand_macro(const tre_char_t * regex,const tre_char_t * regex_end,tre_char_t * buf,size_t buf_len)71 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
72 tre_char_t *buf, size_t buf_len)
73 {
74 int i;
75
76 buf[0] = 0;
77 if (regex >= regex_end)
78 return;
79
80 for (i = 0; tre_macros[i].expansion; i++)
81 {
82 if (tre_macros[i].c == *regex)
83 {
84 unsigned int j;
85 DPRINT(("Expanding macro '%c' => '%s'\n",
86 tre_macros[i].c, tre_macros[i].expansion));
87 // R_change: - 1 is r62537
88 for (j = 0; tre_macros[i].expansion[j] && j < buf_len - 1; j++)
89 buf[j] = tre_macros[i].expansion[j];
90 buf[j] = 0;
91 break;
92 }
93 }
94 }
95
96 static reg_errcode_t
tre_new_item(tre_mem_t mem,int min,int max,int * i,int * max_i,tre_ast_node_t *** items)97 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
98 tre_ast_node_t ***items)
99 {
100 reg_errcode_t status;
101 tre_ast_node_t **array = *items;
102 /* Allocate more space if necessary. */
103 if (*i >= *max_i)
104 {
105 tre_ast_node_t **new_items;
106 DPRINT(("out of array space, i = %d\n", *i));
107 /* If the array is already 1024 items large, give up -- there's
108 probably an error in the regexp (e.g. not a '\0' terminated
109 string and missing ']') */
110 if (*max_i > 1024)
111 return REG_ESPACE;
112 *max_i *= 2;
113 new_items = xrealloc(array, sizeof(*items) * *max_i);
114 if (new_items == NULL)
115 return REG_ESPACE;
116 *items = array = new_items;
117 }
118 array[*i] = tre_ast_new_literal(mem, min, max, -1);
119 status = array[*i] == NULL ? REG_ESPACE : REG_OK;
120 (*i)++;
121 return status;
122 }
123
124
125 /* Expands a character class to character ranges. */
126 static reg_errcode_t
tre_expand_ctype(tre_mem_t mem,tre_ctype_t class,tre_ast_node_t *** items,int * i,int * max_i,int cflags)127 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
128 int *i, int *max_i, int cflags)
129 {
130 reg_errcode_t status = REG_OK;
131 tre_cint_t c;
132 int j, min = -1, max = 0;
133 //assert(TRE_MB_CUR_MAX == 1); It is the ctx->cur_max that matters
134
135 DPRINT((" expanding class to character ranges\n"));
136 for (j = 0; (j < 256) && (status == REG_OK); j++)
137 {
138 c = (tre_cint_t) j;
139 if (tre_isctype(c, class)
140 || ((cflags & REG_ICASE)
141 && (tre_isctype(tre_tolower(c), class)
142 || tre_isctype(tre_toupper(c), class))))
143 {
144 if (min < 0)
145 min = c;
146 max = c;
147 }
148 else if (min >= 0)
149 {
150 DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
151 status = tre_new_item(mem, min, max, i, max_i, items);
152 min = -1;
153 }
154 }
155 if (min >= 0 && status == REG_OK)
156 status = tre_new_item(mem, min, max, i, max_i, items);
157 return status;
158 }
159
160
161 static int
tre_compare_items(const void * a,const void * b)162 tre_compare_items(const void *a, const void *b)
163 {
164 const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
165 const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
166 tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
167 long a_min = l_a->code_min, b_min = l_b->code_min;
168
169 if (a_min < b_min)
170 return -1;
171 else if (a_min > b_min)
172 return 1;
173 else
174 return 0;
175 }
176
177 #ifndef TRE_USE_SYSTEM_WCTYPE
178
179 /* isalnum() and the rest may be macros, so wrap them to functions. */
tre_isalnum_func(tre_cint_t c)180 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
tre_isalpha_func(tre_cint_t c)181 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
182
183 #ifdef tre_isascii
tre_isascii_func(tre_cint_t c)184 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
185 #else /* !tre_isascii */
tre_isascii_func(tre_cint_t c)186 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
187 #endif /* !tre_isascii */
188
189 #ifdef tre_isblank
tre_isblank_func(tre_cint_t c)190 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
191 #else /* !tre_isblank */
tre_isblank_func(tre_cint_t c)192 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
193 #endif /* !tre_isblank */
194
tre_iscntrl_func(tre_cint_t c)195 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
tre_isdigit_func(tre_cint_t c)196 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
tre_isgraph_func(tre_cint_t c)197 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
tre_islower_func(tre_cint_t c)198 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
tre_isprint_func(tre_cint_t c)199 int tre_isprint_func(tre_cint_t c)
200 {
201 #ifdef TRE_WCHAR
202 /* Windows has \t as printable via iswprint in all locales. By POSIX
203 and ?regex, we need \t to be non-printable in the C locale, so we
204 cannot use iswprint. By C99, iswprint(L'\t') should be the same as
205 isprint('\t'). In Windows, in C locale, isprint('\t') is false,
206 hence this workaround. */
207 if (c == L'\t') return isprint('\t');
208 #endif
209 return tre_isprint(c); /* Windows has \t as printable */
210 }
tre_ispunct_func(tre_cint_t c)211 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
tre_isspace_func(tre_cint_t c)212 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
tre_isupper_func(tre_cint_t c)213 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
tre_isxdigit_func(tre_cint_t c)214 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
215
216 struct {
217 char *name;
218 int (*func)(tre_cint_t);
219 } tre_ctype_map[] = {
220 { "alnum", &tre_isalnum_func },
221 { "alpha", &tre_isalpha_func },
222 #ifdef tre_isascii
223 { "ascii", &tre_isascii_func },
224 #endif /* tre_isascii */
225 #ifdef tre_isblank
226 { "blank", &tre_isblank_func },
227 #endif /* tre_isblank */
228 { "cntrl", &tre_iscntrl_func },
229 { "digit", &tre_isdigit_func },
230 { "graph", &tre_isgraph_func },
231 { "lower", &tre_islower_func },
232 { "print", &tre_isprint_func },
233 { "punct", &tre_ispunct_func },
234 { "space", &tre_isspace_func },
235 { "upper", &tre_isupper_func },
236 { "xdigit", &tre_isxdigit_func },
237 { NULL, NULL}
238 };
239
tre_ctype(const char * name)240 tre_ctype_t tre_ctype(const char *name)
241 {
242 int i;
243 for (i = 0; tre_ctype_map[i].name != NULL; i++)
244 {
245 if (strcmp(name, tre_ctype_map[i].name) == 0)
246 return tre_ctype_map[i].func;
247 }
248 return (tre_ctype_t)0;
249 }
250 #endif /* !TRE_USE_SYSTEM_WCTYPE */
251
252 /* Maximum number of character classes that can occur in a negated bracket
253 expression. */
254 #define MAX_NEG_CLASSES 64
255
256 /* Maximum length of character class names. */
257 #define MAX_CLASS_NAME
258
259 #define REST(re) (int)(ctx->re_end - (re)), (re)
260
261 static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t * ctx,int negate,tre_ctype_t neg_classes[],int * num_neg_classes,tre_ast_node_t *** items,int * num_items,int * items_size)262 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
263 tre_ctype_t neg_classes[], int *num_neg_classes,
264 tre_ast_node_t ***items, int *num_items,
265 int *items_size)
266 {
267 const tre_char_t *re = ctx->re;
268 reg_errcode_t status = REG_OK;
269 tre_ctype_t class = (tre_ctype_t)0;
270 int i = *num_items;
271 int max_i = *items_size;
272 int skip;
273
274 /* Build an array of the items in the bracket expression. */
275 while (status == REG_OK)
276 {
277 skip = 0;
278 if (re == ctx->re_end)
279 {
280 status = REG_EBRACK;
281 }
282 else if (*re == CHAR_RBRACKET && re > ctx->re)
283 {
284 DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
285 re++;
286 break;
287 }
288 else
289 {
290 tre_cint_t min = 0, max = 0;
291
292 class = (tre_ctype_t)0;
293 if (re + 2 < ctx->re_end
294 && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
295 {
296 DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
297 min = *re;
298 max = *(re + 2);
299 re += 3;
300 /* XXX - Should use collation order instead of encoding values
301 in character ranges. */
302 if (min > max)
303 status = REG_ERANGE;
304 }
305 else if (re + 1 < ctx->re_end
306 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
307 status = REG_ECOLLATE;
308 else if (re + 1 < ctx->re_end
309 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
310 status = REG_ECOLLATE;
311 else if (re + 1 < ctx->re_end
312 && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
313 {
314 char tmp_str[64];
315 const tre_char_t *endptr = re + 2;
316 size_t len;
317 DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
318 while (endptr < ctx->re_end && *endptr != CHAR_COLON)
319 endptr++;
320 if (endptr != ctx->re_end)
321 {
322 len = MIN(endptr - re - 2, 63);
323 #ifdef TRE_WCHAR
324 {
325 tre_char_t tmp_wcs[64];
326 wcsncpy(tmp_wcs, re + 2, len);
327 tmp_wcs[len] = L'\0';
328 #if defined HAVE_WCSRTOMBS
329 {
330 mbstate_t state;
331 const tre_char_t *src = tmp_wcs;
332 memset(&state, '\0', sizeof(state));
333 len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
334 }
335 #elif defined HAVE_WCSTOMBS
336 len = wcstombs(tmp_str, tmp_wcs, 63);
337 #endif /* defined HAVE_WCSTOMBS */
338 }
339 #else /* !TRE_WCHAR */
340 strncpy(tmp_str, (const char*)re + 2, len);
341 #endif /* !TRE_WCHAR */
342 tmp_str[len] = '\0';
343 DPRINT((" class name: %s\n", tmp_str));
344 class = tre_ctype(tmp_str);
345 if (!class)
346 status = REG_ECTYPE;
347 /* Optimize character classes for 8 bit character sets. */
348 if (status == REG_OK && ctx->cur_max == 1)
349 {
350 status = tre_expand_ctype(ctx->mem, class, items,
351 &i, &max_i, ctx->cflags);
352 class = (tre_ctype_t)0;
353 skip = 1;
354 }
355 re = endptr + 2;
356 }
357 else
358 status = REG_ECTYPE;
359 min = 0;
360 max = TRE_CHAR_MAX;
361 }
362 else
363 {
364 DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
365 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
366 && ctx->re != re)
367 /* Two ranges are not allowed to share and endpoint. */
368 status = REG_ERANGE;
369 min = max = *re++;
370 }
371
372 if (status != REG_OK)
373 break;
374
375 if (class && negate)
376 if (*num_neg_classes >= MAX_NEG_CLASSES)
377 status = REG_ESPACE;
378 else
379 neg_classes[(*num_neg_classes)++] = class;
380 else if (!skip)
381 {
382 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
383 if (status != REG_OK)
384 break;
385 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
386 }
387
388 /* Add opposite-case counterpoints if REG_ICASE is present.
389 This is broken if there are more than two "same" characters. */
390 if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
391 {
392 tre_cint_t cmin, ccurr;
393
394 DPRINT(("adding opposite-case counterpoints\n"));
395 while (min <= max)
396 {
397 if (tre_islower(min))
398 {
399 cmin = ccurr = tre_toupper(min++);
400 while (tre_islower(min) && tre_toupper(min) == ccurr + 1
401 && min <= max)
402 ccurr = tre_toupper(min++);
403 status = tre_new_item(ctx->mem, cmin, ccurr,
404 &i, &max_i, items);
405 }
406 else if (tre_isupper(min))
407 {
408 cmin = ccurr = tre_tolower(min++);
409 while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
410 && min <= max)
411 ccurr = tre_tolower(min++);
412 status = tre_new_item(ctx->mem, cmin, ccurr,
413 &i, &max_i, items);
414 }
415 else min++;
416 if (status != REG_OK)
417 break;
418 }
419 if (status != REG_OK)
420 break;
421 }
422 }
423 }
424 *num_items = i;
425 *items_size = max_i;
426 ctx->re = re;
427 return status;
428 }
429
430 static reg_errcode_t
tre_parse_bracket(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)431 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
432 {
433 tre_ast_node_t *node = NULL;
434 int negate = 0;
435 reg_errcode_t status = REG_OK;
436 tre_ast_node_t **items, *u, *n;
437 int i = 0, j, max_i = 32, curr_max, curr_min;
438 tre_ctype_t neg_classes[MAX_NEG_CLASSES];
439 int num_neg_classes = 0;
440
441 /* Start off with an array of `max_i' elements. */
442 items = xmalloc(sizeof(*items) * max_i);
443 if (items == NULL)
444 return REG_ESPACE;
445
446 if (*ctx->re == CHAR_CARET)
447 {
448 DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
449 negate = 1;
450 ctx->re++;
451 }
452
453 status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
454 &items, &i, &max_i);
455
456 if (status != REG_OK)
457 goto parse_bracket_done;
458
459 /* Sort the array if we need to negate it. */
460 if (negate)
461 qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
462
463 curr_max = curr_min = 0;
464 /* Build a union of the items in the array, negated if necessary. */
465 for (j = 0; j < i && status == REG_OK; j++)
466 {
467 int min, max;
468 tre_literal_t *l = items[j]->obj;
469 min = (int) l->code_min;
470 max = (int) l->code_max;
471
472 DPRINT(("item: %d - %d, class %p, curr_max = %d\n",
473 (int)l->code_min, (int)l->code_max, (void *)l->u.class, curr_max));
474
475 if (negate)
476 {
477 if (min < curr_max)
478 {
479 /* Overlap. */
480 curr_max = MAX(max + 1, curr_max);
481 DPRINT(("overlap, curr_max = %d\n", curr_max));
482 l = NULL;
483 }
484 else
485 {
486 /* No overlap. */
487 curr_max = min - 1;
488 if (curr_max >= curr_min)
489 {
490 DPRINT(("no overlap\n"));
491 l->code_min = curr_min;
492 l->code_max = curr_max;
493 }
494 else
495 {
496 DPRINT(("no overlap, zero room\n"));
497 l = NULL;
498 }
499 curr_min = curr_max = max + 1;
500 }
501 }
502
503 if (l != NULL)
504 {
505 int k;
506 DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
507 l->position = ctx->position;
508 if (num_neg_classes > 0)
509 {
510 l->neg_classes = tre_mem_alloc(ctx->mem,
511 (sizeof(l->neg_classes)
512 * (num_neg_classes + 1)));
513 if (l->neg_classes == NULL)
514 {
515 status = REG_ESPACE;
516 break;
517 }
518 for (k = 0; k < num_neg_classes; k++)
519 l->neg_classes[k] = neg_classes[k];
520 l->neg_classes[k] = (tre_ctype_t)0;
521 }
522 else
523 l->neg_classes = NULL;
524 if (node == NULL)
525 node = items[j];
526 else
527 {
528 u = tre_ast_new_union(ctx->mem, node, items[j]);
529 if (u == NULL)
530 status = REG_ESPACE;
531 node = u;
532 }
533 }
534 }
535
536 if (status != REG_OK)
537 goto parse_bracket_done;
538
539 if (negate)
540 {
541 int k;
542 DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
543 n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
544 if (n == NULL)
545 status = REG_ESPACE;
546 else
547 {
548 tre_literal_t *l = n->obj;
549 if (num_neg_classes > 0)
550 {
551 l->neg_classes = tre_mem_alloc(ctx->mem,
552 (sizeof(l->neg_classes)
553 * (num_neg_classes + 1)));
554 if (l->neg_classes == NULL)
555 {
556 status = REG_ESPACE;
557 goto parse_bracket_done;
558 }
559 for (k = 0; k < num_neg_classes; k++)
560 l->neg_classes[k] = neg_classes[k];
561 l->neg_classes[k] = (tre_ctype_t)0;
562 }
563 else
564 l->neg_classes = NULL;
565 if (node == NULL)
566 node = n;
567 else
568 {
569 u = tre_ast_new_union(ctx->mem, node, n);
570 if (u == NULL)
571 status = REG_ESPACE;
572 node = u;
573 }
574 }
575 }
576
577 if (status != REG_OK)
578 goto parse_bracket_done;
579
580 #ifdef TRE_DEBUG
581 tre_ast_print(node);
582 #endif /* TRE_DEBUG */
583
584 parse_bracket_done:
585 xfree(items);
586 ctx->position++;
587 *result = node;
588 return status;
589 }
590
591
592 // patch from https://github.com/laurikari/tre/issues/55
593 /* Parses a positive decimal integer. Returns -1 if the string does not
594 contain a valid number. */
595 static int
tre_parse_int(const tre_char_t ** regex,const tre_char_t * regex_end)596 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
597 {
598 int num = -1;
599 int overflow = 0;
600 const tre_char_t *r = *regex;
601 while (r < regex_end && *r >= L'0' && *r <= L'9')
602 {
603 if (num < 0)
604 num = 0;
605 if (num <= (INT_MAX - 9) / 10) {
606 num = num * 10 + *r - L'0';
607 } else {
608 /* This digit could cause an integer overflow. We do not return
609 * directly; instead, consume all remaining digits. */
610 overflow = 1;
611 }
612 r++;
613 }
614 *regex = r;
615 return overflow ? -1 : num;
616 }
617
618
619 static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)620 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
621 {
622 int min, max, i;
623 int cost_ins, cost_del, cost_subst, cost_max;
624 int limit_ins, limit_del, limit_subst, limit_err;
625 const tre_char_t *r = ctx->re;
626 const tre_char_t *start;
627 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
628 int approx = 0;
629 int costs_set = 0;
630 int counts_set = 0;
631
632 cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
633 limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
634
635 /* Parse number (minimum repetition count). */
636 min = -1;
637 if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
638 DPRINT(("tre_parse: min count: '%.*" STRF "'\n", REST(r)));
639 min = tre_parse_int(&r, ctx->re_end);
640 }
641
642 /* Parse comma and second number (maximum repetition count). */
643 max = min;
644 if (r < ctx->re_end && *r == CHAR_COMMA)
645 {
646 r++;
647 DPRINT(("tre_parse: max count: '%.*" STRF "'\n", REST(r)));
648 max = tre_parse_int(&r, ctx->re_end);
649 }
650
651 /* Check that the repeat counts are sane. */
652 if ((max >= 0 && min > max) || max > RE_DUP_MAX)
653 return REG_BADBR;
654
655
656 /*
657 '{'
658 optionally followed immediately by a number == minimum repcount
659 optionally followed by , then a number == maximum repcount
660 + then a number == maximum insertion count
661 - then a number == maximum deletion count
662 # then a number == maximum substitution count
663 ~ then a number == maximum number of errors
664 Any of +, -, # or ~ without followed by a number means that
665 the maximum count/number of errors is infinite.
666
667 An equation of the form
668 Xi + Yd + Zs < C
669 can be specified to set costs and the cost limit to a value
670 different from the default value:
671 - X is the cost of an insertion
672 - Y is the cost of a deletion
673 - Z is the cost of a substitution
674 - C is the maximum cost
675
676 If no count limit or cost is set for an operation, the operation
677 is not allowed at all.
678 */
679
680
681 do {
682 int done;
683 start = r;
684
685 /* Parse count limit settings */
686 done = 0;
687 if (!counts_set)
688 while (r + 1 < ctx->re_end && !done)
689 {
690 switch (*r)
691 {
692 case CHAR_PLUS: /* Insert limit */
693 DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", REST(r)));
694 r++;
695 limit_ins = tre_parse_int(&r, ctx->re_end);
696 if (limit_ins < 0)
697 limit_ins = INT_MAX;
698 counts_set = 1;
699 break;
700 case CHAR_MINUS: /* Delete limit */
701 DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", REST(r)));
702 r++;
703 limit_del = tre_parse_int(&r, ctx->re_end);
704 if (limit_del < 0)
705 limit_del = INT_MAX;
706 counts_set = 1;
707 break;
708 case CHAR_HASH: /* Substitute limit */
709 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
710 r++;
711 limit_subst = tre_parse_int(&r, ctx->re_end);
712 if (limit_subst < 0)
713 limit_subst = INT_MAX;
714 counts_set = 1;
715 break;
716 case CHAR_TILDE: /* Maximum number of changes */
717 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
718 r++;
719 limit_err = tre_parse_int(&r, ctx->re_end);
720 if (limit_err < 0)
721 limit_err = INT_MAX;
722 approx = 1;
723 break;
724 case CHAR_COMMA:
725 r++;
726 break;
727 case L' ':
728 r++;
729 break;
730 case L'}':
731 done = 1;
732 break;
733 default:
734 done = 1;
735 break;
736 }
737 }
738
739 /* Parse cost restriction equation. */
740 done = 0;
741 if (!costs_set)
742 while (r + 1 < ctx->re_end && !done)
743 {
744 switch (*r)
745 {
746 case CHAR_PLUS:
747 case L' ':
748 r++;
749 break;
750 case L'<':
751 DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", REST(r)));
752 r++;
753 while (*r == L' ')
754 r++;
755 cost_max = tre_parse_int(&r, ctx->re_end);
756 if (cost_max < 0)
757 cost_max = INT_MAX;
758 else
759 cost_max--;
760 approx = 1;
761 break;
762 case CHAR_COMMA:
763 r++;
764 done = 1;
765 break;
766 default:
767 if (*r >= L'0' && *r <= L'9')
768 {
769 #ifdef TRE_DEBUG
770 const tre_char_t *sr = r;
771 #endif /* TRE_DEBUG */
772 int cost = tre_parse_int(&r, ctx->re_end);
773 /* XXX - make sure r is not past end. */
774 switch (*r)
775 {
776 case L'i': /* Insert cost */
777 DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
778 REST(sr)));
779 r++;
780 cost_ins = cost;
781 costs_set = 1;
782 break;
783 case L'd': /* Delete cost */
784 DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
785 REST(sr)));
786 r++;
787 cost_del = cost;
788 costs_set = 1;
789 break;
790 case L's': /* Substitute cost */
791 DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
792 REST(sr)));
793 r++;
794 cost_subst = cost;
795 costs_set = 1;
796 break;
797 default:
798 return REG_BADBR;
799 }
800 }
801 else
802 {
803 done = 1;
804 break;
805 }
806 }
807 }
808 } while (start != r);
809
810 /* Missing }. */
811 if (r >= ctx->re_end)
812 return REG_EBRACE;
813
814 /* Empty contents of {}. */
815 if (r == ctx->re)
816 return REG_BADBR;
817
818 /* Parse the ending '}' or '\}'.*/
819 if (ctx->cflags & REG_EXTENDED)
820 {
821 if (r >= ctx->re_end || *r != CHAR_RBRACE)
822 return REG_BADBR;
823 r++;
824 }
825 else
826 {
827 if (r + 1 >= ctx->re_end
828 || *r != CHAR_BACKSLASH
829 || *(r + 1) != CHAR_RBRACE)
830 return REG_BADBR;
831 r += 2;
832 }
833
834
835 /* Parse trailing '?' marking minimal repetition. */
836 if (r < ctx->re_end)
837 {
838 if (*r == CHAR_QUESTIONMARK)
839 {
840 minimal = !(ctx->cflags & REG_UNGREEDY);
841 r++;
842 }
843 else if (*r == CHAR_STAR || *r == CHAR_PLUS)
844 {
845 /* These are reserved for future extensions. */
846 return REG_BADRPT;
847 }
848 }
849
850 /* Create the AST node(s). */
851 if (min == 0 && max == 0)
852 {
853 *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
854 if (*result == NULL)
855 return REG_ESPACE;
856 }
857 else
858 {
859 if (min < 0 && max < 0)
860 /* Only approximate parameters set, no repetitions. */
861 min = max = 1;
862
863 *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
864 if (!*result)
865 return REG_ESPACE;
866
867 /* If approximate matching parameters are set, add them to the
868 iteration node. */
869 if (approx || costs_set || counts_set)
870 {
871 int *params;
872 tre_iteration_t *iter = (*result)->obj;
873
874 if (costs_set || counts_set)
875 {
876 if (limit_ins == TRE_PARAM_UNSET)
877 {
878 if (cost_ins == TRE_PARAM_UNSET)
879 limit_ins = 0;
880 else
881 limit_ins = INT_MAX;
882 }
883
884 if (limit_del == TRE_PARAM_UNSET)
885 {
886 if (cost_del == TRE_PARAM_UNSET)
887 limit_del = 0;
888 else
889 limit_del = INT_MAX;
890 }
891
892 if (limit_subst == TRE_PARAM_UNSET)
893 {
894 if (cost_subst == TRE_PARAM_UNSET)
895 limit_subst = 0;
896 else
897 limit_subst = INT_MAX;
898 }
899 }
900
901 if (cost_max == TRE_PARAM_UNSET)
902 cost_max = INT_MAX;
903 if (limit_err == TRE_PARAM_UNSET)
904 limit_err = INT_MAX;
905
906 ctx->have_approx = 1;
907 params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
908 if (!params)
909 return REG_ESPACE;
910 for (i = 0; i < TRE_PARAM_LAST; i++)
911 params[i] = TRE_PARAM_UNSET;
912 params[TRE_PARAM_COST_INS] = cost_ins;
913 params[TRE_PARAM_COST_DEL] = cost_del;
914 params[TRE_PARAM_COST_SUBST] = cost_subst;
915 params[TRE_PARAM_COST_MAX] = cost_max;
916 params[TRE_PARAM_MAX_INS] = limit_ins;
917 params[TRE_PARAM_MAX_DEL] = limit_del;
918 params[TRE_PARAM_MAX_SUBST] = limit_subst;
919 params[TRE_PARAM_MAX_ERR] = limit_err;
920 iter->params = params;
921 }
922 }
923
924 DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
925 "limits [%d,%d,%d, total %d]\n",
926 min, max, cost_ins, cost_del, cost_subst, cost_max,
927 limit_ins, limit_del, limit_subst, limit_err));
928
929
930 ctx->re = r;
931 return REG_OK;
932 }
933
934 typedef enum {
935 PARSE_RE = 0,
936 PARSE_ATOM,
937 PARSE_MARK_FOR_SUBMATCH,
938 PARSE_BRANCH,
939 PARSE_PIECE,
940 PARSE_CATENATION,
941 PARSE_POST_CATENATION,
942 PARSE_UNION,
943 PARSE_POST_UNION,
944 PARSE_POSTFIX,
945 PARSE_RESTORE_CFLAGS
946 } tre_parse_re_stack_symbol_t;
947
948
949 reg_errcode_t
tre_parse(tre_parse_ctx_t * ctx)950 tre_parse(tre_parse_ctx_t *ctx)
951 {
952 tre_ast_node_t *result = NULL;
953 tre_parse_re_stack_symbol_t symbol;
954 reg_errcode_t status = REG_OK;
955 tre_stack_t *stack = ctx->stack;
956 int bottom = tre_stack_num_objects(stack);
957 int depth = 0;
958 int temporary_cflags = 0;
959
960 DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
961 ctx->len, ctx->re, ctx->len));
962
963 if (!ctx->nofirstsub)
964 {
965 STACK_PUSH(stack, int, ctx->submatch_id);
966 STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
967 ctx->submatch_id++;
968 }
969 STACK_PUSH(stack, int, PARSE_RE);
970 ctx->re_start = ctx->re;
971 ctx->re_end = ctx->re + ctx->len;
972
973
974 /* The following is basically just a recursive descent parser. I use
975 an explicit stack instead of recursive functions mostly because of
976 two reasons: compatibility with systems which have an overflowable
977 call stack, and efficiency (both in lines of code and speed). */
978 while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
979 {
980 if (status != REG_OK)
981 break;
982 symbol = tre_stack_pop_int(stack);
983 switch (symbol)
984 {
985 case PARSE_RE:
986 /* Parse a full regexp. A regexp is one or more branches,
987 separated by the union operator `|'. */
988 #ifdef REG_LITERAL
989 if (!(ctx->cflags & REG_LITERAL)
990 && ctx->cflags & REG_EXTENDED)
991 #endif /* REG_LITERAL */
992 STACK_PUSHX(stack, int, PARSE_UNION);
993 STACK_PUSHX(stack, int, PARSE_BRANCH);
994 break;
995
996 case PARSE_BRANCH:
997 /* Parse a branch. A branch is one or more pieces, concatenated.
998 A piece is an atom possibly followed by a postfix operator. */
999 STACK_PUSHX(stack, int, PARSE_CATENATION);
1000 STACK_PUSHX(stack, int, PARSE_PIECE);
1001 break;
1002
1003 case PARSE_PIECE:
1004 /* Parse a piece. A piece is an atom possibly followed by one
1005 or more postfix operators. */
1006 #ifdef REG_LITERAL
1007 if (!(ctx->cflags & REG_LITERAL))
1008 #endif /* REG_LITERAL */
1009 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1010 STACK_PUSHX(stack, int, PARSE_ATOM);
1011 break;
1012
1013 case PARSE_CATENATION:
1014 /* If the expression has not ended, parse another piece. */
1015 {
1016 tre_char_t c;
1017 if (ctx->re >= ctx->re_end)
1018 break;
1019 c = *ctx->re;
1020 #ifdef REG_LITERAL
1021 if (!(ctx->cflags & REG_LITERAL))
1022 {
1023 #endif /* REG_LITERAL */
1024 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1025 break;
1026 if ((ctx->cflags & REG_EXTENDED
1027 && c == CHAR_RPAREN && depth > 0)
1028 || (!(ctx->cflags & REG_EXTENDED)
1029 && (c == CHAR_BACKSLASH
1030 && *(ctx->re + 1) == CHAR_RPAREN)))
1031 {
1032 if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1033 status = REG_EPAREN;
1034 DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1035 REST(ctx->re)));
1036 depth--;
1037 if (!(ctx->cflags & REG_EXTENDED))
1038 ctx->re += 2;
1039 break;
1040 }
1041 #ifdef REG_LITERAL
1042 }
1043 #endif /* REG_LITERAL */
1044
1045 #ifdef REG_RIGHT_ASSOC
1046 if (ctx->cflags & REG_RIGHT_ASSOC)
1047 {
1048 /* Right associative concatenation. */
1049 STACK_PUSHX(stack, voidptr, result);
1050 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1051 STACK_PUSHX(stack, int, PARSE_CATENATION);
1052 STACK_PUSHX(stack, int, PARSE_PIECE);
1053 }
1054 else
1055 #endif /* REG_RIGHT_ASSOC */
1056 {
1057 /* Default case, left associative concatenation. */
1058 STACK_PUSHX(stack, int, PARSE_CATENATION);
1059 STACK_PUSHX(stack, voidptr, result);
1060 STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1061 STACK_PUSHX(stack, int, PARSE_PIECE);
1062 }
1063 break;
1064 }
1065
1066 case PARSE_POST_CATENATION:
1067 {
1068 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1069 tre_ast_node_t *tmp_node;
1070 tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1071 if (!tmp_node)
1072 return REG_ESPACE;
1073 result = tmp_node;
1074 break;
1075 }
1076
1077 case PARSE_UNION:
1078 if (ctx->re >= ctx->re_end)
1079 break;
1080 #ifdef REG_LITERAL
1081 if (ctx->cflags & REG_LITERAL)
1082 break;
1083 #endif /* REG_LITERAL */
1084 switch (*ctx->re)
1085 {
1086 case CHAR_PIPE:
1087 DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1088 REST(ctx->re)));
1089 STACK_PUSHX(stack, int, PARSE_UNION);
1090 STACK_PUSHX(stack, voidptr, result);
1091 STACK_PUSHX(stack, int, PARSE_POST_UNION);
1092 STACK_PUSHX(stack, int, PARSE_BRANCH);
1093 ctx->re++;
1094 break;
1095
1096 case CHAR_RPAREN:
1097 ctx->re++;
1098 break;
1099
1100 default:
1101 break;
1102 }
1103 break;
1104
1105 case PARSE_POST_UNION:
1106 {
1107 tre_ast_node_t *tmp_node;
1108 tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1109 tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1110 if (!tmp_node)
1111 return REG_ESPACE;
1112 result = tmp_node;
1113 break;
1114 }
1115
1116 case PARSE_POSTFIX:
1117 /* Parse postfix operators. */
1118 if (ctx->re >= ctx->re_end)
1119 break;
1120 #ifdef REG_LITERAL
1121 if (ctx->cflags & REG_LITERAL)
1122 break;
1123 #endif /* REG_LITERAL */
1124 switch (*ctx->re)
1125 {
1126 case CHAR_PLUS:
1127 case CHAR_QUESTIONMARK:
1128 if (!(ctx->cflags & REG_EXTENDED))
1129 break;
1130 /*FALLTHROUGH*/
1131 case CHAR_STAR:
1132 {
1133 tre_ast_node_t *tmp_node;
1134 int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1135 int rep_min = 0;
1136 int rep_max = -1;
1137 #ifdef TRE_DEBUG
1138 const tre_char_t *tmp_re;
1139 #endif
1140
1141 if (*ctx->re == CHAR_PLUS)
1142 rep_min = 1;
1143 if (*ctx->re == CHAR_QUESTIONMARK)
1144 rep_max = 1;
1145 #ifdef TRE_DEBUG
1146 tmp_re = ctx->re;
1147 #endif
1148
1149 if (ctx->re + 1 < ctx->re_end)
1150 {
1151 if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1152 {
1153 minimal = !(ctx->cflags & REG_UNGREEDY);
1154 ctx->re++;
1155 }
1156 else if (*(ctx->re + 1) == CHAR_STAR
1157 || *(ctx->re + 1) == CHAR_PLUS)
1158 {
1159 /* These are reserved for future extensions. */
1160 return REG_BADRPT;
1161 }
1162 }
1163
1164 DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1165 minimal ? " minimal" : "greedy", REST(tmp_re)));
1166 ctx->re++;
1167 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1168 minimal);
1169 if (tmp_node == NULL)
1170 return REG_ESPACE;
1171 result = tmp_node;
1172 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1173 }
1174 break;
1175
1176 case CHAR_BACKSLASH:
1177 /* "\{" is special without REG_EXTENDED */
1178 if (!(ctx->cflags & REG_EXTENDED)
1179 && ctx->re + 1 < ctx->re_end
1180 && *(ctx->re + 1) == CHAR_LBRACE)
1181 {
1182 ctx->re++;
1183 goto parse_brace;
1184 }
1185 else
1186 break;
1187
1188 case CHAR_LBRACE:
1189 /* "{" is literal without REG_EXTENDED */
1190 if (!(ctx->cflags & REG_EXTENDED))
1191 break;
1192
1193 parse_brace:
1194 DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1195 REST(ctx->re)));
1196 ctx->re++;
1197
1198 status = tre_parse_bound(ctx, &result);
1199 if (status != REG_OK)
1200 return status;
1201 STACK_PUSHX(stack, int, PARSE_POSTFIX);
1202 break;
1203 }
1204 break;
1205
1206 case PARSE_ATOM:
1207 /* Parse an atom. An atom is a regular expression enclosed in `()',
1208 an empty set of `()', a bracket expression, `.', `^', `$',
1209 a `\' followed by a character, or a single character. */
1210
1211 /* End of regexp? (empty string). */
1212 if (ctx->re >= ctx->re_end)
1213 goto parse_literal;
1214
1215 #ifdef REG_LITERAL
1216 if (ctx->cflags & REG_LITERAL)
1217 goto parse_literal;
1218 #endif /* REG_LITERAL */
1219
1220 switch (*ctx->re)
1221 {
1222 case CHAR_LPAREN: /* parenthesized subexpression */
1223
1224 /* Handle "(?...)" extensions. They work in a way similar
1225 to Perls corresponding extensions. */
1226 if (ctx->cflags & REG_EXTENDED
1227 && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1228 {
1229 int new_cflags = ctx->cflags;
1230 int bit = 1;
1231 DPRINT(("tre_parse: extension: '%.*" STRF "\n",
1232 REST(ctx->re)));
1233 ctx->re += 2;
1234 while (/*CONSTCOND*/(void)1,1)
1235 {
1236 if (*ctx->re == L'i')
1237 {
1238 DPRINT(("tre_parse: icase: '%.*" STRF "\n",
1239 REST(ctx->re)));
1240 if (bit)
1241 new_cflags |= REG_ICASE;
1242 else
1243 new_cflags &= ~REG_ICASE;
1244 ctx->re++;
1245 }
1246 else if (*ctx->re == L'n')
1247 {
1248 DPRINT(("tre_parse: newline: '%.*" STRF "\n",
1249 REST(ctx->re)));
1250 if (bit)
1251 new_cflags |= REG_NEWLINE;
1252 else
1253 new_cflags &= ~REG_NEWLINE;
1254 ctx->re++;
1255 }
1256 #ifdef REG_RIGHT_ASSOC
1257 else if (*ctx->re == L'r')
1258 {
1259 DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1260 REST(ctx->re)));
1261 if (bit)
1262 new_cflags |= REG_RIGHT_ASSOC;
1263 else
1264 new_cflags &= ~REG_RIGHT_ASSOC;
1265 ctx->re++;
1266 }
1267 #endif /* REG_RIGHT_ASSOC */
1268 #ifdef REG_UNGREEDY
1269 else if (*ctx->re == L'U')
1270 {
1271 DPRINT(("tre_parse: ungreedy: '%.*" STRF "\n",
1272 REST(ctx->re)));
1273 if (bit)
1274 new_cflags |= REG_UNGREEDY;
1275 else
1276 new_cflags &= ~REG_UNGREEDY;
1277 ctx->re++;
1278 }
1279 #endif /* REG_UNGREEDY */
1280 else if (*ctx->re == CHAR_MINUS)
1281 {
1282 DPRINT(("tre_parse: turn off: '%.*" STRF "\n",
1283 REST(ctx->re)));
1284 ctx->re++;
1285 bit = 0;
1286 }
1287 else if (*ctx->re == CHAR_COLON)
1288 {
1289 DPRINT(("tre_parse: no group: '%.*" STRF "\n",
1290 REST(ctx->re)));
1291 ctx->re++;
1292 depth++;
1293 break;
1294 }
1295 else if (*ctx->re == CHAR_HASH)
1296 {
1297 DPRINT(("tre_parse: comment: '%.*" STRF "\n",
1298 REST(ctx->re)));
1299 /* A comment can contain any character except a
1300 right parenthesis */
1301 while (*ctx->re != CHAR_RPAREN
1302 && ctx->re < ctx->re_end)
1303 ctx->re++;
1304 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1305 {
1306 ctx->re++;
1307 break;
1308 }
1309 else
1310 return REG_BADPAT;
1311 }
1312 else if (*ctx->re == CHAR_RPAREN)
1313 {
1314 ctx->re++;
1315 break;
1316 }
1317 else
1318 return REG_BADPAT;
1319 }
1320
1321 /* Turn on the cflags changes for the rest of the
1322 enclosing group. */
1323 STACK_PUSHX(stack, int, ctx->cflags);
1324 STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS);
1325 STACK_PUSHX(stack, int, PARSE_RE);
1326 ctx->cflags = new_cflags;
1327 break;
1328 }
1329
1330 if (ctx->cflags & REG_EXTENDED
1331 || (ctx->re > ctx->re_start
1332 && *(ctx->re - 1) == CHAR_BACKSLASH))
1333 {
1334 depth++;
1335 if (ctx->re + 2 < ctx->re_end
1336 && *(ctx->re + 1) == CHAR_QUESTIONMARK
1337 && *(ctx->re + 2) == CHAR_COLON)
1338 {
1339 DPRINT(("tre_parse: group begin: '%.*" STRF
1340 "', no submatch\n", REST(ctx->re)));
1341 /* Don't mark for submatching. */
1342 ctx->re += 3;
1343 STACK_PUSHX(stack, int, PARSE_RE);
1344 }
1345 else
1346 {
1347 DPRINT(("tre_parse: group begin: '%.*" STRF
1348 "', submatch %d\n", REST(ctx->re),
1349 ctx->submatch_id));
1350 ctx->re++;
1351 /* First parse a whole RE, then mark the resulting tree
1352 for submatching. */
1353 STACK_PUSHX(stack, int, ctx->submatch_id);
1354 STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1355 STACK_PUSHX(stack, int, PARSE_RE);
1356 ctx->submatch_id++;
1357 }
1358 }
1359 else
1360 goto parse_literal;
1361 break;
1362
1363 case CHAR_RPAREN: /* end of current subexpression */
1364 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1365 || (!(ctx->cflags & REG_EXTENDED) && ctx->re > ctx->re_start
1366 && *(ctx->re - 1) == CHAR_BACKSLASH))
1367 {
1368 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1369 REST(ctx->re)));
1370 /* We were expecting an atom, but instead the current
1371 subexpression was closed. POSIX leaves the meaning of
1372 this to be implementation-defined. We interpret this as
1373 an empty expression (which matches an empty string). */
1374 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1375 if (result == NULL)
1376 return REG_ESPACE;
1377 if (!(ctx->cflags & REG_EXTENDED))
1378 ctx->re--;
1379 }
1380 else
1381 goto parse_literal;
1382 break;
1383
1384 case CHAR_LBRACKET: /* bracket expression */
1385 DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1386 REST(ctx->re)));
1387 ctx->re++;
1388 status = tre_parse_bracket(ctx, &result);
1389 if (status != REG_OK)
1390 return status;
1391 break;
1392
1393 case CHAR_BACKSLASH:
1394 /* If this is "\(" or "\)" chew off the backslash and
1395 try again. */
1396 if (!(ctx->cflags & REG_EXTENDED)
1397 && ctx->re + 1 < ctx->re_end
1398 && (*(ctx->re + 1) == CHAR_LPAREN
1399 || *(ctx->re + 1) == CHAR_RPAREN))
1400 {
1401 ctx->re++;
1402 STACK_PUSHX(stack, int, PARSE_ATOM);
1403 break;
1404 }
1405
1406 /* If a macro is used, parse the expanded macro recursively. */
1407 {
1408 tre_char_t buf[64];
1409 tre_expand_macro(ctx->re + 1, ctx->re_end,
1410 buf, elementsof(buf));
1411 if (buf[0] != 0)
1412 {
1413 tre_parse_ctx_t subctx;
1414 memcpy(&subctx, ctx, sizeof(subctx));
1415 subctx.re = buf;
1416 subctx.len = tre_strlen(buf);
1417 subctx.nofirstsub = 1;
1418 status = tre_parse(&subctx);
1419 if (status != REG_OK)
1420 return status;
1421 ctx->re += 2;
1422 ctx->position = subctx.position;
1423 result = subctx.result;
1424 break;
1425 }
1426 }
1427
1428 if (ctx->re + 1 >= ctx->re_end)
1429 /* Trailing backslash. */
1430 return REG_EESCAPE;
1431
1432 #ifdef REG_LITERAL
1433 if (*(ctx->re + 1) == L'Q')
1434 {
1435 DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1436 REST(ctx->re)));
1437 ctx->cflags |= REG_LITERAL;
1438 temporary_cflags |= REG_LITERAL;
1439 ctx->re += 2;
1440 STACK_PUSHX(stack, int, PARSE_ATOM);
1441 break;
1442 }
1443 #endif /* REG_LITERAL */
1444
1445 DPRINT(("tre_parse: bleep: '%.*" STRF "'\n", REST(ctx->re)));
1446 ctx->re++;
1447 switch (*ctx->re)
1448 {
1449 case L'b':
1450 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1451 ASSERT_AT_WB, -1);
1452 ctx->re++;
1453 break;
1454 case L'B':
1455 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1456 ASSERT_AT_WB_NEG, -1);
1457 ctx->re++;
1458 break;
1459 case L'<':
1460 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1461 ASSERT_AT_BOW, -1);
1462 ctx->re++;
1463 break;
1464 case L'>':
1465 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1466 ASSERT_AT_EOW, -1);
1467 ctx->re++;
1468 break;
1469 case L'x':
1470 ctx->re++;
1471 if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1472 {
1473 /* 8 bit hex char. */
1474 char tmp[3] = {0, 0, 0};
1475 long val;
1476 DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1477 REST(ctx->re - 2)));
1478
1479 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1480 {
1481 tmp[0] = (char)ctx->re[0];
1482 ctx->re++;
1483 }
1484 if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1485 {
1486 tmp[1] = (char)ctx->re[0];
1487 ctx->re++;
1488 }
1489 val = strtol(tmp, NULL, 16);
1490 result = tre_ast_new_literal(ctx->mem, (int)val,
1491 (int)val, ctx->position);
1492 ctx->position++;
1493 break;
1494 }
1495 else if (ctx->re < ctx->re_end)
1496 {
1497 /* Wide char. */
1498 char tmp[32];
1499 long val;
1500 int i = 0;
1501 ctx->re++;
1502 while (ctx->re_end - ctx->re >= 0)
1503 {
1504 if (ctx->re[0] == CHAR_RBRACE)
1505 break;
1506 if (tre_isxdigit(ctx->re[0]))
1507 {
1508 tmp[i] = (char)ctx->re[0];
1509 i++;
1510 ctx->re++;
1511 continue;
1512 }
1513 return REG_EBRACE;
1514 }
1515 ctx->re++;
1516 tmp[i] = 0;
1517 val = strtol(tmp, NULL, 16);
1518 result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1519 ctx->position);
1520 ctx->position++;
1521 break;
1522 }
1523 /*FALLTHROUGH*/
1524
1525 default:
1526 if (tre_isdigit(*ctx->re))
1527 {
1528 /* Back reference. */
1529 int val = *ctx->re - L'0';
1530 DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
1531 REST(ctx->re - 1)));
1532 result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1533 ctx->position);
1534 if (result == NULL)
1535 return REG_ESPACE;
1536 ctx->position++;
1537 ctx->max_backref = MAX(val, ctx->max_backref);
1538 ctx->re++;
1539 }
1540 else
1541 {
1542 /* Escaped character. */
1543 DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
1544 REST(ctx->re - 1)));
1545 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1546 ctx->position);
1547 ctx->position++;
1548 ctx->re++;
1549 }
1550 break;
1551 }
1552 if (result == NULL)
1553 return REG_ESPACE;
1554 break;
1555
1556 case CHAR_PERIOD: /* the any-symbol */
1557 DPRINT(("tre_parse: any: '%.*" STRF "'\n",
1558 REST(ctx->re)));
1559 if (ctx->cflags & REG_NEWLINE)
1560 {
1561 tre_ast_node_t *tmp1;
1562 tre_ast_node_t *tmp2;
1563 tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1564 ctx->position);
1565 if (!tmp1)
1566 return REG_ESPACE;
1567 tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1568 ctx->position + 1);
1569 if (!tmp2)
1570 return REG_ESPACE;
1571 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1572 if (!result)
1573 return REG_ESPACE;
1574 ctx->position += 2;
1575 }
1576 else
1577 {
1578 result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1579 ctx->position);
1580 if (!result)
1581 return REG_ESPACE;
1582 ctx->position++;
1583 }
1584 ctx->re++;
1585 break;
1586
1587 case CHAR_CARET: /* beginning of line assertion */
1588 /* '^' has a special meaning everywhere in EREs, and in the
1589 beginning of the RE and after \( is BREs. */
1590 if (ctx->cflags & REG_EXTENDED
1591 || (ctx->re - 2 >= ctx->re_start
1592 && *(ctx->re - 2) == CHAR_BACKSLASH
1593 && *(ctx->re - 1) == CHAR_LPAREN)
1594 || ctx->re == ctx->re_start)
1595 {
1596 DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
1597 REST(ctx->re)));
1598 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1599 ASSERT_AT_BOL, -1);
1600 if (result == NULL)
1601 return REG_ESPACE;
1602 ctx->re++;
1603 }
1604 else
1605 goto parse_literal;
1606 break;
1607
1608 case CHAR_DOLLAR: /* end of line assertion. */
1609 /* '$' is special everywhere in EREs, and in the end of the
1610 string and before \) is BREs. */
1611 if (ctx->cflags & REG_EXTENDED
1612 || (ctx->re + 2 < ctx->re_end
1613 && *(ctx->re + 1) == CHAR_BACKSLASH
1614 && *(ctx->re + 2) == CHAR_RPAREN)
1615 || ctx->re + 1 == ctx->re_end)
1616 {
1617 DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
1618 REST(ctx->re)));
1619 result = tre_ast_new_literal(ctx->mem, ASSERTION,
1620 ASSERT_AT_EOL, -1);
1621 if (result == NULL)
1622 return REG_ESPACE;
1623 ctx->re++;
1624 }
1625 else
1626 goto parse_literal;
1627 break;
1628
1629 default:
1630 parse_literal:
1631
1632 if (temporary_cflags && ctx->re + 1 < ctx->re_end
1633 && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1634 {
1635 DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
1636 REST(ctx->re)));
1637 ctx->cflags &= ~temporary_cflags;
1638 temporary_cflags = 0;
1639 ctx->re += 2;
1640 STACK_PUSHX(stack, int, PARSE_PIECE);
1641 break;
1642 }
1643
1644
1645 /* We are expecting an atom. If the subexpression (or the whole
1646 regexp) ends here, we interpret it as an empty expression
1647 (which matches an empty string). */
1648 if (
1649 #ifdef REG_LITERAL
1650 !(ctx->cflags & REG_LITERAL) &&
1651 #endif /* REG_LITERAL */
1652 (ctx->re >= ctx->re_end
1653 || *ctx->re == CHAR_STAR
1654 || (ctx->cflags & REG_EXTENDED
1655 && (*ctx->re == CHAR_PIPE
1656 || *ctx->re == CHAR_LBRACE
1657 || *ctx->re == CHAR_PLUS
1658 || *ctx->re == CHAR_QUESTIONMARK))
1659 /* Test for "\)" in BRE mode. */
1660 || (!(ctx->cflags & REG_EXTENDED)
1661 && ctx->re + 1 < ctx->re_end
1662 && *ctx->re == CHAR_BACKSLASH
1663 && *(ctx->re + 1) == CHAR_LBRACE)))
1664 {
1665 DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1666 REST(ctx->re)));
1667 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1668 if (!result)
1669 return REG_ESPACE;
1670 break;
1671 }
1672
1673 /* R change */
1674 if ((ctx->cflags & REG_LITERAL) && !ctx->re[0]) {
1675 DPRINT(("tre_parse: literal empty: '%.*" STRF "'\n",
1676 REST(ctx->re)));
1677 result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1678 if (!result)
1679 return REG_ESPACE;
1680 break;
1681 }
1682 /* end R change */
1683
1684 DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
1685 REST(ctx->re)));
1686 /* Note that we can't use an tre_isalpha() test here, since there
1687 may be characters which are alphabetic but neither upper or
1688 lower case. */
1689 if (ctx->cflags & REG_ICASE
1690 && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1691 {
1692 tre_ast_node_t *tmp1;
1693 tre_ast_node_t *tmp2;
1694
1695 /* XXX - Can there be more than one opposite-case
1696 counterpoints for some character in some locale? Or
1697 more than two characters which all should be regarded
1698 the same character if case is ignored? If yes, there
1699 does not seem to be a portable way to detect it. I guess
1700 that at least for multi-character collating elements there
1701 could be several opposite-case counterpoints, but they
1702 cannot be supported portably anyway. */
1703 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1704 tre_toupper(*ctx->re),
1705 ctx->position);
1706 if (!tmp1)
1707 return REG_ESPACE;
1708 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1709 tre_tolower(*ctx->re),
1710 ctx->position);
1711 if (!tmp2)
1712 return REG_ESPACE;
1713 result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1714 if (!result)
1715 return REG_ESPACE;
1716 }
1717 else
1718 {
1719 result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1720 ctx->position);
1721 if (!result)
1722 return REG_ESPACE;
1723 }
1724 ctx->position++;
1725 ctx->re++;
1726 break;
1727 }
1728 break;
1729
1730 case PARSE_MARK_FOR_SUBMATCH:
1731 {
1732 int submatch_id = tre_stack_pop_int(stack);
1733
1734 if(result) {
1735 if (result->submatch_id >= 0)
1736 {
1737 tre_ast_node_t *n, *tmp_node;
1738 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1739 if (n == NULL)
1740 return REG_ESPACE;
1741 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1742 if (tmp_node == NULL)
1743 return REG_ESPACE;
1744 tmp_node->num_submatches = result->num_submatches;
1745 result = tmp_node;
1746 }
1747 result->submatch_id = submatch_id;
1748 result->num_submatches++;
1749 }
1750 break;
1751 }
1752
1753 case PARSE_RESTORE_CFLAGS:
1754 ctx->cflags = tre_stack_pop_int(stack);
1755 break;
1756
1757 default:
1758 assert(0);
1759 break;
1760 }
1761 }
1762
1763 /* Check for missing closing parentheses. */
1764 if (depth > 0)
1765 return REG_EPAREN;
1766
1767 if (status == REG_OK)
1768 ctx->result = result;
1769
1770 return status;
1771 }
1772
1773 /* EOF */
1774