1 /*
2   tre-parse.c - Regexp parser
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This parser is just a simple recursive descent parser for POSIX.2
11   regexps.  The parser supports both the obsolete default syntax and
12   the "extended" syntax, and some nonstandard extensions.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 // #include <assert.h>
21 #include <limits.h>
22 
23 #include "xmalloc.h"
24 #include "tre-mem.h"
25 #include "tre-ast.h"
26 #include "tre-stack.h"
27 #include "tre-parse.h"
28 
29 #define assert(a) R_assert(a)
30 
31 /* Characters with special meanings in regexp syntax. */
32 #define CHAR_PIPE	   L'|'
33 #define CHAR_LPAREN	   L'('
34 #define CHAR_RPAREN	   L')'
35 #define CHAR_LBRACE	   L'{'
36 #define CHAR_RBRACE	   L'}'
37 #define CHAR_LBRACKET	   L'['
38 #define CHAR_RBRACKET	   L']'
39 #define CHAR_MINUS	   L'-'
40 #define CHAR_STAR	   L'*'
41 #define CHAR_QUESTIONMARK  L'?'
42 #define CHAR_PLUS	   L'+'
43 #define CHAR_PERIOD	   L'.'
44 #define CHAR_COLON	   L':'
45 #define CHAR_EQUAL	   L'='
46 #define CHAR_COMMA	   L','
47 #define CHAR_CARET	   L'^'
48 #define CHAR_DOLLAR	   L'$'
49 #define CHAR_BACKSLASH	   L'\\'
50 #define CHAR_HASH	   L'#'
51 #define CHAR_TILDE	   L'~'
52 
53 
54 /* Some macros for expanding \w, \s, etc. */
55 static const struct tre_macro_struct {
56   const char c;
57   const char *expansion;
58 } tre_macros[] =
59   { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
60     {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"},
61     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
62     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
63     { 0, NULL }
64   };
65 
66 
67 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
68    must have at least `len' items.  Sets buf[0] to zero if the there
69    is no match in `tre_macros'. */
70 static void
tre_expand_macro(const tre_char_t * regex,const tre_char_t * regex_end,tre_char_t * buf,size_t buf_len)71 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
72 		 tre_char_t *buf, size_t buf_len)
73 {
74   int i;
75 
76   buf[0] = 0;
77   if (regex >= regex_end)
78     return;
79 
80   for (i = 0; tre_macros[i].expansion; i++)
81     {
82       if (tre_macros[i].c == *regex)
83 	{
84 	  unsigned int j;
85 	  DPRINT(("Expanding macro '%c' => '%s'\n",
86 		  tre_macros[i].c, tre_macros[i].expansion));
87 	  // R_change:  - 1 is r62537
88 	  for (j = 0; tre_macros[i].expansion[j] && j < buf_len - 1; j++)
89 	    buf[j] = tre_macros[i].expansion[j];
90 	  buf[j] = 0;
91 	  break;
92 	}
93     }
94 }
95 
96 static reg_errcode_t
tre_new_item(tre_mem_t mem,int min,int max,int * i,int * max_i,tre_ast_node_t *** items)97 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
98 	 tre_ast_node_t ***items)
99 {
100   reg_errcode_t status;
101   tre_ast_node_t **array = *items;
102   /* Allocate more space if necessary. */
103   if (*i >= *max_i)
104     {
105       tre_ast_node_t **new_items;
106       DPRINT(("out of array space, i = %d\n", *i));
107       /* If the array is already 1024 items large, give up -- there's
108 	 probably an error in the regexp (e.g. not a '\0' terminated
109 	 string and missing ']') */
110       if (*max_i > 1024)
111 	return REG_ESPACE;
112       *max_i *= 2;
113       new_items = xrealloc(array, sizeof(*items) * *max_i);
114       if (new_items == NULL)
115 	return REG_ESPACE;
116       *items = array = new_items;
117     }
118   array[*i] = tre_ast_new_literal(mem, min, max, -1);
119   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
120   (*i)++;
121   return status;
122 }
123 
124 
125 /* Expands a character class to character ranges. */
126 static reg_errcode_t
tre_expand_ctype(tre_mem_t mem,tre_ctype_t class,tre_ast_node_t *** items,int * i,int * max_i,int cflags)127 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
128 		 int *i, int *max_i, int cflags)
129 {
130   reg_errcode_t status = REG_OK;
131   tre_cint_t c;
132   int j, min = -1, max = 0;
133   //assert(TRE_MB_CUR_MAX == 1); It is the ctx->cur_max that matters
134 
135   DPRINT(("  expanding class to character ranges\n"));
136   for (j = 0; (j < 256) && (status == REG_OK); j++)
137     {
138       c = (tre_cint_t) j;
139       if (tre_isctype(c, class)
140 	  || ((cflags & REG_ICASE)
141 	      && (tre_isctype(tre_tolower(c), class)
142 		  || tre_isctype(tre_toupper(c), class))))
143 {
144 	  if (min < 0)
145 	    min = c;
146 	  max = c;
147 	}
148       else if (min >= 0)
149 	{
150 	  DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max));
151 	  status = tre_new_item(mem, min, max, i, max_i, items);
152 	  min = -1;
153 	}
154     }
155   if (min >= 0 && status == REG_OK)
156     status = tre_new_item(mem, min, max, i, max_i, items);
157   return status;
158 }
159 
160 
161 static int
tre_compare_items(const void * a,const void * b)162 tre_compare_items(const void *a, const void *b)
163 {
164   const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
165   const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
166   tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
167   long a_min = l_a->code_min, b_min = l_b->code_min;
168 
169   if (a_min < b_min)
170     return -1;
171   else if (a_min > b_min)
172     return 1;
173   else
174     return 0;
175 }
176 
177 #ifndef TRE_USE_SYSTEM_WCTYPE
178 
179 /* isalnum() and the rest may be macros, so wrap them to functions. */
tre_isalnum_func(tre_cint_t c)180 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
tre_isalpha_func(tre_cint_t c)181 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
182 
183 #ifdef tre_isascii
tre_isascii_func(tre_cint_t c)184 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
185 #else /* !tre_isascii */
tre_isascii_func(tre_cint_t c)186 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
187 #endif /* !tre_isascii */
188 
189 #ifdef tre_isblank
tre_isblank_func(tre_cint_t c)190 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
191 #else /* !tre_isblank */
tre_isblank_func(tre_cint_t c)192 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
193 #endif /* !tre_isblank */
194 
tre_iscntrl_func(tre_cint_t c)195 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
tre_isdigit_func(tre_cint_t c)196 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
tre_isgraph_func(tre_cint_t c)197 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
tre_islower_func(tre_cint_t c)198 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
tre_isprint_func(tre_cint_t c)199 int tre_isprint_func(tre_cint_t c)
200 {
201 #ifdef TRE_WCHAR
202   /* Windows has \t as printable via iswprint in all locales. By POSIX
203      and ?regex, we need \t to be non-printable in the C locale, so we
204      cannot use iswprint. By C99, iswprint(L'\t') should be the same as
205      isprint('\t'). In Windows, in C locale, isprint('\t') is false,
206      hence this workaround. */
207     if (c == L'\t') return isprint('\t');
208 #endif
209   return tre_isprint(c); /* Windows has \t as printable */
210 }
tre_ispunct_func(tre_cint_t c)211 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
tre_isspace_func(tre_cint_t c)212 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
tre_isupper_func(tre_cint_t c)213 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
tre_isxdigit_func(tre_cint_t c)214 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
215 
216 struct {
217   char *name;
218   int (*func)(tre_cint_t);
219 } tre_ctype_map[] = {
220   { "alnum", &tre_isalnum_func },
221   { "alpha", &tre_isalpha_func },
222 #ifdef tre_isascii
223   { "ascii", &tre_isascii_func },
224 #endif /* tre_isascii */
225 #ifdef tre_isblank
226   { "blank", &tre_isblank_func },
227 #endif /* tre_isblank */
228   { "cntrl", &tre_iscntrl_func },
229   { "digit", &tre_isdigit_func },
230   { "graph", &tre_isgraph_func },
231   { "lower", &tre_islower_func },
232   { "print", &tre_isprint_func },
233   { "punct", &tre_ispunct_func },
234   { "space", &tre_isspace_func },
235   { "upper", &tre_isupper_func },
236   { "xdigit", &tre_isxdigit_func },
237   { NULL, NULL}
238 };
239 
tre_ctype(const char * name)240 tre_ctype_t tre_ctype(const char *name)
241 {
242   int i;
243   for (i = 0; tre_ctype_map[i].name != NULL; i++)
244     {
245       if (strcmp(name, tre_ctype_map[i].name) == 0)
246 	return tre_ctype_map[i].func;
247     }
248   return (tre_ctype_t)0;
249 }
250 #endif /* !TRE_USE_SYSTEM_WCTYPE */
251 
252 /* Maximum number of character classes that can occur in a negated bracket
253    expression.	*/
254 #define MAX_NEG_CLASSES 64
255 
256 /* Maximum length of character class names. */
257 #define MAX_CLASS_NAME
258 
259 #define REST(re) (int)(ctx->re_end - (re)), (re)
260 
261 static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t * ctx,int negate,tre_ctype_t neg_classes[],int * num_neg_classes,tre_ast_node_t *** items,int * num_items,int * items_size)262 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
263 			tre_ctype_t neg_classes[], int *num_neg_classes,
264 			tre_ast_node_t ***items, int *num_items,
265 			int *items_size)
266 {
267   const tre_char_t *re = ctx->re;
268   reg_errcode_t status = REG_OK;
269   tre_ctype_t class = (tre_ctype_t)0;
270   int i = *num_items;
271   int max_i = *items_size;
272   int skip;
273 
274   /* Build an array of the items in the bracket expression. */
275   while (status == REG_OK)
276     {
277       skip = 0;
278       if (re == ctx->re_end)
279 	{
280 	  status = REG_EBRACK;
281 	}
282       else if (*re == CHAR_RBRACKET && re > ctx->re)
283 	{
284 	  DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n", REST(re)));
285 	  re++;
286 	  break;
287 	}
288       else
289 	{
290 	  tre_cint_t min = 0, max = 0;
291 
292 	  class = (tre_ctype_t)0;
293 	  if (re + 2 < ctx->re_end
294 	      && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
295 	    {
296 	      DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n", REST(re)));
297 	      min = *re;
298 	      max = *(re + 2);
299 	      re += 3;
300 	      /* XXX - Should use collation order instead of encoding values
301 		 in character ranges. */
302 	      if (min > max)
303 		status = REG_ERANGE;
304 	    }
305 	  else if (re + 1 < ctx->re_end
306 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
307 	    status = REG_ECOLLATE;
308 	  else if (re + 1 < ctx->re_end
309 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
310 	    status = REG_ECOLLATE;
311 	  else if (re + 1 < ctx->re_end
312 		   && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
313 	    {
314 	      char tmp_str[64];
315 	      const tre_char_t *endptr = re + 2;
316 	      size_t len;
317 	      DPRINT(("tre_parse_bracket:  class: '%.*" STRF "'\n", REST(re)));
318 	      while (endptr < ctx->re_end && *endptr != CHAR_COLON)
319 		endptr++;
320 	      if (endptr != ctx->re_end)
321 		{
322 		  len = MIN(endptr - re - 2, 63);
323 #ifdef TRE_WCHAR
324 		  {
325 		    tre_char_t tmp_wcs[64];
326 		    wcsncpy(tmp_wcs, re + 2, len);
327 		    tmp_wcs[len] = L'\0';
328 #if defined HAVE_WCSRTOMBS
329 		    {
330 		      mbstate_t state;
331 		      const tre_char_t *src = tmp_wcs;
332 		      memset(&state, '\0', sizeof(state));
333 		      len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
334 		    }
335 #elif defined HAVE_WCSTOMBS
336 		    len = wcstombs(tmp_str, tmp_wcs, 63);
337 #endif /* defined HAVE_WCSTOMBS */
338 		  }
339 #else /* !TRE_WCHAR */
340 		  strncpy(tmp_str, (const char*)re + 2, len);
341 #endif /* !TRE_WCHAR */
342 		  tmp_str[len] = '\0';
343 		  DPRINT(("  class name: %s\n", tmp_str));
344 		  class = tre_ctype(tmp_str);
345 		  if (!class)
346 		    status = REG_ECTYPE;
347 		  /* Optimize character classes for 8 bit character sets. */
348 		  if (status == REG_OK && ctx->cur_max == 1)
349 		    {
350 		      status = tre_expand_ctype(ctx->mem, class, items,
351 						&i, &max_i, ctx->cflags);
352 		      class = (tre_ctype_t)0;
353 		      skip = 1;
354 		    }
355 		  re = endptr + 2;
356 		}
357 	      else
358 		status = REG_ECTYPE;
359 	      min = 0;
360 	      max = TRE_CHAR_MAX;
361 	    }
362 	  else
363 	    {
364 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
365 	      if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
366 		  && ctx->re != re)
367 		/* Two ranges are not allowed to share and endpoint. */
368 		status = REG_ERANGE;
369 	      min = max = *re++;
370 	    }
371 
372 	  if (status != REG_OK)
373 	    break;
374 
375 	  if (class && negate)
376 	    if (*num_neg_classes >= MAX_NEG_CLASSES)
377 	      status = REG_ESPACE;
378 	    else
379 	      neg_classes[(*num_neg_classes)++] = class;
380 	  else if (!skip)
381 	    {
382 	      status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
383 	      if (status != REG_OK)
384 		break;
385 	      ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
386 	    }
387 
388 	  /* Add opposite-case counterpoints if REG_ICASE is present.
389 	     This is broken if there are more than two "same" characters. */
390 	  if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
391 	    {
392 	      tre_cint_t cmin, ccurr;
393 
394 	      DPRINT(("adding opposite-case counterpoints\n"));
395 	      while (min <= max)
396 		{
397 		  if (tre_islower(min))
398 		    {
399 		      cmin = ccurr = tre_toupper(min++);
400 		      while (tre_islower(min) && tre_toupper(min) == ccurr + 1
401 			     && min <= max)
402 			ccurr = tre_toupper(min++);
403 		      status = tre_new_item(ctx->mem, cmin, ccurr,
404 					    &i, &max_i, items);
405 		    }
406 		  else if (tre_isupper(min))
407 		    {
408 		      cmin = ccurr = tre_tolower(min++);
409 		      while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
410 			     && min <= max)
411 			ccurr = tre_tolower(min++);
412 		      status = tre_new_item(ctx->mem, cmin, ccurr,
413 					    &i, &max_i, items);
414 		    }
415 		  else min++;
416 		  if (status != REG_OK)
417 		    break;
418 		}
419 	      if (status != REG_OK)
420 		break;
421 	    }
422 	}
423     }
424   *num_items = i;
425   *items_size = max_i;
426   ctx->re = re;
427   return status;
428 }
429 
430 static reg_errcode_t
tre_parse_bracket(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)431 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
432 {
433   tre_ast_node_t *node = NULL;
434   int negate = 0;
435   reg_errcode_t status = REG_OK;
436   tre_ast_node_t **items, *u, *n;
437   int i = 0, j, max_i = 32, curr_max, curr_min;
438   tre_ctype_t neg_classes[MAX_NEG_CLASSES];
439   int num_neg_classes = 0;
440 
441   /* Start off with an array of `max_i' elements. */
442   items = xmalloc(sizeof(*items) * max_i);
443   if (items == NULL)
444     return REG_ESPACE;
445 
446   if (*ctx->re == CHAR_CARET)
447     {
448       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
449       negate = 1;
450       ctx->re++;
451     }
452 
453   status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
454 				   &items, &i, &max_i);
455 
456   if (status != REG_OK)
457     goto parse_bracket_done;
458 
459   /* Sort the array if we need to negate it. */
460   if (negate)
461     qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
462 
463   curr_max = curr_min = 0;
464   /* Build a union of the items in the array, negated if necessary. */
465   for (j = 0; j < i && status == REG_OK; j++)
466     {
467       int min, max;
468       tre_literal_t *l = items[j]->obj;
469       min = (int) l->code_min;
470       max = (int) l->code_max;
471 
472       DPRINT(("item: %d - %d, class %p, curr_max = %d\n",
473 	      (int)l->code_min, (int)l->code_max, (void *)l->u.class, curr_max));
474 
475       if (negate)
476 	{
477 	  if (min < curr_max)
478 	    {
479 	      /* Overlap. */
480 	      curr_max = MAX(max + 1, curr_max);
481 	      DPRINT(("overlap, curr_max = %d\n", curr_max));
482 	      l = NULL;
483 	    }
484 	  else
485 	    {
486 	      /* No overlap. */
487 	      curr_max = min - 1;
488 	      if (curr_max >= curr_min)
489 		{
490 		  DPRINT(("no overlap\n"));
491 		  l->code_min = curr_min;
492 		  l->code_max = curr_max;
493 		}
494 	      else
495 		{
496 		  DPRINT(("no overlap, zero room\n"));
497 		  l = NULL;
498 		}
499 	      curr_min = curr_max = max + 1;
500 	    }
501 	}
502 
503       if (l != NULL)
504 	{
505 	  int k;
506 	  DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
507 	  l->position = ctx->position;
508 	  if (num_neg_classes > 0)
509 	    {
510 	      l->neg_classes = tre_mem_alloc(ctx->mem,
511 					     (sizeof(l->neg_classes)
512 					      * (num_neg_classes + 1)));
513 	      if (l->neg_classes == NULL)
514 		{
515 		  status = REG_ESPACE;
516 		  break;
517 		}
518 	      for (k = 0; k < num_neg_classes; k++)
519 		l->neg_classes[k] = neg_classes[k];
520 	      l->neg_classes[k] = (tre_ctype_t)0;
521 	    }
522 	  else
523 	    l->neg_classes = NULL;
524 	  if (node == NULL)
525 	    node = items[j];
526 	  else
527 	    {
528 	      u = tre_ast_new_union(ctx->mem, node, items[j]);
529 	      if (u == NULL)
530 		status = REG_ESPACE;
531 	      node = u;
532 	    }
533 	}
534     }
535 
536   if (status != REG_OK)
537     goto parse_bracket_done;
538 
539   if (negate)
540     {
541       int k;
542       DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
543       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
544       if (n == NULL)
545 	status = REG_ESPACE;
546       else
547 	{
548 	  tre_literal_t *l = n->obj;
549 	  if (num_neg_classes > 0)
550 	    {
551 	      l->neg_classes = tre_mem_alloc(ctx->mem,
552 					     (sizeof(l->neg_classes)
553 					      * (num_neg_classes + 1)));
554 	      if (l->neg_classes == NULL)
555 		{
556 		  status = REG_ESPACE;
557 		  goto parse_bracket_done;
558 		}
559 	      for (k = 0; k < num_neg_classes; k++)
560 		l->neg_classes[k] = neg_classes[k];
561 	      l->neg_classes[k] = (tre_ctype_t)0;
562 	    }
563 	  else
564 	    l->neg_classes = NULL;
565 	  if (node == NULL)
566 	    node = n;
567 	  else
568 	    {
569 	      u = tre_ast_new_union(ctx->mem, node, n);
570 	      if (u == NULL)
571 		status = REG_ESPACE;
572 	      node = u;
573 	    }
574 	}
575     }
576 
577   if (status != REG_OK)
578     goto parse_bracket_done;
579 
580 #ifdef TRE_DEBUG
581   tre_ast_print(node);
582 #endif /* TRE_DEBUG */
583 
584  parse_bracket_done:
585   xfree(items);
586   ctx->position++;
587   *result = node;
588   return status;
589 }
590 
591 
592 // patch from https://github.com/laurikari/tre/issues/55
593 /* Parses a positive decimal integer.  Returns -1 if the string does not
594    contain a valid number. */
595 static int
tre_parse_int(const tre_char_t ** regex,const tre_char_t * regex_end)596 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
597 {
598   int num = -1;
599   int overflow = 0;
600   const tre_char_t *r = *regex;
601   while (r < regex_end && *r >= L'0' && *r <= L'9')
602     {
603       if (num < 0)
604 	num = 0;
605       if (num <= (INT_MAX - 9) / 10) {
606          num = num * 10 + *r - L'0';
607       } else {
608           /* This digit could cause an integer overflow. We do not return
609            * directly; instead, consume all remaining digits. */
610           overflow = 1;
611       }
612       r++;
613     }
614   *regex = r;
615   return overflow ? -1 : num;
616 }
617 
618 
619 static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)620 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
621 {
622   int min, max, i;
623   int cost_ins, cost_del, cost_subst, cost_max;
624   int limit_ins, limit_del, limit_subst, limit_err;
625   const tre_char_t *r = ctx->re;
626   const tre_char_t *start;
627   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
628   int approx = 0;
629   int costs_set = 0;
630   int counts_set = 0;
631 
632   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
633   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
634 
635   /* Parse number (minimum repetition count). */
636   min = -1;
637   if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
638     DPRINT(("tre_parse:	  min count: '%.*" STRF "'\n", REST(r)));
639     min = tre_parse_int(&r, ctx->re_end);
640   }
641 
642   /* Parse comma and second number (maximum repetition count). */
643   max = min;
644   if (r < ctx->re_end && *r == CHAR_COMMA)
645     {
646       r++;
647       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
648       max = tre_parse_int(&r, ctx->re_end);
649     }
650 
651   /* Check that the repeat counts are sane. */
652   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
653     return REG_BADBR;
654 
655 
656   /*
657    '{'
658      optionally followed immediately by a number == minimum repcount
659      optionally followed by , then a number == maximum repcount
660       + then a number == maximum insertion count
661       - then a number == maximum deletion count
662       # then a number == maximum substitution count
663       ~ then a number == maximum number of errors
664       Any of +, -, # or ~ without followed by a number means that
665       the maximum count/number of errors is infinite.
666 
667       An equation of the form
668 	Xi + Yd + Zs < C
669       can be specified to set costs and the cost limit to a value
670       different from the default value:
671 	- X is the cost of an insertion
672 	- Y is the cost of a deletion
673 	- Z is the cost of a substitution
674 	- C is the maximum cost
675 
676       If no count limit or cost is set for an operation, the operation
677       is not allowed at all.
678   */
679 
680 
681   do {
682     int done;
683     start = r;
684 
685     /* Parse count limit settings */
686     done = 0;
687     if (!counts_set)
688       while (r + 1 < ctx->re_end && !done)
689 	{
690 	  switch (*r)
691 	    {
692 	    case CHAR_PLUS:  /* Insert limit */
693 	      DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
694 	      r++;
695 	      limit_ins = tre_parse_int(&r, ctx->re_end);
696 	      if (limit_ins < 0)
697 		limit_ins = INT_MAX;
698 	      counts_set = 1;
699 	      break;
700 	    case CHAR_MINUS: /* Delete limit */
701 	      DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
702 	      r++;
703 	      limit_del = tre_parse_int(&r, ctx->re_end);
704 	      if (limit_del < 0)
705 		limit_del = INT_MAX;
706 	      counts_set = 1;
707 	      break;
708 	    case CHAR_HASH:  /* Substitute limit */
709 	      DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
710 	      r++;
711 	      limit_subst = tre_parse_int(&r, ctx->re_end);
712 	      if (limit_subst < 0)
713 		limit_subst = INT_MAX;
714 	      counts_set = 1;
715 	      break;
716 	    case CHAR_TILDE: /* Maximum number of changes */
717 	      DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
718 	      r++;
719 	      limit_err = tre_parse_int(&r, ctx->re_end);
720 	      if (limit_err < 0)
721 		limit_err = INT_MAX;
722 	      approx = 1;
723 	      break;
724 	    case CHAR_COMMA:
725 	      r++;
726 	      break;
727 	    case L' ':
728 	      r++;
729 	      break;
730 	    case L'}':
731 	      done = 1;
732 	      break;
733 	    default:
734 	      done = 1;
735 	      break;
736 	    }
737 	}
738 
739     /* Parse cost restriction equation. */
740     done = 0;
741     if (!costs_set)
742       while (r + 1 < ctx->re_end && !done)
743 	{
744 	  switch (*r)
745 	    {
746 	    case CHAR_PLUS:
747 	    case L' ':
748 	      r++;
749 	      break;
750 	    case L'<':
751 	      DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
752 	      r++;
753 	      while (*r == L' ')
754 		r++;
755 	      cost_max = tre_parse_int(&r, ctx->re_end);
756 	      if (cost_max < 0)
757 		cost_max = INT_MAX;
758 	      else
759 		cost_max--;
760 	      approx = 1;
761 	      break;
762 	    case CHAR_COMMA:
763 	      r++;
764 	      done = 1;
765 	      break;
766 	    default:
767 	      if (*r >= L'0' && *r <= L'9')
768 		{
769 #ifdef TRE_DEBUG
770 		  const tre_char_t *sr = r;
771 #endif /* TRE_DEBUG */
772 		  int cost = tre_parse_int(&r, ctx->re_end);
773 		  /* XXX - make sure r is not past end. */
774 		  switch (*r)
775 		    {
776 		    case L'i':	/* Insert cost */
777 		      DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
778 			      REST(sr)));
779 		      r++;
780 		      cost_ins = cost;
781 		      costs_set = 1;
782 		      break;
783 		    case L'd':	/* Delete cost */
784 		      DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
785 			      REST(sr)));
786 		      r++;
787 		      cost_del = cost;
788 		      costs_set = 1;
789 		      break;
790 		    case L's':	/* Substitute cost */
791 		      DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
792 			      REST(sr)));
793 		      r++;
794 		      cost_subst = cost;
795 		      costs_set = 1;
796 		      break;
797 		    default:
798 		      return REG_BADBR;
799 		    }
800 		}
801 	      else
802 		{
803 		  done = 1;
804 		  break;
805 		}
806 	    }
807 	}
808   } while (start != r);
809 
810   /* Missing }. */
811   if (r >= ctx->re_end)
812     return REG_EBRACE;
813 
814   /* Empty contents of {}. */
815   if (r == ctx->re)
816     return REG_BADBR;
817 
818   /* Parse the ending '}' or '\}'.*/
819   if (ctx->cflags & REG_EXTENDED)
820     {
821       if (r >= ctx->re_end || *r != CHAR_RBRACE)
822 	return REG_BADBR;
823       r++;
824     }
825   else
826     {
827       if (r + 1 >= ctx->re_end
828 	  || *r != CHAR_BACKSLASH
829 	  || *(r + 1) != CHAR_RBRACE)
830 	return REG_BADBR;
831       r += 2;
832     }
833 
834 
835   /* Parse trailing '?' marking minimal repetition. */
836   if (r < ctx->re_end)
837     {
838       if (*r == CHAR_QUESTIONMARK)
839 	{
840 	  minimal = !(ctx->cflags & REG_UNGREEDY);
841 	  r++;
842 	}
843       else if (*r == CHAR_STAR || *r == CHAR_PLUS)
844 	{
845 	  /* These are reserved for future extensions. */
846 	  return REG_BADRPT;
847 	}
848     }
849 
850   /* Create the AST node(s). */
851   if (min == 0 && max == 0)
852     {
853       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
854       if (*result == NULL)
855 	return REG_ESPACE;
856     }
857   else
858     {
859       if (min < 0 && max < 0)
860 	/* Only approximate parameters set, no repetitions. */
861 	min = max = 1;
862 
863       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
864       if (!*result)
865 	return REG_ESPACE;
866 
867       /* If approximate matching parameters are set, add them to the
868 	 iteration node. */
869       if (approx || costs_set || counts_set)
870 	{
871 	  int *params;
872 	  tre_iteration_t *iter = (*result)->obj;
873 
874 	  if (costs_set || counts_set)
875 	    {
876 	      if (limit_ins == TRE_PARAM_UNSET)
877 		{
878 		  if (cost_ins == TRE_PARAM_UNSET)
879 		    limit_ins = 0;
880 		  else
881 		    limit_ins = INT_MAX;
882 		}
883 
884 	      if (limit_del == TRE_PARAM_UNSET)
885 		{
886 		  if (cost_del == TRE_PARAM_UNSET)
887 		    limit_del = 0;
888 		  else
889 		    limit_del = INT_MAX;
890 		}
891 
892 	      if (limit_subst == TRE_PARAM_UNSET)
893 		{
894 		  if (cost_subst == TRE_PARAM_UNSET)
895 		    limit_subst = 0;
896 		  else
897 		    limit_subst = INT_MAX;
898 		}
899 	    }
900 
901 	  if (cost_max == TRE_PARAM_UNSET)
902 	    cost_max = INT_MAX;
903 	  if (limit_err == TRE_PARAM_UNSET)
904 	    limit_err = INT_MAX;
905 
906 	  ctx->have_approx = 1;
907 	  params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
908 	  if (!params)
909 	    return REG_ESPACE;
910 	  for (i = 0; i < TRE_PARAM_LAST; i++)
911 	    params[i] = TRE_PARAM_UNSET;
912 	  params[TRE_PARAM_COST_INS] = cost_ins;
913 	  params[TRE_PARAM_COST_DEL] = cost_del;
914 	  params[TRE_PARAM_COST_SUBST] = cost_subst;
915 	  params[TRE_PARAM_COST_MAX] = cost_max;
916 	  params[TRE_PARAM_MAX_INS] = limit_ins;
917 	  params[TRE_PARAM_MAX_DEL] = limit_del;
918 	  params[TRE_PARAM_MAX_SUBST] = limit_subst;
919 	  params[TRE_PARAM_MAX_ERR] = limit_err;
920 	  iter->params = params;
921 	}
922     }
923 
924   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
925 	  "limits [%d,%d,%d, total %d]\n",
926 	  min, max, cost_ins, cost_del, cost_subst, cost_max,
927 	  limit_ins, limit_del, limit_subst, limit_err));
928 
929 
930   ctx->re = r;
931   return REG_OK;
932 }
933 
934 typedef enum {
935   PARSE_RE = 0,
936   PARSE_ATOM,
937   PARSE_MARK_FOR_SUBMATCH,
938   PARSE_BRANCH,
939   PARSE_PIECE,
940   PARSE_CATENATION,
941   PARSE_POST_CATENATION,
942   PARSE_UNION,
943   PARSE_POST_UNION,
944   PARSE_POSTFIX,
945   PARSE_RESTORE_CFLAGS
946 } tre_parse_re_stack_symbol_t;
947 
948 
949 reg_errcode_t
tre_parse(tre_parse_ctx_t * ctx)950 tre_parse(tre_parse_ctx_t *ctx)
951 {
952   tre_ast_node_t *result = NULL;
953   tre_parse_re_stack_symbol_t symbol;
954   reg_errcode_t status = REG_OK;
955   tre_stack_t *stack = ctx->stack;
956   int bottom = tre_stack_num_objects(stack);
957   int depth = 0;
958   int temporary_cflags = 0;
959 
960   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
961 	  ctx->len, ctx->re, ctx->len));
962 
963   if (!ctx->nofirstsub)
964     {
965       STACK_PUSH(stack, int, ctx->submatch_id);
966       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
967       ctx->submatch_id++;
968     }
969   STACK_PUSH(stack, int, PARSE_RE);
970   ctx->re_start = ctx->re;
971   ctx->re_end = ctx->re + ctx->len;
972 
973 
974   /* The following is basically just a recursive descent parser.  I use
975      an explicit stack instead of recursive functions mostly because of
976      two reasons: compatibility with systems which have an overflowable
977      call stack, and efficiency (both in lines of code and speed).  */
978   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
979     {
980       if (status != REG_OK)
981 	break;
982       symbol = tre_stack_pop_int(stack);
983       switch (symbol)
984 	{
985 	case PARSE_RE:
986 	  /* Parse a full regexp.  A regexp is one or more branches,
987 	     separated by the union operator `|'. */
988 #ifdef REG_LITERAL
989 	  if (!(ctx->cflags & REG_LITERAL)
990 	      && ctx->cflags & REG_EXTENDED)
991 #endif /* REG_LITERAL */
992 	    STACK_PUSHX(stack, int, PARSE_UNION);
993 	  STACK_PUSHX(stack, int, PARSE_BRANCH);
994 	  break;
995 
996 	case PARSE_BRANCH:
997 	  /* Parse a branch.  A branch is one or more pieces, concatenated.
998 	     A piece is an atom possibly followed by a postfix operator. */
999 	  STACK_PUSHX(stack, int, PARSE_CATENATION);
1000 	  STACK_PUSHX(stack, int, PARSE_PIECE);
1001 	  break;
1002 
1003 	case PARSE_PIECE:
1004 	  /* Parse a piece.  A piece is an atom possibly followed by one
1005 	     or more postfix operators. */
1006 #ifdef REG_LITERAL
1007 	  if (!(ctx->cflags & REG_LITERAL))
1008 #endif /* REG_LITERAL */
1009 	    STACK_PUSHX(stack, int, PARSE_POSTFIX);
1010 	  STACK_PUSHX(stack, int, PARSE_ATOM);
1011 	  break;
1012 
1013 	case PARSE_CATENATION:
1014 	  /* If the expression has not ended, parse another piece. */
1015 	  {
1016 	    tre_char_t c;
1017 	    if (ctx->re >= ctx->re_end)
1018 	      break;
1019 	    c = *ctx->re;
1020 #ifdef REG_LITERAL
1021 	    if (!(ctx->cflags & REG_LITERAL))
1022 	      {
1023 #endif /* REG_LITERAL */
1024 		if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1025 		  break;
1026 		if ((ctx->cflags & REG_EXTENDED
1027 		     && c == CHAR_RPAREN && depth > 0)
1028 		    || (!(ctx->cflags & REG_EXTENDED)
1029 			&& (c == CHAR_BACKSLASH
1030 			    && *(ctx->re + 1) == CHAR_RPAREN)))
1031 		  {
1032 		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1033 		      status = REG_EPAREN;
1034 		    DPRINT(("tre_parse:	  group end: '%.*" STRF "'\n",
1035 			    REST(ctx->re)));
1036 		    depth--;
1037 		    if (!(ctx->cflags & REG_EXTENDED))
1038 		      ctx->re += 2;
1039 		    break;
1040 		  }
1041 #ifdef REG_LITERAL
1042 	      }
1043 #endif /* REG_LITERAL */
1044 
1045 #ifdef REG_RIGHT_ASSOC
1046 	    if (ctx->cflags & REG_RIGHT_ASSOC)
1047 	      {
1048 		/* Right associative concatenation. */
1049 		STACK_PUSHX(stack, voidptr, result);
1050 		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1051 		STACK_PUSHX(stack, int, PARSE_CATENATION);
1052 		STACK_PUSHX(stack, int, PARSE_PIECE);
1053 	      }
1054 	    else
1055 #endif /* REG_RIGHT_ASSOC */
1056 	      {
1057 		/* Default case, left associative concatenation. */
1058 		STACK_PUSHX(stack, int, PARSE_CATENATION);
1059 		STACK_PUSHX(stack, voidptr, result);
1060 		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1061 		STACK_PUSHX(stack, int, PARSE_PIECE);
1062 	      }
1063 	    break;
1064 	  }
1065 
1066 	case PARSE_POST_CATENATION:
1067 	  {
1068 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1069 	    tre_ast_node_t *tmp_node;
1070 	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1071 	    if (!tmp_node)
1072 	      return REG_ESPACE;
1073 	    result = tmp_node;
1074 	    break;
1075 	  }
1076 
1077 	case PARSE_UNION:
1078 	  if (ctx->re >= ctx->re_end)
1079 	    break;
1080 #ifdef REG_LITERAL
1081 	  if (ctx->cflags & REG_LITERAL)
1082 	    break;
1083 #endif /* REG_LITERAL */
1084 	  switch (*ctx->re)
1085 	    {
1086 	    case CHAR_PIPE:
1087 	      DPRINT(("tre_parse:	union: '%.*" STRF "'\n",
1088 		      REST(ctx->re)));
1089 	      STACK_PUSHX(stack, int, PARSE_UNION);
1090 	      STACK_PUSHX(stack, voidptr, result);
1091 	      STACK_PUSHX(stack, int, PARSE_POST_UNION);
1092 	      STACK_PUSHX(stack, int, PARSE_BRANCH);
1093 	      ctx->re++;
1094 	      break;
1095 
1096 	    case CHAR_RPAREN:
1097 	      ctx->re++;
1098 	      break;
1099 
1100 	    default:
1101 	      break;
1102 	    }
1103 	  break;
1104 
1105 	case PARSE_POST_UNION:
1106 	  {
1107 	    tre_ast_node_t *tmp_node;
1108 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1109 	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1110 	    if (!tmp_node)
1111 	      return REG_ESPACE;
1112 	    result = tmp_node;
1113 	    break;
1114 	  }
1115 
1116 	case PARSE_POSTFIX:
1117 	  /* Parse postfix operators. */
1118 	  if (ctx->re >= ctx->re_end)
1119 	    break;
1120 #ifdef REG_LITERAL
1121 	  if (ctx->cflags & REG_LITERAL)
1122 	    break;
1123 #endif /* REG_LITERAL */
1124 	  switch (*ctx->re)
1125 	    {
1126 	    case CHAR_PLUS:
1127 	    case CHAR_QUESTIONMARK:
1128 	      if (!(ctx->cflags & REG_EXTENDED))
1129 		break;
1130 		/*FALLTHROUGH*/
1131 	    case CHAR_STAR:
1132 	      {
1133 		tre_ast_node_t *tmp_node;
1134 		int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1135 		int rep_min = 0;
1136 		int rep_max = -1;
1137 #ifdef TRE_DEBUG
1138 		const tre_char_t *tmp_re;
1139 #endif
1140 
1141 		if (*ctx->re == CHAR_PLUS)
1142 		  rep_min = 1;
1143 		if (*ctx->re == CHAR_QUESTIONMARK)
1144 		  rep_max = 1;
1145 #ifdef TRE_DEBUG
1146 		tmp_re = ctx->re;
1147 #endif
1148 
1149 		if (ctx->re + 1 < ctx->re_end)
1150 		  {
1151 		    if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1152 		      {
1153 			minimal = !(ctx->cflags & REG_UNGREEDY);
1154 			ctx->re++;
1155 		      }
1156 		    else if (*(ctx->re + 1) == CHAR_STAR
1157 			     || *(ctx->re + 1) == CHAR_PLUS)
1158 		      {
1159 			/* These are reserved for future extensions. */
1160 			return REG_BADRPT;
1161 		      }
1162 		  }
1163 
1164 		DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1165 			minimal ? "  minimal" : "greedy", REST(tmp_re)));
1166 		ctx->re++;
1167 		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1168 					    minimal);
1169 		if (tmp_node == NULL)
1170 		  return REG_ESPACE;
1171 		result = tmp_node;
1172 		STACK_PUSHX(stack, int, PARSE_POSTFIX);
1173 	      }
1174 	      break;
1175 
1176 	    case CHAR_BACKSLASH:
1177 	      /* "\{" is special without REG_EXTENDED */
1178 	      if (!(ctx->cflags & REG_EXTENDED)
1179 		  && ctx->re + 1 < ctx->re_end
1180 		  && *(ctx->re + 1) == CHAR_LBRACE)
1181 		{
1182 		  ctx->re++;
1183 		  goto parse_brace;
1184 		}
1185 	      else
1186 		break;
1187 
1188 	    case CHAR_LBRACE:
1189 	      /* "{" is literal without REG_EXTENDED */
1190 	      if (!(ctx->cflags & REG_EXTENDED))
1191 		break;
1192 
1193 	    parse_brace:
1194 	      DPRINT(("tre_parse:	bound: '%.*" STRF "'\n",
1195 		      REST(ctx->re)));
1196 	      ctx->re++;
1197 
1198 	      status = tre_parse_bound(ctx, &result);
1199 	      if (status != REG_OK)
1200 		return status;
1201 	      STACK_PUSHX(stack, int, PARSE_POSTFIX);
1202 	      break;
1203 	    }
1204 	  break;
1205 
1206 	case PARSE_ATOM:
1207 	  /* Parse an atom.  An atom is a regular expression enclosed in `()',
1208 	     an empty set of `()', a bracket expression, `.', `^', `$',
1209 	     a `\' followed by a character, or a single character. */
1210 
1211 	  /* End of regexp? (empty string). */
1212 	  if (ctx->re >= ctx->re_end)
1213 	    goto parse_literal;
1214 
1215 #ifdef REG_LITERAL
1216 	  if (ctx->cflags & REG_LITERAL)
1217 	    goto parse_literal;
1218 #endif /* REG_LITERAL */
1219 
1220 	  switch (*ctx->re)
1221 	    {
1222 	    case CHAR_LPAREN:  /* parenthesized subexpression */
1223 
1224 	      /* Handle "(?...)" extensions.  They work in a way similar
1225 		 to Perls corresponding extensions. */
1226 	      if (ctx->cflags & REG_EXTENDED
1227 		  && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1228 		{
1229 		  int new_cflags = ctx->cflags;
1230 		  int bit = 1;
1231 		  DPRINT(("tre_parse:	extension: '%.*" STRF "\n",
1232 			  REST(ctx->re)));
1233 		  ctx->re += 2;
1234 		  while (/*CONSTCOND*/(void)1,1)
1235 		    {
1236 		      if (*ctx->re == L'i')
1237 			{
1238 			  DPRINT(("tre_parse:	    icase: '%.*" STRF "\n",
1239 				  REST(ctx->re)));
1240 			  if (bit)
1241 			    new_cflags |= REG_ICASE;
1242 			  else
1243 			    new_cflags &= ~REG_ICASE;
1244 			  ctx->re++;
1245 			}
1246 		      else if (*ctx->re == L'n')
1247 			{
1248 			  DPRINT(("tre_parse:	  newline: '%.*" STRF "\n",
1249 				  REST(ctx->re)));
1250 			  if (bit)
1251 			    new_cflags |= REG_NEWLINE;
1252 			  else
1253 			    new_cflags &= ~REG_NEWLINE;
1254 			  ctx->re++;
1255 			}
1256 #ifdef REG_RIGHT_ASSOC
1257 		      else if (*ctx->re == L'r')
1258 			{
1259 			  DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1260 				  REST(ctx->re)));
1261 			  if (bit)
1262 			    new_cflags |= REG_RIGHT_ASSOC;
1263 			  else
1264 			    new_cflags &= ~REG_RIGHT_ASSOC;
1265 			  ctx->re++;
1266 			}
1267 #endif /* REG_RIGHT_ASSOC */
1268 #ifdef REG_UNGREEDY
1269 		      else if (*ctx->re == L'U')
1270 			{
1271 			  DPRINT(("tre_parse:    ungreedy: '%.*" STRF "\n",
1272 				  REST(ctx->re)));
1273 			  if (bit)
1274 			    new_cflags |= REG_UNGREEDY;
1275 			  else
1276 			    new_cflags &= ~REG_UNGREEDY;
1277 			  ctx->re++;
1278 			}
1279 #endif /* REG_UNGREEDY */
1280 		      else if (*ctx->re == CHAR_MINUS)
1281 			{
1282 			  DPRINT(("tre_parse:	 turn off: '%.*" STRF "\n",
1283 				  REST(ctx->re)));
1284 			  ctx->re++;
1285 			  bit = 0;
1286 			}
1287 		      else if (*ctx->re == CHAR_COLON)
1288 			{
1289 			  DPRINT(("tre_parse:	 no group: '%.*" STRF "\n",
1290 				  REST(ctx->re)));
1291 			  ctx->re++;
1292 			  depth++;
1293 			  break;
1294 			}
1295 		      else if (*ctx->re == CHAR_HASH)
1296 			{
1297 			  DPRINT(("tre_parse:    comment: '%.*" STRF "\n",
1298 				  REST(ctx->re)));
1299 			  /* A comment can contain any character except a
1300 			     right parenthesis */
1301 			  while (*ctx->re != CHAR_RPAREN
1302 				 && ctx->re < ctx->re_end)
1303 			    ctx->re++;
1304 			  if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1305 			    {
1306 			      ctx->re++;
1307 			      break;
1308 			    }
1309 			  else
1310 			    return REG_BADPAT;
1311 			}
1312 		      else if (*ctx->re == CHAR_RPAREN)
1313 			{
1314 			  ctx->re++;
1315 			  break;
1316 			}
1317 		      else
1318 			return REG_BADPAT;
1319 		    }
1320 
1321 		  /* Turn on the cflags changes for the rest of the
1322 		     enclosing group. */
1323 		  STACK_PUSHX(stack, int, ctx->cflags);
1324 		  STACK_PUSHX(stack, int, PARSE_RESTORE_CFLAGS);
1325 		  STACK_PUSHX(stack, int, PARSE_RE);
1326 		  ctx->cflags = new_cflags;
1327 		  break;
1328 		}
1329 
1330 	      if (ctx->cflags & REG_EXTENDED
1331 		  || (ctx->re > ctx->re_start
1332 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
1333 		{
1334 		  depth++;
1335 		  if (ctx->re + 2 < ctx->re_end
1336 		      && *(ctx->re + 1) == CHAR_QUESTIONMARK
1337 		      && *(ctx->re + 2) == CHAR_COLON)
1338 		    {
1339 		      DPRINT(("tre_parse: group begin: '%.*" STRF
1340 			      "', no submatch\n", REST(ctx->re)));
1341 		      /* Don't mark for submatching. */
1342 		      ctx->re += 3;
1343 		      STACK_PUSHX(stack, int, PARSE_RE);
1344 		    }
1345 		  else
1346 		    {
1347 		      DPRINT(("tre_parse: group begin: '%.*" STRF
1348 			      "', submatch %d\n", REST(ctx->re),
1349 			      ctx->submatch_id));
1350 		      ctx->re++;
1351 		      /* First parse a whole RE, then mark the resulting tree
1352 			 for submatching. */
1353 		      STACK_PUSHX(stack, int, ctx->submatch_id);
1354 		      STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1355 		      STACK_PUSHX(stack, int, PARSE_RE);
1356 		      ctx->submatch_id++;
1357 		    }
1358 		}
1359 	      else
1360 		goto parse_literal;
1361 	      break;
1362 
1363 	    case CHAR_RPAREN:  /* end of current subexpression */
1364 	      if ((ctx->cflags & REG_EXTENDED && depth > 0)
1365 		  || (!(ctx->cflags & REG_EXTENDED) && ctx->re > ctx->re_start
1366 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
1367 		{
1368 		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1369 			  REST(ctx->re)));
1370 		  /* We were expecting an atom, but instead the current
1371 		     subexpression was closed.	POSIX leaves the meaning of
1372 		     this to be implementation-defined.	 We interpret this as
1373 		     an empty expression (which matches an empty string).  */
1374 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1375 		  if (result == NULL)
1376 		    return REG_ESPACE;
1377 		  if (!(ctx->cflags & REG_EXTENDED))
1378 		    ctx->re--;
1379 		}
1380 	      else
1381 		goto parse_literal;
1382 	      break;
1383 
1384 	    case CHAR_LBRACKET: /* bracket expression */
1385 	      DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1386 		      REST(ctx->re)));
1387 	      ctx->re++;
1388 	      status = tre_parse_bracket(ctx, &result);
1389 	      if (status != REG_OK)
1390 		return status;
1391 	      break;
1392 
1393 	    case CHAR_BACKSLASH:
1394 	      /* If this is "\(" or "\)" chew off the backslash and
1395 		 try again. */
1396 	      if (!(ctx->cflags & REG_EXTENDED)
1397 		  && ctx->re + 1 < ctx->re_end
1398 		  && (*(ctx->re + 1) == CHAR_LPAREN
1399 		      || *(ctx->re + 1) == CHAR_RPAREN))
1400 		{
1401 		  ctx->re++;
1402 		  STACK_PUSHX(stack, int, PARSE_ATOM);
1403 		  break;
1404 		}
1405 
1406 	      /* If a macro is used, parse the expanded macro recursively. */
1407 	      {
1408 		tre_char_t buf[64];
1409 		tre_expand_macro(ctx->re + 1, ctx->re_end,
1410 				 buf, elementsof(buf));
1411 		if (buf[0] != 0)
1412 		  {
1413 		    tre_parse_ctx_t subctx;
1414 		    memcpy(&subctx, ctx, sizeof(subctx));
1415 		    subctx.re = buf;
1416 		    subctx.len = tre_strlen(buf);
1417 		    subctx.nofirstsub = 1;
1418 		    status = tre_parse(&subctx);
1419 		    if (status != REG_OK)
1420 		      return status;
1421 		    ctx->re += 2;
1422 		    ctx->position = subctx.position;
1423 		    result = subctx.result;
1424 		    break;
1425 		  }
1426 	      }
1427 
1428 	      if (ctx->re + 1 >= ctx->re_end)
1429 		/* Trailing backslash. */
1430 		return REG_EESCAPE;
1431 
1432 #ifdef REG_LITERAL
1433 	      if (*(ctx->re + 1) == L'Q')
1434 		{
1435 		  DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1436 			  REST(ctx->re)));
1437 		  ctx->cflags |= REG_LITERAL;
1438 		  temporary_cflags |= REG_LITERAL;
1439 		  ctx->re += 2;
1440 		  STACK_PUSHX(stack, int, PARSE_ATOM);
1441 		  break;
1442 		}
1443 #endif /* REG_LITERAL */
1444 
1445 	      DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1446 	      ctx->re++;
1447 	      switch (*ctx->re)
1448 		{
1449 		case L'b':
1450 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1451 					       ASSERT_AT_WB, -1);
1452 		  ctx->re++;
1453 		  break;
1454 		case L'B':
1455 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1456 					       ASSERT_AT_WB_NEG, -1);
1457 		  ctx->re++;
1458 		  break;
1459 		case L'<':
1460 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1461 					       ASSERT_AT_BOW, -1);
1462 		  ctx->re++;
1463 		  break;
1464 		case L'>':
1465 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1466 					       ASSERT_AT_EOW, -1);
1467 		  ctx->re++;
1468 		  break;
1469 		case L'x':
1470 		  ctx->re++;
1471 		  if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1472 		    {
1473 		      /* 8 bit hex char. */
1474 		      char tmp[3] = {0, 0, 0};
1475 		      long val;
1476 		      DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1477 			      REST(ctx->re - 2)));
1478 
1479 		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1480 			{
1481 			  tmp[0] = (char)ctx->re[0];
1482 			  ctx->re++;
1483 			}
1484 		      if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1485 			{
1486 			  tmp[1] = (char)ctx->re[0];
1487 			  ctx->re++;
1488 			}
1489 		      val = strtol(tmp, NULL, 16);
1490 		      result = tre_ast_new_literal(ctx->mem, (int)val,
1491 						   (int)val, ctx->position);
1492 		      ctx->position++;
1493 		      break;
1494 		    }
1495 		  else if (ctx->re < ctx->re_end)
1496 		    {
1497 		      /* Wide char. */
1498 		      char tmp[32];
1499 		      long val;
1500 		      int i = 0;
1501 		      ctx->re++;
1502 		      while (ctx->re_end - ctx->re >= 0)
1503 			{
1504 			  if (ctx->re[0] == CHAR_RBRACE)
1505 			    break;
1506 			  if (tre_isxdigit(ctx->re[0]))
1507 			    {
1508 			      tmp[i] = (char)ctx->re[0];
1509 			      i++;
1510 			      ctx->re++;
1511 			      continue;
1512 			    }
1513 			  return REG_EBRACE;
1514 			}
1515 		      ctx->re++;
1516 		      tmp[i] = 0;
1517 		      val = strtol(tmp, NULL, 16);
1518 		      result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1519 						   ctx->position);
1520 		      ctx->position++;
1521 		      break;
1522 		    }
1523 		  /*FALLTHROUGH*/
1524 
1525 		default:
1526 		  if (tre_isdigit(*ctx->re))
1527 		    {
1528 		      /* Back reference. */
1529 		      int val = *ctx->re - L'0';
1530 		      DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
1531 			      REST(ctx->re - 1)));
1532 		      result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1533 						   ctx->position);
1534 		      if (result == NULL)
1535 			return REG_ESPACE;
1536 		      ctx->position++;
1537 		      ctx->max_backref = MAX(val, ctx->max_backref);
1538 		      ctx->re++;
1539 		    }
1540 		  else
1541 		    {
1542 		      /* Escaped character. */
1543 		      DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
1544 			      REST(ctx->re - 1)));
1545 		      result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1546 						   ctx->position);
1547 		      ctx->position++;
1548 		      ctx->re++;
1549 		    }
1550 		  break;
1551 		}
1552 	      if (result == NULL)
1553 		return REG_ESPACE;
1554 	      break;
1555 
1556 	    case CHAR_PERIOD:	 /* the any-symbol */
1557 	      DPRINT(("tre_parse:	  any: '%.*" STRF "'\n",
1558 		      REST(ctx->re)));
1559 	      if (ctx->cflags & REG_NEWLINE)
1560 		{
1561 		  tre_ast_node_t *tmp1;
1562 		  tre_ast_node_t *tmp2;
1563 		  tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1564 					     ctx->position);
1565 		  if (!tmp1)
1566 		    return REG_ESPACE;
1567 		  tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1568 					     ctx->position + 1);
1569 		  if (!tmp2)
1570 		    return REG_ESPACE;
1571 		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1572 		  if (!result)
1573 		    return REG_ESPACE;
1574 		  ctx->position += 2;
1575 		}
1576 	      else
1577 		{
1578 		  result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1579 					       ctx->position);
1580 		  if (!result)
1581 		    return REG_ESPACE;
1582 		  ctx->position++;
1583 		}
1584 	      ctx->re++;
1585 	      break;
1586 
1587 	    case CHAR_CARET:	 /* beginning of line assertion */
1588 	      /* '^' has a special meaning everywhere in EREs, and in the
1589 		 beginning of the RE and after \( is BREs. */
1590 	      if (ctx->cflags & REG_EXTENDED
1591 		  || (ctx->re - 2 >= ctx->re_start
1592 		      && *(ctx->re - 2) == CHAR_BACKSLASH
1593 		      && *(ctx->re - 1) == CHAR_LPAREN)
1594 		  || ctx->re == ctx->re_start)
1595 		{
1596 		  DPRINT(("tre_parse:	      BOL: '%.*" STRF "'\n",
1597 			  REST(ctx->re)));
1598 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1599 					       ASSERT_AT_BOL, -1);
1600 		  if (result == NULL)
1601 		    return REG_ESPACE;
1602 		  ctx->re++;
1603 		}
1604 	      else
1605 		goto parse_literal;
1606 	      break;
1607 
1608 	    case CHAR_DOLLAR:	 /* end of line assertion. */
1609 	      /* '$' is special everywhere in EREs, and in the end of the
1610 		 string and before \) is BREs. */
1611 	      if (ctx->cflags & REG_EXTENDED
1612 		  || (ctx->re + 2 < ctx->re_end
1613 		      && *(ctx->re + 1) == CHAR_BACKSLASH
1614 		      && *(ctx->re + 2) == CHAR_RPAREN)
1615 		  || ctx->re + 1 == ctx->re_end)
1616 		{
1617 		  DPRINT(("tre_parse:	      EOL: '%.*" STRF "'\n",
1618 			  REST(ctx->re)));
1619 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
1620 					       ASSERT_AT_EOL, -1);
1621 		  if (result == NULL)
1622 		    return REG_ESPACE;
1623 		  ctx->re++;
1624 		}
1625 	      else
1626 		goto parse_literal;
1627 	      break;
1628 
1629 	    default:
1630 	    parse_literal:
1631 
1632 	      if (temporary_cflags && ctx->re + 1 < ctx->re_end
1633 		  && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1634 		{
1635 		  DPRINT(("tre_parse:	 end tmps: '%.*" STRF "'\n",
1636 			  REST(ctx->re)));
1637 		  ctx->cflags &= ~temporary_cflags;
1638 		  temporary_cflags = 0;
1639 		  ctx->re += 2;
1640 		  STACK_PUSHX(stack, int, PARSE_PIECE);
1641 		  break;
1642 		}
1643 
1644 
1645 	      /* We are expecting an atom.  If the subexpression (or the whole
1646 		 regexp) ends here, we interpret it as an empty expression
1647 		 (which matches an empty string).  */
1648 	      if (
1649 #ifdef REG_LITERAL
1650 		  !(ctx->cflags & REG_LITERAL) &&
1651 #endif /* REG_LITERAL */
1652 		  (ctx->re >= ctx->re_end
1653 		   || *ctx->re == CHAR_STAR
1654 		   || (ctx->cflags & REG_EXTENDED
1655 		       && (*ctx->re == CHAR_PIPE
1656 			   || *ctx->re == CHAR_LBRACE
1657 			   || *ctx->re == CHAR_PLUS
1658 			   || *ctx->re == CHAR_QUESTIONMARK))
1659 		   /* Test for "\)" in BRE mode. */
1660 		   || (!(ctx->cflags & REG_EXTENDED)
1661 		       && ctx->re + 1 < ctx->re_end
1662 		       && *ctx->re == CHAR_BACKSLASH
1663 		       && *(ctx->re + 1) == CHAR_LBRACE)))
1664 		{
1665 		  DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1666 			  REST(ctx->re)));
1667 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1668 		  if (!result)
1669 		    return REG_ESPACE;
1670 		  break;
1671 		}
1672 
1673 /* R change */
1674 	      if ((ctx->cflags & REG_LITERAL) && !ctx->re[0]) {
1675 		  DPRINT(("tre_parse:	    literal empty: '%.*" STRF "'\n",
1676 			  REST(ctx->re)));
1677 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1678 		  if (!result)
1679 		    return REG_ESPACE;
1680 		  break;
1681 	      }
1682 /* end R change */
1683 
1684 	      DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
1685 		      REST(ctx->re)));
1686 	      /* Note that we can't use an tre_isalpha() test here, since there
1687 		 may be characters which are alphabetic but neither upper or
1688 		 lower case. */
1689 	      if (ctx->cflags & REG_ICASE
1690 		  && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1691 		{
1692 		  tre_ast_node_t *tmp1;
1693 		  tre_ast_node_t *tmp2;
1694 
1695 		  /* XXX - Can there be more than one opposite-case
1696 		     counterpoints for some character in some locale?  Or
1697 		     more than two characters which all should be regarded
1698 		     the same character if case is ignored?  If yes, there
1699 		     does not seem to be a portable way to detect it.  I guess
1700 		     that at least for multi-character collating elements there
1701 		     could be several opposite-case counterpoints, but they
1702 		     cannot be supported portably anyway. */
1703 		  tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1704 					     tre_toupper(*ctx->re),
1705 					     ctx->position);
1706 		  if (!tmp1)
1707 		    return REG_ESPACE;
1708 		  tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1709 					     tre_tolower(*ctx->re),
1710 					     ctx->position);
1711 		  if (!tmp2)
1712 		    return REG_ESPACE;
1713 		  result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1714 		  if (!result)
1715 		    return REG_ESPACE;
1716 		}
1717 	      else
1718 		{
1719 		  result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1720 					       ctx->position);
1721 		  if (!result)
1722 		    return REG_ESPACE;
1723 		}
1724 	      ctx->position++;
1725 	      ctx->re++;
1726 	      break;
1727 	    }
1728 	  break;
1729 
1730 	case PARSE_MARK_FOR_SUBMATCH:
1731 	  {
1732 	    int submatch_id = tre_stack_pop_int(stack);
1733 
1734             if(result) {
1735 	      if (result->submatch_id >= 0)
1736 	        {
1737 		  tre_ast_node_t *n, *tmp_node;
1738 		  n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1739 		  if (n == NULL)
1740 		    return REG_ESPACE;
1741 		  tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1742 		  if (tmp_node == NULL)
1743 		    return REG_ESPACE;
1744 		  tmp_node->num_submatches = result->num_submatches;
1745 		  result = tmp_node;
1746 	        }
1747 	      result->submatch_id = submatch_id;
1748 	      result->num_submatches++;
1749 	    }
1750 	    break;
1751 	  }
1752 
1753 	case PARSE_RESTORE_CFLAGS:
1754 	  ctx->cflags = tre_stack_pop_int(stack);
1755 	  break;
1756 
1757 	default:
1758 	  assert(0);
1759 	  break;
1760 	}
1761     }
1762 
1763   /* Check for missing closing parentheses. */
1764   if (depth > 0)
1765     return REG_EPAREN;
1766 
1767   if (status == REG_OK)
1768     ctx->result = result;
1769 
1770   return status;
1771 }
1772 
1773 /* EOF */
1774