xref: /dragonfly/contrib/tre/lib/tre-parse.c (revision d9f85b33)
1 /*
2   tre-parse.c - Regexp parser
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This parser is just a simple recursive descent parser for POSIX.2
11   regexps.  The parser supports both the obsolete default syntax and
12   the "extended" syntax, and some nonstandard extensions.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22 #include <stddef.h>
23 
24 #include "xmalloc.h"
25 #include "tre-mem.h"
26 #include "tre-ast.h"
27 #include "tre-stack.h"
28 #include "tre-parse.h"
29 
30 #include "xlocale_private.h"
31 #include "collate.h"
32 
33 /* BSD compatibility:
34      Before looking up a collating symbol, check if the name matches in
35      the character names (cnames) array; if so, use the corresponding
36      character.
37 
38      Also set ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND, which will preserve
39      the implementation choice that for ERE, a non-numeric character following
40      a left brace that would normally be a bound, causes the left brace to be
41      literal. */
42 #define BSD_COMPATIBILITY
43 #ifdef BSD_COMPATIBILITY
44 #include "cname.h"
45 #define ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
46 #endif /* BSD_COMPATIBILITY */
47 
48 /* Characters with special meanings in regexp syntax. */
49 #define CHAR_PIPE	   L'|'
50 #define CHAR_LPAREN	   L'('
51 #define CHAR_RPAREN	   L')'
52 #define CHAR_LBRACE	   L'{'
53 #define CHAR_RBRACE	   L'}'
54 #define CHAR_LBRACKET	   L'['
55 #define CHAR_RBRACKET	   L']'
56 #define CHAR_MINUS	   L'-'
57 #define CHAR_STAR	   L'*'
58 #define CHAR_QUESTIONMARK  L'?'
59 #define CHAR_PLUS	   L'+'
60 #define CHAR_PERIOD	   L'.'
61 #define CHAR_COLON	   L':'
62 #define CHAR_EQUAL	   L'='
63 #define CHAR_COMMA	   L','
64 #define CHAR_CARET	   L'^'
65 #define CHAR_DOLLAR	   L'$'
66 #define CHAR_BACKSLASH	   L'\\'
67 #define CHAR_HASH	   L'#'
68 #define CHAR_TILDE	   L'~'
69 
70 
71 /* Some macros for expanding \w, \s, etc. */
72 static const struct tre_macro_struct {
73   const char c;
74   const char *expansion;
75 } tre_macros[] =
76   { {'t', "\t"},	   {'n', "\n"},		   {'r', "\r"},
77     {'f', "\f"},	   {'a', "\a"},		   {'e', "\033"},
78     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
79     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
80     { 0, NULL }
81   };
82 
83 
84 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
85    must have at least `len' items.  Sets buf[0] to zero if the there
86    is no match in `tre_macros'. */
87 static void
88 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
89 		 tre_char_t *buf, size_t buf_len)
90 {
91   int i;
92 
93   buf[0] = 0;
94   if (regex >= regex_end)
95     return;
96 
97   for (i = 0; tre_macros[i].expansion; i++)
98     {
99       if (tre_macros[i].c == *regex)
100 	{
101 	  unsigned int j;
102 	  DPRINT(("Expanding macro '%c' => '%s'\n",
103 		  tre_macros[i].c, tre_macros[i].expansion));
104 	  for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
105 	    buf[j] = tre_macros[i].expansion[j];
106 	  buf[j] = 0;
107 	  break;
108 	}
109     }
110 }
111 
112 static reg_errcode_t
113 tre_new_item(tre_mem_t mem, int type, int val, int *max_i,
114 	 tre_bracket_match_list_t **items)
115 {
116   reg_errcode_t status = REG_OK;
117   tre_bracket_match_list_t *array = *items;
118   int i = array->num_bracket_matches;
119   /* Allocate more space if necessary. */
120   if (i >= *max_i)
121     {
122       tre_bracket_match_list_t *new_items;
123       DPRINT(("out of tre_bracket_match_list_t array space (%d)\n", i));
124       /* If the array is already 1024 items large, give up -- there's
125 	 probably an error in the regexp (e.g. not a '\0' terminated
126 	 string and missing ']') */
127       if (*max_i >= 1024)
128 	return REG_ESPACE;
129       *max_i *= 2;
130       new_items = xrealloc(array, SIZEOF_BRACKET_MATCH_LIST_N(*max_i));
131       if (new_items == NULL)
132 	return REG_ESPACE;
133       *items = array = new_items;
134     }
135   array->bracket_matches[i].type = type;
136   array->bracket_matches[i].value = val;
137   array->num_bracket_matches++;
138   return status;
139 }
140 
141 #ifndef TRE_USE_SYSTEM_WCTYPE
142 
143 /* isalnum() and the rest may be macros, so wrap them to functions. */
144 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
145 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
146 
147 #ifdef tre_isascii
148 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
149 #else /* !tre_isascii */
150 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
151 #endif /* !tre_isascii */
152 
153 #ifdef tre_isblank
154 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
155 #else /* !tre_isblank */
156 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
157 #endif /* !tre_isblank */
158 
159 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
160 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
161 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
162 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
163 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
164 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
165 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
166 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
167 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
168 
169 struct {
170   char *name;
171   int (*func)(tre_cint_t);
172 } tre_ctype_map[] = {
173   { "alnum", &tre_isalnum_func },
174   { "alpha", &tre_isalpha_func },
175 #ifdef tre_isascii
176   { "ascii", &tre_isascii_func },
177 #endif /* tre_isascii */
178 #ifdef tre_isblank
179   { "blank", &tre_isblank_func },
180 #endif /* tre_isblank */
181   { "cntrl", &tre_iscntrl_func },
182   { "digit", &tre_isdigit_func },
183   { "graph", &tre_isgraph_func },
184   { "lower", &tre_islower_func },
185   { "print", &tre_isprint_func },
186   { "punct", &tre_ispunct_func },
187   { "space", &tre_isspace_func },
188   { "upper", &tre_isupper_func },
189   { "xdigit", &tre_isxdigit_func },
190   { NULL, NULL}
191 };
192 
193 tre_ctype_t tre_ctype(const char *name)
194 {
195   int i;
196   for (i = 0; tre_ctype_map[i].name != NULL; i++)
197     {
198       if (strcmp(name, tre_ctype_map[i].name) == 0)
199 	return tre_ctype_map[i].func;
200     }
201   return (tre_ctype_t)0;
202 }
203 #endif /* !TRE_USE_SYSTEM_WCTYPE */
204 
205 #define REST(re) (int)(ctx->re_end - (re)), (re)
206 
207 #define START_COLLATING_SYMBOLS		16
208 #define MAX_COLLATING_SYMBOL_LEN	4
209 
210 typedef struct {
211   const tre_char_t *start;
212   int len;
213 } tre_collating_symbol;
214 
215 #ifdef BSD_COMPATIBILITY
216 static wchar_t
217 tre_search_cnames(const wchar_t *name, size_t len)
218 {
219   size_t low = 0;
220   size_t high = NCNAMES - 1;
221   size_t cur;
222   int cmp;
223 
224   while(low <= high)
225     {
226       cur = (low + high) / 2;
227       cmp = wcsncmp(name, cnames[cur].name, len);
228       if (cmp == 0 && cnames[cur].name[len] == 0) return cnames[cur].code;
229       if (cmp > 0) low = cur + 1;
230       else high = cur - 1;
231     }
232   return (wchar_t)-1;
233 }
234 #endif /* BSD_COMPATIBILITY */
235 
236 /* Scan the contents of a bracket expression, and create a
237  * tre_bracket_match_list_t encoding the bracket expression.  If during
238  * the scan, multi-character collating symbols are detected, switch
239  * into a mode to collect those MCCSs into a tre_collating_symbol
240  * list and pass them back.  tre_parse_bracket will use that to
241  * create a new string composed of a union of the bracket expression
242  * without the MCCSs and the MCCSs (e.g., [x[.ch.]] => [x]|ch), and
243  * call tre_parse (recursive) to parse that new string (which will
244  * call tre_parse_bracket and tre_parse_bracket_items again. */
245 static reg_errcode_t
246 tre_parse_bracket_items(tre_parse_ctx_t *ctx, tre_bracket_match_list_t **items,
247 			int *items_size, tre_collating_symbol **result)
248 {
249   const tre_char_t *re = ctx->re;
250   const tre_char_t *re_end = ctx->re_end;
251   tre_collating_symbol *col_syms = NULL;
252   tre_collating_symbol *cp = NULL;
253   int n_col_syms = 0;
254   reg_errcode_t status;
255   int max_i = *items_size;
256   int other = 0;  /* contains content other than multi-character collating
257 		   * symbols */
258   int range = -1; /* -1 unset, 0 begin range set, +1 end range expected */
259   tre_cint_t min, c;
260   int invert = ((*items)->flags & TRE_BRACKET_MATCH_FLAG_NEGATE);
261   int collect_MCCS = 0;
262   const tre_char_t *start;
263 
264   for ( ;re < re_end; re++)
265     {
266       switch (*re)
267 	{
268 	case CHAR_MINUS:
269 	  /* A first hyphen */
270 	  if (re == ctx->re)
271 	    {
272 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
273 	      min = CHAR_MINUS;
274 	      other++;
275 	      range = 0;
276 	      break;
277 	    }
278 	  /* The hyphen is the end range */
279 	  if (range > 0)
280 	    {
281 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
282 	      c = CHAR_MINUS;
283 	      goto process_end_range;
284 	    }
285 	  if (re + 1 >= re_end)
286 	    {
287 	      status = REG_EBRACK;
288 	      goto error;
289 	    }
290 	  /* The hyphen is at the end */
291 	  if (re[1] == CHAR_RBRACKET)
292 	    {
293 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
294 	      c = CHAR_MINUS;
295 	      goto process_begin_range;
296 	    }
297 	  /* Two ranges are not allowed to share an endpoint, or begin
298 	   * range is illegal. */
299 	  if (range < 0)
300 	    {
301 	      status = REG_ERANGE;
302 	      goto error;
303 	    }
304 	  range = 1; /* Expect end range */
305 	  DPRINT(("tre_parse_bracket:   range: '%.*" STRF "'\n", REST(re)));
306 	  break;
307 
308 	case CHAR_LBRACKET:
309 	  if (re + 1 >= re_end)
310 	    {
311 	      status = REG_EBRACK;
312 	      goto error;
313 	    }
314 	  switch (re[1])
315 	    {
316 	    case CHAR_PERIOD:
317 	      {
318 		re += 2;
319 		start = re;
320 		for (;; re++)
321 		  {
322 		    if (re >= re_end)
323 		      {
324 			status = REG_ECOLLATE;
325 			goto error;
326 		      }
327 		    if (*re == CHAR_PERIOD)
328 		      {
329 			if (re + 1 >= re_end)
330 			  {
331 			    status = REG_ECOLLATE;
332 			    goto error;
333 			  }
334 			/* Found end */
335 			if (re[1] == CHAR_RBRACKET)
336 			  {
337 			    DPRINT(("tre_parse_bracket:   collating "
338 				    "symbol: '%.*" STRF "'\n",
339 				    REST(start - 2)));
340 			    /* Empty name */
341 			    if (re == start)
342 			      {
343 				status = REG_ECOLLATE;
344 				goto error;
345 			      }
346 #ifdef BSD_COMPATIBILITY
347 			    /* Check if the name is in cnames; if so, use
348 			       the corresponding code */
349 			    c = tre_search_cnames(start, re - start);
350 			    if (c != (wchar_t)-1)
351 			      {
352 				re++;
353 				goto process_single_character;
354 			      }
355 #endif /* BSD_COMPATIBILITY */
356 			    /* Verify this is a known sequence */
357 			    if (__collate_equiv_value(ctx->loc, start,
358 							  re - start) <= 0)
359 			      {
360 				status = REG_ECOLLATE;
361 				goto error;
362 			      }
363 			    /* Process single character collating symbols */
364 			    if (re - start == 1)
365 			      {
366 				c = *start;
367 				re++;
368 				goto process_single_character;
369 			      }
370 			    /* Inverted MCCSs are undefined */
371 			    if (invert)
372 			      {
373 				status = REG_ECOLLATE;
374 				goto error;
375 			      }
376 			    /* Can't have MCCSs as an endpoint to a range */
377 			    if (range > 0)
378 			      {
379 				status = REG_ERANGE;
380 				goto error;
381 			      }
382 			    range = -1;
383 			    /* Switch into MCCS collection mode (if not
384 			     * already there */
385 #if TRE_DEBUG
386 			    if (!collect_MCCS)
387 			      {
388 				collect_MCCS = 1;
389 				DPRINT(("tre_parse_bracket: Detected MCCS\n"));
390 			      }
391 #else /* !TRE_DEBUG */
392 			    collect_MCCS = 1;
393 #endif /* !TRE_DEBUG */
394 			    /* Allocate a memory block the first time */
395 			    if (!cp)
396 			      {
397 				if ((col_syms = xmalloc(sizeof(*col_syms) *
398 					    (START_COLLATING_SYMBOLS + 2)))
399 					    == NULL)
400 				  return REG_ESPACE;
401 				cp = col_syms + 1;
402 				n_col_syms = START_COLLATING_SYMBOLS;
403 			      }
404 			    /* Enlarge the memory block is more is needed */
405 			    if ((cp - col_syms) - 1 >= n_col_syms)
406 			      {
407 				int i = n_col_syms;
408 				tre_collating_symbol *tmp =
409 				    xrealloc(col_syms, sizeof(*col_syms) *
410 					     ((n_col_syms *= 2) + 2));
411 				if (tmp == NULL)
412 				  {
413 				    xfree(col_syms);
414 				    return REG_ESPACE;
415 				  }
416 				DPRINT(("tre_list_collating_symbols: "
417 					"Enlarging col_syms to %d\n",
418 					n_col_syms));
419 				col_syms = tmp;
420 				cp = col_syms + i + 1;
421 			      }
422 			    cp->start = start;
423 			    cp->len = re - start;
424 			    cp++;
425 			    re++;
426 			    break;
427 			  }
428 		      }
429 		  }
430 		break;
431 	      }
432 
433 	    case CHAR_EQUAL:
434 	    case CHAR_COLON:
435 	      {
436 		/* Process equivalence and character classes */
437 		tre_char_t kind = re[1];
438 
439 		/* Can't have a class as an endpoint to a range */
440 		if (range > 0)
441 		  {
442 		    status = REG_ERANGE;
443 		    goto error;
444 		  }
445 		if (!collect_MCCS && range == 0)
446 		  {
447 		    status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
448 					  min, &max_i, items);
449 		    if (status != REG_OK)
450 		      goto error;
451 		  }
452 		range = -1;
453 		re += 2;
454 		start = re;
455 		for (;; re++)
456 		  {
457 		    if (re >= re_end)
458 		      {
459 			status = kind == CHAR_EQUAL ? REG_ECOLLATE : REG_ECTYPE;
460 			goto error;
461 		      }
462 		    if (*re == kind)
463 		      {
464 			if (re + 1 >= re_end)
465 			  {
466 			    status = kind == CHAR_EQUAL ? REG_ECOLLATE :
467 							  REG_ECTYPE;
468 			    goto error;
469 			  }
470 			/* Found end */
471 			if (re[1] == CHAR_RBRACKET)
472 			  {
473 			    if (re == start)
474 			      {
475 				/* Empty class name */
476 				status = kind == CHAR_EQUAL ? REG_ECOLLATE :
477 							      REG_ECTYPE;
478 				goto error;
479 			      }
480 			    /* Process equivalence class */
481 			    if (kind == CHAR_EQUAL)
482 			      {
483 				int equiv;
484 
485 				DPRINT(("tre_parse_bracket:   equivalence: '%.*"
486 					STRF "'\n", REST(start - 2)));
487 
488 				/* While we find the collation value even for
489 				   multi-character collating elements , we
490 				   don't (yet) match any collation values
491 				   against multi-character sequences.  We'd have
492 				   to enumerate those multi-character sequences
493 				   and like multi-character collating symbols,
494 				   create a union of those sequences with the
495 				   rest of the bracket expression.  While
496 				   doable, a bracket expression matching
497 				   multiple characters, that doesn't explicitly
498 				   contain multi-character sequences, might
499 				   be unexpected, so we punt for now. */
500 				if ((equiv = __collate_equiv_value(ctx->loc,
501 					     start, re - start)) <= 0)
502 				  {
503 				    /* The standard says that if no collating
504 				       element if found, we use the collating
505 				       symbol itself.  But __collate_equiv_value
506 				       doesn't make a distinction between
507 				       an element that is in a equvalence
508 				       class with others, or is the only member,
509 				       so we already know there is no collating
510 				       symbol.  (Note that in the case of a
511 				       collating element whose collation value
512 				       is unique, matching against the
513 				       collating element itself, or against
514 				       its collation value, is equivalent.) */
515 #ifdef BSD_COMPATIBILITY
516 				    /* Check if the name is in cnames; if so,
517 				       use the corresponding code */
518 				    c = tre_search_cnames(start, re - start);
519 				    if (c != (wchar_t)-1)
520 				      {
521 					re++;
522 					goto process_single_character;
523 				      }
524 #endif /* BSD_COMPATIBILITY */
525 				    status = REG_ECOLLATE;
526 				    goto error;
527 				  }
528 				if (!collect_MCCS)
529 				  {
530 				    status = tre_new_item(ctx->mem,
531 					     TRE_BRACKET_MATCH_TYPE_EQUIVALENCE,
532 					     equiv, &max_i, items);
533 				    if (status != REG_OK)
534 				      goto error;
535 				  }
536 			      }
537 			    else
538 			      {
539 				/* Process character class */
540 				DPRINT(("tre_parse_bracket:  class: '%.*" STRF
541 					"'\n", REST(start - 2)));
542 				if (!collect_MCCS)
543 				  {
544 				    char tmp_str[64];
545 				    tre_ctype_t class;
546 				    int len = MIN(re - start, 63);
547 #ifdef TRE_WCHAR
548 				    {
549 				      tre_char_t tmp_wcs[64];
550 				      wcsncpy(tmp_wcs, start, (size_t)len);
551 				      tmp_wcs[len] = L'\0';
552 #if defined HAVE_WCSRTOMBS
553 				      {
554 					mbstate_t state;
555 					const tre_char_t *src = tmp_wcs;
556 					memset(&state, '\0', sizeof(state));
557 					len = wcsrtombs_l(tmp_str, &src,
558 						      sizeof(tmp_str), &state,
559 						      ctx->loc);
560 				      }
561 #elif defined HAVE_WCSTOMBS
562 				      len = wcstombs(tmp_str, tmp_wcs, 63);
563 #endif /* defined HAVE_WCSTOMBS */
564 				    }
565 #else /* !TRE_WCHAR */
566 				    strncpy(tmp_str, (const char*)start, len);
567 #endif /* !TRE_WCHAR */
568 				    tmp_str[len] = '\0';
569 				    DPRINT(("  class name: %s\n", tmp_str));
570 				    class = tre_ctype_l(tmp_str, ctx->loc);
571 				    if (!class)
572 				      {
573 					status = REG_ECTYPE;
574 					goto error;
575 				      }
576 				    status = tre_new_item(ctx->mem,
577 					     TRE_BRACKET_MATCH_TYPE_CLASS,
578 					     class, &max_i, items);
579 				    if (status != REG_OK)
580 				      goto error;
581 				  }
582 			      }
583 			    re++;
584 			    break;
585 			  }
586 		      }
587 		  }
588 		other++;
589 		break;
590 	      }
591 
592 	    default:
593 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
594 	      c = CHAR_LBRACKET;
595 	      goto process_single_character;
596 	      break;
597 	    }
598 	  break;
599 
600 	case CHAR_RBRACKET:
601 	  /* A first right bracket */
602 	  if (re == ctx->re)
603 	    {
604 	      DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
605 	      min = CHAR_RBRACKET;
606 	      range = 0;
607 	      other++;
608 	      break;
609 	    }
610 	  /* Done */
611 	  if (collect_MCCS)
612 	    {
613 	      DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n",
614 		      REST(re)));
615 	      if (col_syms)
616 		{
617 		  /* Mark the character following the right bracket.  Set len
618 		   * to whether there are other things besides the
619 		   * multi-character collating symbols */
620 		  col_syms->start = re + 1;
621 		  col_syms->len = other;
622 		  /* Mark the end of the list */
623 		  cp->start = NULL;
624 		}
625 	      *result = col_syms;
626 	      return REG_OK;
627 	    }
628 	  /* range > 0 is not possible, since we did a lookahead after the
629 	   * hyphen */
630 	  if (range == 0)
631 	    {
632 	      status = tre_new_item(ctx->mem, TRE_BRACKET_MATCH_TYPE_CHAR,
633 				    min, &max_i, items);
634 	      if (status != REG_OK)
635 		goto error;
636 	    }
637 	  DPRINT(("tre_parse_bracket:	done: '%.*" STRF "'\n", REST(re)));
638 	  *items_size = max_i;
639 	  ctx->re = re + 1;
640 	  return REG_OK;
641 
642 	default:
643 	  DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
644 	  c = *re;
645 process_single_character:
646 	  /* Process single character */
647 	  if (range > 0)
648 	    {
649 	      int mine, maxe;
650 
651 process_end_range:
652 	      /* Get collation equivalence values */
653 	      mine = __collate_equiv_value(ctx->loc, &min, 1);
654 	      maxe = __collate_equiv_value(ctx->loc, &c, 1);
655 	      if (maxe < mine)
656 		{
657 		  status = REG_ERANGE;
658 		  goto error;
659 		}
660 	      if (!collect_MCCS)
661 		{
662 		  status = tre_new_item(ctx->mem,
663 					TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN,
664 					mine, &max_i, items);
665 		  if (status != REG_OK)
666 		    goto error;
667 		  status = tre_new_item(ctx->mem,
668 					TRE_BRACKET_MATCH_TYPE_RANGE_END,
669 					maxe, &max_i, items);
670 		  if (status != REG_OK)
671 		    goto error;
672 		}
673 	      range = -1;
674 	    }
675 	  else
676 	    {
677 process_begin_range:
678 	      if (!collect_MCCS)
679 		{
680 		  if (range == 0)
681 		    {
682 		      status = tre_new_item(ctx->mem,
683 					    TRE_BRACKET_MATCH_TYPE_CHAR,
684 					    min, &max_i, items);
685 		      if (status != REG_OK)
686 			goto error;
687 		    }
688 		  min = c;
689 		}
690 	      range = 0;
691 	    }
692 	  other++;
693 	  break;
694 	}
695     }
696   status = REG_EBRACK;
697 error:
698   DPRINT(("tre_parse_bracket:	error: '%.*" STRF "', status=%d\n",
699 	  REST(re), status));
700   if (col_syms)
701     xfree(col_syms);
702   return status;
703 }
704 
705 #ifdef TRE_DEBUG
706 static const char *bracket_match_type_str[] = {
707   "unused",
708   "char",
709   "range begin",
710   "range end",
711   "class",
712   "equivalence value",
713 };
714 #endif /* TRE_DEBUG */
715 
716 static reg_errcode_t
717 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
718 {
719   tre_ast_node_t *node;
720   reg_errcode_t status = REG_OK;
721   tre_bracket_match_list_t *items;
722   int max_i = 32;
723   tre_collating_symbol *col_syms = NULL;
724 
725   /* Handle special cases [[:<:]] and [[:>:]] */
726   if (ctx->re_end - ctx->re >= 6 && ctx->re[0] == CHAR_LBRACKET
727       && ctx->re[1] == CHAR_COLON && (ctx->re[2] == L'<' || ctx->re[2] == L'>')
728       && ctx->re[3] == CHAR_COLON && ctx->re[4] == CHAR_RBRACKET
729       && ctx->re[5] == CHAR_RBRACKET)
730     {
731       *result = tre_ast_new_literal(ctx->mem, ASSERTION,
732 		      (ctx->re[2] == L'<') ? ASSERT_AT_BOW : ASSERT_AT_EOW,
733 		      -1);
734       DPRINT(("tre_parse_bracket: special case %s\n", (ctx->re[2] == L'<') ?
735 	      "[[:<:]]" : "[[:>:]]"));
736       ctx->re += 6;
737       return *result ? REG_OK : REG_ESPACE;
738     }
739 
740   /* Start off with an array of `max_i' elements. */
741   items = xcalloc(1, SIZEOF_BRACKET_MATCH_LIST_N(max_i));
742   if (items == NULL)
743     return REG_ESPACE;
744 
745   if (*ctx->re == CHAR_CARET)
746     {
747       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
748       items->flags |= TRE_BRACKET_MATCH_FLAG_NEGATE;
749       ctx->re++;
750     }
751 
752   status = tre_parse_bracket_items(ctx, &items, &max_i, &col_syms);
753 
754   if (status != REG_OK)
755     goto parse_bracket_done;
756 
757   /* If there are collating symbols, split off the multi-character ones
758    * into a union of the bracket expression (without the collating symbols)
759    * and the multiple-character sequences.  We create an equivalent input
760    * string and run tre_parse() recursively */
761   if (col_syms)
762     {
763       tre_char_t *str, *sp;
764       tre_collating_symbol *cp;
765       tre_parse_ctx_t subctx;
766 
767       /* Allocate a new string.  We start with the size of the original
768        * bracket expression (minus 1) and add 2 (for a leading "[" and
769        * a trailing nil; don't need a "^", since it is illegal to have
770        * inverted MCCSs).  Since a multi-character collating symbols
771        * will be converted from "[.xx.]" to "|xx" (n+4 to n+1), we don't
772        * need to worry about the new string getting too long. */
773       xfree(items);
774       str = xmalloc(sizeof(*str) * ((col_syms->start - ctx->re) + 2));
775       if (str == NULL)
776 	{
777 	  xfree(col_syms);
778 	  return REG_ESPACE;
779 	}
780       sp = str;
781       if (col_syms->len > 0)
782 	{
783 	  /* There are other items in the bracket expression besides the
784 	   * multi-character collating symbols, so create a new bracket
785 	   * expression with only those other itmes. */
786 	  const tre_char_t *re;
787 	  ptrdiff_t i;
788 
789 	  *sp++ = '[';
790 	  re = ctx->re;
791 	  for (cp = col_syms + 1; cp->start; cp++)
792 	    {
793 	      /* The "- 2" is to account for the "[." */
794 	      if ((i = ((cp->start - re) - 2)) > 0)
795 		{
796 		  memcpy(sp, re, sizeof(*sp) * i);
797 		  sp += i;
798 		}
799 	      /* The "+ 2" is to account for the ".]" */
800 	      re = cp->start + cp->len + 2;
801 	    }
802 	    i = col_syms->start - re; /* Includes the trailing right bracket */
803 	    memcpy(sp, re, sizeof(*sp) * i);
804 	    sp += i;
805 	    *sp++ = '|';
806 	}
807       for (cp = col_syms + 1; cp->start; cp++)
808 	{
809 	  memcpy(sp, cp->start, sizeof(*sp) * cp->len);
810 	  sp += cp->len;
811 	  if (cp[1].start)
812 	    *sp++ = '|';
813 	}
814       *sp = 0;
815       DPRINT(("tre_parse_bracket: Reparsing bracket expression with '%ls'\n",
816 	      str));
817 
818       memcpy(&subctx, ctx, sizeof(subctx));
819       subctx.re = str;
820       subctx.len = sp - str;
821       subctx.nofirstsub = 1;
822       subctx.cflags |= REG_EXTENDED; /* Force extended mode for parsing */
823       status = tre_parse(&subctx);
824       xfree(str);
825       if (status != REG_OK)
826 	{
827 	  xfree(col_syms);
828 	  return status;
829 	}
830       ctx->re = col_syms->start;
831       ctx->position = subctx.position;
832       xfree(col_syms);
833       *result = subctx.result;
834       DPRINT(("tre_parse_bracket: Returning to original string\n"));
835       return REG_OK;
836     }
837 
838   DPRINT(("tre_parse_bracket: creating bracket expression literal\n"));
839   node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position);
840   if (node == NULL)
841     {
842       status = REG_ESPACE;
843       goto parse_bracket_done;
844     }
845   else
846     {
847       tre_literal_t *l = node->obj;
848       l->u.bracket_match_list = tre_mem_alloc(ctx->mem,
849 					 SIZEOF_BRACKET_MATCH_LIST(items));
850       if (l->u.bracket_match_list == NULL)
851 	{
852 	  status = REG_ESPACE;
853 	  goto parse_bracket_done;
854 	}
855       memcpy(l->u.bracket_match_list, items, SIZEOF_BRACKET_MATCH_LIST(items));
856     }
857 
858 #ifdef TRE_DEBUG
859   {
860     int i;
861     tre_bracket_match_t *b;
862     DPRINT(("tre_parse_bracket: %d bracket match items, flags 0x%x\n",
863 	    items->num_bracket_matches, items->flags));
864     for (i = 0, b = items->bracket_matches;
865 	 i < items->num_bracket_matches; i++, b++)
866       {
867 	DPRINT(("   %d: %s %d\n", i, bracket_match_type_str[b->type],
868 		b->value));
869       }
870   }
871 #endif /* TRE_DEBUG */
872 
873  parse_bracket_done:
874   xfree(items);
875   ctx->position++;
876   *result = node;
877   return status;
878 }
879 
880 
881 /* Parses a positive decimal integer.  Returns -1 if the string does not
882    contain a valid number. */
883 static int
884 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
885 {
886   int num = -1;
887   const tre_char_t *r = *regex;
888   while (r < regex_end && *r >= L'0' && *r <= L'9')
889     {
890       if (num < 0)
891 	num = 0;
892       num = num * 10 + *r - L'0';
893       r++;
894     }
895   *regex = r;
896   return num;
897 }
898 
899 
900 static reg_errcode_t
901 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
902 {
903   int min, max;
904 #ifdef TRE_APPROX
905   int i;
906   int cost_ins, cost_del, cost_subst, cost_max;
907   int limit_ins, limit_del, limit_subst, limit_err;
908   const tre_char_t *start;
909 #endif /* TRE_APPROX */
910   const tre_char_t *r = ctx->re;
911   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
912 #ifdef TRE_APPROX
913   int approx = 0;
914   int costs_set = 0;
915   int counts_set = 0;
916 
917   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
918   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
919 #endif /* TRE_APPROX */
920 
921   /* Parse number (minimum repetition count). */
922   min = -1;
923   if (r >= ctx->re_end)
924 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
925     return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_EBRACE;
926 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
927     return REG_EBRACE;
928 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
929   if (*r >= L'0' && *r <= L'9') {
930     DPRINT(("tre_parse:	  min count: '%.*" STRF "'\n", REST(r)));
931     min = tre_parse_int(&r, ctx->re_end);
932   }
933 #ifndef TRE_APPROX
934   else
935 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
936       /* For ERE, return REG_NOMATCH to signal that the lbrace should
937          be treated as a literal */
938       return (ctx->cflags & REG_EXTENDED) ? REG_NOMATCH : REG_BADBR;
939 #else /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
940       return REG_BADBR;
941 #endif /* !ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
942 #endif /* !TRE_APPROX */
943 
944   /* Parse comma and second number (maximum repetition count). */
945   max = min;
946   if (r < ctx->re_end && *r == CHAR_COMMA)
947     {
948       r++;
949       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
950       max = tre_parse_int(&r, ctx->re_end);
951     }
952 
953   /* Check that the repeat counts are sane. */
954   if ((max >= 0 && min > max) || min > RE_DUP_MAX || max > RE_DUP_MAX)
955     return REG_BADBR;
956 
957 
958 #ifdef TRE_APPROX
959   /*
960    '{'
961      optionally followed immediately by a number == minimum repcount
962      optionally followed by , then a number == maximum repcount
963       + then a number == maximum insertion count
964       - then a number == maximum deletion count
965       # then a number == maximum substitution count
966       ~ then a number == maximum number of errors
967       Any of +, -, # or ~ without followed by a number means that
968       the maximum count/number of errors is infinite.
969 
970       An equation of the form
971 	Xi + Yd + Zs < C
972       can be specified to set costs and the cost limit to a value
973       different from the default value:
974 	- X is the cost of an insertion
975 	- Y is the cost of a deletion
976 	- Z is the cost of a substitution
977 	- C is the maximum cost
978 
979       If no count limit or cost is set for an operation, the operation
980       is not allowed at all.
981   */
982 
983 
984   do {
985     int done;
986     start = r;
987 
988     /* Parse count limit settings */
989     done = 0;
990     if (!counts_set)
991       while (r + 1 < ctx->re_end && !done)
992 	{
993 	  switch (*r)
994 	    {
995 	    case CHAR_PLUS:  /* Insert limit */
996 	      DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
997 	      r++;
998 	      limit_ins = tre_parse_int(&r, ctx->re_end);
999 	      if (limit_ins < 0)
1000 		limit_ins = INT_MAX;
1001 	      counts_set = 1;
1002 	      break;
1003 	    case CHAR_MINUS: /* Delete limit */
1004 	      DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
1005 	      r++;
1006 	      limit_del = tre_parse_int(&r, ctx->re_end);
1007 	      if (limit_del < 0)
1008 		limit_del = INT_MAX;
1009 	      counts_set = 1;
1010 	      break;
1011 	    case CHAR_HASH:  /* Substitute limit */
1012 	      DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
1013 	      r++;
1014 	      limit_subst = tre_parse_int(&r, ctx->re_end);
1015 	      if (limit_subst < 0)
1016 		limit_subst = INT_MAX;
1017 	      counts_set = 1;
1018 	      break;
1019 	    case CHAR_TILDE: /* Maximum number of changes */
1020 	      DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
1021 	      r++;
1022 	      limit_err = tre_parse_int(&r, ctx->re_end);
1023 	      if (limit_err < 0)
1024 		limit_err = INT_MAX;
1025 	      approx = 1;
1026 	      break;
1027 	    case CHAR_COMMA:
1028 	      r++;
1029 	      break;
1030 	    case L' ':
1031 	      r++;
1032 	      break;
1033 	    case L'}':
1034 	      done = 1;
1035 	      break;
1036 	    default:
1037 	      done = 1;
1038 	      break;
1039 	    }
1040 	}
1041 
1042     /* Parse cost restriction equation. */
1043     done = 0;
1044     if (!costs_set)
1045       while (r + 1 < ctx->re_end && !done)
1046 	{
1047 	  switch (*r)
1048 	    {
1049 	    case CHAR_PLUS:
1050 	    case L' ':
1051 	      r++;
1052 	      break;
1053 	    case L'<':
1054 	      DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
1055 	      r++;
1056 	      while (*r == L' ')
1057 		r++;
1058 	      cost_max = tre_parse_int(&r, ctx->re_end);
1059 	      if (cost_max < 0)
1060 		cost_max = INT_MAX;
1061 	      else
1062 		cost_max--;
1063 	      approx = 1;
1064 	      break;
1065 	    case CHAR_COMMA:
1066 	      r++;
1067 	      done = 1;
1068 	      break;
1069 	    default:
1070 	      if (*r >= L'0' && *r <= L'9')
1071 		{
1072 #ifdef TRE_DEBUG
1073 		  const tre_char_t *sr = r;
1074 #endif /* TRE_DEBUG */
1075 		  int cost = tre_parse_int(&r, ctx->re_end);
1076 		  /* XXX - make sure r is not past end. */
1077 		  switch (*r)
1078 		    {
1079 		    case L'i':	/* Insert cost */
1080 		      DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
1081 			      REST(sr)));
1082 		      r++;
1083 		      cost_ins = cost;
1084 		      costs_set = 1;
1085 		      break;
1086 		    case L'd':	/* Delete cost */
1087 		      DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
1088 			      REST(sr)));
1089 		      r++;
1090 		      cost_del = cost;
1091 		      costs_set = 1;
1092 		      break;
1093 		    case L's':	/* Substitute cost */
1094 		      DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
1095 			      REST(sr)));
1096 		      r++;
1097 		      cost_subst = cost;
1098 		      costs_set = 1;
1099 		      break;
1100 		    default:
1101 		      return REG_BADBR;
1102 		    }
1103 		}
1104 	      else
1105 		{
1106 		  done = 1;
1107 		  break;
1108 		}
1109 	    }
1110 	}
1111   } while (start != r);
1112 #endif /* TRE_APPROX */
1113 
1114   /*{*//* Missing }. */
1115   if (r >= ctx->re_end)
1116     return REG_EBRACE;
1117 
1118   /* Empty contents of {}. */
1119   if (r == ctx->re)
1120     return REG_BADBR;
1121 
1122   /* Parse the ending '}' or '\}'.*/
1123   if (ctx->cflags & REG_EXTENDED)
1124     {
1125       if (r >= ctx->re_end || *r != CHAR_RBRACE)
1126 	return REG_BADBR;
1127       r++;
1128       /* Parse trailing '?' marking minimal repetition. */
1129       if (r < ctx->re_end)
1130 	{
1131 	  if (*r == CHAR_QUESTIONMARK)
1132 	    {
1133 	      /* Process the question mark only in enhanced mode.
1134 		 Otherwise, the question mark is an error in ERE
1135 		 or a literal in BRE */
1136 	      if (ctx->cflags & REG_ENHANCED)
1137 		{
1138 		  minimal = !(ctx->cflags & REG_UNGREEDY);
1139 		  r++;
1140 		}
1141 	      else return REG_BADRPT;
1142 	    }
1143 	  else if (*r == CHAR_STAR || *r == CHAR_PLUS)
1144 	    {
1145 	      /* These are reserved for future extensions. */
1146 	      return REG_BADRPT;
1147 	    }
1148 	}
1149     }
1150   else
1151     {
1152       if (r + 1 >= ctx->re_end
1153 	  || *r != CHAR_BACKSLASH
1154 	  || *(r + 1) != CHAR_RBRACE)
1155 	return REG_BADBR;
1156       r += 2;
1157       if (r < ctx->re_end && *r == CHAR_STAR)
1158 	{
1159 	  /* This is reserved for future extensions. */
1160 	  return REG_BADRPT;
1161 	}
1162     }
1163 
1164   if (minimal)
1165     ctx->num_reorder_tags++;
1166 
1167   if (!result) goto parse_bound_exit;
1168   /* Create the AST node(s). */
1169   /* Originally, if min == 0 && max == 0, we immediately replace the whole
1170      iteration with EMPTY.  This unfortunately drops any submatches, and
1171      messes up setting the pmatch values (we can get tags of -1, and
1172      tag values in the billions).  So we leave it and process this case as
1173      usual, and wait until tre_expand_ast() to replace with EMPTY */
1174 #ifdef TRE_APPROX
1175   if (min < 0 && max < 0)
1176     /* Only approximate parameters set, no repetitions. */
1177     min = max = 1;
1178 #endif /* TRE_APPROX */
1179 
1180   *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
1181   if (!*result)
1182     return REG_ESPACE;
1183 
1184 #ifdef TRE_APPROX
1185   /* If approximate matching parameters are set, add them to the
1186      iteration node. */
1187   if (approx || costs_set || counts_set)
1188     {
1189       int *params;
1190       tre_iteration_t *iter = (*result)->obj;
1191 
1192       if (costs_set || counts_set)
1193 	{
1194 	  if (limit_ins == TRE_PARAM_UNSET)
1195 	    {
1196 	      if (cost_ins == TRE_PARAM_UNSET)
1197 		limit_ins = 0;
1198 	      else
1199 		limit_ins = INT_MAX;
1200 	    }
1201 
1202 	  if (limit_del == TRE_PARAM_UNSET)
1203 	    {
1204 	      if (cost_del == TRE_PARAM_UNSET)
1205 		limit_del = 0;
1206 	      else
1207 		limit_del = INT_MAX;
1208 	    }
1209 
1210 	  if (limit_subst == TRE_PARAM_UNSET)
1211 	    {
1212 	      if (cost_subst == TRE_PARAM_UNSET)
1213 		limit_subst = 0;
1214 	      else
1215 		limit_subst = INT_MAX;
1216 	    }
1217 	}
1218 
1219       if (cost_max == TRE_PARAM_UNSET)
1220 	cost_max = INT_MAX;
1221       if (limit_err == TRE_PARAM_UNSET)
1222 	limit_err = INT_MAX;
1223 
1224       ctx->have_approx = 1;
1225       params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
1226       if (!params)
1227 	return REG_ESPACE;
1228       for (i = 0; i < TRE_PARAM_LAST; i++)
1229 	params[i] = TRE_PARAM_UNSET;
1230       params[TRE_PARAM_COST_INS] = cost_ins;
1231       params[TRE_PARAM_COST_DEL] = cost_del;
1232       params[TRE_PARAM_COST_SUBST] = cost_subst;
1233       params[TRE_PARAM_COST_MAX] = cost_max;
1234       params[TRE_PARAM_MAX_INS] = limit_ins;
1235       params[TRE_PARAM_MAX_DEL] = limit_del;
1236       params[TRE_PARAM_MAX_SUBST] = limit_subst;
1237       params[TRE_PARAM_MAX_ERR] = limit_err;
1238       iter->params = params;
1239     }
1240 #endif /* TRE_APPROX */
1241 
1242 parse_bound_exit:
1243 #ifdef TRE_APPROX
1244   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
1245 	  "limits [%d,%d,%d, total %d]\n",
1246 	  min, max, cost_ins, cost_del, cost_subst, cost_max,
1247 	  limit_ins, limit_del, limit_subst, limit_err));
1248 #else /* !TRE_APPROX */
1249   DPRINT(("tre_parse_bound: min %d, max %d\n", min, max));
1250 #endif /* !TRE_APPROX */
1251 
1252 
1253   ctx->re = r;
1254   return REG_OK;
1255 }
1256 
1257 /* Previously, we had PARSE_RESTORE_CFLAGS restore the cflags, but for
1258    non-self-contained options, like (?i), this causes ((?i)fu)bar to be
1259    treated more like ((?i)fu(?-i)bar), so the pmatch value is incorrect.
1260    Because we now set up tags for even non-capturing parenthesized
1261    subexpressions, we always call PARSE_MARK_FOR_SUBMATCH.  So if we
1262    pass the unmodified version of cflags to PARSE_MARK_FOR_SUBMATCH and
1263    have it restore cflags after the subexpression, we don't need to have
1264    a separate PARSE_RESTORE_CFLAGS, and then after processing the
1265    non-self-contained option, we can call PARSE_ATOM instead of PARSE_RE.
1266    This has the side-benefit of now matching the perl behavior: the RE
1267    foo(?i)bar|zap is foo(?i)bar OR (?i)zap instead of TRE previous behavior
1268    of foo AND (?i) (bar OR zap). */
1269 typedef enum {
1270   PARSE_RE = 0,
1271   PARSE_ATOM,
1272   PARSE_MARK_FOR_SUBMATCH,
1273   PARSE_BRANCH,
1274   PARSE_PIECE,
1275   PARSE_CATENATION,
1276   PARSE_POST_CATENATION,
1277   PARSE_UNION,
1278   PARSE_POST_UNION,
1279   PARSE_POSTFIX,
1280 } tre_parse_re_stack_symbol_t;
1281 
1282 
1283 reg_errcode_t
1284 tre_parse(tre_parse_ctx_t *ctx)
1285 {
1286   tre_ast_node_t *result = NULL;
1287   tre_parse_re_stack_symbol_t symbol;
1288   reg_errcode_t status = REG_OK;
1289   tre_stack_t *stack = ctx->stack;
1290   int bottom = tre_stack_num_objects(stack);
1291   int depth = 0;
1292   int temporary_cflags = 0;
1293   int bre_branch_begin;
1294 #ifdef TRE_DEBUG
1295   const tre_char_t *tmp_re;
1296 #endif
1297 
1298   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d cflags = 0%o\n",
1299 	  ctx->len, ctx->re, ctx->len, ctx->cflags));
1300 
1301   if (ctx->len <= 0) return REG_EMPTY;
1302   if (!ctx->nofirstsub)
1303     {
1304       STACK_PUSH(stack, int, ctx->cflags);
1305       STACK_PUSH(stack, int, ctx->submatch_id);
1306       STACK_PUSH(stack, int, PARSE_MARK_FOR_SUBMATCH);
1307       ctx->submatch_id++;
1308     }
1309   STACK_PUSH(stack, int, 0); // bre_branch_begin
1310   STACK_PUSH(stack, int, PARSE_RE);
1311   ctx->re_start = ctx->re;
1312   ctx->re_end = ctx->re + ctx->len;
1313 
1314 
1315   /* The following is basically just a recursive descent parser.  I use
1316      an explicit stack instead of recursive functions mostly because of
1317      two reasons: compatibility with systems which have an overflowable
1318      call stack, and efficiency (both in lines of code and speed).  */
1319   while (tre_stack_num_objects(stack) > bottom)
1320     {
1321       symbol = tre_stack_pop_int(stack);
1322       switch (symbol)
1323 	{
1324 	case PARSE_RE:
1325 	  /* Parse a full regexp.  A regexp is one or more branches,
1326 	     separated by the union operator `|'. */
1327 	  bre_branch_begin = tre_stack_pop_int(stack);
1328 	  if (
1329 #ifdef REG_LITERAL
1330 	      !(ctx->cflags & REG_LITERAL) &&
1331 #endif /* REG_LITERAL */
1332 	      ctx->cflags & (REG_EXTENDED | REG_ENHANCED))
1333 	    STACK_PUSHX(stack, int, PARSE_UNION);
1334 	  STACK_PUSHX(stack, int, bre_branch_begin);
1335 	  STACK_PUSHX(stack, int, PARSE_BRANCH);
1336 	  break;
1337 
1338 	case PARSE_BRANCH:
1339 	  /* Parse a branch.  A branch is one or more pieces, concatenated.
1340 	     A piece is an atom possibly followed by a postfix operator. */
1341 	  bre_branch_begin = tre_stack_pop_int(stack);
1342 	  STACK_PUSHX(stack, int, PARSE_CATENATION);
1343 	  STACK_PUSHX(stack, int, bre_branch_begin);
1344 	  STACK_PUSHX(stack, int, PARSE_PIECE);
1345 	  break;
1346 
1347 	case PARSE_PIECE:
1348 	  /* Parse a piece.  A piece is an atom possibly followed by one
1349 	     or more postfix operators. */
1350 	  bre_branch_begin = tre_stack_pop_int(stack);
1351 	  STACK_PUSHX(stack, int, PARSE_POSTFIX);
1352 	  STACK_PUSHX(stack, int, bre_branch_begin);
1353 	  STACK_PUSHX(stack, int, PARSE_ATOM);
1354 	  break;
1355 
1356 	case PARSE_CATENATION:
1357 	  /* If the expression has not ended, parse another piece. */
1358 	  {
1359 	    tre_char_t c;
1360 	    if (ctx->re >= ctx->re_end)
1361 	      break;
1362 	    c = *ctx->re;
1363 #ifdef REG_LITERAL
1364 	    if (!(ctx->cflags & REG_LITERAL))
1365 	      {
1366 #endif /* REG_LITERAL */
1367 		if ((ctx->cflags & REG_EXTENDED && c == CHAR_PIPE) ||
1368 		    ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED
1369 		    && ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH &&
1370 		    *(ctx->re + 1) == CHAR_PIPE))
1371 		  break;
1372 		if ((ctx->cflags & REG_EXTENDED
1373 		     && c == CHAR_RPAREN && depth > 0)
1374 		    || (!(ctx->cflags & REG_EXTENDED)
1375 			&& ctx->re + 1 < ctx->re_end && c == CHAR_BACKSLASH
1376 			    && *(ctx->re + 1) == CHAR_RPAREN))
1377 		  {
1378 		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1379 		      return REG_EPAREN;
1380 		    DPRINT(("tre_parse:	  group end: '%.*" STRF "'\n",
1381 			    REST(ctx->re)));
1382 		    depth--;
1383 		    if (!(ctx->cflags & (REG_EXTENDED | REG_ENHANCED)))
1384 		      ctx->re += 2;
1385 		    break;
1386 		  }
1387 #ifdef REG_LITERAL
1388 	      }
1389 #endif /* REG_LITERAL */
1390 
1391 #ifdef REG_LEFT_ASSOC
1392 	    if (ctx->cflags & REG_LEFT_ASSOC)
1393 	      {
1394 		/* Left associative concatenation. */
1395 		STACK_PUSHX(stack, int, PARSE_CATENATION);
1396 		STACK_PUSHX(stack, voidptr, result);
1397 		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1398 		STACK_PUSHX(stack, int, 0); // bre_branch_begin
1399 		STACK_PUSHX(stack, int, PARSE_PIECE);
1400 	      }
1401 	    else
1402 #endif /* REG_LEFT_ASSOC */
1403 	      {
1404 		/* Default case, right associative concatenation. */
1405 		STACK_PUSHX(stack, voidptr, result);
1406 		STACK_PUSHX(stack, int, PARSE_POST_CATENATION);
1407 		STACK_PUSHX(stack, int, PARSE_CATENATION);
1408 		STACK_PUSHX(stack, int, 0); // bre_branch_begin
1409 		STACK_PUSHX(stack, int, PARSE_PIECE);
1410 	      }
1411 	    break;
1412 	  }
1413 
1414 	case PARSE_POST_CATENATION:
1415 	  {
1416 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1417 	    tre_ast_node_t *tmp_node;
1418 	    tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1419 	    if (!tmp_node)
1420 	      return REG_ESPACE;
1421 	    result = tmp_node;
1422 	    break;
1423 	  }
1424 
1425 	case PARSE_UNION:
1426 	  if (ctx->re >= ctx->re_end)
1427 	    break;
1428 #ifdef REG_LITERAL
1429 	  if (ctx->cflags & REG_LITERAL)
1430 	    break;
1431 #endif /* REG_LITERAL */
1432 	  if (!(ctx->cflags & REG_EXTENDED))
1433 	    {
1434 	      if (*ctx->re != CHAR_BACKSLASH || ctx->re + 1 >= ctx->re_end)
1435 		break;
1436 	      ctx->re++;
1437 	    }
1438 	  switch (*ctx->re)
1439 	    {
1440 	    case CHAR_PIPE:
1441 	      DPRINT(("tre_parse:	union: '%.*" STRF "'\n",
1442 		      REST(ctx->re)));
1443 	      STACK_PUSHX(stack, int, PARSE_UNION);
1444 	      STACK_PUSHX(stack, voidptr, (void *)ctx->re);
1445 	      STACK_PUSHX(stack, voidptr, result);
1446 	      STACK_PUSHX(stack, int, PARSE_POST_UNION);
1447 	      /* We need to pass a boolean (eventually) to PARSE_ATOM to
1448 		 indicate if this is the beginning of a BRE extended branch. */
1449 	      STACK_PUSHX(stack, int, (ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) == REG_ENHANCED); // bre_branch_begin
1450 	      STACK_PUSHX(stack, int, PARSE_BRANCH);
1451 	      ctx->re++;
1452 	      break;
1453 
1454 	    case CHAR_RPAREN:
1455 	      ctx->re++;
1456 	      break;
1457 
1458 	    default:
1459 	      if (!(ctx->cflags & REG_EXTENDED))
1460 		ctx->re--;
1461 	      break;
1462 	    }
1463 	  break;
1464 
1465 	case PARSE_POST_UNION:
1466 	  {
1467 	    tre_ast_node_t *tmp_node;
1468 	    tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1469 	    const tre_char_t *pipechar = tre_stack_pop_voidptr(stack);
1470 	    /* error on empty expression at end of union */
1471 	    if (pipechar == ctx->re - 1)
1472 	      {
1473 		return REG_EMPTY;
1474 	      }
1475 	    tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1476 	    if (!tmp_node)
1477 	      return REG_ESPACE;
1478 	    result = tmp_node;
1479 	    break;
1480 	  }
1481 
1482 	case PARSE_POSTFIX:
1483 	  /* Parse postfix operators. */
1484 	  if (ctx->re >= ctx->re_end)
1485 	    break;
1486 #ifdef REG_LITERAL
1487 	  if (ctx->cflags & REG_LITERAL)
1488 	    break;
1489 #endif /* REG_LITERAL */
1490 	  int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1491 	  int rep_min = 0;
1492 	  int rep_max = -1;
1493 #ifdef TRE_DEBUG
1494 	  int lbrace_off;
1495 #endif
1496 	  switch (*ctx->re)
1497 	    {
1498 	    case CHAR_PLUS:
1499 	    case CHAR_QUESTIONMARK:
1500 	      if (!(ctx->cflags & REG_EXTENDED))
1501 		break;
1502 		/*FALLTHROUGH*/
1503 	    case CHAR_STAR:
1504 	      {
1505 		tre_ast_node_t *tmp_node;
1506 #ifdef TRE_DEBUG
1507 		const char *tstr = "star";
1508 		tmp_re = ctx->re;
1509 #endif
1510 
1511 	handle_plus_or_question:
1512 		/* error on iteration of raw assertion (not in subexpression) */
1513 		if (result->type == LITERAL && result->submatch_id < 0 &&
1514 		    IS_ASSERTION((tre_literal_t *)result->obj))
1515 		  {
1516 		    if (!(ctx->cflags & REG_EXTENDED)) break;
1517 		    return REG_BADRPT;
1518 		  }
1519 		if (*ctx->re == CHAR_PLUS)
1520 		  {
1521 		    rep_min = 1;
1522 #ifdef TRE_DEBUG
1523 		    tstr = "plus";
1524 #endif
1525 		  }
1526 		if (*ctx->re == CHAR_QUESTIONMARK)
1527 		  {
1528 		    rep_max = 1;
1529 #ifdef TRE_DEBUG
1530 		    tstr = "questionmark";
1531 #endif
1532 		  }
1533 
1534 		if (ctx->cflags & REG_EXTENDED)
1535 		  {
1536 		    if (ctx->re + 1 < ctx->re_end)
1537 		      {
1538 			if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1539 			  {
1540 			    /* Process the question mark only in enhanced mode.
1541 			       Otherwise, the question mark is an error in ERE */
1542 			    if (ctx->cflags & REG_ENHANCED)
1543 			      {
1544 				minimal = !(ctx->cflags & REG_UNGREEDY);
1545 				ctx->re++;
1546 			      }
1547 			    else return REG_BADRPT;
1548 			  }
1549 			else if (*(ctx->re + 1) == CHAR_STAR
1550 				 || *(ctx->re + 1) == CHAR_PLUS)
1551 			  {
1552 			    /* These are reserved for future extensions. */
1553 			    return REG_BADRPT;
1554 			  }
1555 		      }
1556 		  }
1557 		else
1558 		  {
1559 		    if (ctx->re + 1 < ctx->re_end && *(ctx->re + 1) == CHAR_STAR)
1560 		      {
1561 			/* This is reserved for future extensions. */
1562 			return REG_BADRPT;
1563 		      }
1564 		    if (ctx->re + 2 < ctx->re_end)
1565 		      {
1566 			if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_QUESTIONMARK)
1567 			  {
1568 			    /* Process the question mark only in enhanced mode.
1569 			       Otherwise, the question mark is a literal in BRE */
1570 			    if (ctx->cflags & REG_ENHANCED)
1571 			      {
1572 				minimal = !(ctx->cflags & REG_UNGREEDY);
1573 				ctx->re += 2;
1574 			      }
1575 			  }
1576 			else if (*(ctx->re + 1) == CHAR_BACKSLASH && *(ctx->re + 2) == CHAR_PLUS)
1577 			  {
1578 			    /* This is reserved for future extensions. */
1579 			    return REG_BADRPT;
1580 			  }
1581 		      }
1582 		  }
1583 
1584 		if (minimal)
1585 		  ctx->num_reorder_tags++;
1586 
1587 		DPRINT(("tre_parse: %s %s: '%.*" STRF "'\n",
1588 			minimal ? "  minimal" : "greedy", tstr, REST(tmp_re)));
1589 		if (result == NULL)
1590 		  {
1591 		    if (ctx->cflags & REG_EXTENDED) return REG_BADRPT;
1592 		    else goto parse_literal;
1593 		  }
1594 		ctx->re++;
1595 		tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1596 					    minimal);
1597 		if (tmp_node == NULL)
1598 		  return REG_ESPACE;
1599 		result = tmp_node;
1600 
1601 		/* Set the iterator with a submatch id in the invisible range
1602 		 * (which will be overridden if a real submatch is needed) */
1603 		result->submatch_id = ctx->submatch_id_invisible++;
1604 
1605 #if 0
1606 		/* We don't allow multiple postfixes, but this might be needed
1607 		   to support approximate matching */
1608 		STACK_PUSHX(stack, int, PARSE_POSTFIX);
1609 #endif
1610 	      }
1611 	      break;
1612 
1613 	    case CHAR_BACKSLASH:
1614 	      /* "\{" is special without REG_EXTENDED */
1615 	      /* "\+" and "\?" are special with REG_ENHANCED for BRE */
1616 	      if (!(ctx->cflags & REG_EXTENDED)
1617 		  && ctx->re + 1 < ctx->re_end)
1618 		{
1619 		  switch (*(ctx->re + 1))
1620 		    {
1621 		    case CHAR_LBRACE:
1622 		      ctx->re++;
1623 #ifdef TRE_DEBUG
1624 		      lbrace_off = 2;
1625 #endif
1626 		      goto parse_brace;
1627 		    case CHAR_PLUS:
1628 		    case CHAR_QUESTIONMARK:
1629 		      if (ctx->cflags & REG_ENHANCED)
1630 			{
1631 #ifdef TRE_DEBUG
1632 			  tmp_re = ctx->re;
1633 #endif
1634 			  ctx->re++;
1635 			  goto handle_plus_or_question;
1636 			}
1637 		      break;
1638 		    }
1639 		  break;
1640 		}
1641 	      else
1642 		break;
1643 
1644 	    case CHAR_LBRACE:
1645 	      {
1646 		int raw_assertion;
1647 
1648 		/* "{" is literal without REG_EXTENDED */
1649 		if (!(ctx->cflags & REG_EXTENDED))
1650 		  break;
1651 #ifdef TRE_DEBUG
1652 		lbrace_off = 1;
1653 #endif
1654 
1655 	    parse_brace:
1656 		/* error on iteration of raw assertion (not in subexpression),
1657 		   but wait until after parsing bounds */
1658 		raw_assertion = (result->type == LITERAL
1659 				 && result->submatch_id < 0
1660 				 && IS_ASSERTION((tre_literal_t *)result->obj));
1661 		ctx->re++;
1662 
1663 		status = tre_parse_bound(ctx, &result);
1664 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
1665 		/* For ERE, if status is REG_NOMATCH, this mean the lbrace
1666 		   is to be treated as a literal. */
1667 		if (status == REG_NOMATCH)
1668 		  {
1669 		    ctx->re--;
1670 		    break;
1671 		  }
1672 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
1673 		DPRINT(("tre_parse:	bound: '%.*" STRF "'\n",
1674 			REST(ctx->re - lbrace_off)));
1675 		if (status != REG_OK)
1676 		  return status;
1677 		if (raw_assertion) return REG_BADRPT;
1678 
1679 		/* Set the iterator with a submatch id in the invisible range
1680 		 * (which will be overridden if a real submatch is needed) */
1681 		if (result->type == ITERATION)
1682 		  result->submatch_id = ctx->submatch_id_invisible++;
1683 
1684 #if 0
1685 		/* We don't allow multiple postfixes, but this might be needed
1686 		   to support approximate matching */
1687 		STACK_PUSHX(stack, int, PARSE_POSTFIX);
1688 #endif
1689 		break;
1690 	      }
1691 	    }
1692 	  break;
1693 
1694 	case PARSE_ATOM:
1695 	  {
1696 	    /* Parse an atom.  An atom is a regular expression enclosed in `()',
1697 	       an empty set of `()', a bracket expression, `.', `^', `$',
1698 	       a `\' followed by a character, or a single character. */
1699 
1700 	    /* The stack contains a boolean value, whether PARSE_ATOM is
1701 	       being called just after the start of a group (left paren)
1702 	       in a BRE */
1703 	    bre_branch_begin = tre_stack_pop_int(stack);
1704 
1705 	    /* End of regexp? (empty string). */
1706 	    if (ctx->re >= ctx->re_end)
1707 	      goto parse_literal;
1708 
1709 #ifdef REG_LITERAL
1710 	    if (ctx->cflags & REG_LITERAL)
1711 	      goto parse_literal;
1712 #endif /* REG_LITERAL */
1713 
1714 	    switch (*ctx->re)
1715 	      {
1716 	      case CHAR_LPAREN:  /* parenthesized subexpression */
1717 
1718 		/* Handle "(?...)" extensions.  They work in a way similar
1719 		   to Perls corresponding extensions. */
1720 		if ((ctx->cflags & (REG_EXTENDED|REG_ENHANCED)) ==
1721 		    (REG_EXTENDED|REG_ENHANCED)
1722 		    && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1723 		  {
1724 		    int new_cflags = ctx->cflags;
1725 		    int bit = 1;
1726 		    int invisible_submatch = 0;
1727 		    DPRINT(("tre_parse:	extension: '%.*" STRF "'\n",
1728 			    REST(ctx->re)));
1729 		    ctx->re += 2;
1730 		    while (/*CONSTCOND*/1)
1731 		      {
1732 			if (*ctx->re == L'i')
1733 			  {
1734 			    DPRINT(("tre_parse:	    icase: '%.*" STRF "'\n",
1735 				    REST(ctx->re)));
1736 			    if (bit)
1737 			      new_cflags |= REG_ICASE;
1738 			    else
1739 			      new_cflags &= ~REG_ICASE;
1740 			    ctx->re++;
1741 			  }
1742 			else if (*ctx->re == L'n')
1743 			  {
1744 			    DPRINT(("tre_parse:	  newline: '%.*" STRF "'\n",
1745 				    REST(ctx->re)));
1746 			    if (bit)
1747 			      new_cflags |= REG_NEWLINE;
1748 			    else
1749 			      new_cflags &= ~REG_NEWLINE;
1750 			    ctx->re++;
1751 			  }
1752 #ifdef REG_LEFT_ASSOC
1753 			else if (*ctx->re == L'l')
1754 			  {
1755 			    DPRINT(("tre_parse: left assoc: '%.*" STRF "'\n",
1756 				    REST(ctx->re)));
1757 			    if (bit)
1758 			      new_cflags |= REG_LEFT_ASSOC;
1759 			    else
1760 			      new_cflags &= ~REG_LEFT_ASSOC;
1761 			    ctx->re++;
1762 			  }
1763 #endif /* REG_LEFT_ASSOC */
1764 #ifdef REG_UNGREEDY
1765 			else if (*ctx->re == L'U')
1766 			  {
1767 			    DPRINT(("tre_parse:    ungreedy: '%.*" STRF "'\n",
1768 				    REST(ctx->re)));
1769 			    if (bit)
1770 			      new_cflags |= REG_UNGREEDY;
1771 			    else
1772 			      new_cflags &= ~REG_UNGREEDY;
1773 			    ctx->re++;
1774 			  }
1775 #endif /* REG_UNGREEDY */
1776 			else if (*ctx->re == CHAR_MINUS)
1777 			  {
1778 			    DPRINT(("tre_parse:	 turn off: '%.*" STRF "'\n",
1779 				    REST(ctx->re)));
1780 			    ctx->re++;
1781 			    bit = 0;
1782 			  }
1783 			else if (*ctx->re == CHAR_COLON)
1784 			  {
1785 			    DPRINT(("tre_parse:	 no group: '%.*" STRF
1786 				    "', (invisible submatch %d)\n",
1787 				    REST(ctx->re), ctx->submatch_id_invisible));
1788 			    ctx->re++;
1789 			    depth++;
1790 			    invisible_submatch = 1;
1791 			    break;
1792 			  }
1793 			else if (*ctx->re == CHAR_HASH)
1794 			  {
1795 			    DPRINT(("tre_parse:    comment: '%.*" STRF "'\n",
1796 				    REST(ctx->re)));
1797 			    /* A comment can contain any character except a
1798 			       right parenthesis */
1799 			    while (*ctx->re != CHAR_RPAREN
1800 				   && ctx->re < ctx->re_end)
1801 			      ctx->re++;
1802 			    if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1803 			      {
1804 				ctx->re++;
1805 				break;
1806 			      }
1807 			    else
1808 			      return REG_BADPAT;
1809 			  }
1810 			else if (*ctx->re == CHAR_RPAREN)
1811 			  {
1812 			    ctx->re++;
1813 			    break;
1814 			  }
1815 			else
1816 			  return REG_BADRPT;
1817 		      }
1818 
1819 		    /* Turn on the cflags changes for the rest of the
1820 		       enclosing group. */
1821 		    if (invisible_submatch)
1822 		      {
1823 			STACK_PUSHX(stack, int, ctx->cflags);
1824 			STACK_PUSHX(stack, int, ctx->submatch_id_invisible);
1825 			STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1826 			ctx->submatch_id_invisible++;
1827 			STACK_PUSHX(stack, int, 0); // bre_branch_begin
1828 			STACK_PUSHX(stack, int, PARSE_RE);
1829 		      }
1830 		    else {
1831 			STACK_PUSHX(stack, int, 0); // bre_branch_begin
1832 			STACK_PUSHX(stack, int, PARSE_ATOM);
1833 		    }
1834 		    ctx->cflags = new_cflags;
1835 		    break;
1836 		  }
1837 
1838 		if (ctx->cflags & REG_EXTENDED)
1839 		  {
1840 		parse_bre_lparen:
1841 		    DPRINT(("tre_parse: group begin: '%.*" STRF
1842 			    "', submatch %d\n", REST(ctx->re),
1843 			    ctx->submatch_id));
1844 		    ctx->re++;
1845 		    /* First parse a whole RE, then mark the resulting tree
1846 		       for submatching. */
1847 		    STACK_PUSHX(stack, int, ctx->cflags);
1848 		    STACK_PUSHX(stack, int, ctx->submatch_id);
1849 		    STACK_PUSHX(stack, int, PARSE_MARK_FOR_SUBMATCH);
1850 		    /* We need to pass a boolean (eventually) to PARSE_ATOM to
1851 		       indicate if this is the beginning of a BRE group. */
1852 		    STACK_PUSHX(stack, int, !(ctx->cflags & REG_EXTENDED));
1853 		    STACK_PUSHX(stack, int, PARSE_RE);
1854 		    ctx->submatch_id++;
1855 		    depth++;
1856 		  }
1857 		else
1858 		  goto parse_literal;
1859 		break;
1860 
1861 	      case CHAR_RPAREN:  /* end of current subexpression */
1862 		if (ctx->cflags & REG_EXTENDED && depth > 0)
1863 		  {
1864 	      parse_bre_rparen_empty:
1865 		    if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1866 		      return REG_EPAREN;
1867 		    DPRINT(("tre_parse:	    empty: '%.*" STRF "'\n",
1868 			    REST(ctx->re)));
1869 		    /* We were expecting an atom, but instead the current
1870 		       subexpression was closed.  POSIX leaves the meaning of
1871 		       this to be implementation-defined.  We interpret this as
1872 		       an empty expression (which matches an empty string).  */
1873 		    result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1874 		    if (result == NULL)
1875 		      return REG_ESPACE;
1876 		    if (!(ctx->cflags & REG_EXTENDED))
1877 		      ctx->re--;
1878 		  }
1879 		else
1880 		  goto parse_literal;
1881 		break;
1882 
1883 	      case CHAR_LBRACKET: /* bracket expression */
1884 		DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1885 			REST(ctx->re)));
1886 		ctx->re++;
1887 		status = tre_parse_bracket(ctx, &result);
1888 		if (status != REG_OK)
1889 		  return status;
1890 		break;
1891 
1892 	      case CHAR_BACKSLASH:
1893 		/* Deal with "\(", "\)" or "\{" for BREs */
1894 		if (!(ctx->cflags & REG_EXTENDED)
1895 		    && ctx->re + 1 < ctx->re_end)
1896 		  {
1897 		    if (*(ctx->re + 1) == CHAR_LPAREN)
1898 		      {
1899 			ctx->re++;
1900 			goto parse_bre_lparen;
1901 		      }
1902 		    else if (*(ctx->re + 1) == CHAR_RPAREN)
1903 		      {
1904 			ctx->re++;
1905 			goto parse_bre_rparen_empty;
1906 		      }
1907 		    if (*(ctx->re + 1) == CHAR_LBRACE) goto parse_literal;
1908 		  }
1909 
1910 		if (ctx->re + 1 >= ctx->re_end)
1911 		  /* Trailing backslash. */
1912 		  return REG_EESCAPE;
1913 
1914 		if (!(ctx->cflags & REG_ENHANCED))
1915 		  {
1916 		    DPRINT(("tre_parse:  unenhanced bleep: '%.*" STRF "'\n", REST(ctx->re)));
1917 		    ctx->re++;
1918 		    goto unenhanced_backslash;
1919 		  }
1920 
1921 		/* If a macro is used, parse the expanded macro recursively. */
1922 		{
1923 		  tre_char_t buf[64];
1924 		  tre_expand_macro(ctx->re + 1, ctx->re_end,
1925 				   buf, elementsof(buf));
1926 		  if (buf[0] != 0)
1927 		    {
1928 		      tre_parse_ctx_t subctx;
1929 		      memcpy(&subctx, ctx, sizeof(subctx));
1930 		      subctx.re = buf;
1931 		      subctx.len = tre_strlen(buf);
1932 		      subctx.nofirstsub = 1;
1933 		      status = tre_parse(&subctx);
1934 		      if (status != REG_OK)
1935 			return status;
1936 		      ctx->re += 2;
1937 		      ctx->position = subctx.position;
1938 		      result = subctx.result;
1939 		      break;
1940 		    }
1941 		}
1942 
1943 #ifdef REG_LITERAL
1944 		if (*(ctx->re + 1) == L'Q')
1945 		  {
1946 		    DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1947 			    REST(ctx->re)));
1948 		    ctx->cflags |= REG_LITERAL;
1949 		    temporary_cflags |= REG_LITERAL;
1950 		    ctx->re += 2;
1951 		    STACK_PUSHX(stack, int, 0);
1952 		    STACK_PUSHX(stack, int, PARSE_ATOM);
1953 		    break;
1954 		  }
1955 #endif /* REG_LITERAL */
1956 
1957 		DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1958 		ctx->re++;
1959 		switch (*ctx->re)
1960 		  {
1961 		  case L'b':
1962 		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1963 						 ASSERT_AT_WB, -1);
1964 		    ctx->re++;
1965 		    break;
1966 		  case L'B':
1967 		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1968 						 ASSERT_AT_WB_NEG, -1);
1969 		    ctx->re++;
1970 		    break;
1971 		  case L'<':
1972 		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1973 						 ASSERT_AT_BOW, -1);
1974 		    ctx->re++;
1975 		    break;
1976 		  case L'>':
1977 		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
1978 						 ASSERT_AT_EOW, -1);
1979 		    ctx->re++;
1980 		    break;
1981 		  case L'x':
1982 		    ctx->re++;
1983 		    if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1984 		      {
1985 			/* 8 bit hex char. */
1986 			char tmp[3] = {0, 0, 0};
1987 			long val;
1988 			DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1989 				REST(ctx->re - 2)));
1990 
1991 			if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1992 			    ctx->re < ctx->re_end)
1993 			  {
1994 			    tmp[0] = (char)ctx->re[0];
1995 			    ctx->re++;
1996 			  }
1997 			if (tre_isxdigit_l(ctx->re[0], ctx->loc) &&
1998 			    ctx->re < ctx->re_end)
1999 			  {
2000 			    tmp[1] = (char)ctx->re[0];
2001 			    ctx->re++;
2002 			  }
2003 			val = strtol(tmp, NULL, 16);
2004 			result = tre_ast_new_literal(ctx->mem, (int)val,
2005 						     (int)val, ctx->position);
2006 			ctx->position++;
2007 			break;
2008 		      }
2009 		    else if (ctx->re < ctx->re_end)
2010 		      {
2011 			/* Wide char. */
2012 			char tmp[32];
2013 			long val;
2014 			int i = 0;
2015 			ctx->re++;
2016 			while (ctx->re_end - ctx->re >= 0)
2017 			  {
2018 			    if (ctx->re[0] == CHAR_RBRACE)
2019 			      break;
2020 			    if (tre_isxdigit_l(ctx->re[0], ctx->loc))
2021 			      {
2022 				tmp[i] = (char)ctx->re[0];
2023 				i++;
2024 				ctx->re++;
2025 				continue;
2026 			      }
2027 			    return REG_EBRACE;
2028 			  }
2029 			ctx->re++;
2030 			tmp[i] = 0;
2031 			val = strtol(tmp, NULL, 16);
2032 			result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
2033 						     ctx->position);
2034 			ctx->position++;
2035 			break;
2036 		      }
2037 		    /*FALLTHROUGH*/
2038 
2039 		  default:
2040 		  unenhanced_backslash:
2041 		    if ((ctx->cflags & (REG_EXTENDED | REG_ENHANCED)) !=
2042 			REG_EXTENDED &&
2043 			tre_isdigit_l(*ctx->re, ctx->loc) && *ctx->re != L'0')
2044 		      {
2045 			/* Back reference (only in BRE or enhanced). */
2046 			int val = *ctx->re - L'0';
2047 			DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
2048 				REST(ctx->re - 1)));
2049 			result = tre_ast_new_literal(ctx->mem, BACKREF, val,
2050 						     ctx->position);
2051 			if (result == NULL)
2052 			  return REG_ESPACE;
2053 
2054 			/* Set the backref with a submatch id in the invisible
2055 			 * range (which will be overridden if a real submatch
2056 			 * is needed) */
2057 			result->submatch_id = ctx->submatch_id_invisible++;
2058 
2059 			ctx->position++;
2060 			ctx->num_reorder_tags++;
2061 			ctx->max_backref = MAX(val, ctx->max_backref);
2062 			ctx->re++;
2063 		      }
2064 		    else
2065 		      {
2066 			/* Escaped character. */
2067 			DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
2068 				REST(ctx->re - 1)));
2069 			result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2070 						     ctx->position);
2071 			ctx->position++;
2072 			ctx->re++;
2073 		      }
2074 		    break;
2075 		  }
2076 		if (result == NULL)
2077 		  return REG_ESPACE;
2078 		break;
2079 
2080 	      case CHAR_PERIOD:	 /* the any-symbol */
2081 		DPRINT(("tre_parse:	  any: '%.*" STRF "'\n",
2082 			REST(ctx->re)));
2083 		if (ctx->cflags & REG_NEWLINE)
2084 		  {
2085 		    tre_ast_node_t *tmp1;
2086 		    tre_ast_node_t *tmp2;
2087 		    tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
2088 					       ctx->position);
2089 		    if (!tmp1)
2090 		      return REG_ESPACE;
2091 		    tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
2092 					       ctx->position + 1);
2093 		    if (!tmp2)
2094 		      return REG_ESPACE;
2095 		    result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2096 		    if (!result)
2097 		      return REG_ESPACE;
2098 		    ctx->position += 2;
2099 		  }
2100 		else
2101 		  {
2102 		    result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
2103 						 ctx->position);
2104 		    if (!result)
2105 		      return REG_ESPACE;
2106 		    ctx->position++;
2107 		  }
2108 		ctx->re++;
2109 		break;
2110 
2111 	      case CHAR_CARET:	 /* beginning of line assertion */
2112 		/* '^' has a special meaning everywhere in EREs, at the
2113 		   beginning of the RE and after \( is BREs.  It is also
2114 		   special in enhanced BREs at the beginning of each branches
2115 		   of a union */
2116 		if (ctx->cflags & REG_EXTENDED
2117 		    || bre_branch_begin
2118 		    || ctx->re == ctx->re_start)
2119 		  {
2120 		    DPRINT(("tre_parse:	      BOL: '%.*" STRF "'\n",
2121 			    REST(ctx->re)));
2122 		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
2123 						 ASSERT_AT_BOL, -1);
2124 		    if (result == NULL)
2125 		      return REG_ESPACE;
2126 		    ctx->re++;
2127 		  }
2128 		else
2129 		  goto parse_literal;
2130 		break;
2131 
2132 	      case CHAR_DOLLAR:	 /* end of line assertion. */
2133 		/* '$' is special everywhere in EREs, and in the end of the
2134 		   string and before \) is BREs. */
2135 		if (ctx->cflags & REG_EXTENDED
2136 		    || (ctx->re + 2 < ctx->re_end
2137 			&& *(ctx->re + 1) == CHAR_BACKSLASH
2138 			&& *(ctx->re + 2) == CHAR_RPAREN)
2139 		    || ctx->re + 1 == ctx->re_end)
2140 		  {
2141 		    DPRINT(("tre_parse:	      EOL: '%.*" STRF "'\n",
2142 			    REST(ctx->re)));
2143 		    result = tre_ast_new_literal(ctx->mem, ASSERTION,
2144 						 ASSERT_AT_EOL, -1);
2145 		    if (result == NULL)
2146 		      return REG_ESPACE;
2147 		    ctx->re++;
2148 		  }
2149 		else
2150 		  goto parse_literal;
2151 		break;
2152 
2153 	      default:
2154 	      parse_literal:
2155 
2156 		if (temporary_cflags && ctx->re + 1 < ctx->re_end
2157 		    && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
2158 		  {
2159 		    DPRINT(("tre_parse:	 end tmps: '%.*" STRF "'\n",
2160 			    REST(ctx->re)));
2161 		    ctx->cflags &= ~temporary_cflags;
2162 		    temporary_cflags = 0;
2163 		    ctx->re += 2;
2164 		    if (ctx->re < ctx->re_end)
2165 		      {
2166 			STACK_PUSHX(stack, int, 0);
2167 			STACK_PUSHX(stack, int, PARSE_ATOM);
2168 		      }
2169 		    else
2170 		      {
2171 			result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2172 			if (!result) return REG_ESPACE;
2173 		      }
2174 		    break;
2175 		  }
2176 
2177 
2178 		/* We are expecting an atom.  If the subexpression (or the whole
2179 		   regexp ends here, we interpret it as an empty expression
2180 		   (which matches an empty string), which is an error.
2181 		   Iterations of an empty expression is also an error. */
2182 #ifdef REG_LITERAL
2183 		if (!(ctx->cflags & REG_LITERAL))
2184 		  {
2185 #endif /* REG_LITERAL */
2186 		    /* error on end of string */
2187 		    if (ctx->re >= ctx->re_end) return depth > 0 ? REG_EPAREN
2188 						       : REG_EMPTY;
2189 		    /* error on unions and iterations of empty expressions */
2190 		    if (ctx->cflags & REG_EXTENDED)
2191 		      {
2192 			if (ctx->re < ctx->re_end)
2193 			  {
2194 			    if (*ctx->re == CHAR_PIPE) return REG_EMPTY;
2195 			    if (*ctx->re == CHAR_LBRACE)
2196 			      {
2197 				ctx->re++;
2198 		  empty_parse_bound:
2199 				/* We need to parse the bound first and return
2200 				   any error, before returning REG_BADRPT */
2201 				status = tre_parse_bound(ctx, NULL);
2202 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2203 				/* For ERE, if REG_NOMATCH is returned, we
2204 				   treat the lbrace as a literal. */
2205 				if (status == REG_NOMATCH)
2206 				  {
2207 				    ctx->re--;
2208 				    /* Drop down to literal-handling code */
2209 				  }
2210 				else
2211 				  {
2212 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2213 				    if (status != REG_OK)
2214 				      return status;
2215 				    return REG_BADRPT;
2216 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2217 				  }
2218 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2219 			      }
2220 #ifdef ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND
2221 			    else
2222 #endif /* ERE_LITERAL_LBRACE_ON_NON_NUMERIC_BOUND */
2223 			    if (*ctx->re == CHAR_STAR
2224 				|| *ctx->re == CHAR_PLUS
2225 				|| *ctx->re == CHAR_QUESTIONMARK)
2226 			      {
2227 				return REG_BADRPT;
2228 			      }
2229 			  }
2230 		      }
2231 		    else if (ctx->re + 1 < ctx->re_end
2232 			     && *ctx->re == CHAR_BACKSLASH
2233 			     && *(ctx->re + 1) == CHAR_LBRACE)
2234 		      {
2235 			ctx->re += 2;
2236 			goto empty_parse_bound;
2237 		      }
2238 #ifdef REG_LITERAL
2239 		  }
2240 #endif /* REG_LITERAL */
2241 
2242 		DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
2243 			REST(ctx->re)));
2244 		/* Note that we can't use an tre_isalpha() test here, since there
2245 		   may be characters which are alphabetic but neither upper or
2246 		   lower case. */
2247 		if (ctx->cflags & REG_ICASE
2248 		    && (tre_isupper_l(*ctx->re, ctx->loc) ||
2249 		    tre_islower_l(*ctx->re, ctx->loc)))
2250 		  {
2251 		    tre_ast_node_t *tmp1;
2252 		    tre_ast_node_t *tmp2;
2253 
2254 		    /* XXX - Can there be more than one opposite-case
2255 		       counterpoints for some character in some locale?  Or
2256 		       more than two characters which all should be regarded
2257 		       the same character if case is ignored?  If yes, there
2258 		       does not seem to be a portable way to detect it.  I guess
2259 		       that at least for multi-character collating elements there
2260 		       could be several opposite-case counterpoints, but they
2261 		       cannot be supported portably anyway. */
2262 		    tmp1 = tre_ast_new_literal(ctx->mem,
2263 					       tre_toupper_l(*ctx->re, ctx->loc),
2264 					       tre_toupper_l(*ctx->re, ctx->loc),
2265 					       ctx->position);
2266 		    if (!tmp1)
2267 		      return REG_ESPACE;
2268 		    tmp2 = tre_ast_new_literal(ctx->mem,
2269 					       tre_tolower_l(*ctx->re, ctx->loc),
2270 					       tre_tolower_l(*ctx->re, ctx->loc),
2271 					       ctx->position);
2272 		    if (!tmp2)
2273 		      return REG_ESPACE;
2274 		    result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
2275 		    if (!result)
2276 		      return REG_ESPACE;
2277 		  }
2278 		else
2279 		  {
2280 		    result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
2281 						 ctx->position);
2282 		    if (!result)
2283 		      return REG_ESPACE;
2284 		  }
2285 		ctx->position++;
2286 		ctx->re++;
2287 		break;
2288 	      }
2289 	    break;
2290 	  }
2291 
2292 	case PARSE_MARK_FOR_SUBMATCH:
2293 	  {
2294 	    int submatch_id = tre_stack_pop_int(stack);
2295 
2296 	    ctx->cflags = tre_stack_pop_int(stack); /* restore cflags */
2297 	    if (result->submatch_id >= 0 &&
2298 		result->submatch_id < SUBMATCH_ID_INVISIBLE_START)
2299 	      {
2300 		tre_ast_node_t *n, *tmp_node;
2301 		if (submatch_id >= SUBMATCH_ID_INVISIBLE_START)
2302 		  break;
2303 		n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
2304 		if (n == NULL)
2305 		  return REG_ESPACE;
2306 		tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
2307 		if (tmp_node == NULL)
2308 		  return REG_ESPACE;
2309 		tmp_node->num_submatches = result->num_submatches;
2310 		result = tmp_node;
2311 	      }
2312 	    result->submatch_id = submatch_id;
2313 	    if (submatch_id < SUBMATCH_ID_INVISIBLE_START)
2314 	      result->num_submatches++;
2315 	    break;
2316 	  }
2317 
2318 	default:
2319 	  assert(0);
2320 	  break;
2321 	}
2322     }
2323 
2324   /* Check for missing closing parentheses. */
2325   if (depth > 0)
2326     return REG_EPAREN;
2327 
2328   ctx->result = result;
2329 
2330   return REG_OK;
2331 }
2332 
2333 /* EOF */
2334