1 /*
2   regcomp.c - TRE POSIX compatible regex compilation functions.
3 
4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5   All rights reserved.
6 
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10 
11     1. Redistributions of source code must retain the above copyright
12        notice, this list of conditions and the following disclaimer.
13 
14     2. Redistributions in binary form must reproduce the above copyright
15        notice, this list of conditions and the following disclaimer in the
16        documentation and/or other materials provided with the distribution.
17 
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 */
31 
32 #include <string.h>
33 #include <stdlib.h>
34 #include <regex.h>
35 #include <limits.h>
36 #include <stdint.h>
37 #include <ctype.h>
38 
39 #include "tre.h"
40 
41 #include <assert.h>
42 
43 /***********************************************************************
44  from tre-compile.h
45 ***********************************************************************/
46 
47 typedef struct {
48   int position;
49   int code_min;
50   int code_max;
51   int *tags;
52   int assertions;
53   tre_ctype_t class;
54   tre_ctype_t *neg_classes;
55   int backref;
56 } tre_pos_and_tags_t;
57 
58 
59 /***********************************************************************
60  from tre-ast.c and tre-ast.h
61 ***********************************************************************/
62 
63 /* The different AST node types. */
64 typedef enum {
65   LITERAL,
66   CATENATION,
67   ITERATION,
68   UNION
69 } tre_ast_type_t;
70 
71 /* Special subtypes of TRE_LITERAL. */
72 #define EMPTY	  -1   /* Empty leaf (denotes empty string). */
73 #define ASSERTION -2   /* Assertion leaf. */
74 #define TAG	  -3   /* Tag leaf. */
75 #define BACKREF	  -4   /* Back reference leaf. */
76 
77 #define IS_SPECIAL(x)	((x)->code_min < 0)
78 #define IS_EMPTY(x)	((x)->code_min == EMPTY)
79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80 #define IS_TAG(x)	((x)->code_min == TAG)
81 #define IS_BACKREF(x)	((x)->code_min == BACKREF)
82 
83 
84 /* A generic AST node.  All AST nodes consist of this node on the top
85    level with `obj' pointing to the actual content. */
86 typedef struct {
87   tre_ast_type_t type;   /* Type of the node. */
88   void *obj;             /* Pointer to actual node. */
89   int nullable;
90   int submatch_id;
91   int num_submatches;
92   int num_tags;
93   tre_pos_and_tags_t *firstpos;
94   tre_pos_and_tags_t *lastpos;
95 } tre_ast_node_t;
96 
97 
98 /* A "literal" node.  These are created for assertions, back references,
99    tags, matching parameter settings, and all expressions that match one
100    character. */
101 typedef struct {
102   long code_min;
103   long code_max;
104   int position;
105   tre_ctype_t class;
106   tre_ctype_t *neg_classes;
107 } tre_literal_t;
108 
109 /* A "catenation" node.	 These are created when two regexps are concatenated.
110    If there are more than one subexpressions in sequence, the `left' part
111    holds all but the last, and `right' part holds the last subexpression
112    (catenation is left associative). */
113 typedef struct {
114   tre_ast_node_t *left;
115   tre_ast_node_t *right;
116 } tre_catenation_t;
117 
118 /* An "iteration" node.	 These are created for the "*", "+", "?", and "{m,n}"
119    operators. */
120 typedef struct {
121   /* Subexpression to match. */
122   tre_ast_node_t *arg;
123   /* Minimum number of consecutive matches. */
124   int min;
125   /* Maximum number of consecutive matches. */
126   int max;
127   /* If 0, match as many characters as possible, if 1 match as few as
128      possible.	Note that this does not always mean the same thing as
129      matching as many/few repetitions as possible. */
130   unsigned int minimal:1;
131 } tre_iteration_t;
132 
133 /* An "union" node.  These are created for the "|" operator. */
134 typedef struct {
135   tre_ast_node_t *left;
136   tre_ast_node_t *right;
137 } tre_union_t;
138 
139 
140 static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem,int type,void * obj)141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
142 {
143 	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
144 	if (!node || !obj)
145 		return 0;
146 	node->obj = obj;
147 	node->type = type;
148 	node->nullable = -1;
149 	node->submatch_id = -1;
150 	return node;
151 }
152 
153 static tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem,int code_min,int code_max,int position)154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155 {
156 	tre_ast_node_t *node;
157 	tre_literal_t *lit;
158 
159 	lit = tre_mem_calloc(mem, sizeof *lit);
160 	node = tre_ast_new_node(mem, LITERAL, lit);
161 	if (!node)
162 		return 0;
163 	lit->code_min = code_min;
164 	lit->code_max = code_max;
165 	lit->position = position;
166 	return node;
167 }
168 
169 static tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem,tre_ast_node_t * arg,int min,int max,int minimal)170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171 {
172 	tre_ast_node_t *node;
173 	tre_iteration_t *iter;
174 
175 	iter = tre_mem_calloc(mem, sizeof *iter);
176 	node = tre_ast_new_node(mem, ITERATION, iter);
177 	if (!node)
178 		return 0;
179 	iter->arg = arg;
180 	iter->min = min;
181 	iter->max = max;
182 	iter->minimal = minimal;
183 	node->num_submatches = arg->num_submatches;
184 	return node;
185 }
186 
187 static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
189 {
190 	tre_ast_node_t *node;
191 	tre_union_t *un;
192 
193 	if (!left)
194 		return right;
195 	un = tre_mem_calloc(mem, sizeof *un);
196 	node = tre_ast_new_node(mem, UNION, un);
197 	if (!node || !right)
198 		return 0;
199 	un->left = left;
200 	un->right = right;
201 	node->num_submatches = left->num_submatches + right->num_submatches;
202 	return node;
203 }
204 
205 static tre_ast_node_t *
tre_ast_new_catenation(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
207 {
208 	tre_ast_node_t *node;
209 	tre_catenation_t *cat;
210 
211 	if (!left)
212 		return right;
213 	cat = tre_mem_calloc(mem, sizeof *cat);
214 	node = tre_ast_new_node(mem, CATENATION, cat);
215 	if (!node)
216 		return 0;
217 	cat->left = left;
218 	cat->right = right;
219 	node->num_submatches = left->num_submatches + right->num_submatches;
220 	return node;
221 }
222 
223 
224 /***********************************************************************
225  from tre-stack.c and tre-stack.h
226 ***********************************************************************/
227 
228 typedef struct tre_stack_rec tre_stack_t;
229 
230 /* Creates a new stack object.	`size' is initial size in bytes, `max_size'
231    is maximum size, and `increment' specifies how much more space will be
232    allocated with realloc() if all space gets used up.	Returns the stack
233    object or NULL if out of memory. */
234 static tre_stack_t *
235 tre_stack_new(int size, int max_size, int increment);
236 
237 /* Frees the stack object. */
238 static void
239 tre_stack_destroy(tre_stack_t *s);
240 
241 /* Returns the current number of objects in the stack. */
242 static int
243 tre_stack_num_objects(tre_stack_t *s);
244 
245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
247    This tries to realloc() more space before failing if maximum size
248    has not yet been reached.  Returns REG_OK if successful. */
249 #define declare_pushf(typetag, type)					      \
250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251 
252 declare_pushf(voidptr, void *);
253 declare_pushf(int, int);
254 
255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
256    element off of stack `s' and returns it.  The stack must not be
257    empty. */
258 #define declare_popf(typetag, type)		  \
259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260 
261 declare_popf(voidptr, void *);
262 declare_popf(int, int);
263 
264 /* Just to save some typing. */
265 #define STACK_PUSH(s, typetag, value)					      \
266   do									      \
267     {									      \
268       status = tre_stack_push_ ## typetag(s, value);			      \
269     }									      \
270   while (/*CONSTCOND*/0)
271 
272 #define STACK_PUSHX(s, typetag, value)					      \
273   {									      \
274     status = tre_stack_push_ ## typetag(s, value);			      \
275     if (status != REG_OK)						      \
276       break;								      \
277   }
278 
279 #define STACK_PUSHR(s, typetag, value)					      \
280   {									      \
281     reg_errcode_t _status;						      \
282     _status = tre_stack_push_ ## typetag(s, value);			      \
283     if (_status != REG_OK)						      \
284       return _status;							      \
285   }
286 
287 union tre_stack_item {
288   void *voidptr_value;
289   int int_value;
290 };
291 
292 struct tre_stack_rec {
293   int size;
294   int max_size;
295   int increment;
296   int ptr;
297   union tre_stack_item *stack;
298 };
299 
300 
301 static tre_stack_t *
tre_stack_new(int size,int max_size,int increment)302 tre_stack_new(int size, int max_size, int increment)
303 {
304   tre_stack_t *s;
305 
306   s = xmalloc(sizeof(*s));
307   if (s != NULL)
308     {
309       s->stack = xmalloc(sizeof(*s->stack) * size);
310       if (s->stack == NULL)
311 	{
312 	  xfree(s);
313 	  return NULL;
314 	}
315       s->size = size;
316       s->max_size = max_size;
317       s->increment = increment;
318       s->ptr = 0;
319     }
320   return s;
321 }
322 
323 static void
tre_stack_destroy(tre_stack_t * s)324 tre_stack_destroy(tre_stack_t *s)
325 {
326   xfree(s->stack);
327   xfree(s);
328 }
329 
330 static int
tre_stack_num_objects(tre_stack_t * s)331 tre_stack_num_objects(tre_stack_t *s)
332 {
333   return s->ptr;
334 }
335 
336 static reg_errcode_t
tre_stack_push(tre_stack_t * s,union tre_stack_item value)337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338 {
339   if (s->ptr < s->size)
340     {
341       s->stack[s->ptr] = value;
342       s->ptr++;
343     }
344   else
345     {
346       if (s->size >= s->max_size)
347 	{
348 	  return REG_ESPACE;
349 	}
350       else
351 	{
352 	  union tre_stack_item *new_buffer;
353 	  int new_size;
354 	  new_size = s->size + s->increment;
355 	  if (new_size > s->max_size)
356 	    new_size = s->max_size;
357 	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
358 	  if (new_buffer == NULL)
359 	    {
360 	      return REG_ESPACE;
361 	    }
362 	  assert(new_size > s->size);
363 	  s->size = new_size;
364 	  s->stack = new_buffer;
365 	  tre_stack_push(s, value);
366 	}
367     }
368   return REG_OK;
369 }
370 
371 #define define_pushf(typetag, type)  \
372   declare_pushf(typetag, type) {     \
373     union tre_stack_item item;	     \
374     item.typetag ## _value = value;  \
375     return tre_stack_push(s, item);  \
376 }
377 
378 define_pushf(int, int)
379 define_pushf(voidptr, void *)
380 
381 #define define_popf(typetag, type)		    \
382   declare_popf(typetag, type) {			    \
383     return s->stack[--s->ptr].typetag ## _value;    \
384   }
385 
386 define_popf(int, int)
387 define_popf(voidptr, void *)
388 
389 
390 /***********************************************************************
391  from tre-parse.c and tre-parse.h
392 ***********************************************************************/
393 
394 /* Parse context. */
395 typedef struct {
396 	/* Memory allocator. The AST is allocated using this. */
397 	tre_mem_t mem;
398 	/* Stack used for keeping track of regexp syntax. */
399 	tre_stack_t *stack;
400 	/* The parsed node after a parse function returns. */
401 	tre_ast_node_t *n;
402 	/* Position in the regexp pattern after a parse function returns. */
403 	const char *s;
404 	/* The first character of the last subexpression parsed. */
405 	const char *start;
406 	/* Current submatch ID. */
407 	int submatch_id;
408 	/* Current position (number of literal). */
409 	int position;
410 	/* The highest back reference or -1 if none seen so far. */
411 	int max_backref;
412 	/* Compilation flags. */
413 	int cflags;
414 } tre_parse_ctx_t;
415 
416 /* Some macros for expanding \w, \s, etc. */
417 static const struct {
418 	char c;
419 	const char *expansion;
420 } tre_macros[] = {
421 	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422 	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423 	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424 	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425 	{ 0, 0 }
426 };
427 
428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429    must have at least `len' items.  Sets buf[0] to zero if the there
430    is no match in `tre_macros'. */
tre_expand_macro(const char * s)431 static const char *tre_expand_macro(const char *s)
432 {
433 	int i;
434 	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435 	return tre_macros[i].expansion;
436 }
437 
438 static int
tre_compare_lit(const void * a,const void * b)439 tre_compare_lit(const void *a, const void *b)
440 {
441 	const tre_literal_t *const *la = a;
442 	const tre_literal_t *const *lb = b;
443 	/* assumes the range of valid code_min is < INT_MAX */
444 	return la[0]->code_min - lb[0]->code_min;
445 }
446 
447 struct literals {
448 	tre_mem_t mem;
449 	tre_literal_t **a;
450 	int len;
451 	int cap;
452 };
453 
tre_new_lit(struct literals * p)454 static tre_literal_t *tre_new_lit(struct literals *p)
455 {
456 	tre_literal_t **a;
457 	if (p->len >= p->cap) {
458 		if (p->cap >= 1<<15)
459 			return 0;
460 		p->cap *= 2;
461 		a = xrealloc(p->a, p->cap * sizeof *p->a);
462 		if (!a)
463 			return 0;
464 		p->a = a;
465 	}
466 	a = p->a + p->len++;
467 	*a = tre_mem_calloc(p->mem, sizeof **a);
468 	return *a;
469 }
470 
add_icase_literals(struct literals * ls,int min,int max)471 static int add_icase_literals(struct literals *ls, int min, int max)
472 {
473 	tre_literal_t *lit;
474 	int b, e, c;
475 	for (c=min; c<=max; ) {
476 		/* assumes islower(c) and isupper(c) are exclusive
477 		   and toupper(c)!=c if islower(c).
478 		   multiple opposite case characters are not supported */
479 		if (tre_islower(c)) {
480 			b = e = tre_toupper(c);
481 			for (c++, e++; c<=max; c++, e++)
482 				if (tre_toupper(c) != e) break;
483 		} else if (tre_isupper(c)) {
484 			b = e = tre_tolower(c);
485 			for (c++, e++; c<=max; c++, e++)
486 				if (tre_tolower(c) != e) break;
487 		} else {
488 			c++;
489 			continue;
490 		}
491 		lit = tre_new_lit(ls);
492 		if (!lit)
493 			return -1;
494 		lit->code_min = b;
495 		lit->code_max = e-1;
496 		lit->position = -1;
497 	}
498 	return 0;
499 }
500 
501 
502 /* Maximum number of character classes in a negated bracket expression. */
503 #define MAX_NEG_CLASSES 64
504 
505 struct neg {
506 	int negate;
507 	int len;
508 	tre_ctype_t a[MAX_NEG_CLASSES];
509 };
510 
511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512 
513 /*
514 bracket grammar:
515 Bracket  =  '[' List ']'  |  '[^' List ']'
516 List     =  Term  |  List Term
517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
518 Range    =  Char '-' Char  |  Char '-' '-'
519 Char     =  Coll  |  coll_single
520 Meta     =  ']'  |  '-'
521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
523 Chclass  =  '[:' class ':]'
524 
525 coll_single is a single char collating element but it can be
526  '-' only at the beginning or end of a List and
527  ']' only at the beginning of a List and
528  '^' anywhere except after the openning '['
529 */
530 
parse_bracket_terms(tre_parse_ctx_t * ctx,const char * s,struct literals * ls,struct neg * neg)531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
532 {
533 	const char *start = s;
534 	tre_ctype_t class;
535 	int min, max;
536 	wchar_t wc;
537 	int len;
538 
539 	for (;;) {
540 		class = 0;
541 		len = mbtowc(&wc, s, -1);
542 		if (len <= 0)
543 			return *s ? REG_BADPAT : REG_EBRACK;
544 		if (*s == ']' && s != start) {
545 			ctx->s = s+1;
546 			return REG_OK;
547 		}
548 		if (*s == '-' && s != start && s[1] != ']' &&
549 		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
550 		    (s[1] != '-' || s[2] == ']'))
551 			return REG_ERANGE;
552 		if (*s == '[' && (s[1] == '.' || s[1] == '='))
553 			/* collating symbols and equivalence classes are not supported */
554 			return REG_ECOLLATE;
555 		if (*s == '[' && s[1] == ':') {
556 			char tmp[CHARCLASS_NAME_MAX+1];
557 			s += 2;
558 			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559 				if (s[len] == ':') {
560 					memcpy(tmp, s, len);
561 					tmp[len] = 0;
562 					class = tre_ctype(tmp);
563 					break;
564 				}
565 			}
566 			if (!class || s[len+1] != ']')
567 				return REG_ECTYPE;
568 			min = 0;
569 			max = TRE_CHAR_MAX;
570 			s += len+2;
571 		} else {
572 			min = max = wc;
573 			s += len;
574 			if (*s == '-' && s[1] != ']') {
575 				s++;
576 				len = mbtowc(&wc, s, -1);
577 				max = wc;
578 				/* XXX - Should use collation order instead of
579 				   encoding values in character ranges. */
580 				if (len <= 0 || min > max)
581 					return REG_ERANGE;
582 				s += len;
583 			}
584 		}
585 
586 		if (class && neg->negate) {
587 			if (neg->len >= MAX_NEG_CLASSES)
588 				return REG_ESPACE;
589 			neg->a[neg->len++] = class;
590 		} else  {
591 			tre_literal_t *lit = tre_new_lit(ls);
592 			if (!lit)
593 				return REG_ESPACE;
594 			lit->code_min = min;
595 			lit->code_max = max;
596 			lit->class = class;
597 			lit->position = -1;
598 
599 			/* Add opposite-case codepoints if REG_ICASE is present.
600 			   It seems that POSIX requires that bracket negation
601 			   should happen before case-folding, but most practical
602 			   implementations do it the other way around. Changing
603 			   the order would need efficient representation of
604 			   case-fold ranges and bracket range sets even with
605 			   simple patterns so this is ok for now. */
606 			if (ctx->cflags & REG_ICASE && !class)
607 				if (add_icase_literals(ls, min, max))
608 					return REG_ESPACE;
609 		}
610 	}
611 }
612 
parse_bracket(tre_parse_ctx_t * ctx,const char * s)613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
614 {
615 	int i, max, min, negmax, negmin;
616 	tre_ast_node_t *node = 0, *n;
617 	tre_ctype_t *nc = 0;
618 	tre_literal_t *lit;
619 	struct literals ls;
620 	struct neg neg;
621 	reg_errcode_t err;
622 
623 	ls.mem = ctx->mem;
624 	ls.len = 0;
625 	ls.cap = 32;
626 	ls.a = xmalloc(ls.cap * sizeof *ls.a);
627 	if (!ls.a)
628 		return REG_ESPACE;
629 	neg.len = 0;
630 	neg.negate = *s == '^';
631 	if (neg.negate)
632 		s++;
633 
634 	err = parse_bracket_terms(ctx, s, &ls, &neg);
635 	if (err != REG_OK)
636 		goto parse_bracket_done;
637 
638 	if (neg.negate) {
639 		/*
640 		 * With REG_NEWLINE, POSIX requires that newlines are not matched by
641 		 * any form of a non-matching list.
642 		 */
643 		if (ctx->cflags & REG_NEWLINE) {
644 			lit = tre_new_lit(&ls);
645 			if (!lit) {
646 				err = REG_ESPACE;
647 				goto parse_bracket_done;
648 			}
649 			lit->code_min = '\n';
650 			lit->code_max = '\n';
651 			lit->position = -1;
652 		}
653 		/* Sort the array if we need to negate it. */
654 		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
655 		/* extra lit for the last negated range */
656 		lit = tre_new_lit(&ls);
657 		if (!lit) {
658 			err = REG_ESPACE;
659 			goto parse_bracket_done;
660 		}
661 		lit->code_min = TRE_CHAR_MAX+1;
662 		lit->code_max = TRE_CHAR_MAX+1;
663 		lit->position = -1;
664 		/* negated classes */
665 		if (neg.len) {
666 			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
667 			if (!nc) {
668 				err = REG_ESPACE;
669 				goto parse_bracket_done;
670 			}
671 			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
672 			nc[neg.len] = 0;
673 		}
674 	}
675 
676 	/* Build a union of the items in the array, negated if necessary. */
677 	negmax = negmin = 0;
678 	for (i = 0; i < ls.len; i++) {
679 		lit = ls.a[i];
680 		min = lit->code_min;
681 		max = lit->code_max;
682 		if (neg.negate) {
683 			if (min <= negmin) {
684 				/* Overlap. */
685 				negmin = MAX(max + 1, negmin);
686 				continue;
687 			}
688 			negmax = min - 1;
689 			lit->code_min = negmin;
690 			lit->code_max = negmax;
691 			negmin = max + 1;
692 		}
693 		lit->position = ctx->position;
694 		lit->neg_classes = nc;
695 		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
696 		node = tre_ast_new_union(ctx->mem, node, n);
697 		if (!node) {
698 			err = REG_ESPACE;
699 			break;
700 		}
701 	}
702 
703 parse_bracket_done:
704 	xfree(ls.a);
705 	ctx->position++;
706 	ctx->n = node;
707 	return err;
708 }
709 
parse_dup_count(const char * s,int * n)710 static const char *parse_dup_count(const char *s, int *n)
711 {
712 	*n = -1;
713 	if (!isdigit(*s))
714 		return s;
715 	*n = 0;
716 	for (;;) {
717 		*n = 10 * *n + (*s - '0');
718 		s++;
719 		if (!isdigit(*s) || *n > RE_DUP_MAX)
720 			break;
721 	}
722 	return s;
723 }
724 
parse_dup(const char * s,int ere,int * pmin,int * pmax)725 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
726 {
727 	int min, max;
728 
729 	s = parse_dup_count(s, &min);
730 	if (*s == ',')
731 		s = parse_dup_count(s+1, &max);
732 	else
733 		max = min;
734 
735 	if (
736 		(max < min && max >= 0) ||
737 		max > RE_DUP_MAX ||
738 		min > RE_DUP_MAX ||
739 		min < 0 ||
740 		(!ere && *s++ != '\\') ||
741 		*s++ != '}'
742 	)
743 		return 0;
744 	*pmin = min;
745 	*pmax = max;
746 	return s;
747 }
748 
hexval(unsigned c)749 static int hexval(unsigned c)
750 {
751 	if (c-'0'<10) return c-'0';
752 	c |= 32;
753 	if (c-'a'<6) return c-'a'+10;
754 	return -1;
755 }
756 
marksub(tre_parse_ctx_t * ctx,tre_ast_node_t * node,int subid)757 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
758 {
759 	if (node->submatch_id >= 0) {
760 		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
761 		if (!n)
762 			return REG_ESPACE;
763 		n = tre_ast_new_catenation(ctx->mem, n, node);
764 		if (!n)
765 			return REG_ESPACE;
766 		n->num_submatches = node->num_submatches;
767 		node = n;
768 	}
769 	node->submatch_id = subid;
770 	node->num_submatches++;
771 	ctx->n = node;
772 	return REG_OK;
773 }
774 
775 /*
776 BRE grammar:
777 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
778 Branch =  Atom  |  Branch Atom
779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
780 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
781 
782 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
783 
784 ERE grammar:
785 Regex  =  Branch  |  Regex '|' Branch
786 Branch =  Atom  |  Branch Atom
787 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
788 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
789 
790 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
791 */
792 
parse_atom(tre_parse_ctx_t * ctx,const char * s)793 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
794 {
795 	int len, ere = ctx->cflags & REG_EXTENDED;
796 	const char *p;
797 	tre_ast_node_t *node;
798 	wchar_t wc;
799 	switch (*s) {
800 	case '[':
801 		return parse_bracket(ctx, s+1);
802 	case '\\':
803 		p = tre_expand_macro(s+1);
804 		if (p) {
805 			/* assume \X expansion is a single atom */
806 			reg_errcode_t err = parse_atom(ctx, p);
807 			ctx->s = s+2;
808 			return err;
809 		}
810 		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
811 		switch (*++s) {
812 		case 0:
813 			return REG_EESCAPE;
814 		case 'b':
815 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
816 			break;
817 		case 'B':
818 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
819 			break;
820 		case '<':
821 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
822 			break;
823 		case '>':
824 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
825 			break;
826 		case 'x':
827 			s++;
828 			int i, v = 0, c;
829 			len = 2;
830 			if (*s == '{') {
831 				len = 8;
832 				s++;
833 			}
834 			for (i=0; i<len && v<0x110000; i++) {
835 				c = hexval(s[i]);
836 				if (c < 0) break;
837 				v = 16*v + c;
838 			}
839 			s += i;
840 			if (len == 8) {
841 				if (*s != '}')
842 					return REG_EBRACE;
843 				s++;
844 			}
845 			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
846 			s--;
847 			break;
848 		case '{':
849 		case '+':
850 		case '?':
851 			/* extension: treat \+, \? as repetitions in BRE */
852 			/* reject repetitions after empty expression in BRE */
853 			if (!ere)
854 				return REG_BADRPT;
855 		case '|':
856 			/* extension: treat \| as alternation in BRE */
857 			if (!ere) {
858 				node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
859 				s--;
860 				goto end;
861 			}
862 			/* fallthrough */
863 		default:
864 			if (!ere && (unsigned)*s-'1' < 9) {
865 				/* back reference */
866 				int val = *s - '0';
867 				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
868 				ctx->max_backref = MAX(val, ctx->max_backref);
869 			} else {
870 				/* extension: accept unknown escaped char
871 				   as a literal */
872 				goto parse_literal;
873 			}
874 		}
875 		s++;
876 		break;
877 	case '.':
878 		if (ctx->cflags & REG_NEWLINE) {
879 			tre_ast_node_t *tmp1, *tmp2;
880 			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
881 			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
882 			if (tmp1 && tmp2)
883 				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
884 			else
885 				node = 0;
886 		} else {
887 			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
888 		}
889 		s++;
890 		break;
891 	case '^':
892 		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
893 		if (!ere && s != ctx->start)
894 			goto parse_literal;
895 		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
896 		s++;
897 		break;
898 	case '$':
899 		/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
900 		if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
901 			goto parse_literal;
902 		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
903 		s++;
904 		break;
905 	case '*':
906 	case '{':
907 	case '+':
908 	case '?':
909 		/* reject repetitions after empty expression in ERE */
910 		if (ere)
911 			return REG_BADRPT;
912 	case '|':
913 		if (!ere)
914 			goto parse_literal;
915 	case 0:
916 		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
917 		break;
918 	default:
919 parse_literal:
920 		len = mbtowc(&wc, s, -1);
921 		if (len < 0)
922 			return REG_BADPAT;
923 		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
924 			tre_ast_node_t *tmp1, *tmp2;
925 			/* multiple opposite case characters are not supported */
926 			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
927 			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
928 			if (tmp1 && tmp2)
929 				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
930 			else
931 				node = 0;
932 		} else {
933 			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
934 		}
935 		ctx->position++;
936 		s += len;
937 		break;
938 	}
939 end:
940 	if (!node)
941 		return REG_ESPACE;
942 	ctx->n = node;
943 	ctx->s = s;
944 	return REG_OK;
945 }
946 
947 #define PUSHPTR(err, s, v) do { \
948 	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
949 		return err; \
950 } while(0)
951 
952 #define PUSHINT(err, s, v) do { \
953 	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
954 		return err; \
955 } while(0)
956 
tre_parse(tre_parse_ctx_t * ctx)957 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
958 {
959 	tre_ast_node_t *nbranch=0, *nunion=0;
960 	int ere = ctx->cflags & REG_EXTENDED;
961 	const char *s = ctx->start;
962 	int subid = 0;
963 	int depth = 0;
964 	reg_errcode_t err;
965 	tre_stack_t *stack = ctx->stack;
966 
967 	PUSHINT(err, stack, subid++);
968 	for (;;) {
969 		if ((!ere && *s == '\\' && s[1] == '(') ||
970 		    (ere && *s == '(')) {
971 			PUSHPTR(err, stack, nunion);
972 			PUSHPTR(err, stack, nbranch);
973 			PUSHINT(err, stack, subid++);
974 			s++;
975 			if (!ere)
976 				s++;
977 			depth++;
978 			nbranch = nunion = 0;
979 			ctx->start = s;
980 			continue;
981 		}
982 		if ((!ere && *s == '\\' && s[1] == ')') ||
983 		    (ere && *s == ')' && depth)) {
984 			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
985 			if (!ctx->n)
986 				return REG_ESPACE;
987 		} else {
988 			err = parse_atom(ctx, s);
989 			if (err != REG_OK)
990 				return err;
991 			s = ctx->s;
992 		}
993 
994 	parse_iter:
995 		for (;;) {
996 			int min, max;
997 
998 			if (*s!='\\' && *s!='*') {
999 				if (!ere)
1000 					break;
1001 				if (*s!='+' && *s!='?' && *s!='{')
1002 					break;
1003 			}
1004 			if (*s=='\\' && ere)
1005 				break;
1006 			/* extension: treat \+, \? as repetitions in BRE */
1007 			if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1008 				break;
1009 			if (*s=='\\')
1010 				s++;
1011 
1012 			/* handle ^* at the start of a BRE. */
1013 			if (!ere && s==ctx->start+1 && s[-1]=='^')
1014 				break;
1015 
1016 			/* extension: multiple consecutive *+?{,} is unspecified,
1017 			   but (a+)+ has to be supported so accepting a++ makes
1018 			   sense, note however that the RE_DUP_MAX limit can be
1019 			   circumvented: (a{255}){255} uses a lot of memory.. */
1020 			if (*s=='{') {
1021 				s = parse_dup(s+1, ere, &min, &max);
1022 				if (!s)
1023 					return REG_BADBR;
1024 			} else {
1025 				min=0;
1026 				max=-1;
1027 				if (*s == '+')
1028 					min = 1;
1029 				if (*s == '?')
1030 					max = 1;
1031 				s++;
1032 			}
1033 			if (max == 0)
1034 				ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1035 			else
1036 				ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1037 			if (!ctx->n)
1038 				return REG_ESPACE;
1039 		}
1040 
1041 		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1042 		if ((ere && *s == '|') ||
1043 		    (ere && *s == ')' && depth) ||
1044 		    (!ere && *s == '\\' && s[1] == ')') ||
1045 		    /* extension: treat \| as alternation in BRE */
1046 		    (!ere && *s == '\\' && s[1] == '|') ||
1047 		    !*s) {
1048 			/* extension: empty branch is unspecified (), (|a), (a|)
1049 			   here they are not rejected but match on empty string */
1050 			int c = *s;
1051 			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1052 			nbranch = 0;
1053 
1054 			if (c == '\\' && s[1] == '|') {
1055 				s+=2;
1056 				ctx->start = s;
1057 			} else if (c == '|') {
1058 				s++;
1059 				ctx->start = s;
1060 			} else {
1061 				if (c == '\\') {
1062 					if (!depth) return REG_EPAREN;
1063 					s+=2;
1064 				} else if (c == ')')
1065 					s++;
1066 				depth--;
1067 				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1068 				if (err != REG_OK)
1069 					return err;
1070 				if (!c && depth<0) {
1071 					ctx->submatch_id = subid;
1072 					return REG_OK;
1073 				}
1074 				if (!c || depth<0)
1075 					return REG_EPAREN;
1076 				nbranch = tre_stack_pop_voidptr(stack);
1077 				nunion = tre_stack_pop_voidptr(stack);
1078 				goto parse_iter;
1079 			}
1080 		}
1081 	}
1082 }
1083 
1084 
1085 /***********************************************************************
1086  from tre-compile.c
1087 ***********************************************************************/
1088 
1089 
1090 /*
1091   TODO:
1092    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1093      function calls.
1094 */
1095 
1096 /*
1097   Algorithms to setup tags so that submatch addressing can be done.
1098 */
1099 
1100 
1101 /* Inserts a catenation node to the root of the tree given in `node'.
1102    As the left child a new tag with number `tag_id' to `node' is added,
1103    and the right child is the old root. */
1104 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1105 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1106 {
1107   tre_catenation_t *c;
1108 
1109   c = tre_mem_alloc(mem, sizeof(*c));
1110   if (c == NULL)
1111     return REG_ESPACE;
1112   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1113   if (c->left == NULL)
1114     return REG_ESPACE;
1115   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1116   if (c->right == NULL)
1117     return REG_ESPACE;
1118 
1119   c->right->obj = node->obj;
1120   c->right->type = node->type;
1121   c->right->nullable = -1;
1122   c->right->submatch_id = -1;
1123   c->right->firstpos = NULL;
1124   c->right->lastpos = NULL;
1125   c->right->num_tags = 0;
1126   c->right->num_submatches = 0;
1127   node->obj = c;
1128   node->type = CATENATION;
1129   return REG_OK;
1130 }
1131 
1132 /* Inserts a catenation node to the root of the tree given in `node'.
1133    As the right child a new tag with number `tag_id' to `node' is added,
1134    and the left child is the old root. */
1135 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1136 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1137 {
1138   tre_catenation_t *c;
1139 
1140   c = tre_mem_alloc(mem, sizeof(*c));
1141   if (c == NULL)
1142     return REG_ESPACE;
1143   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1144   if (c->right == NULL)
1145     return REG_ESPACE;
1146   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1147   if (c->left == NULL)
1148     return REG_ESPACE;
1149 
1150   c->left->obj = node->obj;
1151   c->left->type = node->type;
1152   c->left->nullable = -1;
1153   c->left->submatch_id = -1;
1154   c->left->firstpos = NULL;
1155   c->left->lastpos = NULL;
1156   c->left->num_tags = 0;
1157   c->left->num_submatches = 0;
1158   node->obj = c;
1159   node->type = CATENATION;
1160   return REG_OK;
1161 }
1162 
1163 typedef enum {
1164   ADDTAGS_RECURSE,
1165   ADDTAGS_AFTER_ITERATION,
1166   ADDTAGS_AFTER_UNION_LEFT,
1167   ADDTAGS_AFTER_UNION_RIGHT,
1168   ADDTAGS_AFTER_CAT_LEFT,
1169   ADDTAGS_AFTER_CAT_RIGHT,
1170   ADDTAGS_SET_SUBMATCH_END
1171 } tre_addtags_symbol_t;
1172 
1173 
1174 typedef struct {
1175   int tag;
1176   int next_tag;
1177 } tre_tag_states_t;
1178 
1179 
1180 /* Go through `regset' and set submatch data for submatches that are
1181    using this tag. */
1182 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)1183 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1184 {
1185   int i;
1186 
1187   for (i = 0; regset[i] >= 0; i++)
1188     {
1189       int id = regset[i] / 2;
1190       int start = !(regset[i] % 2);
1191       if (start)
1192 	tnfa->submatch_data[id].so_tag = tag;
1193       else
1194 	tnfa->submatch_data[id].eo_tag = tag;
1195     }
1196   regset[0] = -1;
1197 }
1198 
1199 
1200 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1201    subexpressions marked for submatch addressing can be traced. */
1202 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)1203 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1204 	     tre_tnfa_t *tnfa)
1205 {
1206   reg_errcode_t status = REG_OK;
1207   tre_addtags_symbol_t symbol;
1208   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1209   int bottom = tre_stack_num_objects(stack);
1210   /* True for first pass (counting number of needed tags) */
1211   int first_pass = (mem == NULL || tnfa == NULL);
1212   int *regset, *orig_regset;
1213   int num_tags = 0; /* Total number of tags. */
1214   int num_minimals = 0;	 /* Number of special minimal tags. */
1215   int tag = 0;	    /* The tag that is to be added next. */
1216   int next_tag = 1; /* Next tag to use after this one. */
1217   int *parents;	    /* Stack of submatches the current submatch is
1218 		       contained in. */
1219   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1220   tre_tag_states_t *saved_states;
1221 
1222   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1223   if (!first_pass)
1224     {
1225       tnfa->end_tag = 0;
1226       tnfa->minimal_tags[0] = -1;
1227     }
1228 
1229   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1230   if (regset == NULL)
1231     return REG_ESPACE;
1232   regset[0] = -1;
1233   orig_regset = regset;
1234 
1235   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1236   if (parents == NULL)
1237     {
1238       xfree(regset);
1239       return REG_ESPACE;
1240     }
1241   parents[0] = -1;
1242 
1243   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1244   if (saved_states == NULL)
1245     {
1246       xfree(regset);
1247       xfree(parents);
1248       return REG_ESPACE;
1249     }
1250   else
1251     {
1252       unsigned int i;
1253       for (i = 0; i <= tnfa->num_submatches; i++)
1254 	saved_states[i].tag = -1;
1255     }
1256 
1257   STACK_PUSH(stack, voidptr, node);
1258   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1259 
1260   while (tre_stack_num_objects(stack) > bottom)
1261     {
1262       if (status != REG_OK)
1263 	break;
1264 
1265       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1266       switch (symbol)
1267 	{
1268 
1269 	case ADDTAGS_SET_SUBMATCH_END:
1270 	  {
1271 	    int id = tre_stack_pop_int(stack);
1272 	    int i;
1273 
1274 	    /* Add end of this submatch to regset. */
1275 	    for (i = 0; regset[i] >= 0; i++);
1276 	    regset[i] = id * 2 + 1;
1277 	    regset[i + 1] = -1;
1278 
1279 	    /* Pop this submatch from the parents stack. */
1280 	    for (i = 0; parents[i] >= 0; i++);
1281 	    parents[i - 1] = -1;
1282 	    break;
1283 	  }
1284 
1285 	case ADDTAGS_RECURSE:
1286 	  node = tre_stack_pop_voidptr(stack);
1287 
1288 	  if (node->submatch_id >= 0)
1289 	    {
1290 	      int id = node->submatch_id;
1291 	      int i;
1292 
1293 
1294 	      /* Add start of this submatch to regset. */
1295 	      for (i = 0; regset[i] >= 0; i++);
1296 	      regset[i] = id * 2;
1297 	      regset[i + 1] = -1;
1298 
1299 	      if (!first_pass)
1300 		{
1301 		  for (i = 0; parents[i] >= 0; i++);
1302 		  tnfa->submatch_data[id].parents = NULL;
1303 		  if (i > 0)
1304 		    {
1305 		      int *p = xmalloc(sizeof(*p) * (i + 1));
1306 		      if (p == NULL)
1307 			{
1308 			  status = REG_ESPACE;
1309 			  break;
1310 			}
1311 		      assert(tnfa->submatch_data[id].parents == NULL);
1312 		      tnfa->submatch_data[id].parents = p;
1313 		      for (i = 0; parents[i] >= 0; i++)
1314 			p[i] = parents[i];
1315 		      p[i] = -1;
1316 		    }
1317 		}
1318 
1319 	      /* Add end of this submatch to regset after processing this
1320 		 node. */
1321 	      STACK_PUSHX(stack, int, node->submatch_id);
1322 	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1323 	    }
1324 
1325 	  switch (node->type)
1326 	    {
1327 	    case LITERAL:
1328 	      {
1329 		tre_literal_t *lit = node->obj;
1330 
1331 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1332 		  {
1333 		    int i;
1334 		    if (regset[0] >= 0)
1335 		      {
1336 			/* Regset is not empty, so add a tag before the
1337 			   literal or backref. */
1338 			if (!first_pass)
1339 			  {
1340 			    status = tre_add_tag_left(mem, node, tag);
1341 			    tnfa->tag_directions[tag] = direction;
1342 			    if (minimal_tag >= 0)
1343 			      {
1344 				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1345 				tnfa->minimal_tags[i] = tag;
1346 				tnfa->minimal_tags[i + 1] = minimal_tag;
1347 				tnfa->minimal_tags[i + 2] = -1;
1348 				minimal_tag = -1;
1349 				num_minimals++;
1350 			      }
1351 			    tre_purge_regset(regset, tnfa, tag);
1352 			  }
1353 			else
1354 			  {
1355 			    node->num_tags = 1;
1356 			  }
1357 
1358 			regset[0] = -1;
1359 			tag = next_tag;
1360 			num_tags++;
1361 			next_tag++;
1362 		      }
1363 		  }
1364 		else
1365 		  {
1366 		    assert(!IS_TAG(lit));
1367 		  }
1368 		break;
1369 	      }
1370 	    case CATENATION:
1371 	      {
1372 		tre_catenation_t *cat = node->obj;
1373 		tre_ast_node_t *left = cat->left;
1374 		tre_ast_node_t *right = cat->right;
1375 		int reserved_tag = -1;
1376 
1377 
1378 		/* After processing right child. */
1379 		STACK_PUSHX(stack, voidptr, node);
1380 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1381 
1382 		/* Process right child. */
1383 		STACK_PUSHX(stack, voidptr, right);
1384 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1385 
1386 		/* After processing left child. */
1387 		STACK_PUSHX(stack, int, next_tag + left->num_tags);
1388 		if (left->num_tags > 0 && right->num_tags > 0)
1389 		  {
1390 		    /* Reserve the next tag to the right child. */
1391 		    reserved_tag = next_tag;
1392 		    next_tag++;
1393 		  }
1394 		STACK_PUSHX(stack, int, reserved_tag);
1395 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1396 
1397 		/* Process left child. */
1398 		STACK_PUSHX(stack, voidptr, left);
1399 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1400 
1401 		}
1402 	      break;
1403 	    case ITERATION:
1404 	      {
1405 		tre_iteration_t *iter = node->obj;
1406 
1407 		if (first_pass)
1408 		  {
1409 		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1410 		  }
1411 		else
1412 		  {
1413 		    STACK_PUSHX(stack, int, tag);
1414 		    STACK_PUSHX(stack, int, iter->minimal);
1415 		  }
1416 		STACK_PUSHX(stack, voidptr, node);
1417 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1418 
1419 		STACK_PUSHX(stack, voidptr, iter->arg);
1420 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1421 
1422 		/* Regset is not empty, so add a tag here. */
1423 		if (regset[0] >= 0 || iter->minimal)
1424 		  {
1425 		    if (!first_pass)
1426 		      {
1427 			int i;
1428 			status = tre_add_tag_left(mem, node, tag);
1429 			if (iter->minimal)
1430 			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1431 			else
1432 			  tnfa->tag_directions[tag] = direction;
1433 			if (minimal_tag >= 0)
1434 			  {
1435 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1436 			    tnfa->minimal_tags[i] = tag;
1437 			    tnfa->minimal_tags[i + 1] = minimal_tag;
1438 			    tnfa->minimal_tags[i + 2] = -1;
1439 			    minimal_tag = -1;
1440 			    num_minimals++;
1441 			  }
1442 			tre_purge_regset(regset, tnfa, tag);
1443 		      }
1444 
1445 		    regset[0] = -1;
1446 		    tag = next_tag;
1447 		    num_tags++;
1448 		    next_tag++;
1449 		  }
1450 		direction = TRE_TAG_MINIMIZE;
1451 	      }
1452 	      break;
1453 	    case UNION:
1454 	      {
1455 		tre_union_t *uni = node->obj;
1456 		tre_ast_node_t *left = uni->left;
1457 		tre_ast_node_t *right = uni->right;
1458 		int left_tag;
1459 		int right_tag;
1460 
1461 		if (regset[0] >= 0)
1462 		  {
1463 		    left_tag = next_tag;
1464 		    right_tag = next_tag + 1;
1465 		  }
1466 		else
1467 		  {
1468 		    left_tag = tag;
1469 		    right_tag = next_tag;
1470 		  }
1471 
1472 		/* After processing right child. */
1473 		STACK_PUSHX(stack, int, right_tag);
1474 		STACK_PUSHX(stack, int, left_tag);
1475 		STACK_PUSHX(stack, voidptr, regset);
1476 		STACK_PUSHX(stack, int, regset[0] >= 0);
1477 		STACK_PUSHX(stack, voidptr, node);
1478 		STACK_PUSHX(stack, voidptr, right);
1479 		STACK_PUSHX(stack, voidptr, left);
1480 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1481 
1482 		/* Process right child. */
1483 		STACK_PUSHX(stack, voidptr, right);
1484 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1485 
1486 		/* After processing left child. */
1487 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1488 
1489 		/* Process left child. */
1490 		STACK_PUSHX(stack, voidptr, left);
1491 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1492 
1493 		/* Regset is not empty, so add a tag here. */
1494 		if (regset[0] >= 0)
1495 		  {
1496 		    if (!first_pass)
1497 		      {
1498 			int i;
1499 			status = tre_add_tag_left(mem, node, tag);
1500 			tnfa->tag_directions[tag] = direction;
1501 			if (minimal_tag >= 0)
1502 			  {
1503 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1504 			    tnfa->minimal_tags[i] = tag;
1505 			    tnfa->minimal_tags[i + 1] = minimal_tag;
1506 			    tnfa->minimal_tags[i + 2] = -1;
1507 			    minimal_tag = -1;
1508 			    num_minimals++;
1509 			  }
1510 			tre_purge_regset(regset, tnfa, tag);
1511 		      }
1512 
1513 		    regset[0] = -1;
1514 		    tag = next_tag;
1515 		    num_tags++;
1516 		    next_tag++;
1517 		  }
1518 
1519 		if (node->num_submatches > 0)
1520 		  {
1521 		    /* The next two tags are reserved for markers. */
1522 		    next_tag++;
1523 		    tag = next_tag;
1524 		    next_tag++;
1525 		  }
1526 
1527 		break;
1528 	      }
1529 	    }
1530 
1531 	  if (node->submatch_id >= 0)
1532 	    {
1533 	      int i;
1534 	      /* Push this submatch on the parents stack. */
1535 	      for (i = 0; parents[i] >= 0; i++);
1536 	      parents[i] = node->submatch_id;
1537 	      parents[i + 1] = -1;
1538 	    }
1539 
1540 	  break; /* end case: ADDTAGS_RECURSE */
1541 
1542 	case ADDTAGS_AFTER_ITERATION:
1543 	  {
1544 	    int minimal = 0;
1545 	    int enter_tag;
1546 	    node = tre_stack_pop_voidptr(stack);
1547 	    if (first_pass)
1548 	      {
1549 		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1550 		  + tre_stack_pop_int(stack);
1551 		minimal_tag = -1;
1552 	      }
1553 	    else
1554 	      {
1555 		minimal = tre_stack_pop_int(stack);
1556 		enter_tag = tre_stack_pop_int(stack);
1557 		if (minimal)
1558 		  minimal_tag = enter_tag;
1559 	      }
1560 
1561 	    if (!first_pass)
1562 	      {
1563 		if (minimal)
1564 		  direction = TRE_TAG_MINIMIZE;
1565 		else
1566 		  direction = TRE_TAG_MAXIMIZE;
1567 	      }
1568 	    break;
1569 	  }
1570 
1571 	case ADDTAGS_AFTER_CAT_LEFT:
1572 	  {
1573 	    int new_tag = tre_stack_pop_int(stack);
1574 	    next_tag = tre_stack_pop_int(stack);
1575 	    if (new_tag >= 0)
1576 	      {
1577 		tag = new_tag;
1578 	      }
1579 	    break;
1580 	  }
1581 
1582 	case ADDTAGS_AFTER_CAT_RIGHT:
1583 	  node = tre_stack_pop_voidptr(stack);
1584 	  if (first_pass)
1585 	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1586 	      + ((tre_catenation_t *)node->obj)->right->num_tags;
1587 	  break;
1588 
1589 	case ADDTAGS_AFTER_UNION_LEFT:
1590 	  /* Lift the bottom of the `regset' array so that when processing
1591 	     the right operand the items currently in the array are
1592 	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
1593 	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1594 	  while (*regset >= 0)
1595 	    regset++;
1596 	  break;
1597 
1598 	case ADDTAGS_AFTER_UNION_RIGHT:
1599 	  {
1600 	    int added_tags, tag_left, tag_right;
1601 	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1602 	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1603 	    node = tre_stack_pop_voidptr(stack);
1604 	    added_tags = tre_stack_pop_int(stack);
1605 	    if (first_pass)
1606 	      {
1607 		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1608 		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1609 		  + ((node->num_submatches > 0) ? 2 : 0);
1610 	      }
1611 	    regset = tre_stack_pop_voidptr(stack);
1612 	    tag_left = tre_stack_pop_int(stack);
1613 	    tag_right = tre_stack_pop_int(stack);
1614 
1615 	    /* Add tags after both children, the left child gets a smaller
1616 	       tag than the right child.  This guarantees that we prefer
1617 	       the left child over the right child. */
1618 	    /* XXX - This is not always necessary (if the children have
1619 	       tags which must be seen for every match of that child). */
1620 	    /* XXX - Check if this is the only place where tre_add_tag_right
1621 	       is used.	 If so, use tre_add_tag_left (putting the tag before
1622 	       the child as opposed after the child) and throw away
1623 	       tre_add_tag_right. */
1624 	    if (node->num_submatches > 0)
1625 	      {
1626 		if (!first_pass)
1627 		  {
1628 		    status = tre_add_tag_right(mem, left, tag_left);
1629 		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1630 		    if (status == REG_OK)
1631 		      status = tre_add_tag_right(mem, right, tag_right);
1632 		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1633 		  }
1634 		num_tags += 2;
1635 	      }
1636 	    direction = TRE_TAG_MAXIMIZE;
1637 	    break;
1638 	  }
1639 
1640 	default:
1641 	  assert(0);
1642 	  break;
1643 
1644 	} /* end switch(symbol) */
1645     } /* end while(tre_stack_num_objects(stack) > bottom) */
1646 
1647   if (!first_pass)
1648     tre_purge_regset(regset, tnfa, tag);
1649 
1650   if (!first_pass && minimal_tag >= 0)
1651     {
1652       int i;
1653       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1654       tnfa->minimal_tags[i] = tag;
1655       tnfa->minimal_tags[i + 1] = minimal_tag;
1656       tnfa->minimal_tags[i + 2] = -1;
1657       minimal_tag = -1;
1658       num_minimals++;
1659     }
1660 
1661   assert(tree->num_tags == num_tags);
1662   tnfa->end_tag = num_tags;
1663   tnfa->num_tags = num_tags;
1664   tnfa->num_minimals = num_minimals;
1665   xfree(orig_regset);
1666   xfree(parents);
1667   xfree(saved_states);
1668   return status;
1669 }
1670 
1671 
1672 
1673 /*
1674   AST to TNFA compilation routines.
1675 */
1676 
1677 typedef enum {
1678   COPY_RECURSE,
1679   COPY_SET_RESULT_PTR
1680 } tre_copyast_symbol_t;
1681 
1682 /* Flags for tre_copy_ast(). */
1683 #define COPY_REMOVE_TAGS	 1
1684 #define COPY_MAXIMIZE_FIRST_TAG	 2
1685 
1686 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)1687 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1688 	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1689 	     tre_ast_node_t **copy, int *max_pos)
1690 {
1691   reg_errcode_t status = REG_OK;
1692   int bottom = tre_stack_num_objects(stack);
1693   int num_copied = 0;
1694   int first_tag = 1;
1695   tre_ast_node_t **result = copy;
1696   tre_copyast_symbol_t symbol;
1697 
1698   STACK_PUSH(stack, voidptr, ast);
1699   STACK_PUSH(stack, int, COPY_RECURSE);
1700 
1701   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1702     {
1703       tre_ast_node_t *node;
1704       if (status != REG_OK)
1705 	break;
1706 
1707       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1708       switch (symbol)
1709 	{
1710 	case COPY_SET_RESULT_PTR:
1711 	  result = tre_stack_pop_voidptr(stack);
1712 	  break;
1713 	case COPY_RECURSE:
1714 	  node = tre_stack_pop_voidptr(stack);
1715 	  switch (node->type)
1716 	    {
1717 	    case LITERAL:
1718 	      {
1719 		tre_literal_t *lit = node->obj;
1720 		int pos = lit->position;
1721 		int min = lit->code_min;
1722 		int max = lit->code_max;
1723 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1724 		  {
1725 		    /* XXX - e.g. [ab] has only one position but two
1726 		       nodes, so we are creating holes in the state space
1727 		       here.  Not fatal, just wastes memory. */
1728 		    pos += *pos_add;
1729 		    num_copied++;
1730 		  }
1731 		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1732 		  {
1733 		    /* Change this tag to empty. */
1734 		    min = EMPTY;
1735 		    max = pos = -1;
1736 		  }
1737 		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1738 			 && first_tag)
1739 		  {
1740 		    /* Maximize the first tag. */
1741 		    tag_directions[max] = TRE_TAG_MAXIMIZE;
1742 		    first_tag = 0;
1743 		  }
1744 		*result = tre_ast_new_literal(mem, min, max, pos);
1745 		if (*result == NULL)
1746 		  status = REG_ESPACE;
1747 		else {
1748 		  tre_literal_t *p = (*result)->obj;
1749 		  p->class = lit->class;
1750 		  p->neg_classes = lit->neg_classes;
1751 		}
1752 
1753 		if (pos > *max_pos)
1754 		  *max_pos = pos;
1755 		break;
1756 	      }
1757 	    case UNION:
1758 	      {
1759 		tre_union_t *uni = node->obj;
1760 		tre_union_t *tmp;
1761 		*result = tre_ast_new_union(mem, uni->left, uni->right);
1762 		if (*result == NULL)
1763 		  {
1764 		    status = REG_ESPACE;
1765 		    break;
1766 		  }
1767 		tmp = (*result)->obj;
1768 		result = &tmp->left;
1769 		STACK_PUSHX(stack, voidptr, uni->right);
1770 		STACK_PUSHX(stack, int, COPY_RECURSE);
1771 		STACK_PUSHX(stack, voidptr, &tmp->right);
1772 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1773 		STACK_PUSHX(stack, voidptr, uni->left);
1774 		STACK_PUSHX(stack, int, COPY_RECURSE);
1775 		break;
1776 	      }
1777 	    case CATENATION:
1778 	      {
1779 		tre_catenation_t *cat = node->obj;
1780 		tre_catenation_t *tmp;
1781 		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
1782 		if (*result == NULL)
1783 		  {
1784 		    status = REG_ESPACE;
1785 		    break;
1786 		  }
1787 		tmp = (*result)->obj;
1788 		tmp->left = NULL;
1789 		tmp->right = NULL;
1790 		result = &tmp->left;
1791 
1792 		STACK_PUSHX(stack, voidptr, cat->right);
1793 		STACK_PUSHX(stack, int, COPY_RECURSE);
1794 		STACK_PUSHX(stack, voidptr, &tmp->right);
1795 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1796 		STACK_PUSHX(stack, voidptr, cat->left);
1797 		STACK_PUSHX(stack, int, COPY_RECURSE);
1798 		break;
1799 	      }
1800 	    case ITERATION:
1801 	      {
1802 		tre_iteration_t *iter = node->obj;
1803 		STACK_PUSHX(stack, voidptr, iter->arg);
1804 		STACK_PUSHX(stack, int, COPY_RECURSE);
1805 		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
1806 					   iter->max, iter->minimal);
1807 		if (*result == NULL)
1808 		  {
1809 		    status = REG_ESPACE;
1810 		    break;
1811 		  }
1812 		iter = (*result)->obj;
1813 		result = &iter->arg;
1814 		break;
1815 	      }
1816 	    default:
1817 	      assert(0);
1818 	      break;
1819 	    }
1820 	  break;
1821 	}
1822     }
1823   *pos_add += num_copied;
1824   return status;
1825 }
1826 
1827 typedef enum {
1828   EXPAND_RECURSE,
1829   EXPAND_AFTER_ITER
1830 } tre_expand_ast_symbol_t;
1831 
1832 /* Expands each iteration node that has a finite nonzero minimum or maximum
1833    iteration count to a catenated sequence of copies of the node. */
1834 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions)1835 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1836 	       int *position, tre_tag_direction_t *tag_directions)
1837 {
1838   reg_errcode_t status = REG_OK;
1839   int bottom = tre_stack_num_objects(stack);
1840   int pos_add = 0;
1841   int pos_add_total = 0;
1842   int max_pos = 0;
1843   int iter_depth = 0;
1844 
1845   STACK_PUSHR(stack, voidptr, ast);
1846   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1847   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1848     {
1849       tre_ast_node_t *node;
1850       tre_expand_ast_symbol_t symbol;
1851 
1852       if (status != REG_OK)
1853 	break;
1854 
1855       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1856       node = tre_stack_pop_voidptr(stack);
1857       switch (symbol)
1858 	{
1859 	case EXPAND_RECURSE:
1860 	  switch (node->type)
1861 	    {
1862 	    case LITERAL:
1863 	      {
1864 		tre_literal_t *lit= node->obj;
1865 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1866 		  {
1867 		    lit->position += pos_add;
1868 		    if (lit->position > max_pos)
1869 		      max_pos = lit->position;
1870 		  }
1871 		break;
1872 	      }
1873 	    case UNION:
1874 	      {
1875 		tre_union_t *uni = node->obj;
1876 		STACK_PUSHX(stack, voidptr, uni->right);
1877 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1878 		STACK_PUSHX(stack, voidptr, uni->left);
1879 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1880 		break;
1881 	      }
1882 	    case CATENATION:
1883 	      {
1884 		tre_catenation_t *cat = node->obj;
1885 		STACK_PUSHX(stack, voidptr, cat->right);
1886 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1887 		STACK_PUSHX(stack, voidptr, cat->left);
1888 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1889 		break;
1890 	      }
1891 	    case ITERATION:
1892 	      {
1893 		tre_iteration_t *iter = node->obj;
1894 		STACK_PUSHX(stack, int, pos_add);
1895 		STACK_PUSHX(stack, voidptr, node);
1896 		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1897 		STACK_PUSHX(stack, voidptr, iter->arg);
1898 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1899 		/* If we are going to expand this node at EXPAND_AFTER_ITER
1900 		   then don't increase the `pos' fields of the nodes now, it
1901 		   will get done when expanding. */
1902 		if (iter->min > 1 || iter->max > 1)
1903 		  pos_add = 0;
1904 		iter_depth++;
1905 		break;
1906 	      }
1907 	    default:
1908 	      assert(0);
1909 	      break;
1910 	    }
1911 	  break;
1912 	case EXPAND_AFTER_ITER:
1913 	  {
1914 	    tre_iteration_t *iter = node->obj;
1915 	    int pos_add_last;
1916 	    pos_add = tre_stack_pop_int(stack);
1917 	    pos_add_last = pos_add;
1918 	    if (iter->min > 1 || iter->max > 1)
1919 	      {
1920 		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1921 		int j;
1922 		int pos_add_save = pos_add;
1923 
1924 		/* Create a catenated sequence of copies of the node. */
1925 		for (j = 0; j < iter->min; j++)
1926 		  {
1927 		    tre_ast_node_t *copy;
1928 		    /* Remove tags from all but the last copy. */
1929 		    int flags = ((j + 1 < iter->min)
1930 				 ? COPY_REMOVE_TAGS
1931 				 : COPY_MAXIMIZE_FIRST_TAG);
1932 		    pos_add_save = pos_add;
1933 		    status = tre_copy_ast(mem, stack, iter->arg, flags,
1934 					  &pos_add, tag_directions, &copy,
1935 					  &max_pos);
1936 		    if (status != REG_OK)
1937 		      return status;
1938 		    if (seq1 != NULL)
1939 		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
1940 		    else
1941 		      seq1 = copy;
1942 		    if (seq1 == NULL)
1943 		      return REG_ESPACE;
1944 		  }
1945 
1946 		if (iter->max == -1)
1947 		  {
1948 		    /* No upper limit. */
1949 		    pos_add_save = pos_add;
1950 		    status = tre_copy_ast(mem, stack, iter->arg, 0,
1951 					  &pos_add, NULL, &seq2, &max_pos);
1952 		    if (status != REG_OK)
1953 		      return status;
1954 		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1955 		    if (seq2 == NULL)
1956 		      return REG_ESPACE;
1957 		  }
1958 		else
1959 		  {
1960 		    for (j = iter->min; j < iter->max; j++)
1961 		      {
1962 			tre_ast_node_t *tmp, *copy;
1963 			pos_add_save = pos_add;
1964 			status = tre_copy_ast(mem, stack, iter->arg, 0,
1965 					      &pos_add, NULL, &copy, &max_pos);
1966 			if (status != REG_OK)
1967 			  return status;
1968 			if (seq2 != NULL)
1969 			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
1970 			else
1971 			  seq2 = copy;
1972 			if (seq2 == NULL)
1973 			  return REG_ESPACE;
1974 			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1975 			if (tmp == NULL)
1976 			  return REG_ESPACE;
1977 			seq2 = tre_ast_new_union(mem, tmp, seq2);
1978 			if (seq2 == NULL)
1979 			  return REG_ESPACE;
1980 		      }
1981 		  }
1982 
1983 		pos_add = pos_add_save;
1984 		if (seq1 == NULL)
1985 		  seq1 = seq2;
1986 		else if (seq2 != NULL)
1987 		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1988 		if (seq1 == NULL)
1989 		  return REG_ESPACE;
1990 		node->obj = seq1->obj;
1991 		node->type = seq1->type;
1992 	      }
1993 
1994 	    iter_depth--;
1995 	    pos_add_total += pos_add - pos_add_last;
1996 	    if (iter_depth == 0)
1997 	      pos_add = pos_add_total;
1998 
1999 	    break;
2000 	  }
2001 	default:
2002 	  assert(0);
2003 	  break;
2004 	}
2005     }
2006 
2007   *position += pos_add_total;
2008 
2009   /* `max_pos' should never be larger than `*position' if the above
2010      code works, but just an extra safeguard let's make sure
2011      `*position' is set large enough so enough memory will be
2012      allocated for the transition table. */
2013   if (max_pos > *position)
2014     *position = max_pos;
2015 
2016   return status;
2017 }
2018 
2019 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)2020 tre_set_empty(tre_mem_t mem)
2021 {
2022   tre_pos_and_tags_t *new_set;
2023 
2024   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2025   if (new_set == NULL)
2026     return NULL;
2027 
2028   new_set[0].position = -1;
2029   new_set[0].code_min = -1;
2030   new_set[0].code_max = -1;
2031 
2032   return new_set;
2033 }
2034 
2035 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)2036 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2037 	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2038 {
2039   tre_pos_and_tags_t *new_set;
2040 
2041   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2042   if (new_set == NULL)
2043     return NULL;
2044 
2045   new_set[0].position = position;
2046   new_set[0].code_min = code_min;
2047   new_set[0].code_max = code_max;
2048   new_set[0].class = class;
2049   new_set[0].neg_classes = neg_classes;
2050   new_set[0].backref = backref;
2051   new_set[1].position = -1;
2052   new_set[1].code_min = -1;
2053   new_set[1].code_max = -1;
2054 
2055   return new_set;
2056 }
2057 
2058 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions)2059 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2060 	      int *tags, int assertions)
2061 {
2062   int s1, s2, i, j;
2063   tre_pos_and_tags_t *new_set;
2064   int *new_tags;
2065   int num_tags;
2066 
2067   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2068   for (s1 = 0; set1[s1].position >= 0; s1++);
2069   for (s2 = 0; set2[s2].position >= 0; s2++);
2070   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2071   if (!new_set )
2072     return NULL;
2073 
2074   for (s1 = 0; set1[s1].position >= 0; s1++)
2075     {
2076       new_set[s1].position = set1[s1].position;
2077       new_set[s1].code_min = set1[s1].code_min;
2078       new_set[s1].code_max = set1[s1].code_max;
2079       new_set[s1].assertions = set1[s1].assertions | assertions;
2080       new_set[s1].class = set1[s1].class;
2081       new_set[s1].neg_classes = set1[s1].neg_classes;
2082       new_set[s1].backref = set1[s1].backref;
2083       if (set1[s1].tags == NULL && tags == NULL)
2084 	new_set[s1].tags = NULL;
2085       else
2086 	{
2087 	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2088 	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2089 					 * (i + num_tags + 1)));
2090 	  if (new_tags == NULL)
2091 	    return NULL;
2092 	  for (j = 0; j < i; j++)
2093 	    new_tags[j] = set1[s1].tags[j];
2094 	  for (i = 0; i < num_tags; i++)
2095 	    new_tags[j + i] = tags[i];
2096 	  new_tags[j + i] = -1;
2097 	  new_set[s1].tags = new_tags;
2098 	}
2099     }
2100 
2101   for (s2 = 0; set2[s2].position >= 0; s2++)
2102     {
2103       new_set[s1 + s2].position = set2[s2].position;
2104       new_set[s1 + s2].code_min = set2[s2].code_min;
2105       new_set[s1 + s2].code_max = set2[s2].code_max;
2106       /* XXX - why not | assertions here as well? */
2107       new_set[s1 + s2].assertions = set2[s2].assertions;
2108       new_set[s1 + s2].class = set2[s2].class;
2109       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2110       new_set[s1 + s2].backref = set2[s2].backref;
2111       if (set2[s2].tags == NULL)
2112 	new_set[s1 + s2].tags = NULL;
2113       else
2114 	{
2115 	  for (i = 0; set2[s2].tags[i] >= 0; i++);
2116 	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2117 	  if (new_tags == NULL)
2118 	    return NULL;
2119 	  for (j = 0; j < i; j++)
2120 	    new_tags[j] = set2[s2].tags[j];
2121 	  new_tags[j] = -1;
2122 	  new_set[s1 + s2].tags = new_tags;
2123 	}
2124     }
2125   new_set[s1 + s2].position = -1;
2126   return new_set;
2127 }
2128 
2129 /* Finds the empty path through `node' which is the one that should be
2130    taken according to POSIX.2 rules, and adds the tags on that path to
2131    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2132    set to the number of tags seen on the path. */
2133 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * num_tags_seen)2134 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2135 		int *assertions, int *num_tags_seen)
2136 {
2137   tre_literal_t *lit;
2138   tre_union_t *uni;
2139   tre_catenation_t *cat;
2140   tre_iteration_t *iter;
2141   int i;
2142   int bottom = tre_stack_num_objects(stack);
2143   reg_errcode_t status = REG_OK;
2144   if (num_tags_seen)
2145     *num_tags_seen = 0;
2146 
2147   status = tre_stack_push_voidptr(stack, node);
2148 
2149   /* Walk through the tree recursively. */
2150   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2151     {
2152       node = tre_stack_pop_voidptr(stack);
2153 
2154       switch (node->type)
2155 	{
2156 	case LITERAL:
2157 	  lit = (tre_literal_t *)node->obj;
2158 	  switch (lit->code_min)
2159 	    {
2160 	    case TAG:
2161 	      if (lit->code_max >= 0)
2162 		{
2163 		  if (tags != NULL)
2164 		    {
2165 		      /* Add the tag to `tags'. */
2166 		      for (i = 0; tags[i] >= 0; i++)
2167 			if (tags[i] == lit->code_max)
2168 			  break;
2169 		      if (tags[i] < 0)
2170 			{
2171 			  tags[i] = lit->code_max;
2172 			  tags[i + 1] = -1;
2173 			}
2174 		    }
2175 		  if (num_tags_seen)
2176 		    (*num_tags_seen)++;
2177 		}
2178 	      break;
2179 	    case ASSERTION:
2180 	      assert(lit->code_max >= 1
2181 		     || lit->code_max <= ASSERT_LAST);
2182 	      if (assertions != NULL)
2183 		*assertions |= lit->code_max;
2184 	      break;
2185 	    case EMPTY:
2186 	      break;
2187 	    default:
2188 	      assert(0);
2189 	      break;
2190 	    }
2191 	  break;
2192 
2193 	case UNION:
2194 	  /* Subexpressions starting earlier take priority over ones
2195 	     starting later, so we prefer the left subexpression over the
2196 	     right subexpression. */
2197 	  uni = (tre_union_t *)node->obj;
2198 	  if (uni->left->nullable)
2199 	    STACK_PUSHX(stack, voidptr, uni->left)
2200 	  else if (uni->right->nullable)
2201 	    STACK_PUSHX(stack, voidptr, uni->right)
2202 	  else
2203 	    assert(0);
2204 	  break;
2205 
2206 	case CATENATION:
2207 	  /* The path must go through both children. */
2208 	  cat = (tre_catenation_t *)node->obj;
2209 	  assert(cat->left->nullable);
2210 	  assert(cat->right->nullable);
2211 	  STACK_PUSHX(stack, voidptr, cat->left);
2212 	  STACK_PUSHX(stack, voidptr, cat->right);
2213 	  break;
2214 
2215 	case ITERATION:
2216 	  /* A match with an empty string is preferred over no match at
2217 	     all, so we go through the argument if possible. */
2218 	  iter = (tre_iteration_t *)node->obj;
2219 	  if (iter->arg->nullable)
2220 	    STACK_PUSHX(stack, voidptr, iter->arg);
2221 	  break;
2222 
2223 	default:
2224 	  assert(0);
2225 	  break;
2226 	}
2227     }
2228 
2229   return status;
2230 }
2231 
2232 
2233 typedef enum {
2234   NFL_RECURSE,
2235   NFL_POST_UNION,
2236   NFL_POST_CATENATION,
2237   NFL_POST_ITERATION
2238 } tre_nfl_stack_symbol_t;
2239 
2240 
2241 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2242    the nodes of the AST `tree'. */
2243 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)2244 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2245 {
2246   int bottom = tre_stack_num_objects(stack);
2247 
2248   STACK_PUSHR(stack, voidptr, tree);
2249   STACK_PUSHR(stack, int, NFL_RECURSE);
2250 
2251   while (tre_stack_num_objects(stack) > bottom)
2252     {
2253       tre_nfl_stack_symbol_t symbol;
2254       tre_ast_node_t *node;
2255 
2256       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2257       node = tre_stack_pop_voidptr(stack);
2258       switch (symbol)
2259 	{
2260 	case NFL_RECURSE:
2261 	  switch (node->type)
2262 	    {
2263 	    case LITERAL:
2264 	      {
2265 		tre_literal_t *lit = (tre_literal_t *)node->obj;
2266 		if (IS_BACKREF(lit))
2267 		  {
2268 		    /* Back references: nullable = false, firstpos = {i},
2269 		       lastpos = {i}. */
2270 		    node->nullable = 0;
2271 		    node->firstpos = tre_set_one(mem, lit->position, 0,
2272 					     TRE_CHAR_MAX, 0, NULL, -1);
2273 		    if (!node->firstpos)
2274 		      return REG_ESPACE;
2275 		    node->lastpos = tre_set_one(mem, lit->position, 0,
2276 						TRE_CHAR_MAX, 0, NULL,
2277 						(int)lit->code_max);
2278 		    if (!node->lastpos)
2279 		      return REG_ESPACE;
2280 		  }
2281 		else if (lit->code_min < 0)
2282 		  {
2283 		    /* Tags, empty strings, params, and zero width assertions:
2284 		       nullable = true, firstpos = {}, and lastpos = {}. */
2285 		    node->nullable = 1;
2286 		    node->firstpos = tre_set_empty(mem);
2287 		    if (!node->firstpos)
2288 		      return REG_ESPACE;
2289 		    node->lastpos = tre_set_empty(mem);
2290 		    if (!node->lastpos)
2291 		      return REG_ESPACE;
2292 		  }
2293 		else
2294 		  {
2295 		    /* Literal at position i: nullable = false, firstpos = {i},
2296 		       lastpos = {i}. */
2297 		    node->nullable = 0;
2298 		    node->firstpos =
2299 		      tre_set_one(mem, lit->position, (int)lit->code_min,
2300 				  (int)lit->code_max, 0, NULL, -1);
2301 		    if (!node->firstpos)
2302 		      return REG_ESPACE;
2303 		    node->lastpos = tre_set_one(mem, lit->position,
2304 						(int)lit->code_min,
2305 						(int)lit->code_max,
2306 						lit->class, lit->neg_classes,
2307 						-1);
2308 		    if (!node->lastpos)
2309 		      return REG_ESPACE;
2310 		  }
2311 		break;
2312 	      }
2313 
2314 	    case UNION:
2315 	      /* Compute the attributes for the two subtrees, and after that
2316 		 for this node. */
2317 	      STACK_PUSHR(stack, voidptr, node);
2318 	      STACK_PUSHR(stack, int, NFL_POST_UNION);
2319 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2320 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2321 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2322 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2323 	      break;
2324 
2325 	    case CATENATION:
2326 	      /* Compute the attributes for the two subtrees, and after that
2327 		 for this node. */
2328 	      STACK_PUSHR(stack, voidptr, node);
2329 	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2330 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2331 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2332 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2333 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2334 	      break;
2335 
2336 	    case ITERATION:
2337 	      /* Compute the attributes for the subtree, and after that for
2338 		 this node. */
2339 	      STACK_PUSHR(stack, voidptr, node);
2340 	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2341 	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2342 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2343 	      break;
2344 	    }
2345 	  break; /* end case: NFL_RECURSE */
2346 
2347 	case NFL_POST_UNION:
2348 	  {
2349 	    tre_union_t *uni = (tre_union_t *)node->obj;
2350 	    node->nullable = uni->left->nullable || uni->right->nullable;
2351 	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
2352 					   uni->right->firstpos, NULL, 0);
2353 	    if (!node->firstpos)
2354 	      return REG_ESPACE;
2355 	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
2356 					  uni->right->lastpos, NULL, 0);
2357 	    if (!node->lastpos)
2358 	      return REG_ESPACE;
2359 	    break;
2360 	  }
2361 
2362 	case NFL_POST_ITERATION:
2363 	  {
2364 	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2365 
2366 	    if (iter->min == 0 || iter->arg->nullable)
2367 	      node->nullable = 1;
2368 	    else
2369 	      node->nullable = 0;
2370 	    node->firstpos = iter->arg->firstpos;
2371 	    node->lastpos = iter->arg->lastpos;
2372 	    break;
2373 	  }
2374 
2375 	case NFL_POST_CATENATION:
2376 	  {
2377 	    int num_tags, *tags, assertions;
2378 	    reg_errcode_t status;
2379 	    tre_catenation_t *cat = node->obj;
2380 	    node->nullable = cat->left->nullable && cat->right->nullable;
2381 
2382 	    /* Compute firstpos. */
2383 	    if (cat->left->nullable)
2384 	      {
2385 		/* The left side matches the empty string.  Make a first pass
2386 		   with tre_match_empty() to get the number of tags and
2387 		   parameters. */
2388 		status = tre_match_empty(stack, cat->left,
2389 					 NULL, NULL, &num_tags);
2390 		if (status != REG_OK)
2391 		  return status;
2392 		/* Allocate arrays for the tags and parameters. */
2393 		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2394 		if (!tags)
2395 		  return REG_ESPACE;
2396 		tags[0] = -1;
2397 		assertions = 0;
2398 		/* Second pass with tre_mach_empty() to get the list of
2399 		   tags and parameters. */
2400 		status = tre_match_empty(stack, cat->left, tags,
2401 					 &assertions, NULL);
2402 		if (status != REG_OK)
2403 		  {
2404 		    xfree(tags);
2405 		    return status;
2406 		  }
2407 		node->firstpos =
2408 		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2409 				tags, assertions);
2410 		xfree(tags);
2411 		if (!node->firstpos)
2412 		  return REG_ESPACE;
2413 	      }
2414 	    else
2415 	      {
2416 		node->firstpos = cat->left->firstpos;
2417 	      }
2418 
2419 	    /* Compute lastpos. */
2420 	    if (cat->right->nullable)
2421 	      {
2422 		/* The right side matches the empty string.  Make a first pass
2423 		   with tre_match_empty() to get the number of tags and
2424 		   parameters. */
2425 		status = tre_match_empty(stack, cat->right,
2426 					 NULL, NULL, &num_tags);
2427 		if (status != REG_OK)
2428 		  return status;
2429 		/* Allocate arrays for the tags and parameters. */
2430 		tags = xmalloc(sizeof(int) * (num_tags + 1));
2431 		if (!tags)
2432 		  return REG_ESPACE;
2433 		tags[0] = -1;
2434 		assertions = 0;
2435 		/* Second pass with tre_mach_empty() to get the list of
2436 		   tags and parameters. */
2437 		status = tre_match_empty(stack, cat->right, tags,
2438 					 &assertions, NULL);
2439 		if (status != REG_OK)
2440 		  {
2441 		    xfree(tags);
2442 		    return status;
2443 		  }
2444 		node->lastpos =
2445 		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2446 				tags, assertions);
2447 		xfree(tags);
2448 		if (!node->lastpos)
2449 		  return REG_ESPACE;
2450 	      }
2451 	    else
2452 	      {
2453 		node->lastpos = cat->right->lastpos;
2454 	      }
2455 	    break;
2456 	  }
2457 
2458 	default:
2459 	  assert(0);
2460 	  break;
2461 	}
2462     }
2463 
2464   return REG_OK;
2465 }
2466 
2467 
2468 /* Adds a transition from each position in `p1' to each position in `p2'. */
2469 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)2470 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2471 	       tre_tnfa_transition_t *transitions,
2472 	       int *counts, int *offs)
2473 {
2474   tre_pos_and_tags_t *orig_p2 = p2;
2475   tre_tnfa_transition_t *trans;
2476   int i, j, k, l, dup, prev_p2_pos;
2477 
2478   if (transitions != NULL)
2479     while (p1->position >= 0)
2480       {
2481 	p2 = orig_p2;
2482 	prev_p2_pos = -1;
2483 	while (p2->position >= 0)
2484 	  {
2485 	    /* Optimization: if this position was already handled, skip it. */
2486 	    if (p2->position == prev_p2_pos)
2487 	      {
2488 		p2++;
2489 		continue;
2490 	      }
2491 	    prev_p2_pos = p2->position;
2492 	    /* Set `trans' to point to the next unused transition from
2493 	       position `p1->position'. */
2494 	    trans = transitions + offs[p1->position];
2495 	    while (trans->state != NULL)
2496 	      {
2497 #if 0
2498 		/* If we find a previous transition from `p1->position' to
2499 		   `p2->position', it is overwritten.  This can happen only
2500 		   if there are nested loops in the regexp, like in "((a)*)*".
2501 		   In POSIX.2 repetition using the outer loop is always
2502 		   preferred over using the inner loop.	 Therefore the
2503 		   transition for the inner loop is useless and can be thrown
2504 		   away. */
2505 		/* XXX - The same position is used for all nodes in a bracket
2506 		   expression, so this optimization cannot be used (it will
2507 		   break bracket expressions) unless I figure out a way to
2508 		   detect it here. */
2509 		if (trans->state_id == p2->position)
2510 		  {
2511 		    break;
2512 		  }
2513 #endif
2514 		trans++;
2515 	      }
2516 
2517 	    if (trans->state == NULL)
2518 	      (trans + 1)->state = NULL;
2519 	    /* Use the character ranges, assertions, etc. from `p1' for
2520 	       the transition from `p1' to `p2'. */
2521 	    trans->code_min = p1->code_min;
2522 	    trans->code_max = p1->code_max;
2523 	    trans->state = transitions + offs[p2->position];
2524 	    trans->state_id = p2->position;
2525 	    trans->assertions = p1->assertions | p2->assertions
2526 	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
2527 	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2528 	    if (p1->backref >= 0)
2529 	      {
2530 		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2531 		assert(p2->backref < 0);
2532 		trans->u.backref = p1->backref;
2533 		trans->assertions |= ASSERT_BACKREF;
2534 	      }
2535 	    else
2536 	      trans->u.class = p1->class;
2537 	    if (p1->neg_classes != NULL)
2538 	      {
2539 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2540 		trans->neg_classes =
2541 		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2542 		if (trans->neg_classes == NULL)
2543 		  return REG_ESPACE;
2544 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2545 		  trans->neg_classes[i] = p1->neg_classes[i];
2546 		trans->neg_classes[i] = (tre_ctype_t)0;
2547 	      }
2548 	    else
2549 	      trans->neg_classes = NULL;
2550 
2551 	    /* Find out how many tags this transition has. */
2552 	    i = 0;
2553 	    if (p1->tags != NULL)
2554 	      while(p1->tags[i] >= 0)
2555 		i++;
2556 	    j = 0;
2557 	    if (p2->tags != NULL)
2558 	      while(p2->tags[j] >= 0)
2559 		j++;
2560 
2561 	    /* If we are overwriting a transition, free the old tag array. */
2562 	    if (trans->tags != NULL)
2563 	      xfree(trans->tags);
2564 	    trans->tags = NULL;
2565 
2566 	    /* If there were any tags, allocate an array and fill it. */
2567 	    if (i + j > 0)
2568 	      {
2569 		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2570 		if (!trans->tags)
2571 		  return REG_ESPACE;
2572 		i = 0;
2573 		if (p1->tags != NULL)
2574 		  while(p1->tags[i] >= 0)
2575 		    {
2576 		      trans->tags[i] = p1->tags[i];
2577 		      i++;
2578 		    }
2579 		l = i;
2580 		j = 0;
2581 		if (p2->tags != NULL)
2582 		  while (p2->tags[j] >= 0)
2583 		    {
2584 		      /* Don't add duplicates. */
2585 		      dup = 0;
2586 		      for (k = 0; k < i; k++)
2587 			if (trans->tags[k] == p2->tags[j])
2588 			  {
2589 			    dup = 1;
2590 			    break;
2591 			  }
2592 		      if (!dup)
2593 			trans->tags[l++] = p2->tags[j];
2594 		      j++;
2595 		    }
2596 		trans->tags[l] = -1;
2597 	      }
2598 
2599 	    p2++;
2600 	  }
2601 	p1++;
2602       }
2603   else
2604     /* Compute a maximum limit for the number of transitions leaving
2605        from each state. */
2606     while (p1->position >= 0)
2607       {
2608 	p2 = orig_p2;
2609 	while (p2->position >= 0)
2610 	  {
2611 	    counts[p1->position]++;
2612 	    p2++;
2613 	  }
2614 	p1++;
2615       }
2616   return REG_OK;
2617 }
2618 
2619 /* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
2620    labelled with one character range (there are no transitions on empty
2621    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2622    the regexp. */
2623 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)2624 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2625 		int *counts, int *offs)
2626 {
2627   tre_union_t *uni;
2628   tre_catenation_t *cat;
2629   tre_iteration_t *iter;
2630   reg_errcode_t errcode = REG_OK;
2631 
2632   /* XXX - recurse using a stack!. */
2633   switch (node->type)
2634     {
2635     case LITERAL:
2636       break;
2637     case UNION:
2638       uni = (tre_union_t *)node->obj;
2639       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2640       if (errcode != REG_OK)
2641 	return errcode;
2642       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2643       break;
2644 
2645     case CATENATION:
2646       cat = (tre_catenation_t *)node->obj;
2647       /* Add a transition from each position in cat->left->lastpos
2648 	 to each position in cat->right->firstpos. */
2649       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2650 			       transitions, counts, offs);
2651       if (errcode != REG_OK)
2652 	return errcode;
2653       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2654       if (errcode != REG_OK)
2655 	return errcode;
2656       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2657       break;
2658 
2659     case ITERATION:
2660       iter = (tre_iteration_t *)node->obj;
2661       assert(iter->max == -1 || iter->max == 1);
2662 
2663       if (iter->max == -1)
2664 	{
2665 	  assert(iter->min == 0 || iter->min == 1);
2666 	  /* Add a transition from each last position in the iterated
2667 	     expression to each first position. */
2668 	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2669 				   transitions, counts, offs);
2670 	  if (errcode != REG_OK)
2671 	    return errcode;
2672 	}
2673       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2674       break;
2675     }
2676   return errcode;
2677 }
2678 
2679 
2680 #define ERROR_EXIT(err)		  \
2681   do				  \
2682     {				  \
2683       errcode = err;		  \
2684       if (/*CONSTCOND*/1)	  \
2685       	goto error_exit;	  \
2686     }				  \
2687  while (/*CONSTCOND*/0)
2688 
2689 
2690 int
regcomp(regex_t * restrict preg,const char * restrict regex,int cflags)2691 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2692 {
2693   tre_stack_t *stack;
2694   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2695   tre_pos_and_tags_t *p;
2696   int *counts = NULL, *offs = NULL;
2697   int i, add = 0;
2698   tre_tnfa_transition_t *transitions, *initial;
2699   tre_tnfa_t *tnfa = NULL;
2700   tre_submatch_data_t *submatch_data;
2701   tre_tag_direction_t *tag_directions = NULL;
2702   reg_errcode_t errcode;
2703   tre_mem_t mem;
2704 
2705   /* Parse context. */
2706   tre_parse_ctx_t parse_ctx;
2707 
2708   /* Allocate a stack used throughout the compilation process for various
2709      purposes. */
2710   stack = tre_stack_new(512, 1024000, 128);
2711   if (!stack)
2712     return REG_ESPACE;
2713   /* Allocate a fast memory allocator. */
2714   mem = tre_mem_new();
2715   if (!mem)
2716     {
2717       tre_stack_destroy(stack);
2718       return REG_ESPACE;
2719     }
2720 
2721   /* Parse the regexp. */
2722   memset(&parse_ctx, 0, sizeof(parse_ctx));
2723   parse_ctx.mem = mem;
2724   parse_ctx.stack = stack;
2725   parse_ctx.start = regex;
2726   parse_ctx.cflags = cflags;
2727   parse_ctx.max_backref = -1;
2728   errcode = tre_parse(&parse_ctx);
2729   if (errcode != REG_OK)
2730     ERROR_EXIT(errcode);
2731   preg->re_nsub = parse_ctx.submatch_id - 1;
2732   tree = parse_ctx.n;
2733 
2734 #ifdef TRE_DEBUG
2735   tre_ast_print(tree);
2736 #endif /* TRE_DEBUG */
2737 
2738   /* Referring to nonexistent subexpressions is illegal. */
2739   if (parse_ctx.max_backref > (int)preg->re_nsub)
2740     ERROR_EXIT(REG_ESUBREG);
2741 
2742   /* Allocate the TNFA struct. */
2743   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2744   if (tnfa == NULL)
2745     ERROR_EXIT(REG_ESPACE);
2746   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2747   tnfa->have_approx = 0;
2748   tnfa->num_submatches = parse_ctx.submatch_id;
2749 
2750   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2751      regexp does not have back references, this can be skipped. */
2752   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2753     {
2754 
2755       /* Figure out how many tags we will need. */
2756       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2757       if (errcode != REG_OK)
2758 	ERROR_EXIT(errcode);
2759 
2760       if (tnfa->num_tags > 0)
2761 	{
2762 	  tag_directions = xmalloc(sizeof(*tag_directions)
2763 				   * (tnfa->num_tags + 1));
2764 	  if (tag_directions == NULL)
2765 	    ERROR_EXIT(REG_ESPACE);
2766 	  tnfa->tag_directions = tag_directions;
2767 	  memset(tag_directions, -1,
2768 		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
2769 	}
2770       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2771 				   sizeof(*tnfa->minimal_tags));
2772       if (tnfa->minimal_tags == NULL)
2773 	ERROR_EXIT(REG_ESPACE);
2774 
2775       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2776 			      sizeof(*submatch_data));
2777       if (submatch_data == NULL)
2778 	ERROR_EXIT(REG_ESPACE);
2779       tnfa->submatch_data = submatch_data;
2780 
2781       errcode = tre_add_tags(mem, stack, tree, tnfa);
2782       if (errcode != REG_OK)
2783 	ERROR_EXIT(errcode);
2784 
2785     }
2786 
2787   /* Expand iteration nodes. */
2788   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2789 			   tag_directions);
2790   if (errcode != REG_OK)
2791     ERROR_EXIT(errcode);
2792 
2793   /* Add a dummy node for the final state.
2794      XXX - For certain patterns this dummy node can be optimized away,
2795 	   for example "a*" or "ab*".	Figure out a simple way to detect
2796 	   this possibility. */
2797   tmp_ast_l = tree;
2798   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2799   if (tmp_ast_r == NULL)
2800     ERROR_EXIT(REG_ESPACE);
2801 
2802   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2803   if (tree == NULL)
2804     ERROR_EXIT(REG_ESPACE);
2805 
2806   errcode = tre_compute_nfl(mem, stack, tree);
2807   if (errcode != REG_OK)
2808     ERROR_EXIT(errcode);
2809 
2810   counts = xmalloc(sizeof(int) * parse_ctx.position);
2811   if (counts == NULL)
2812     ERROR_EXIT(REG_ESPACE);
2813 
2814   offs = xmalloc(sizeof(int) * parse_ctx.position);
2815   if (offs == NULL)
2816     ERROR_EXIT(REG_ESPACE);
2817 
2818   for (i = 0; i < parse_ctx.position; i++)
2819     counts[i] = 0;
2820   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2821 
2822   add = 0;
2823   for (i = 0; i < parse_ctx.position; i++)
2824     {
2825       offs[i] = add;
2826       add += counts[i] + 1;
2827       counts[i] = 0;
2828     }
2829   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2830   if (transitions == NULL)
2831     ERROR_EXIT(REG_ESPACE);
2832   tnfa->transitions = transitions;
2833   tnfa->num_transitions = add;
2834 
2835   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2836   if (errcode != REG_OK)
2837     ERROR_EXIT(errcode);
2838 
2839   tnfa->firstpos_chars = NULL;
2840 
2841   p = tree->firstpos;
2842   i = 0;
2843   while (p->position >= 0)
2844     {
2845       i++;
2846       p++;
2847     }
2848 
2849   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2850   if (initial == NULL)
2851     ERROR_EXIT(REG_ESPACE);
2852   tnfa->initial = initial;
2853 
2854   i = 0;
2855   for (p = tree->firstpos; p->position >= 0; p++)
2856     {
2857       initial[i].state = transitions + offs[p->position];
2858       initial[i].state_id = p->position;
2859       initial[i].tags = NULL;
2860       /* Copy the arrays p->tags, and p->params, they are allocated
2861 	 from a tre_mem object. */
2862       if (p->tags)
2863 	{
2864 	  int j;
2865 	  for (j = 0; p->tags[j] >= 0; j++);
2866 	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2867 	  if (!initial[i].tags)
2868 	    ERROR_EXIT(REG_ESPACE);
2869 	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2870 	}
2871       initial[i].assertions = p->assertions;
2872       i++;
2873     }
2874   initial[i].state = NULL;
2875 
2876   tnfa->num_transitions = add;
2877   tnfa->final = transitions + offs[tree->lastpos[0].position];
2878   tnfa->num_states = parse_ctx.position;
2879   tnfa->cflags = cflags;
2880 
2881   tre_mem_destroy(mem);
2882   tre_stack_destroy(stack);
2883   xfree(counts);
2884   xfree(offs);
2885 
2886   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2887   return REG_OK;
2888 
2889  error_exit:
2890   /* Free everything that was allocated and return the error code. */
2891   tre_mem_destroy(mem);
2892   if (stack != NULL)
2893     tre_stack_destroy(stack);
2894   if (counts != NULL)
2895     xfree(counts);
2896   if (offs != NULL)
2897     xfree(offs);
2898   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2899   regfree(preg);
2900   return errcode;
2901 }
2902 
2903 
2904 
2905 
2906 void
regfree(regex_t * preg)2907 regfree(regex_t *preg)
2908 {
2909   tre_tnfa_t *tnfa;
2910   unsigned int i;
2911   tre_tnfa_transition_t *trans;
2912 
2913   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2914   if (!tnfa)
2915     return;
2916 
2917   for (i = 0; i < tnfa->num_transitions; i++)
2918     if (tnfa->transitions[i].state)
2919       {
2920 	if (tnfa->transitions[i].tags)
2921 	  xfree(tnfa->transitions[i].tags);
2922 	if (tnfa->transitions[i].neg_classes)
2923 	  xfree(tnfa->transitions[i].neg_classes);
2924       }
2925   if (tnfa->transitions)
2926     xfree(tnfa->transitions);
2927 
2928   if (tnfa->initial)
2929     {
2930       for (trans = tnfa->initial; trans->state; trans++)
2931 	{
2932 	  if (trans->tags)
2933 	    xfree(trans->tags);
2934 	}
2935       xfree(tnfa->initial);
2936     }
2937 
2938   if (tnfa->submatch_data)
2939     {
2940       for (i = 0; i < tnfa->num_submatches; i++)
2941 	if (tnfa->submatch_data[i].parents)
2942 	  xfree(tnfa->submatch_data[i].parents);
2943       xfree(tnfa->submatch_data);
2944     }
2945 
2946   if (tnfa->tag_directions)
2947     xfree(tnfa->tag_directions);
2948   if (tnfa->firstpos_chars)
2949     xfree(tnfa->firstpos_chars);
2950   if (tnfa->minimal_tags)
2951     xfree(tnfa->minimal_tags);
2952   xfree(tnfa);
2953 }
2954