1 /*
2 regcomp.c - TRE POSIX compatible regex compilation functions.
3
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 */
31
32 #include <string.h>
33 #include <stdlib.h>
34 #include <regex.h>
35 #include <limits.h>
36 #include <stdint.h>
37 #include <ctype.h>
38
39 #include "tre.h"
40
41 #include <assert.h>
42
43 /***********************************************************************
44 from tre-compile.h
45 ***********************************************************************/
46
47 typedef struct {
48 int position;
49 int code_min;
50 int code_max;
51 int *tags;
52 int assertions;
53 tre_ctype_t class;
54 tre_ctype_t *neg_classes;
55 int backref;
56 } tre_pos_and_tags_t;
57
58
59 /***********************************************************************
60 from tre-ast.c and tre-ast.h
61 ***********************************************************************/
62
63 /* The different AST node types. */
64 typedef enum {
65 LITERAL,
66 CATENATION,
67 ITERATION,
68 UNION
69 } tre_ast_type_t;
70
71 /* Special subtypes of TRE_LITERAL. */
72 #define EMPTY -1 /* Empty leaf (denotes empty string). */
73 #define ASSERTION -2 /* Assertion leaf. */
74 #define TAG -3 /* Tag leaf. */
75 #define BACKREF -4 /* Back reference leaf. */
76
77 #define IS_SPECIAL(x) ((x)->code_min < 0)
78 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80 #define IS_TAG(x) ((x)->code_min == TAG)
81 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
82
83
84 /* A generic AST node. All AST nodes consist of this node on the top
85 level with `obj' pointing to the actual content. */
86 typedef struct {
87 tre_ast_type_t type; /* Type of the node. */
88 void *obj; /* Pointer to actual node. */
89 int nullable;
90 int submatch_id;
91 int num_submatches;
92 int num_tags;
93 tre_pos_and_tags_t *firstpos;
94 tre_pos_and_tags_t *lastpos;
95 } tre_ast_node_t;
96
97
98 /* A "literal" node. These are created for assertions, back references,
99 tags, matching parameter settings, and all expressions that match one
100 character. */
101 typedef struct {
102 long code_min;
103 long code_max;
104 int position;
105 tre_ctype_t class;
106 tre_ctype_t *neg_classes;
107 } tre_literal_t;
108
109 /* A "catenation" node. These are created when two regexps are concatenated.
110 If there are more than one subexpressions in sequence, the `left' part
111 holds all but the last, and `right' part holds the last subexpression
112 (catenation is left associative). */
113 typedef struct {
114 tre_ast_node_t *left;
115 tre_ast_node_t *right;
116 } tre_catenation_t;
117
118 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
119 operators. */
120 typedef struct {
121 /* Subexpression to match. */
122 tre_ast_node_t *arg;
123 /* Minimum number of consecutive matches. */
124 int min;
125 /* Maximum number of consecutive matches. */
126 int max;
127 /* If 0, match as many characters as possible, if 1 match as few as
128 possible. Note that this does not always mean the same thing as
129 matching as many/few repetitions as possible. */
130 unsigned int minimal:1;
131 } tre_iteration_t;
132
133 /* An "union" node. These are created for the "|" operator. */
134 typedef struct {
135 tre_ast_node_t *left;
136 tre_ast_node_t *right;
137 } tre_union_t;
138
139
140 static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem,int type,void * obj)141 tre_ast_new_node(tre_mem_t mem, int type, void *obj)
142 {
143 tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
144 if (!node || !obj)
145 return 0;
146 node->obj = obj;
147 node->type = type;
148 node->nullable = -1;
149 node->submatch_id = -1;
150 return node;
151 }
152
153 static tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem,int code_min,int code_max,int position)154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155 {
156 tre_ast_node_t *node;
157 tre_literal_t *lit;
158
159 lit = tre_mem_calloc(mem, sizeof *lit);
160 node = tre_ast_new_node(mem, LITERAL, lit);
161 if (!node)
162 return 0;
163 lit->code_min = code_min;
164 lit->code_max = code_max;
165 lit->position = position;
166 return node;
167 }
168
169 static tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem,tre_ast_node_t * arg,int min,int max,int minimal)170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171 {
172 tre_ast_node_t *node;
173 tre_iteration_t *iter;
174
175 iter = tre_mem_calloc(mem, sizeof *iter);
176 node = tre_ast_new_node(mem, ITERATION, iter);
177 if (!node)
178 return 0;
179 iter->arg = arg;
180 iter->min = min;
181 iter->max = max;
182 iter->minimal = minimal;
183 node->num_submatches = arg->num_submatches;
184 return node;
185 }
186
187 static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
189 {
190 tre_ast_node_t *node;
191 tre_union_t *un;
192
193 if (!left)
194 return right;
195 un = tre_mem_calloc(mem, sizeof *un);
196 node = tre_ast_new_node(mem, UNION, un);
197 if (!node || !right)
198 return 0;
199 un->left = left;
200 un->right = right;
201 node->num_submatches = left->num_submatches + right->num_submatches;
202 return node;
203 }
204
205 static tre_ast_node_t *
tre_ast_new_catenation(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
207 {
208 tre_ast_node_t *node;
209 tre_catenation_t *cat;
210
211 if (!left)
212 return right;
213 cat = tre_mem_calloc(mem, sizeof *cat);
214 node = tre_ast_new_node(mem, CATENATION, cat);
215 if (!node)
216 return 0;
217 cat->left = left;
218 cat->right = right;
219 node->num_submatches = left->num_submatches + right->num_submatches;
220 return node;
221 }
222
223
224 /***********************************************************************
225 from tre-stack.c and tre-stack.h
226 ***********************************************************************/
227
228 typedef struct tre_stack_rec tre_stack_t;
229
230 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
231 is maximum size, and `increment' specifies how much more space will be
232 allocated with realloc() if all space gets used up. Returns the stack
233 object or NULL if out of memory. */
234 static tre_stack_t *
235 tre_stack_new(int size, int max_size, int increment);
236
237 /* Frees the stack object. */
238 static void
239 tre_stack_destroy(tre_stack_t *s);
240
241 /* Returns the current number of objects in the stack. */
242 static int
243 tre_stack_num_objects(tre_stack_t *s);
244
245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
246 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
247 This tries to realloc() more space before failing if maximum size
248 has not yet been reached. Returns REG_OK if successful. */
249 #define declare_pushf(typetag, type) \
250 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251
252 declare_pushf(voidptr, void *);
253 declare_pushf(int, int);
254
255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
256 element off of stack `s' and returns it. The stack must not be
257 empty. */
258 #define declare_popf(typetag, type) \
259 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260
261 declare_popf(voidptr, void *);
262 declare_popf(int, int);
263
264 /* Just to save some typing. */
265 #define STACK_PUSH(s, typetag, value) \
266 do \
267 { \
268 status = tre_stack_push_ ## typetag(s, value); \
269 } \
270 while (/*CONSTCOND*/0)
271
272 #define STACK_PUSHX(s, typetag, value) \
273 { \
274 status = tre_stack_push_ ## typetag(s, value); \
275 if (status != REG_OK) \
276 break; \
277 }
278
279 #define STACK_PUSHR(s, typetag, value) \
280 { \
281 reg_errcode_t _status; \
282 _status = tre_stack_push_ ## typetag(s, value); \
283 if (_status != REG_OK) \
284 return _status; \
285 }
286
287 union tre_stack_item {
288 void *voidptr_value;
289 int int_value;
290 };
291
292 struct tre_stack_rec {
293 int size;
294 int max_size;
295 int increment;
296 int ptr;
297 union tre_stack_item *stack;
298 };
299
300
301 static tre_stack_t *
tre_stack_new(int size,int max_size,int increment)302 tre_stack_new(int size, int max_size, int increment)
303 {
304 tre_stack_t *s;
305
306 s = xmalloc(sizeof(*s));
307 if (s != NULL)
308 {
309 s->stack = xmalloc(sizeof(*s->stack) * size);
310 if (s->stack == NULL)
311 {
312 xfree(s);
313 return NULL;
314 }
315 s->size = size;
316 s->max_size = max_size;
317 s->increment = increment;
318 s->ptr = 0;
319 }
320 return s;
321 }
322
323 static void
tre_stack_destroy(tre_stack_t * s)324 tre_stack_destroy(tre_stack_t *s)
325 {
326 xfree(s->stack);
327 xfree(s);
328 }
329
330 static int
tre_stack_num_objects(tre_stack_t * s)331 tre_stack_num_objects(tre_stack_t *s)
332 {
333 return s->ptr;
334 }
335
336 static reg_errcode_t
tre_stack_push(tre_stack_t * s,union tre_stack_item value)337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338 {
339 if (s->ptr < s->size)
340 {
341 s->stack[s->ptr] = value;
342 s->ptr++;
343 }
344 else
345 {
346 if (s->size >= s->max_size)
347 {
348 return REG_ESPACE;
349 }
350 else
351 {
352 union tre_stack_item *new_buffer;
353 int new_size;
354 new_size = s->size + s->increment;
355 if (new_size > s->max_size)
356 new_size = s->max_size;
357 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
358 if (new_buffer == NULL)
359 {
360 return REG_ESPACE;
361 }
362 assert(new_size > s->size);
363 s->size = new_size;
364 s->stack = new_buffer;
365 tre_stack_push(s, value);
366 }
367 }
368 return REG_OK;
369 }
370
371 #define define_pushf(typetag, type) \
372 declare_pushf(typetag, type) { \
373 union tre_stack_item item; \
374 item.typetag ## _value = value; \
375 return tre_stack_push(s, item); \
376 }
377
378 define_pushf(int, int)
379 define_pushf(voidptr, void *)
380
381 #define define_popf(typetag, type) \
382 declare_popf(typetag, type) { \
383 return s->stack[--s->ptr].typetag ## _value; \
384 }
385
386 define_popf(int, int)
387 define_popf(voidptr, void *)
388
389
390 /***********************************************************************
391 from tre-parse.c and tre-parse.h
392 ***********************************************************************/
393
394 /* Parse context. */
395 typedef struct {
396 /* Memory allocator. The AST is allocated using this. */
397 tre_mem_t mem;
398 /* Stack used for keeping track of regexp syntax. */
399 tre_stack_t *stack;
400 /* The parsed node after a parse function returns. */
401 tre_ast_node_t *n;
402 /* Position in the regexp pattern after a parse function returns. */
403 const char *s;
404 /* The first character of the last subexpression parsed. */
405 const char *start;
406 /* Current submatch ID. */
407 int submatch_id;
408 /* Current position (number of literal). */
409 int position;
410 /* The highest back reference or -1 if none seen so far. */
411 int max_backref;
412 /* Compilation flags. */
413 int cflags;
414 } tre_parse_ctx_t;
415
416 /* Some macros for expanding \w, \s, etc. */
417 static const struct {
418 char c;
419 const char *expansion;
420 } tre_macros[] = {
421 {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425 { 0, 0 }
426 };
427
428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429 must have at least `len' items. Sets buf[0] to zero if the there
430 is no match in `tre_macros'. */
tre_expand_macro(const char * s)431 static const char *tre_expand_macro(const char *s)
432 {
433 int i;
434 for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435 return tre_macros[i].expansion;
436 }
437
438 static int
tre_compare_lit(const void * a,const void * b)439 tre_compare_lit(const void *a, const void *b)
440 {
441 const tre_literal_t *const *la = a;
442 const tre_literal_t *const *lb = b;
443 /* assumes the range of valid code_min is < INT_MAX */
444 return la[0]->code_min - lb[0]->code_min;
445 }
446
447 struct literals {
448 tre_mem_t mem;
449 tre_literal_t **a;
450 int len;
451 int cap;
452 };
453
tre_new_lit(struct literals * p)454 static tre_literal_t *tre_new_lit(struct literals *p)
455 {
456 tre_literal_t **a;
457 if (p->len >= p->cap) {
458 if (p->cap >= 1<<15)
459 return 0;
460 p->cap *= 2;
461 a = xrealloc(p->a, p->cap * sizeof *p->a);
462 if (!a)
463 return 0;
464 p->a = a;
465 }
466 a = p->a + p->len++;
467 *a = tre_mem_calloc(p->mem, sizeof **a);
468 return *a;
469 }
470
add_icase_literals(struct literals * ls,int min,int max)471 static int add_icase_literals(struct literals *ls, int min, int max)
472 {
473 tre_literal_t *lit;
474 int b, e, c;
475 for (c=min; c<=max; ) {
476 /* assumes islower(c) and isupper(c) are exclusive
477 and toupper(c)!=c if islower(c).
478 multiple opposite case characters are not supported */
479 if (tre_islower(c)) {
480 b = e = tre_toupper(c);
481 for (c++, e++; c<=max; c++, e++)
482 if (tre_toupper(c) != e) break;
483 } else if (tre_isupper(c)) {
484 b = e = tre_tolower(c);
485 for (c++, e++; c<=max; c++, e++)
486 if (tre_tolower(c) != e) break;
487 } else {
488 c++;
489 continue;
490 }
491 lit = tre_new_lit(ls);
492 if (!lit)
493 return -1;
494 lit->code_min = b;
495 lit->code_max = e-1;
496 lit->position = -1;
497 }
498 return 0;
499 }
500
501
502 /* Maximum number of character classes in a negated bracket expression. */
503 #define MAX_NEG_CLASSES 64
504
505 struct neg {
506 int negate;
507 int len;
508 tre_ctype_t a[MAX_NEG_CLASSES];
509 };
510
511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512
513 /*
514 bracket grammar:
515 Bracket = '[' List ']' | '[^' List ']'
516 List = Term | List Term
517 Term = Char | Range | Chclass | Eqclass
518 Range = Char '-' Char | Char '-' '-'
519 Char = Coll | coll_single
520 Meta = ']' | '-'
521 Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]'
522 Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]'
523 Chclass = '[:' class ':]'
524
525 coll_single is a single char collating element but it can be
526 '-' only at the beginning or end of a List and
527 ']' only at the beginning of a List and
528 '^' anywhere except after the openning '['
529 */
530
parse_bracket_terms(tre_parse_ctx_t * ctx,const char * s,struct literals * ls,struct neg * neg)531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
532 {
533 const char *start = s;
534 tre_ctype_t class;
535 int min, max;
536 wchar_t wc;
537 int len;
538
539 for (;;) {
540 class = 0;
541 len = mbtowc(&wc, s, -1);
542 if (len <= 0)
543 return *s ? REG_BADPAT : REG_EBRACK;
544 if (*s == ']' && s != start) {
545 ctx->s = s+1;
546 return REG_OK;
547 }
548 if (*s == '-' && s != start && s[1] != ']' &&
549 /* extension: [a-z--@] is accepted as [a-z]|[--@] */
550 (s[1] != '-' || s[2] == ']'))
551 return REG_ERANGE;
552 if (*s == '[' && (s[1] == '.' || s[1] == '='))
553 /* collating symbols and equivalence classes are not supported */
554 return REG_ECOLLATE;
555 if (*s == '[' && s[1] == ':') {
556 char tmp[CHARCLASS_NAME_MAX+1];
557 s += 2;
558 for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559 if (s[len] == ':') {
560 memcpy(tmp, s, len);
561 tmp[len] = 0;
562 class = tre_ctype(tmp);
563 break;
564 }
565 }
566 if (!class || s[len+1] != ']')
567 return REG_ECTYPE;
568 min = 0;
569 max = TRE_CHAR_MAX;
570 s += len+2;
571 } else {
572 min = max = wc;
573 s += len;
574 if (*s == '-' && s[1] != ']') {
575 s++;
576 len = mbtowc(&wc, s, -1);
577 max = wc;
578 /* XXX - Should use collation order instead of
579 encoding values in character ranges. */
580 if (len <= 0 || min > max)
581 return REG_ERANGE;
582 s += len;
583 }
584 }
585
586 if (class && neg->negate) {
587 if (neg->len >= MAX_NEG_CLASSES)
588 return REG_ESPACE;
589 neg->a[neg->len++] = class;
590 } else {
591 tre_literal_t *lit = tre_new_lit(ls);
592 if (!lit)
593 return REG_ESPACE;
594 lit->code_min = min;
595 lit->code_max = max;
596 lit->class = class;
597 lit->position = -1;
598
599 /* Add opposite-case codepoints if REG_ICASE is present.
600 It seems that POSIX requires that bracket negation
601 should happen before case-folding, but most practical
602 implementations do it the other way around. Changing
603 the order would need efficient representation of
604 case-fold ranges and bracket range sets even with
605 simple patterns so this is ok for now. */
606 if (ctx->cflags & REG_ICASE && !class)
607 if (add_icase_literals(ls, min, max))
608 return REG_ESPACE;
609 }
610 }
611 }
612
parse_bracket(tre_parse_ctx_t * ctx,const char * s)613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
614 {
615 int i, max, min, negmax, negmin;
616 tre_ast_node_t *node = 0, *n;
617 tre_ctype_t *nc = 0;
618 tre_literal_t *lit;
619 struct literals ls;
620 struct neg neg;
621 reg_errcode_t err;
622
623 ls.mem = ctx->mem;
624 ls.len = 0;
625 ls.cap = 32;
626 ls.a = xmalloc(ls.cap * sizeof *ls.a);
627 if (!ls.a)
628 return REG_ESPACE;
629 neg.len = 0;
630 neg.negate = *s == '^';
631 if (neg.negate)
632 s++;
633
634 err = parse_bracket_terms(ctx, s, &ls, &neg);
635 if (err != REG_OK)
636 goto parse_bracket_done;
637
638 if (neg.negate) {
639 /*
640 * With REG_NEWLINE, POSIX requires that newlines are not matched by
641 * any form of a non-matching list.
642 */
643 if (ctx->cflags & REG_NEWLINE) {
644 lit = tre_new_lit(&ls);
645 if (!lit) {
646 err = REG_ESPACE;
647 goto parse_bracket_done;
648 }
649 lit->code_min = '\n';
650 lit->code_max = '\n';
651 lit->position = -1;
652 }
653 /* Sort the array if we need to negate it. */
654 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
655 /* extra lit for the last negated range */
656 lit = tre_new_lit(&ls);
657 if (!lit) {
658 err = REG_ESPACE;
659 goto parse_bracket_done;
660 }
661 lit->code_min = TRE_CHAR_MAX+1;
662 lit->code_max = TRE_CHAR_MAX+1;
663 lit->position = -1;
664 /* negated classes */
665 if (neg.len) {
666 nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
667 if (!nc) {
668 err = REG_ESPACE;
669 goto parse_bracket_done;
670 }
671 memcpy(nc, neg.a, neg.len*sizeof *neg.a);
672 nc[neg.len] = 0;
673 }
674 }
675
676 /* Build a union of the items in the array, negated if necessary. */
677 negmax = negmin = 0;
678 for (i = 0; i < ls.len; i++) {
679 lit = ls.a[i];
680 min = lit->code_min;
681 max = lit->code_max;
682 if (neg.negate) {
683 if (min <= negmin) {
684 /* Overlap. */
685 negmin = MAX(max + 1, negmin);
686 continue;
687 }
688 negmax = min - 1;
689 lit->code_min = negmin;
690 lit->code_max = negmax;
691 negmin = max + 1;
692 }
693 lit->position = ctx->position;
694 lit->neg_classes = nc;
695 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
696 node = tre_ast_new_union(ctx->mem, node, n);
697 if (!node) {
698 err = REG_ESPACE;
699 break;
700 }
701 }
702
703 parse_bracket_done:
704 xfree(ls.a);
705 ctx->position++;
706 ctx->n = node;
707 return err;
708 }
709
parse_dup_count(const char * s,int * n)710 static const char *parse_dup_count(const char *s, int *n)
711 {
712 *n = -1;
713 if (!isdigit(*s))
714 return s;
715 *n = 0;
716 for (;;) {
717 *n = 10 * *n + (*s - '0');
718 s++;
719 if (!isdigit(*s) || *n > RE_DUP_MAX)
720 break;
721 }
722 return s;
723 }
724
parse_dup(const char * s,int ere,int * pmin,int * pmax)725 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
726 {
727 int min, max;
728
729 s = parse_dup_count(s, &min);
730 if (*s == ',')
731 s = parse_dup_count(s+1, &max);
732 else
733 max = min;
734
735 if (
736 (max < min && max >= 0) ||
737 max > RE_DUP_MAX ||
738 min > RE_DUP_MAX ||
739 min < 0 ||
740 (!ere && *s++ != '\\') ||
741 *s++ != '}'
742 )
743 return 0;
744 *pmin = min;
745 *pmax = max;
746 return s;
747 }
748
hexval(unsigned c)749 static int hexval(unsigned c)
750 {
751 if (c-'0'<10) return c-'0';
752 c |= 32;
753 if (c-'a'<6) return c-'a'+10;
754 return -1;
755 }
756
marksub(tre_parse_ctx_t * ctx,tre_ast_node_t * node,int subid)757 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
758 {
759 if (node->submatch_id >= 0) {
760 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
761 if (!n)
762 return REG_ESPACE;
763 n = tre_ast_new_catenation(ctx->mem, n, node);
764 if (!n)
765 return REG_ESPACE;
766 n->num_submatches = node->num_submatches;
767 node = n;
768 }
769 node->submatch_id = subid;
770 node->num_submatches++;
771 ctx->n = node;
772 return REG_OK;
773 }
774
775 /*
776 BRE grammar:
777 Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$'
778 Branch = Atom | Branch Atom
779 Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref
780 Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}'
781
782 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
783
784 ERE grammar:
785 Regex = Branch | Regex '|' Branch
786 Branch = Atom | Branch Atom
787 Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$'
788 Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}'
789
790 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
791 */
792
parse_atom(tre_parse_ctx_t * ctx,const char * s)793 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
794 {
795 int len, ere = ctx->cflags & REG_EXTENDED;
796 const char *p;
797 tre_ast_node_t *node;
798 wchar_t wc;
799 switch (*s) {
800 case '[':
801 return parse_bracket(ctx, s+1);
802 case '\\':
803 p = tre_expand_macro(s+1);
804 if (p) {
805 /* assume \X expansion is a single atom */
806 reg_errcode_t err = parse_atom(ctx, p);
807 ctx->s = s+2;
808 return err;
809 }
810 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
811 switch (*++s) {
812 case 0:
813 return REG_EESCAPE;
814 case 'b':
815 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
816 break;
817 case 'B':
818 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
819 break;
820 case '<':
821 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
822 break;
823 case '>':
824 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
825 break;
826 case 'x':
827 s++;
828 int i, v = 0, c;
829 len = 2;
830 if (*s == '{') {
831 len = 8;
832 s++;
833 }
834 for (i=0; i<len && v<0x110000; i++) {
835 c = hexval(s[i]);
836 if (c < 0) break;
837 v = 16*v + c;
838 }
839 s += i;
840 if (len == 8) {
841 if (*s != '}')
842 return REG_EBRACE;
843 s++;
844 }
845 node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
846 s--;
847 break;
848 case '{':
849 case '+':
850 case '?':
851 /* extension: treat \+, \? as repetitions in BRE */
852 /* reject repetitions after empty expression in BRE */
853 if (!ere)
854 return REG_BADRPT;
855 case '|':
856 /* extension: treat \| as alternation in BRE */
857 if (!ere) {
858 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
859 s--;
860 goto end;
861 }
862 /* fallthrough */
863 default:
864 if (!ere && (unsigned)*s-'1' < 9) {
865 /* back reference */
866 int val = *s - '0';
867 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
868 ctx->max_backref = MAX(val, ctx->max_backref);
869 } else {
870 /* extension: accept unknown escaped char
871 as a literal */
872 goto parse_literal;
873 }
874 }
875 s++;
876 break;
877 case '.':
878 if (ctx->cflags & REG_NEWLINE) {
879 tre_ast_node_t *tmp1, *tmp2;
880 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
881 tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
882 if (tmp1 && tmp2)
883 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
884 else
885 node = 0;
886 } else {
887 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
888 }
889 s++;
890 break;
891 case '^':
892 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
893 if (!ere && s != ctx->start)
894 goto parse_literal;
895 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
896 s++;
897 break;
898 case '$':
899 /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
900 if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
901 goto parse_literal;
902 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
903 s++;
904 break;
905 case '*':
906 case '{':
907 case '+':
908 case '?':
909 /* reject repetitions after empty expression in ERE */
910 if (ere)
911 return REG_BADRPT;
912 case '|':
913 if (!ere)
914 goto parse_literal;
915 case 0:
916 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
917 break;
918 default:
919 parse_literal:
920 len = mbtowc(&wc, s, -1);
921 if (len < 0)
922 return REG_BADPAT;
923 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
924 tre_ast_node_t *tmp1, *tmp2;
925 /* multiple opposite case characters are not supported */
926 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
927 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
928 if (tmp1 && tmp2)
929 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
930 else
931 node = 0;
932 } else {
933 node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
934 }
935 ctx->position++;
936 s += len;
937 break;
938 }
939 end:
940 if (!node)
941 return REG_ESPACE;
942 ctx->n = node;
943 ctx->s = s;
944 return REG_OK;
945 }
946
947 #define PUSHPTR(err, s, v) do { \
948 if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
949 return err; \
950 } while(0)
951
952 #define PUSHINT(err, s, v) do { \
953 if ((err = tre_stack_push_int(s, v)) != REG_OK) \
954 return err; \
955 } while(0)
956
tre_parse(tre_parse_ctx_t * ctx)957 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
958 {
959 tre_ast_node_t *nbranch=0, *nunion=0;
960 int ere = ctx->cflags & REG_EXTENDED;
961 const char *s = ctx->start;
962 int subid = 0;
963 int depth = 0;
964 reg_errcode_t err;
965 tre_stack_t *stack = ctx->stack;
966
967 PUSHINT(err, stack, subid++);
968 for (;;) {
969 if ((!ere && *s == '\\' && s[1] == '(') ||
970 (ere && *s == '(')) {
971 PUSHPTR(err, stack, nunion);
972 PUSHPTR(err, stack, nbranch);
973 PUSHINT(err, stack, subid++);
974 s++;
975 if (!ere)
976 s++;
977 depth++;
978 nbranch = nunion = 0;
979 ctx->start = s;
980 continue;
981 }
982 if ((!ere && *s == '\\' && s[1] == ')') ||
983 (ere && *s == ')' && depth)) {
984 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
985 if (!ctx->n)
986 return REG_ESPACE;
987 } else {
988 err = parse_atom(ctx, s);
989 if (err != REG_OK)
990 return err;
991 s = ctx->s;
992 }
993
994 parse_iter:
995 for (;;) {
996 int min, max;
997
998 if (*s!='\\' && *s!='*') {
999 if (!ere)
1000 break;
1001 if (*s!='+' && *s!='?' && *s!='{')
1002 break;
1003 }
1004 if (*s=='\\' && ere)
1005 break;
1006 /* extension: treat \+, \? as repetitions in BRE */
1007 if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1008 break;
1009 if (*s=='\\')
1010 s++;
1011
1012 /* handle ^* at the start of a BRE. */
1013 if (!ere && s==ctx->start+1 && s[-1]=='^')
1014 break;
1015
1016 /* extension: multiple consecutive *+?{,} is unspecified,
1017 but (a+)+ has to be supported so accepting a++ makes
1018 sense, note however that the RE_DUP_MAX limit can be
1019 circumvented: (a{255}){255} uses a lot of memory.. */
1020 if (*s=='{') {
1021 s = parse_dup(s+1, ere, &min, &max);
1022 if (!s)
1023 return REG_BADBR;
1024 } else {
1025 min=0;
1026 max=-1;
1027 if (*s == '+')
1028 min = 1;
1029 if (*s == '?')
1030 max = 1;
1031 s++;
1032 }
1033 if (max == 0)
1034 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1035 else
1036 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1037 if (!ctx->n)
1038 return REG_ESPACE;
1039 }
1040
1041 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1042 if ((ere && *s == '|') ||
1043 (ere && *s == ')' && depth) ||
1044 (!ere && *s == '\\' && s[1] == ')') ||
1045 /* extension: treat \| as alternation in BRE */
1046 (!ere && *s == '\\' && s[1] == '|') ||
1047 !*s) {
1048 /* extension: empty branch is unspecified (), (|a), (a|)
1049 here they are not rejected but match on empty string */
1050 int c = *s;
1051 nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1052 nbranch = 0;
1053
1054 if (c == '\\' && s[1] == '|') {
1055 s+=2;
1056 ctx->start = s;
1057 } else if (c == '|') {
1058 s++;
1059 ctx->start = s;
1060 } else {
1061 if (c == '\\') {
1062 if (!depth) return REG_EPAREN;
1063 s+=2;
1064 } else if (c == ')')
1065 s++;
1066 depth--;
1067 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1068 if (err != REG_OK)
1069 return err;
1070 if (!c && depth<0) {
1071 ctx->submatch_id = subid;
1072 return REG_OK;
1073 }
1074 if (!c || depth<0)
1075 return REG_EPAREN;
1076 nbranch = tre_stack_pop_voidptr(stack);
1077 nunion = tre_stack_pop_voidptr(stack);
1078 goto parse_iter;
1079 }
1080 }
1081 }
1082 }
1083
1084
1085 /***********************************************************************
1086 from tre-compile.c
1087 ***********************************************************************/
1088
1089
1090 /*
1091 TODO:
1092 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1093 function calls.
1094 */
1095
1096 /*
1097 Algorithms to setup tags so that submatch addressing can be done.
1098 */
1099
1100
1101 /* Inserts a catenation node to the root of the tree given in `node'.
1102 As the left child a new tag with number `tag_id' to `node' is added,
1103 and the right child is the old root. */
1104 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1105 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1106 {
1107 tre_catenation_t *c;
1108
1109 c = tre_mem_alloc(mem, sizeof(*c));
1110 if (c == NULL)
1111 return REG_ESPACE;
1112 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1113 if (c->left == NULL)
1114 return REG_ESPACE;
1115 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1116 if (c->right == NULL)
1117 return REG_ESPACE;
1118
1119 c->right->obj = node->obj;
1120 c->right->type = node->type;
1121 c->right->nullable = -1;
1122 c->right->submatch_id = -1;
1123 c->right->firstpos = NULL;
1124 c->right->lastpos = NULL;
1125 c->right->num_tags = 0;
1126 c->right->num_submatches = 0;
1127 node->obj = c;
1128 node->type = CATENATION;
1129 return REG_OK;
1130 }
1131
1132 /* Inserts a catenation node to the root of the tree given in `node'.
1133 As the right child a new tag with number `tag_id' to `node' is added,
1134 and the left child is the old root. */
1135 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1136 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1137 {
1138 tre_catenation_t *c;
1139
1140 c = tre_mem_alloc(mem, sizeof(*c));
1141 if (c == NULL)
1142 return REG_ESPACE;
1143 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1144 if (c->right == NULL)
1145 return REG_ESPACE;
1146 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1147 if (c->left == NULL)
1148 return REG_ESPACE;
1149
1150 c->left->obj = node->obj;
1151 c->left->type = node->type;
1152 c->left->nullable = -1;
1153 c->left->submatch_id = -1;
1154 c->left->firstpos = NULL;
1155 c->left->lastpos = NULL;
1156 c->left->num_tags = 0;
1157 c->left->num_submatches = 0;
1158 node->obj = c;
1159 node->type = CATENATION;
1160 return REG_OK;
1161 }
1162
1163 typedef enum {
1164 ADDTAGS_RECURSE,
1165 ADDTAGS_AFTER_ITERATION,
1166 ADDTAGS_AFTER_UNION_LEFT,
1167 ADDTAGS_AFTER_UNION_RIGHT,
1168 ADDTAGS_AFTER_CAT_LEFT,
1169 ADDTAGS_AFTER_CAT_RIGHT,
1170 ADDTAGS_SET_SUBMATCH_END
1171 } tre_addtags_symbol_t;
1172
1173
1174 typedef struct {
1175 int tag;
1176 int next_tag;
1177 } tre_tag_states_t;
1178
1179
1180 /* Go through `regset' and set submatch data for submatches that are
1181 using this tag. */
1182 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)1183 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1184 {
1185 int i;
1186
1187 for (i = 0; regset[i] >= 0; i++)
1188 {
1189 int id = regset[i] / 2;
1190 int start = !(regset[i] % 2);
1191 if (start)
1192 tnfa->submatch_data[id].so_tag = tag;
1193 else
1194 tnfa->submatch_data[id].eo_tag = tag;
1195 }
1196 regset[0] = -1;
1197 }
1198
1199
1200 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1201 subexpressions marked for submatch addressing can be traced. */
1202 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)1203 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1204 tre_tnfa_t *tnfa)
1205 {
1206 reg_errcode_t status = REG_OK;
1207 tre_addtags_symbol_t symbol;
1208 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1209 int bottom = tre_stack_num_objects(stack);
1210 /* True for first pass (counting number of needed tags) */
1211 int first_pass = (mem == NULL || tnfa == NULL);
1212 int *regset, *orig_regset;
1213 int num_tags = 0; /* Total number of tags. */
1214 int num_minimals = 0; /* Number of special minimal tags. */
1215 int tag = 0; /* The tag that is to be added next. */
1216 int next_tag = 1; /* Next tag to use after this one. */
1217 int *parents; /* Stack of submatches the current submatch is
1218 contained in. */
1219 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1220 tre_tag_states_t *saved_states;
1221
1222 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1223 if (!first_pass)
1224 {
1225 tnfa->end_tag = 0;
1226 tnfa->minimal_tags[0] = -1;
1227 }
1228
1229 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1230 if (regset == NULL)
1231 return REG_ESPACE;
1232 regset[0] = -1;
1233 orig_regset = regset;
1234
1235 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1236 if (parents == NULL)
1237 {
1238 xfree(regset);
1239 return REG_ESPACE;
1240 }
1241 parents[0] = -1;
1242
1243 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1244 if (saved_states == NULL)
1245 {
1246 xfree(regset);
1247 xfree(parents);
1248 return REG_ESPACE;
1249 }
1250 else
1251 {
1252 unsigned int i;
1253 for (i = 0; i <= tnfa->num_submatches; i++)
1254 saved_states[i].tag = -1;
1255 }
1256
1257 STACK_PUSH(stack, voidptr, node);
1258 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1259
1260 while (tre_stack_num_objects(stack) > bottom)
1261 {
1262 if (status != REG_OK)
1263 break;
1264
1265 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1266 switch (symbol)
1267 {
1268
1269 case ADDTAGS_SET_SUBMATCH_END:
1270 {
1271 int id = tre_stack_pop_int(stack);
1272 int i;
1273
1274 /* Add end of this submatch to regset. */
1275 for (i = 0; regset[i] >= 0; i++);
1276 regset[i] = id * 2 + 1;
1277 regset[i + 1] = -1;
1278
1279 /* Pop this submatch from the parents stack. */
1280 for (i = 0; parents[i] >= 0; i++);
1281 parents[i - 1] = -1;
1282 break;
1283 }
1284
1285 case ADDTAGS_RECURSE:
1286 node = tre_stack_pop_voidptr(stack);
1287
1288 if (node->submatch_id >= 0)
1289 {
1290 int id = node->submatch_id;
1291 int i;
1292
1293
1294 /* Add start of this submatch to regset. */
1295 for (i = 0; regset[i] >= 0; i++);
1296 regset[i] = id * 2;
1297 regset[i + 1] = -1;
1298
1299 if (!first_pass)
1300 {
1301 for (i = 0; parents[i] >= 0; i++);
1302 tnfa->submatch_data[id].parents = NULL;
1303 if (i > 0)
1304 {
1305 int *p = xmalloc(sizeof(*p) * (i + 1));
1306 if (p == NULL)
1307 {
1308 status = REG_ESPACE;
1309 break;
1310 }
1311 assert(tnfa->submatch_data[id].parents == NULL);
1312 tnfa->submatch_data[id].parents = p;
1313 for (i = 0; parents[i] >= 0; i++)
1314 p[i] = parents[i];
1315 p[i] = -1;
1316 }
1317 }
1318
1319 /* Add end of this submatch to regset after processing this
1320 node. */
1321 STACK_PUSHX(stack, int, node->submatch_id);
1322 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1323 }
1324
1325 switch (node->type)
1326 {
1327 case LITERAL:
1328 {
1329 tre_literal_t *lit = node->obj;
1330
1331 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1332 {
1333 int i;
1334 if (regset[0] >= 0)
1335 {
1336 /* Regset is not empty, so add a tag before the
1337 literal or backref. */
1338 if (!first_pass)
1339 {
1340 status = tre_add_tag_left(mem, node, tag);
1341 tnfa->tag_directions[tag] = direction;
1342 if (minimal_tag >= 0)
1343 {
1344 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1345 tnfa->minimal_tags[i] = tag;
1346 tnfa->minimal_tags[i + 1] = minimal_tag;
1347 tnfa->minimal_tags[i + 2] = -1;
1348 minimal_tag = -1;
1349 num_minimals++;
1350 }
1351 tre_purge_regset(regset, tnfa, tag);
1352 }
1353 else
1354 {
1355 node->num_tags = 1;
1356 }
1357
1358 regset[0] = -1;
1359 tag = next_tag;
1360 num_tags++;
1361 next_tag++;
1362 }
1363 }
1364 else
1365 {
1366 assert(!IS_TAG(lit));
1367 }
1368 break;
1369 }
1370 case CATENATION:
1371 {
1372 tre_catenation_t *cat = node->obj;
1373 tre_ast_node_t *left = cat->left;
1374 tre_ast_node_t *right = cat->right;
1375 int reserved_tag = -1;
1376
1377
1378 /* After processing right child. */
1379 STACK_PUSHX(stack, voidptr, node);
1380 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1381
1382 /* Process right child. */
1383 STACK_PUSHX(stack, voidptr, right);
1384 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1385
1386 /* After processing left child. */
1387 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1388 if (left->num_tags > 0 && right->num_tags > 0)
1389 {
1390 /* Reserve the next tag to the right child. */
1391 reserved_tag = next_tag;
1392 next_tag++;
1393 }
1394 STACK_PUSHX(stack, int, reserved_tag);
1395 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1396
1397 /* Process left child. */
1398 STACK_PUSHX(stack, voidptr, left);
1399 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1400
1401 }
1402 break;
1403 case ITERATION:
1404 {
1405 tre_iteration_t *iter = node->obj;
1406
1407 if (first_pass)
1408 {
1409 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1410 }
1411 else
1412 {
1413 STACK_PUSHX(stack, int, tag);
1414 STACK_PUSHX(stack, int, iter->minimal);
1415 }
1416 STACK_PUSHX(stack, voidptr, node);
1417 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1418
1419 STACK_PUSHX(stack, voidptr, iter->arg);
1420 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1421
1422 /* Regset is not empty, so add a tag here. */
1423 if (regset[0] >= 0 || iter->minimal)
1424 {
1425 if (!first_pass)
1426 {
1427 int i;
1428 status = tre_add_tag_left(mem, node, tag);
1429 if (iter->minimal)
1430 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1431 else
1432 tnfa->tag_directions[tag] = direction;
1433 if (minimal_tag >= 0)
1434 {
1435 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1436 tnfa->minimal_tags[i] = tag;
1437 tnfa->minimal_tags[i + 1] = minimal_tag;
1438 tnfa->minimal_tags[i + 2] = -1;
1439 minimal_tag = -1;
1440 num_minimals++;
1441 }
1442 tre_purge_regset(regset, tnfa, tag);
1443 }
1444
1445 regset[0] = -1;
1446 tag = next_tag;
1447 num_tags++;
1448 next_tag++;
1449 }
1450 direction = TRE_TAG_MINIMIZE;
1451 }
1452 break;
1453 case UNION:
1454 {
1455 tre_union_t *uni = node->obj;
1456 tre_ast_node_t *left = uni->left;
1457 tre_ast_node_t *right = uni->right;
1458 int left_tag;
1459 int right_tag;
1460
1461 if (regset[0] >= 0)
1462 {
1463 left_tag = next_tag;
1464 right_tag = next_tag + 1;
1465 }
1466 else
1467 {
1468 left_tag = tag;
1469 right_tag = next_tag;
1470 }
1471
1472 /* After processing right child. */
1473 STACK_PUSHX(stack, int, right_tag);
1474 STACK_PUSHX(stack, int, left_tag);
1475 STACK_PUSHX(stack, voidptr, regset);
1476 STACK_PUSHX(stack, int, regset[0] >= 0);
1477 STACK_PUSHX(stack, voidptr, node);
1478 STACK_PUSHX(stack, voidptr, right);
1479 STACK_PUSHX(stack, voidptr, left);
1480 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1481
1482 /* Process right child. */
1483 STACK_PUSHX(stack, voidptr, right);
1484 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1485
1486 /* After processing left child. */
1487 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1488
1489 /* Process left child. */
1490 STACK_PUSHX(stack, voidptr, left);
1491 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1492
1493 /* Regset is not empty, so add a tag here. */
1494 if (regset[0] >= 0)
1495 {
1496 if (!first_pass)
1497 {
1498 int i;
1499 status = tre_add_tag_left(mem, node, tag);
1500 tnfa->tag_directions[tag] = direction;
1501 if (minimal_tag >= 0)
1502 {
1503 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1504 tnfa->minimal_tags[i] = tag;
1505 tnfa->minimal_tags[i + 1] = minimal_tag;
1506 tnfa->minimal_tags[i + 2] = -1;
1507 minimal_tag = -1;
1508 num_minimals++;
1509 }
1510 tre_purge_regset(regset, tnfa, tag);
1511 }
1512
1513 regset[0] = -1;
1514 tag = next_tag;
1515 num_tags++;
1516 next_tag++;
1517 }
1518
1519 if (node->num_submatches > 0)
1520 {
1521 /* The next two tags are reserved for markers. */
1522 next_tag++;
1523 tag = next_tag;
1524 next_tag++;
1525 }
1526
1527 break;
1528 }
1529 }
1530
1531 if (node->submatch_id >= 0)
1532 {
1533 int i;
1534 /* Push this submatch on the parents stack. */
1535 for (i = 0; parents[i] >= 0; i++);
1536 parents[i] = node->submatch_id;
1537 parents[i + 1] = -1;
1538 }
1539
1540 break; /* end case: ADDTAGS_RECURSE */
1541
1542 case ADDTAGS_AFTER_ITERATION:
1543 {
1544 int minimal = 0;
1545 int enter_tag;
1546 node = tre_stack_pop_voidptr(stack);
1547 if (first_pass)
1548 {
1549 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1550 + tre_stack_pop_int(stack);
1551 minimal_tag = -1;
1552 }
1553 else
1554 {
1555 minimal = tre_stack_pop_int(stack);
1556 enter_tag = tre_stack_pop_int(stack);
1557 if (minimal)
1558 minimal_tag = enter_tag;
1559 }
1560
1561 if (!first_pass)
1562 {
1563 if (minimal)
1564 direction = TRE_TAG_MINIMIZE;
1565 else
1566 direction = TRE_TAG_MAXIMIZE;
1567 }
1568 break;
1569 }
1570
1571 case ADDTAGS_AFTER_CAT_LEFT:
1572 {
1573 int new_tag = tre_stack_pop_int(stack);
1574 next_tag = tre_stack_pop_int(stack);
1575 if (new_tag >= 0)
1576 {
1577 tag = new_tag;
1578 }
1579 break;
1580 }
1581
1582 case ADDTAGS_AFTER_CAT_RIGHT:
1583 node = tre_stack_pop_voidptr(stack);
1584 if (first_pass)
1585 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1586 + ((tre_catenation_t *)node->obj)->right->num_tags;
1587 break;
1588
1589 case ADDTAGS_AFTER_UNION_LEFT:
1590 /* Lift the bottom of the `regset' array so that when processing
1591 the right operand the items currently in the array are
1592 invisible. The original bottom was saved at ADDTAGS_UNION and
1593 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1594 while (*regset >= 0)
1595 regset++;
1596 break;
1597
1598 case ADDTAGS_AFTER_UNION_RIGHT:
1599 {
1600 int added_tags, tag_left, tag_right;
1601 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1602 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1603 node = tre_stack_pop_voidptr(stack);
1604 added_tags = tre_stack_pop_int(stack);
1605 if (first_pass)
1606 {
1607 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1608 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1609 + ((node->num_submatches > 0) ? 2 : 0);
1610 }
1611 regset = tre_stack_pop_voidptr(stack);
1612 tag_left = tre_stack_pop_int(stack);
1613 tag_right = tre_stack_pop_int(stack);
1614
1615 /* Add tags after both children, the left child gets a smaller
1616 tag than the right child. This guarantees that we prefer
1617 the left child over the right child. */
1618 /* XXX - This is not always necessary (if the children have
1619 tags which must be seen for every match of that child). */
1620 /* XXX - Check if this is the only place where tre_add_tag_right
1621 is used. If so, use tre_add_tag_left (putting the tag before
1622 the child as opposed after the child) and throw away
1623 tre_add_tag_right. */
1624 if (node->num_submatches > 0)
1625 {
1626 if (!first_pass)
1627 {
1628 status = tre_add_tag_right(mem, left, tag_left);
1629 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1630 if (status == REG_OK)
1631 status = tre_add_tag_right(mem, right, tag_right);
1632 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1633 }
1634 num_tags += 2;
1635 }
1636 direction = TRE_TAG_MAXIMIZE;
1637 break;
1638 }
1639
1640 default:
1641 assert(0);
1642 break;
1643
1644 } /* end switch(symbol) */
1645 } /* end while(tre_stack_num_objects(stack) > bottom) */
1646
1647 if (!first_pass)
1648 tre_purge_regset(regset, tnfa, tag);
1649
1650 if (!first_pass && minimal_tag >= 0)
1651 {
1652 int i;
1653 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1654 tnfa->minimal_tags[i] = tag;
1655 tnfa->minimal_tags[i + 1] = minimal_tag;
1656 tnfa->minimal_tags[i + 2] = -1;
1657 minimal_tag = -1;
1658 num_minimals++;
1659 }
1660
1661 assert(tree->num_tags == num_tags);
1662 tnfa->end_tag = num_tags;
1663 tnfa->num_tags = num_tags;
1664 tnfa->num_minimals = num_minimals;
1665 xfree(orig_regset);
1666 xfree(parents);
1667 xfree(saved_states);
1668 return status;
1669 }
1670
1671
1672
1673 /*
1674 AST to TNFA compilation routines.
1675 */
1676
1677 typedef enum {
1678 COPY_RECURSE,
1679 COPY_SET_RESULT_PTR
1680 } tre_copyast_symbol_t;
1681
1682 /* Flags for tre_copy_ast(). */
1683 #define COPY_REMOVE_TAGS 1
1684 #define COPY_MAXIMIZE_FIRST_TAG 2
1685
1686 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)1687 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1688 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1689 tre_ast_node_t **copy, int *max_pos)
1690 {
1691 reg_errcode_t status = REG_OK;
1692 int bottom = tre_stack_num_objects(stack);
1693 int num_copied = 0;
1694 int first_tag = 1;
1695 tre_ast_node_t **result = copy;
1696 tre_copyast_symbol_t symbol;
1697
1698 STACK_PUSH(stack, voidptr, ast);
1699 STACK_PUSH(stack, int, COPY_RECURSE);
1700
1701 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1702 {
1703 tre_ast_node_t *node;
1704 if (status != REG_OK)
1705 break;
1706
1707 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1708 switch (symbol)
1709 {
1710 case COPY_SET_RESULT_PTR:
1711 result = tre_stack_pop_voidptr(stack);
1712 break;
1713 case COPY_RECURSE:
1714 node = tre_stack_pop_voidptr(stack);
1715 switch (node->type)
1716 {
1717 case LITERAL:
1718 {
1719 tre_literal_t *lit = node->obj;
1720 int pos = lit->position;
1721 int min = lit->code_min;
1722 int max = lit->code_max;
1723 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1724 {
1725 /* XXX - e.g. [ab] has only one position but two
1726 nodes, so we are creating holes in the state space
1727 here. Not fatal, just wastes memory. */
1728 pos += *pos_add;
1729 num_copied++;
1730 }
1731 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1732 {
1733 /* Change this tag to empty. */
1734 min = EMPTY;
1735 max = pos = -1;
1736 }
1737 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1738 && first_tag)
1739 {
1740 /* Maximize the first tag. */
1741 tag_directions[max] = TRE_TAG_MAXIMIZE;
1742 first_tag = 0;
1743 }
1744 *result = tre_ast_new_literal(mem, min, max, pos);
1745 if (*result == NULL)
1746 status = REG_ESPACE;
1747 else {
1748 tre_literal_t *p = (*result)->obj;
1749 p->class = lit->class;
1750 p->neg_classes = lit->neg_classes;
1751 }
1752
1753 if (pos > *max_pos)
1754 *max_pos = pos;
1755 break;
1756 }
1757 case UNION:
1758 {
1759 tre_union_t *uni = node->obj;
1760 tre_union_t *tmp;
1761 *result = tre_ast_new_union(mem, uni->left, uni->right);
1762 if (*result == NULL)
1763 {
1764 status = REG_ESPACE;
1765 break;
1766 }
1767 tmp = (*result)->obj;
1768 result = &tmp->left;
1769 STACK_PUSHX(stack, voidptr, uni->right);
1770 STACK_PUSHX(stack, int, COPY_RECURSE);
1771 STACK_PUSHX(stack, voidptr, &tmp->right);
1772 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1773 STACK_PUSHX(stack, voidptr, uni->left);
1774 STACK_PUSHX(stack, int, COPY_RECURSE);
1775 break;
1776 }
1777 case CATENATION:
1778 {
1779 tre_catenation_t *cat = node->obj;
1780 tre_catenation_t *tmp;
1781 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1782 if (*result == NULL)
1783 {
1784 status = REG_ESPACE;
1785 break;
1786 }
1787 tmp = (*result)->obj;
1788 tmp->left = NULL;
1789 tmp->right = NULL;
1790 result = &tmp->left;
1791
1792 STACK_PUSHX(stack, voidptr, cat->right);
1793 STACK_PUSHX(stack, int, COPY_RECURSE);
1794 STACK_PUSHX(stack, voidptr, &tmp->right);
1795 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1796 STACK_PUSHX(stack, voidptr, cat->left);
1797 STACK_PUSHX(stack, int, COPY_RECURSE);
1798 break;
1799 }
1800 case ITERATION:
1801 {
1802 tre_iteration_t *iter = node->obj;
1803 STACK_PUSHX(stack, voidptr, iter->arg);
1804 STACK_PUSHX(stack, int, COPY_RECURSE);
1805 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1806 iter->max, iter->minimal);
1807 if (*result == NULL)
1808 {
1809 status = REG_ESPACE;
1810 break;
1811 }
1812 iter = (*result)->obj;
1813 result = &iter->arg;
1814 break;
1815 }
1816 default:
1817 assert(0);
1818 break;
1819 }
1820 break;
1821 }
1822 }
1823 *pos_add += num_copied;
1824 return status;
1825 }
1826
1827 typedef enum {
1828 EXPAND_RECURSE,
1829 EXPAND_AFTER_ITER
1830 } tre_expand_ast_symbol_t;
1831
1832 /* Expands each iteration node that has a finite nonzero minimum or maximum
1833 iteration count to a catenated sequence of copies of the node. */
1834 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions)1835 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1836 int *position, tre_tag_direction_t *tag_directions)
1837 {
1838 reg_errcode_t status = REG_OK;
1839 int bottom = tre_stack_num_objects(stack);
1840 int pos_add = 0;
1841 int pos_add_total = 0;
1842 int max_pos = 0;
1843 int iter_depth = 0;
1844
1845 STACK_PUSHR(stack, voidptr, ast);
1846 STACK_PUSHR(stack, int, EXPAND_RECURSE);
1847 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1848 {
1849 tre_ast_node_t *node;
1850 tre_expand_ast_symbol_t symbol;
1851
1852 if (status != REG_OK)
1853 break;
1854
1855 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1856 node = tre_stack_pop_voidptr(stack);
1857 switch (symbol)
1858 {
1859 case EXPAND_RECURSE:
1860 switch (node->type)
1861 {
1862 case LITERAL:
1863 {
1864 tre_literal_t *lit= node->obj;
1865 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1866 {
1867 lit->position += pos_add;
1868 if (lit->position > max_pos)
1869 max_pos = lit->position;
1870 }
1871 break;
1872 }
1873 case UNION:
1874 {
1875 tre_union_t *uni = node->obj;
1876 STACK_PUSHX(stack, voidptr, uni->right);
1877 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1878 STACK_PUSHX(stack, voidptr, uni->left);
1879 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1880 break;
1881 }
1882 case CATENATION:
1883 {
1884 tre_catenation_t *cat = node->obj;
1885 STACK_PUSHX(stack, voidptr, cat->right);
1886 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1887 STACK_PUSHX(stack, voidptr, cat->left);
1888 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1889 break;
1890 }
1891 case ITERATION:
1892 {
1893 tre_iteration_t *iter = node->obj;
1894 STACK_PUSHX(stack, int, pos_add);
1895 STACK_PUSHX(stack, voidptr, node);
1896 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1897 STACK_PUSHX(stack, voidptr, iter->arg);
1898 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1899 /* If we are going to expand this node at EXPAND_AFTER_ITER
1900 then don't increase the `pos' fields of the nodes now, it
1901 will get done when expanding. */
1902 if (iter->min > 1 || iter->max > 1)
1903 pos_add = 0;
1904 iter_depth++;
1905 break;
1906 }
1907 default:
1908 assert(0);
1909 break;
1910 }
1911 break;
1912 case EXPAND_AFTER_ITER:
1913 {
1914 tre_iteration_t *iter = node->obj;
1915 int pos_add_last;
1916 pos_add = tre_stack_pop_int(stack);
1917 pos_add_last = pos_add;
1918 if (iter->min > 1 || iter->max > 1)
1919 {
1920 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1921 int j;
1922 int pos_add_save = pos_add;
1923
1924 /* Create a catenated sequence of copies of the node. */
1925 for (j = 0; j < iter->min; j++)
1926 {
1927 tre_ast_node_t *copy;
1928 /* Remove tags from all but the last copy. */
1929 int flags = ((j + 1 < iter->min)
1930 ? COPY_REMOVE_TAGS
1931 : COPY_MAXIMIZE_FIRST_TAG);
1932 pos_add_save = pos_add;
1933 status = tre_copy_ast(mem, stack, iter->arg, flags,
1934 &pos_add, tag_directions, ©,
1935 &max_pos);
1936 if (status != REG_OK)
1937 return status;
1938 if (seq1 != NULL)
1939 seq1 = tre_ast_new_catenation(mem, seq1, copy);
1940 else
1941 seq1 = copy;
1942 if (seq1 == NULL)
1943 return REG_ESPACE;
1944 }
1945
1946 if (iter->max == -1)
1947 {
1948 /* No upper limit. */
1949 pos_add_save = pos_add;
1950 status = tre_copy_ast(mem, stack, iter->arg, 0,
1951 &pos_add, NULL, &seq2, &max_pos);
1952 if (status != REG_OK)
1953 return status;
1954 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1955 if (seq2 == NULL)
1956 return REG_ESPACE;
1957 }
1958 else
1959 {
1960 for (j = iter->min; j < iter->max; j++)
1961 {
1962 tre_ast_node_t *tmp, *copy;
1963 pos_add_save = pos_add;
1964 status = tre_copy_ast(mem, stack, iter->arg, 0,
1965 &pos_add, NULL, ©, &max_pos);
1966 if (status != REG_OK)
1967 return status;
1968 if (seq2 != NULL)
1969 seq2 = tre_ast_new_catenation(mem, copy, seq2);
1970 else
1971 seq2 = copy;
1972 if (seq2 == NULL)
1973 return REG_ESPACE;
1974 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1975 if (tmp == NULL)
1976 return REG_ESPACE;
1977 seq2 = tre_ast_new_union(mem, tmp, seq2);
1978 if (seq2 == NULL)
1979 return REG_ESPACE;
1980 }
1981 }
1982
1983 pos_add = pos_add_save;
1984 if (seq1 == NULL)
1985 seq1 = seq2;
1986 else if (seq2 != NULL)
1987 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1988 if (seq1 == NULL)
1989 return REG_ESPACE;
1990 node->obj = seq1->obj;
1991 node->type = seq1->type;
1992 }
1993
1994 iter_depth--;
1995 pos_add_total += pos_add - pos_add_last;
1996 if (iter_depth == 0)
1997 pos_add = pos_add_total;
1998
1999 break;
2000 }
2001 default:
2002 assert(0);
2003 break;
2004 }
2005 }
2006
2007 *position += pos_add_total;
2008
2009 /* `max_pos' should never be larger than `*position' if the above
2010 code works, but just an extra safeguard let's make sure
2011 `*position' is set large enough so enough memory will be
2012 allocated for the transition table. */
2013 if (max_pos > *position)
2014 *position = max_pos;
2015
2016 return status;
2017 }
2018
2019 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)2020 tre_set_empty(tre_mem_t mem)
2021 {
2022 tre_pos_and_tags_t *new_set;
2023
2024 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2025 if (new_set == NULL)
2026 return NULL;
2027
2028 new_set[0].position = -1;
2029 new_set[0].code_min = -1;
2030 new_set[0].code_max = -1;
2031
2032 return new_set;
2033 }
2034
2035 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)2036 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2037 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2038 {
2039 tre_pos_and_tags_t *new_set;
2040
2041 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2042 if (new_set == NULL)
2043 return NULL;
2044
2045 new_set[0].position = position;
2046 new_set[0].code_min = code_min;
2047 new_set[0].code_max = code_max;
2048 new_set[0].class = class;
2049 new_set[0].neg_classes = neg_classes;
2050 new_set[0].backref = backref;
2051 new_set[1].position = -1;
2052 new_set[1].code_min = -1;
2053 new_set[1].code_max = -1;
2054
2055 return new_set;
2056 }
2057
2058 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions)2059 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2060 int *tags, int assertions)
2061 {
2062 int s1, s2, i, j;
2063 tre_pos_and_tags_t *new_set;
2064 int *new_tags;
2065 int num_tags;
2066
2067 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2068 for (s1 = 0; set1[s1].position >= 0; s1++);
2069 for (s2 = 0; set2[s2].position >= 0; s2++);
2070 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2071 if (!new_set )
2072 return NULL;
2073
2074 for (s1 = 0; set1[s1].position >= 0; s1++)
2075 {
2076 new_set[s1].position = set1[s1].position;
2077 new_set[s1].code_min = set1[s1].code_min;
2078 new_set[s1].code_max = set1[s1].code_max;
2079 new_set[s1].assertions = set1[s1].assertions | assertions;
2080 new_set[s1].class = set1[s1].class;
2081 new_set[s1].neg_classes = set1[s1].neg_classes;
2082 new_set[s1].backref = set1[s1].backref;
2083 if (set1[s1].tags == NULL && tags == NULL)
2084 new_set[s1].tags = NULL;
2085 else
2086 {
2087 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2088 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2089 * (i + num_tags + 1)));
2090 if (new_tags == NULL)
2091 return NULL;
2092 for (j = 0; j < i; j++)
2093 new_tags[j] = set1[s1].tags[j];
2094 for (i = 0; i < num_tags; i++)
2095 new_tags[j + i] = tags[i];
2096 new_tags[j + i] = -1;
2097 new_set[s1].tags = new_tags;
2098 }
2099 }
2100
2101 for (s2 = 0; set2[s2].position >= 0; s2++)
2102 {
2103 new_set[s1 + s2].position = set2[s2].position;
2104 new_set[s1 + s2].code_min = set2[s2].code_min;
2105 new_set[s1 + s2].code_max = set2[s2].code_max;
2106 /* XXX - why not | assertions here as well? */
2107 new_set[s1 + s2].assertions = set2[s2].assertions;
2108 new_set[s1 + s2].class = set2[s2].class;
2109 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2110 new_set[s1 + s2].backref = set2[s2].backref;
2111 if (set2[s2].tags == NULL)
2112 new_set[s1 + s2].tags = NULL;
2113 else
2114 {
2115 for (i = 0; set2[s2].tags[i] >= 0; i++);
2116 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2117 if (new_tags == NULL)
2118 return NULL;
2119 for (j = 0; j < i; j++)
2120 new_tags[j] = set2[s2].tags[j];
2121 new_tags[j] = -1;
2122 new_set[s1 + s2].tags = new_tags;
2123 }
2124 }
2125 new_set[s1 + s2].position = -1;
2126 return new_set;
2127 }
2128
2129 /* Finds the empty path through `node' which is the one that should be
2130 taken according to POSIX.2 rules, and adds the tags on that path to
2131 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2132 set to the number of tags seen on the path. */
2133 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * num_tags_seen)2134 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2135 int *assertions, int *num_tags_seen)
2136 {
2137 tre_literal_t *lit;
2138 tre_union_t *uni;
2139 tre_catenation_t *cat;
2140 tre_iteration_t *iter;
2141 int i;
2142 int bottom = tre_stack_num_objects(stack);
2143 reg_errcode_t status = REG_OK;
2144 if (num_tags_seen)
2145 *num_tags_seen = 0;
2146
2147 status = tre_stack_push_voidptr(stack, node);
2148
2149 /* Walk through the tree recursively. */
2150 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2151 {
2152 node = tre_stack_pop_voidptr(stack);
2153
2154 switch (node->type)
2155 {
2156 case LITERAL:
2157 lit = (tre_literal_t *)node->obj;
2158 switch (lit->code_min)
2159 {
2160 case TAG:
2161 if (lit->code_max >= 0)
2162 {
2163 if (tags != NULL)
2164 {
2165 /* Add the tag to `tags'. */
2166 for (i = 0; tags[i] >= 0; i++)
2167 if (tags[i] == lit->code_max)
2168 break;
2169 if (tags[i] < 0)
2170 {
2171 tags[i] = lit->code_max;
2172 tags[i + 1] = -1;
2173 }
2174 }
2175 if (num_tags_seen)
2176 (*num_tags_seen)++;
2177 }
2178 break;
2179 case ASSERTION:
2180 assert(lit->code_max >= 1
2181 || lit->code_max <= ASSERT_LAST);
2182 if (assertions != NULL)
2183 *assertions |= lit->code_max;
2184 break;
2185 case EMPTY:
2186 break;
2187 default:
2188 assert(0);
2189 break;
2190 }
2191 break;
2192
2193 case UNION:
2194 /* Subexpressions starting earlier take priority over ones
2195 starting later, so we prefer the left subexpression over the
2196 right subexpression. */
2197 uni = (tre_union_t *)node->obj;
2198 if (uni->left->nullable)
2199 STACK_PUSHX(stack, voidptr, uni->left)
2200 else if (uni->right->nullable)
2201 STACK_PUSHX(stack, voidptr, uni->right)
2202 else
2203 assert(0);
2204 break;
2205
2206 case CATENATION:
2207 /* The path must go through both children. */
2208 cat = (tre_catenation_t *)node->obj;
2209 assert(cat->left->nullable);
2210 assert(cat->right->nullable);
2211 STACK_PUSHX(stack, voidptr, cat->left);
2212 STACK_PUSHX(stack, voidptr, cat->right);
2213 break;
2214
2215 case ITERATION:
2216 /* A match with an empty string is preferred over no match at
2217 all, so we go through the argument if possible. */
2218 iter = (tre_iteration_t *)node->obj;
2219 if (iter->arg->nullable)
2220 STACK_PUSHX(stack, voidptr, iter->arg);
2221 break;
2222
2223 default:
2224 assert(0);
2225 break;
2226 }
2227 }
2228
2229 return status;
2230 }
2231
2232
2233 typedef enum {
2234 NFL_RECURSE,
2235 NFL_POST_UNION,
2236 NFL_POST_CATENATION,
2237 NFL_POST_ITERATION
2238 } tre_nfl_stack_symbol_t;
2239
2240
2241 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2242 the nodes of the AST `tree'. */
2243 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)2244 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2245 {
2246 int bottom = tre_stack_num_objects(stack);
2247
2248 STACK_PUSHR(stack, voidptr, tree);
2249 STACK_PUSHR(stack, int, NFL_RECURSE);
2250
2251 while (tre_stack_num_objects(stack) > bottom)
2252 {
2253 tre_nfl_stack_symbol_t symbol;
2254 tre_ast_node_t *node;
2255
2256 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2257 node = tre_stack_pop_voidptr(stack);
2258 switch (symbol)
2259 {
2260 case NFL_RECURSE:
2261 switch (node->type)
2262 {
2263 case LITERAL:
2264 {
2265 tre_literal_t *lit = (tre_literal_t *)node->obj;
2266 if (IS_BACKREF(lit))
2267 {
2268 /* Back references: nullable = false, firstpos = {i},
2269 lastpos = {i}. */
2270 node->nullable = 0;
2271 node->firstpos = tre_set_one(mem, lit->position, 0,
2272 TRE_CHAR_MAX, 0, NULL, -1);
2273 if (!node->firstpos)
2274 return REG_ESPACE;
2275 node->lastpos = tre_set_one(mem, lit->position, 0,
2276 TRE_CHAR_MAX, 0, NULL,
2277 (int)lit->code_max);
2278 if (!node->lastpos)
2279 return REG_ESPACE;
2280 }
2281 else if (lit->code_min < 0)
2282 {
2283 /* Tags, empty strings, params, and zero width assertions:
2284 nullable = true, firstpos = {}, and lastpos = {}. */
2285 node->nullable = 1;
2286 node->firstpos = tre_set_empty(mem);
2287 if (!node->firstpos)
2288 return REG_ESPACE;
2289 node->lastpos = tre_set_empty(mem);
2290 if (!node->lastpos)
2291 return REG_ESPACE;
2292 }
2293 else
2294 {
2295 /* Literal at position i: nullable = false, firstpos = {i},
2296 lastpos = {i}. */
2297 node->nullable = 0;
2298 node->firstpos =
2299 tre_set_one(mem, lit->position, (int)lit->code_min,
2300 (int)lit->code_max, 0, NULL, -1);
2301 if (!node->firstpos)
2302 return REG_ESPACE;
2303 node->lastpos = tre_set_one(mem, lit->position,
2304 (int)lit->code_min,
2305 (int)lit->code_max,
2306 lit->class, lit->neg_classes,
2307 -1);
2308 if (!node->lastpos)
2309 return REG_ESPACE;
2310 }
2311 break;
2312 }
2313
2314 case UNION:
2315 /* Compute the attributes for the two subtrees, and after that
2316 for this node. */
2317 STACK_PUSHR(stack, voidptr, node);
2318 STACK_PUSHR(stack, int, NFL_POST_UNION);
2319 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2320 STACK_PUSHR(stack, int, NFL_RECURSE);
2321 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2322 STACK_PUSHR(stack, int, NFL_RECURSE);
2323 break;
2324
2325 case CATENATION:
2326 /* Compute the attributes for the two subtrees, and after that
2327 for this node. */
2328 STACK_PUSHR(stack, voidptr, node);
2329 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2330 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2331 STACK_PUSHR(stack, int, NFL_RECURSE);
2332 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2333 STACK_PUSHR(stack, int, NFL_RECURSE);
2334 break;
2335
2336 case ITERATION:
2337 /* Compute the attributes for the subtree, and after that for
2338 this node. */
2339 STACK_PUSHR(stack, voidptr, node);
2340 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2341 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2342 STACK_PUSHR(stack, int, NFL_RECURSE);
2343 break;
2344 }
2345 break; /* end case: NFL_RECURSE */
2346
2347 case NFL_POST_UNION:
2348 {
2349 tre_union_t *uni = (tre_union_t *)node->obj;
2350 node->nullable = uni->left->nullable || uni->right->nullable;
2351 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2352 uni->right->firstpos, NULL, 0);
2353 if (!node->firstpos)
2354 return REG_ESPACE;
2355 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2356 uni->right->lastpos, NULL, 0);
2357 if (!node->lastpos)
2358 return REG_ESPACE;
2359 break;
2360 }
2361
2362 case NFL_POST_ITERATION:
2363 {
2364 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2365
2366 if (iter->min == 0 || iter->arg->nullable)
2367 node->nullable = 1;
2368 else
2369 node->nullable = 0;
2370 node->firstpos = iter->arg->firstpos;
2371 node->lastpos = iter->arg->lastpos;
2372 break;
2373 }
2374
2375 case NFL_POST_CATENATION:
2376 {
2377 int num_tags, *tags, assertions;
2378 reg_errcode_t status;
2379 tre_catenation_t *cat = node->obj;
2380 node->nullable = cat->left->nullable && cat->right->nullable;
2381
2382 /* Compute firstpos. */
2383 if (cat->left->nullable)
2384 {
2385 /* The left side matches the empty string. Make a first pass
2386 with tre_match_empty() to get the number of tags and
2387 parameters. */
2388 status = tre_match_empty(stack, cat->left,
2389 NULL, NULL, &num_tags);
2390 if (status != REG_OK)
2391 return status;
2392 /* Allocate arrays for the tags and parameters. */
2393 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2394 if (!tags)
2395 return REG_ESPACE;
2396 tags[0] = -1;
2397 assertions = 0;
2398 /* Second pass with tre_mach_empty() to get the list of
2399 tags and parameters. */
2400 status = tre_match_empty(stack, cat->left, tags,
2401 &assertions, NULL);
2402 if (status != REG_OK)
2403 {
2404 xfree(tags);
2405 return status;
2406 }
2407 node->firstpos =
2408 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2409 tags, assertions);
2410 xfree(tags);
2411 if (!node->firstpos)
2412 return REG_ESPACE;
2413 }
2414 else
2415 {
2416 node->firstpos = cat->left->firstpos;
2417 }
2418
2419 /* Compute lastpos. */
2420 if (cat->right->nullable)
2421 {
2422 /* The right side matches the empty string. Make a first pass
2423 with tre_match_empty() to get the number of tags and
2424 parameters. */
2425 status = tre_match_empty(stack, cat->right,
2426 NULL, NULL, &num_tags);
2427 if (status != REG_OK)
2428 return status;
2429 /* Allocate arrays for the tags and parameters. */
2430 tags = xmalloc(sizeof(int) * (num_tags + 1));
2431 if (!tags)
2432 return REG_ESPACE;
2433 tags[0] = -1;
2434 assertions = 0;
2435 /* Second pass with tre_mach_empty() to get the list of
2436 tags and parameters. */
2437 status = tre_match_empty(stack, cat->right, tags,
2438 &assertions, NULL);
2439 if (status != REG_OK)
2440 {
2441 xfree(tags);
2442 return status;
2443 }
2444 node->lastpos =
2445 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2446 tags, assertions);
2447 xfree(tags);
2448 if (!node->lastpos)
2449 return REG_ESPACE;
2450 }
2451 else
2452 {
2453 node->lastpos = cat->right->lastpos;
2454 }
2455 break;
2456 }
2457
2458 default:
2459 assert(0);
2460 break;
2461 }
2462 }
2463
2464 return REG_OK;
2465 }
2466
2467
2468 /* Adds a transition from each position in `p1' to each position in `p2'. */
2469 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)2470 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2471 tre_tnfa_transition_t *transitions,
2472 int *counts, int *offs)
2473 {
2474 tre_pos_and_tags_t *orig_p2 = p2;
2475 tre_tnfa_transition_t *trans;
2476 int i, j, k, l, dup, prev_p2_pos;
2477
2478 if (transitions != NULL)
2479 while (p1->position >= 0)
2480 {
2481 p2 = orig_p2;
2482 prev_p2_pos = -1;
2483 while (p2->position >= 0)
2484 {
2485 /* Optimization: if this position was already handled, skip it. */
2486 if (p2->position == prev_p2_pos)
2487 {
2488 p2++;
2489 continue;
2490 }
2491 prev_p2_pos = p2->position;
2492 /* Set `trans' to point to the next unused transition from
2493 position `p1->position'. */
2494 trans = transitions + offs[p1->position];
2495 while (trans->state != NULL)
2496 {
2497 #if 0
2498 /* If we find a previous transition from `p1->position' to
2499 `p2->position', it is overwritten. This can happen only
2500 if there are nested loops in the regexp, like in "((a)*)*".
2501 In POSIX.2 repetition using the outer loop is always
2502 preferred over using the inner loop. Therefore the
2503 transition for the inner loop is useless and can be thrown
2504 away. */
2505 /* XXX - The same position is used for all nodes in a bracket
2506 expression, so this optimization cannot be used (it will
2507 break bracket expressions) unless I figure out a way to
2508 detect it here. */
2509 if (trans->state_id == p2->position)
2510 {
2511 break;
2512 }
2513 #endif
2514 trans++;
2515 }
2516
2517 if (trans->state == NULL)
2518 (trans + 1)->state = NULL;
2519 /* Use the character ranges, assertions, etc. from `p1' for
2520 the transition from `p1' to `p2'. */
2521 trans->code_min = p1->code_min;
2522 trans->code_max = p1->code_max;
2523 trans->state = transitions + offs[p2->position];
2524 trans->state_id = p2->position;
2525 trans->assertions = p1->assertions | p2->assertions
2526 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2527 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2528 if (p1->backref >= 0)
2529 {
2530 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2531 assert(p2->backref < 0);
2532 trans->u.backref = p1->backref;
2533 trans->assertions |= ASSERT_BACKREF;
2534 }
2535 else
2536 trans->u.class = p1->class;
2537 if (p1->neg_classes != NULL)
2538 {
2539 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2540 trans->neg_classes =
2541 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2542 if (trans->neg_classes == NULL)
2543 return REG_ESPACE;
2544 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2545 trans->neg_classes[i] = p1->neg_classes[i];
2546 trans->neg_classes[i] = (tre_ctype_t)0;
2547 }
2548 else
2549 trans->neg_classes = NULL;
2550
2551 /* Find out how many tags this transition has. */
2552 i = 0;
2553 if (p1->tags != NULL)
2554 while(p1->tags[i] >= 0)
2555 i++;
2556 j = 0;
2557 if (p2->tags != NULL)
2558 while(p2->tags[j] >= 0)
2559 j++;
2560
2561 /* If we are overwriting a transition, free the old tag array. */
2562 if (trans->tags != NULL)
2563 xfree(trans->tags);
2564 trans->tags = NULL;
2565
2566 /* If there were any tags, allocate an array and fill it. */
2567 if (i + j > 0)
2568 {
2569 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2570 if (!trans->tags)
2571 return REG_ESPACE;
2572 i = 0;
2573 if (p1->tags != NULL)
2574 while(p1->tags[i] >= 0)
2575 {
2576 trans->tags[i] = p1->tags[i];
2577 i++;
2578 }
2579 l = i;
2580 j = 0;
2581 if (p2->tags != NULL)
2582 while (p2->tags[j] >= 0)
2583 {
2584 /* Don't add duplicates. */
2585 dup = 0;
2586 for (k = 0; k < i; k++)
2587 if (trans->tags[k] == p2->tags[j])
2588 {
2589 dup = 1;
2590 break;
2591 }
2592 if (!dup)
2593 trans->tags[l++] = p2->tags[j];
2594 j++;
2595 }
2596 trans->tags[l] = -1;
2597 }
2598
2599 p2++;
2600 }
2601 p1++;
2602 }
2603 else
2604 /* Compute a maximum limit for the number of transitions leaving
2605 from each state. */
2606 while (p1->position >= 0)
2607 {
2608 p2 = orig_p2;
2609 while (p2->position >= 0)
2610 {
2611 counts[p1->position]++;
2612 p2++;
2613 }
2614 p1++;
2615 }
2616 return REG_OK;
2617 }
2618
2619 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
2620 labelled with one character range (there are no transitions on empty
2621 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
2622 the regexp. */
2623 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)2624 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2625 int *counts, int *offs)
2626 {
2627 tre_union_t *uni;
2628 tre_catenation_t *cat;
2629 tre_iteration_t *iter;
2630 reg_errcode_t errcode = REG_OK;
2631
2632 /* XXX - recurse using a stack!. */
2633 switch (node->type)
2634 {
2635 case LITERAL:
2636 break;
2637 case UNION:
2638 uni = (tre_union_t *)node->obj;
2639 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2640 if (errcode != REG_OK)
2641 return errcode;
2642 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2643 break;
2644
2645 case CATENATION:
2646 cat = (tre_catenation_t *)node->obj;
2647 /* Add a transition from each position in cat->left->lastpos
2648 to each position in cat->right->firstpos. */
2649 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2650 transitions, counts, offs);
2651 if (errcode != REG_OK)
2652 return errcode;
2653 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2654 if (errcode != REG_OK)
2655 return errcode;
2656 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2657 break;
2658
2659 case ITERATION:
2660 iter = (tre_iteration_t *)node->obj;
2661 assert(iter->max == -1 || iter->max == 1);
2662
2663 if (iter->max == -1)
2664 {
2665 assert(iter->min == 0 || iter->min == 1);
2666 /* Add a transition from each last position in the iterated
2667 expression to each first position. */
2668 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2669 transitions, counts, offs);
2670 if (errcode != REG_OK)
2671 return errcode;
2672 }
2673 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2674 break;
2675 }
2676 return errcode;
2677 }
2678
2679
2680 #define ERROR_EXIT(err) \
2681 do \
2682 { \
2683 errcode = err; \
2684 if (/*CONSTCOND*/1) \
2685 goto error_exit; \
2686 } \
2687 while (/*CONSTCOND*/0)
2688
2689
2690 int
regcomp(regex_t * restrict preg,const char * restrict regex,int cflags)2691 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2692 {
2693 tre_stack_t *stack;
2694 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2695 tre_pos_and_tags_t *p;
2696 int *counts = NULL, *offs = NULL;
2697 int i, add = 0;
2698 tre_tnfa_transition_t *transitions, *initial;
2699 tre_tnfa_t *tnfa = NULL;
2700 tre_submatch_data_t *submatch_data;
2701 tre_tag_direction_t *tag_directions = NULL;
2702 reg_errcode_t errcode;
2703 tre_mem_t mem;
2704
2705 /* Parse context. */
2706 tre_parse_ctx_t parse_ctx;
2707
2708 /* Allocate a stack used throughout the compilation process for various
2709 purposes. */
2710 stack = tre_stack_new(512, 1024000, 128);
2711 if (!stack)
2712 return REG_ESPACE;
2713 /* Allocate a fast memory allocator. */
2714 mem = tre_mem_new();
2715 if (!mem)
2716 {
2717 tre_stack_destroy(stack);
2718 return REG_ESPACE;
2719 }
2720
2721 /* Parse the regexp. */
2722 memset(&parse_ctx, 0, sizeof(parse_ctx));
2723 parse_ctx.mem = mem;
2724 parse_ctx.stack = stack;
2725 parse_ctx.start = regex;
2726 parse_ctx.cflags = cflags;
2727 parse_ctx.max_backref = -1;
2728 errcode = tre_parse(&parse_ctx);
2729 if (errcode != REG_OK)
2730 ERROR_EXIT(errcode);
2731 preg->re_nsub = parse_ctx.submatch_id - 1;
2732 tree = parse_ctx.n;
2733
2734 #ifdef TRE_DEBUG
2735 tre_ast_print(tree);
2736 #endif /* TRE_DEBUG */
2737
2738 /* Referring to nonexistent subexpressions is illegal. */
2739 if (parse_ctx.max_backref > (int)preg->re_nsub)
2740 ERROR_EXIT(REG_ESUBREG);
2741
2742 /* Allocate the TNFA struct. */
2743 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2744 if (tnfa == NULL)
2745 ERROR_EXIT(REG_ESPACE);
2746 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2747 tnfa->have_approx = 0;
2748 tnfa->num_submatches = parse_ctx.submatch_id;
2749
2750 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
2751 regexp does not have back references, this can be skipped. */
2752 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2753 {
2754
2755 /* Figure out how many tags we will need. */
2756 errcode = tre_add_tags(NULL, stack, tree, tnfa);
2757 if (errcode != REG_OK)
2758 ERROR_EXIT(errcode);
2759
2760 if (tnfa->num_tags > 0)
2761 {
2762 tag_directions = xmalloc(sizeof(*tag_directions)
2763 * (tnfa->num_tags + 1));
2764 if (tag_directions == NULL)
2765 ERROR_EXIT(REG_ESPACE);
2766 tnfa->tag_directions = tag_directions;
2767 memset(tag_directions, -1,
2768 sizeof(*tag_directions) * (tnfa->num_tags + 1));
2769 }
2770 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2771 sizeof(*tnfa->minimal_tags));
2772 if (tnfa->minimal_tags == NULL)
2773 ERROR_EXIT(REG_ESPACE);
2774
2775 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2776 sizeof(*submatch_data));
2777 if (submatch_data == NULL)
2778 ERROR_EXIT(REG_ESPACE);
2779 tnfa->submatch_data = submatch_data;
2780
2781 errcode = tre_add_tags(mem, stack, tree, tnfa);
2782 if (errcode != REG_OK)
2783 ERROR_EXIT(errcode);
2784
2785 }
2786
2787 /* Expand iteration nodes. */
2788 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2789 tag_directions);
2790 if (errcode != REG_OK)
2791 ERROR_EXIT(errcode);
2792
2793 /* Add a dummy node for the final state.
2794 XXX - For certain patterns this dummy node can be optimized away,
2795 for example "a*" or "ab*". Figure out a simple way to detect
2796 this possibility. */
2797 tmp_ast_l = tree;
2798 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2799 if (tmp_ast_r == NULL)
2800 ERROR_EXIT(REG_ESPACE);
2801
2802 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2803 if (tree == NULL)
2804 ERROR_EXIT(REG_ESPACE);
2805
2806 errcode = tre_compute_nfl(mem, stack, tree);
2807 if (errcode != REG_OK)
2808 ERROR_EXIT(errcode);
2809
2810 counts = xmalloc(sizeof(int) * parse_ctx.position);
2811 if (counts == NULL)
2812 ERROR_EXIT(REG_ESPACE);
2813
2814 offs = xmalloc(sizeof(int) * parse_ctx.position);
2815 if (offs == NULL)
2816 ERROR_EXIT(REG_ESPACE);
2817
2818 for (i = 0; i < parse_ctx.position; i++)
2819 counts[i] = 0;
2820 tre_ast_to_tnfa(tree, NULL, counts, NULL);
2821
2822 add = 0;
2823 for (i = 0; i < parse_ctx.position; i++)
2824 {
2825 offs[i] = add;
2826 add += counts[i] + 1;
2827 counts[i] = 0;
2828 }
2829 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2830 if (transitions == NULL)
2831 ERROR_EXIT(REG_ESPACE);
2832 tnfa->transitions = transitions;
2833 tnfa->num_transitions = add;
2834
2835 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2836 if (errcode != REG_OK)
2837 ERROR_EXIT(errcode);
2838
2839 tnfa->firstpos_chars = NULL;
2840
2841 p = tree->firstpos;
2842 i = 0;
2843 while (p->position >= 0)
2844 {
2845 i++;
2846 p++;
2847 }
2848
2849 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2850 if (initial == NULL)
2851 ERROR_EXIT(REG_ESPACE);
2852 tnfa->initial = initial;
2853
2854 i = 0;
2855 for (p = tree->firstpos; p->position >= 0; p++)
2856 {
2857 initial[i].state = transitions + offs[p->position];
2858 initial[i].state_id = p->position;
2859 initial[i].tags = NULL;
2860 /* Copy the arrays p->tags, and p->params, they are allocated
2861 from a tre_mem object. */
2862 if (p->tags)
2863 {
2864 int j;
2865 for (j = 0; p->tags[j] >= 0; j++);
2866 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2867 if (!initial[i].tags)
2868 ERROR_EXIT(REG_ESPACE);
2869 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2870 }
2871 initial[i].assertions = p->assertions;
2872 i++;
2873 }
2874 initial[i].state = NULL;
2875
2876 tnfa->num_transitions = add;
2877 tnfa->final = transitions + offs[tree->lastpos[0].position];
2878 tnfa->num_states = parse_ctx.position;
2879 tnfa->cflags = cflags;
2880
2881 tre_mem_destroy(mem);
2882 tre_stack_destroy(stack);
2883 xfree(counts);
2884 xfree(offs);
2885
2886 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2887 return REG_OK;
2888
2889 error_exit:
2890 /* Free everything that was allocated and return the error code. */
2891 tre_mem_destroy(mem);
2892 if (stack != NULL)
2893 tre_stack_destroy(stack);
2894 if (counts != NULL)
2895 xfree(counts);
2896 if (offs != NULL)
2897 xfree(offs);
2898 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2899 regfree(preg);
2900 return errcode;
2901 }
2902
2903
2904
2905
2906 void
regfree(regex_t * preg)2907 regfree(regex_t *preg)
2908 {
2909 tre_tnfa_t *tnfa;
2910 unsigned int i;
2911 tre_tnfa_transition_t *trans;
2912
2913 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2914 if (!tnfa)
2915 return;
2916
2917 for (i = 0; i < tnfa->num_transitions; i++)
2918 if (tnfa->transitions[i].state)
2919 {
2920 if (tnfa->transitions[i].tags)
2921 xfree(tnfa->transitions[i].tags);
2922 if (tnfa->transitions[i].neg_classes)
2923 xfree(tnfa->transitions[i].neg_classes);
2924 }
2925 if (tnfa->transitions)
2926 xfree(tnfa->transitions);
2927
2928 if (tnfa->initial)
2929 {
2930 for (trans = tnfa->initial; trans->state; trans++)
2931 {
2932 if (trans->tags)
2933 xfree(trans->tags);
2934 }
2935 xfree(tnfa->initial);
2936 }
2937
2938 if (tnfa->submatch_data)
2939 {
2940 for (i = 0; i < tnfa->num_submatches; i++)
2941 if (tnfa->submatch_data[i].parents)
2942 xfree(tnfa->submatch_data[i].parents);
2943 xfree(tnfa->submatch_data);
2944 }
2945
2946 if (tnfa->tag_directions)
2947 xfree(tnfa->tag_directions);
2948 if (tnfa->firstpos_chars)
2949 xfree(tnfa->firstpos_chars);
2950 if (tnfa->minimal_tags)
2951 xfree(tnfa->minimal_tags);
2952 xfree(tnfa);
2953 }
2954