1 /*
2 tre-compile.c - TRE regex compiler
3
4 Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with this library; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21
22 /*
23 TODO:
24 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
25 function calls.
26 */
27
28
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif /* HAVE_CONFIG_H */
32 #include <stdio.h>
33 #include <assert.h>
34 #include <string.h>
35
36 #include "tre-internal.h"
37 #include "tre-mem.h"
38 #include "tre-stack.h"
39 #include "tre-ast.h"
40 #include "tre-parse.h"
41 #include "tre-compile.h"
42 #include "regex.h"
43 #include "xmalloc.h"
44
45 /*
46 Algorithms to setup tags so that submatch addressing can be done.
47 */
48
49
50 /* Inserts a catenation node to the root of the tree given in `node'.
51 As the left child a new tag with number `tag_id' to `node' is added,
52 and the right child is the old root. */
53 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)54 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
55 {
56 tre_catenation_t *c;
57
58 DPRINT(("add_tag_left: tag %d\n", tag_id));
59
60 c = tre_mem_alloc(mem, sizeof(*c));
61 if (c == NULL)
62 return REG_ESPACE;
63 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
64 if (c->left == NULL)
65 return REG_ESPACE;
66 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
67 if (c->right == NULL)
68 return REG_ESPACE;
69
70 c->right->obj = node->obj;
71 c->right->type = node->type;
72 c->right->nullable = -1;
73 c->right->submatch_id = -1;
74 c->right->firstpos = NULL;
75 c->right->lastpos = NULL;
76 c->right->num_tags = 0;
77 node->obj = c;
78 node->type = CATENATION;
79 return REG_OK;
80 }
81
82 /* Inserts a catenation node to the root of the tree given in `node'.
83 As the right child a new tag with number `tag_id' to `node' is added,
84 and the left child is the old root. */
85 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)86 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
87 {
88 tre_catenation_t *c;
89
90 DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
91
92 c = tre_mem_alloc(mem, sizeof(*c));
93 if (c == NULL)
94 return REG_ESPACE;
95 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
96 if (c->right == NULL)
97 return REG_ESPACE;
98 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
99 if (c->left == NULL)
100 return REG_ESPACE;
101
102 c->left->obj = node->obj;
103 c->left->type = node->type;
104 c->left->nullable = -1;
105 c->left->submatch_id = -1;
106 c->left->firstpos = NULL;
107 c->left->lastpos = NULL;
108 c->left->num_tags = 0;
109 node->obj = c;
110 node->type = CATENATION;
111 return REG_OK;
112 }
113
114 typedef enum {
115 ADDTAGS_RECURSE,
116 ADDTAGS_AFTER_ITERATION,
117 ADDTAGS_AFTER_UNION_LEFT,
118 ADDTAGS_AFTER_UNION_RIGHT,
119 ADDTAGS_AFTER_CAT_LEFT,
120 ADDTAGS_AFTER_CAT_RIGHT,
121 ADDTAGS_SET_SUBMATCH_END
122 } tre_addtags_symbol_t;
123
124
125 typedef struct {
126 int tag;
127 int next_tag;
128 } tre_tag_states_t;
129
130 /* Adds tags to appropriate locations in the parse tree in `tree', so that
131 subexpressions marked for submatch addressing can be traced. */
132 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)133 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
134 tre_tnfa_t *tnfa)
135 {
136 reg_errcode_t status = REG_OK;
137 tre_addtags_symbol_t symbol;
138 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
139 int bottom = tre_stack_num_objects(stack);
140 /* True for first pass (counting number of needed tags) */
141 int first_pass = (mem == NULL || tnfa == NULL);
142 int *regset, *orig_regset;
143 int num_tags = 0; /* Total number of tags. */
144 int num_minimals = 0; /* Number of special minimal tags. */
145 int tag = 0; /* The tag that is to be added next. */
146 int next_tag = 1; /* Next tag to use after this one. */
147 int *parents; /* Stack of submatches the current submatch is
148 contained in. */
149 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
150 tre_tag_states_t *saved_states;
151
152 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
153 if (!first_pass)
154 {
155 tnfa->end_tag = 0;
156 tnfa->minimal_tags[0] = -1;
157 }
158
159 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
160 if (regset == NULL)
161 return REG_ESPACE;
162 regset[0] = -1;
163 orig_regset = regset;
164
165 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
166 if (parents == NULL)
167 {
168 xfree(regset);
169 return REG_ESPACE;
170 }
171 parents[0] = -1;
172
173 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
174 if (saved_states == NULL)
175 {
176 xfree(regset);
177 xfree(parents);
178 return REG_ESPACE;
179 }
180 else
181 {
182 unsigned int i;
183 for (i = 0; i <= tnfa->num_submatches; i++)
184 saved_states[i].tag = -1;
185 }
186
187 STACK_PUSH(stack, voidptr, node);
188 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
189
190 while (tre_stack_num_objects(stack) > bottom)
191 {
192 if (status != REG_OK)
193 break;
194
195 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
196 switch (symbol)
197 {
198
199 case ADDTAGS_SET_SUBMATCH_END:
200 {
201 int id = tre_stack_pop_int(stack);
202 int i;
203
204 /* Add end of this submatch to regset. */
205 for (i = 0; regset[i] >= 0; i++);
206 regset[i] = id * 2 + 1;
207 regset[i + 1] = -1;
208
209 /* Pop this submatch from the parents stack. */
210 for (i = 0; parents[i] >= 0; i++);
211 parents[i - 1] = -1;
212 break;
213 }
214
215 case ADDTAGS_RECURSE:
216 node = tre_stack_pop_voidptr(stack);
217
218 if (node->submatch_id >= 0)
219 {
220 int id = node->submatch_id;
221 int i;
222
223
224 /* Add start of this submatch to regset. */
225 for (i = 0; regset[i] >= 0; i++);
226 regset[i] = id * 2;
227 regset[i + 1] = -1;
228
229 if (!first_pass)
230 {
231 for (i = 0; parents[i] >= 0; i++);
232 tnfa->submatch_data[id].parents = NULL;
233 if (i > 0)
234 {
235 int *p = xmalloc(sizeof(*p) * (i + 1));
236 if (p == NULL)
237 {
238 status = REG_ESPACE;
239 break;
240 }
241 assert(tnfa->submatch_data[id].parents == NULL);
242 tnfa->submatch_data[id].parents = p;
243 for (i = 0; parents[i] >= 0; i++)
244 p[i] = parents[i];
245 p[i] = -1;
246 }
247 }
248
249 /* Add end of this submatch to regset after processing this
250 node. */
251 STACK_PUSHX(stack, int, node->submatch_id);
252 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
253 }
254
255 switch (node->type)
256 {
257 case LITERAL:
258 {
259 tre_literal_t *lit = node->obj;
260
261 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
262 {
263 int i;
264 DPRINT(("Literal %d-%d\n",
265 (int)lit->code_min, (int)lit->code_max));
266 if (regset[0] >= 0)
267 {
268 /* Regset is not empty, so add a tag before the
269 literal or backref. */
270 if (!first_pass)
271 {
272 status = tre_add_tag_left(mem, node, tag);
273 tnfa->tag_directions[tag] = direction;
274 if (minimal_tag >= 0)
275 {
276 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
277 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
278 tnfa->minimal_tags[i] = tag;
279 tnfa->minimal_tags[i + 1] = minimal_tag;
280 tnfa->minimal_tags[i + 2] = -1;
281 minimal_tag = -1;
282 num_minimals++;
283 }
284 /* Go through the regset and set submatch data for
285 submatches that are using this tag. */
286 for (i = 0; regset[i] >= 0; i++)
287 {
288 int id = regset[i] / 2;
289 int start = !(regset[i] % 2);
290 DPRINT((" Using tag %d for %s offset of "
291 "submatch %d\n", tag,
292 start ? "start" : "end", id));
293 if (start)
294 tnfa->submatch_data[id].so_tag = tag;
295 else
296 tnfa->submatch_data[id].eo_tag = tag;
297 }
298 }
299 else
300 {
301 DPRINT((" num_tags = 1\n"));
302 node->num_tags = 1;
303 }
304
305 DPRINT((" num_tags++\n"));
306 regset[0] = -1;
307 tag = next_tag;
308 num_tags++;
309 next_tag++;
310 }
311 }
312 else
313 {
314 assert(!IS_TAG(lit));
315 }
316 break;
317 }
318 case CATENATION:
319 {
320 tre_catenation_t *cat = node->obj;
321 tre_ast_node_t *left = cat->left;
322 tre_ast_node_t *right = cat->right;
323 int reserved_tag = -1;
324 DPRINT(("Catenation, next_tag = %d\n", next_tag));
325
326
327 /* After processing right child. */
328 STACK_PUSHX(stack, voidptr, node);
329 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
330
331 /* Process right child. */
332 STACK_PUSHX(stack, voidptr, right);
333 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
334
335 /* After processing left child. */
336 STACK_PUSHX(stack, int, next_tag + left->num_tags);
337 DPRINT((" Pushing %d for after left\n",
338 next_tag + left->num_tags));
339 if (left->num_tags > 0 && right->num_tags > 0)
340 {
341 /* Reserve the next tag to the right child. */
342 DPRINT((" Reserving next_tag %d to right child\n",
343 next_tag));
344 reserved_tag = next_tag;
345 next_tag++;
346 }
347 STACK_PUSHX(stack, int, reserved_tag);
348 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
349
350 /* Process left child. */
351 STACK_PUSHX(stack, voidptr, left);
352 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
353
354 }
355 break;
356 case ITERATION:
357 {
358 tre_iteration_t *iter = node->obj;
359 DPRINT(("Iteration\n"));
360
361 if (first_pass)
362 {
363 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
364 }
365 else
366 {
367 STACK_PUSHX(stack, int, tag);
368 STACK_PUSHX(stack, int, iter->minimal);
369 }
370 STACK_PUSHX(stack, voidptr, node);
371 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
372
373 STACK_PUSHX(stack, voidptr, iter->arg);
374 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
375
376 /* Regset is not empty, so add a tag here. */
377 if (regset[0] >= 0 || iter->minimal)
378 {
379 if (!first_pass)
380 {
381 int i;
382 status = tre_add_tag_left(mem, node, tag);
383 if (iter->minimal)
384 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
385 else
386 tnfa->tag_directions[tag] = direction;
387 if (minimal_tag >= 0)
388 {
389 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
390 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
391 tnfa->minimal_tags[i] = tag;
392 tnfa->minimal_tags[i + 1] = minimal_tag;
393 tnfa->minimal_tags[i + 2] = -1;
394 minimal_tag = -1;
395 num_minimals++;
396 }
397 /* Go through the regset and set submatch data for
398 submatches that are using this tag. */
399 for (i = 0; regset[i] >= 0; i++)
400 {
401 int id = regset[i] / 2;
402 int start = !(regset[i] % 2);
403 DPRINT((" Using tag %d for %s offset of "
404 "submatch %d\n", tag,
405 start ? "start" : "end", id));
406 if (start)
407 tnfa->submatch_data[id].so_tag = tag;
408 else
409 tnfa->submatch_data[id].eo_tag = tag;
410 }
411 }
412
413 DPRINT((" num_tags++\n"));
414 regset[0] = -1;
415 tag = next_tag;
416 num_tags++;
417 next_tag++;
418 }
419 direction = TRE_TAG_MINIMIZE;
420 }
421 break;
422 case UNION:
423 {
424 tre_union_t *uni = node->obj;
425 tre_ast_node_t *left = uni->left;
426 tre_ast_node_t *right = uni->right;
427 int left_tag;
428 int right_tag;
429
430 if (regset[0] >= 0)
431 {
432 left_tag = next_tag;
433 right_tag = next_tag + 1;
434 }
435 else
436 {
437 left_tag = tag;
438 right_tag = next_tag;
439 }
440
441 DPRINT(("Union\n"));
442
443 /* After processing right child. */
444 STACK_PUSHX(stack, int, right_tag);
445 STACK_PUSHX(stack, int, left_tag);
446 STACK_PUSHX(stack, voidptr, regset);
447 STACK_PUSHX(stack, int, regset[0] >= 0);
448 STACK_PUSHX(stack, voidptr, node);
449 STACK_PUSHX(stack, voidptr, right);
450 STACK_PUSHX(stack, voidptr, left);
451 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
452
453 /* Process right child. */
454 STACK_PUSHX(stack, voidptr, right);
455 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
456
457 /* After processing left child. */
458 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
459
460 /* Process left child. */
461 STACK_PUSHX(stack, voidptr, left);
462 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
463
464 /* Regset is not empty, so add a tag here. */
465 if (regset[0] >= 0)
466 {
467 if (!first_pass)
468 {
469 int i;
470 status = tre_add_tag_left(mem, node, tag);
471 tnfa->tag_directions[tag] = direction;
472 if (minimal_tag >= 0)
473 {
474 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
475 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
476 tnfa->minimal_tags[i] = tag;
477 tnfa->minimal_tags[i + 1] = minimal_tag;
478 tnfa->minimal_tags[i + 2] = -1;
479 minimal_tag = -1;
480 num_minimals++;
481 }
482 /* Go through the regset and set submatch data for
483 submatches that are using this tag. */
484 for (i = 0; regset[i] >= 0; i++)
485 {
486 int id = regset[i] / 2;
487 int start = !(regset[i] % 2);
488 DPRINT((" Using tag %d for %s offset of "
489 "submatch %d\n", tag,
490 start ? "start" : "end", id));
491 if (start)
492 tnfa->submatch_data[id].so_tag = tag;
493 else
494 tnfa->submatch_data[id].eo_tag = tag;
495 }
496 }
497
498 DPRINT((" num_tags++\n"));
499 regset[0] = -1;
500 tag = next_tag;
501 num_tags++;
502 next_tag++;
503 }
504
505 if (node->num_submatches > 0)
506 {
507 /* The next two tags are reserved for markers. */
508 next_tag++;
509 tag = next_tag;
510 next_tag++;
511 }
512
513 break;
514 }
515 }
516
517 if (node->submatch_id >= 0)
518 {
519 int i;
520 /* Push this submatch on the parents stack. */
521 for (i = 0; parents[i] >= 0; i++);
522 parents[i] = node->submatch_id;
523 parents[i + 1] = -1;
524 }
525
526 break; /* end case: ADDTAGS_RECURSE */
527
528 case ADDTAGS_AFTER_ITERATION:
529 {
530 int minimal = 0;
531 int enter_tag;
532 node = tre_stack_pop_voidptr(stack);
533 if (first_pass)
534 {
535 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
536 + tre_stack_pop_int(stack);
537 minimal_tag = -1;
538 }
539 else
540 {
541 minimal = tre_stack_pop_int(stack);
542 enter_tag = tre_stack_pop_int(stack);
543 if (minimal)
544 minimal_tag = enter_tag;
545 }
546
547 DPRINT(("After iteration\n"));
548 if (!first_pass)
549 {
550 DPRINT((" Setting direction to %s\n",
551 minimal ? "minimize" : "maximize"));
552 if (minimal)
553 direction = TRE_TAG_MINIMIZE;
554 else
555 direction = TRE_TAG_MAXIMIZE;
556 }
557 break;
558 }
559
560 case ADDTAGS_AFTER_CAT_LEFT:
561 {
562 int new_tag = tre_stack_pop_int(stack);
563 next_tag = tre_stack_pop_int(stack);
564 DPRINT(("After cat left, tag = %d, next_tag = %d\n",
565 tag, next_tag));
566 if (new_tag >= 0)
567 {
568 DPRINT((" Setting tag to %d\n", new_tag));
569 tag = new_tag;
570 }
571 break;
572 }
573
574 case ADDTAGS_AFTER_CAT_RIGHT:
575 DPRINT(("After cat right\n"));
576 node = tre_stack_pop_voidptr(stack);
577 if (first_pass)
578 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
579 + ((tre_catenation_t *)node->obj)->right->num_tags;
580 break;
581
582 case ADDTAGS_AFTER_UNION_LEFT:
583 DPRINT(("After union left\n"));
584 /* Lift the bottom of the `regset' array so that when processing
585 the right operand the items currently in the array are
586 invisible. The original bottom was saved at ADDTAGS_UNION and
587 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
588 while (*regset >= 0)
589 regset++;
590 break;
591
592 case ADDTAGS_AFTER_UNION_RIGHT:
593 {
594 int added_tags, tag_left, tag_right;
595 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
596 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
597 DPRINT(("After union right\n"));
598 node = tre_stack_pop_voidptr(stack);
599 added_tags = tre_stack_pop_int(stack);
600 if (first_pass)
601 {
602 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
603 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
604 + ((node->num_submatches > 0) ? 2 : 0);
605 }
606 regset = tre_stack_pop_voidptr(stack);
607 tag_left = tre_stack_pop_int(stack);
608 tag_right = tre_stack_pop_int(stack);
609
610 /* Add tags after both children, the left child gets a smaller
611 tag than the right child. This guarantees that we prefer
612 the left child over the right child. */
613 /* XXX - This is not always necessary (if the children have
614 tags which must be seen for every match of that child). */
615 /* XXX - Check if this is the only place where tre_add_tag_right
616 is used. If so, use tre_add_tag_left (putting the tag before
617 the child as opposed after the child) and throw away
618 tre_add_tag_right. */
619 if (node->num_submatches > 0)
620 {
621 if (!first_pass)
622 {
623 status = tre_add_tag_right(mem, left, tag_left);
624 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
625 status = tre_add_tag_right(mem, right, tag_right);
626 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
627 }
628 DPRINT((" num_tags += 2\n"));
629 num_tags += 2;
630 }
631 direction = TRE_TAG_MAXIMIZE;
632 break;
633 }
634
635 default:
636 assert(0);
637 break;
638
639 } /* end switch(symbol) */
640 } /* end while(tre_stack_num_objects(stack) > bottom) */
641
642 if (!first_pass)
643 {
644 int i;
645 /* Go through the regset and set submatch data for
646 submatches that are using this tag. */
647 for (i = 0; regset[i] >= 0; i++)
648 {
649 int id = regset[i] / 2;
650 int start = !(regset[i] % 2);
651 DPRINT((" Using tag %d for %s offset of "
652 "submatch %d\n", num_tags,
653 start ? "start" : "end", id));
654 if (start)
655 tnfa->submatch_data[id].so_tag = num_tags;
656 else
657 tnfa->submatch_data[id].eo_tag = num_tags;
658 }
659 }
660
661 if (!first_pass && minimal_tag >= 0)
662 {
663 int i;
664 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
665 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
666 tnfa->minimal_tags[i] = tag;
667 tnfa->minimal_tags[i + 1] = minimal_tag;
668 tnfa->minimal_tags[i + 2] = -1;
669 minimal_tag = -1;
670 num_minimals++;
671 }
672
673 DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
674 first_pass? "First pass" : "Second pass", num_tags));
675
676 assert(tree->num_tags == num_tags);
677 tnfa->end_tag = num_tags;
678 tnfa->num_tags = num_tags;
679 tnfa->num_minimals = num_minimals;
680 xfree(orig_regset);
681 xfree(parents);
682 xfree(saved_states);
683 return status;
684 }
685
686
687
688 /*
689 AST to TNFA compilation routines.
690 */
691
692 typedef enum {
693 COPY_RECURSE,
694 COPY_SET_RESULT_PTR
695 } tre_copyast_symbol_t;
696
697 /* Flags for tre_copy_ast(). */
698 #define COPY_REMOVE_TAGS 1
699 #define COPY_MAXIMIZE_FIRST_TAG 2
700
701 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)702 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
703 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
704 tre_ast_node_t **copy, int *max_pos)
705 {
706 reg_errcode_t status = REG_OK;
707 int bottom = tre_stack_num_objects(stack);
708 int num_copied = 0;
709 int first_tag = 1;
710 tre_ast_node_t **result = copy;
711 tre_copyast_symbol_t symbol;
712
713 STACK_PUSH(stack, voidptr, ast);
714 STACK_PUSH(stack, int, COPY_RECURSE);
715
716 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
717 {
718 tre_ast_node_t *node;
719 if (status != REG_OK)
720 break;
721
722 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
723 switch (symbol)
724 {
725 case COPY_SET_RESULT_PTR:
726 result = tre_stack_pop_voidptr(stack);
727 break;
728 case COPY_RECURSE:
729 node = tre_stack_pop_voidptr(stack);
730 switch (node->type)
731 {
732 case LITERAL:
733 {
734 tre_literal_t *lit = node->obj;
735 int pos = lit->position;
736 int min = lit->code_min;
737 int max = lit->code_max;
738 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
739 {
740 /* XXX - e.g. [ab] has only one position but two
741 nodes, so we are creating holes in the state space
742 here. Not fatal, just wastes memory. */
743 pos += *pos_add;
744 num_copied++;
745 }
746 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
747 {
748 /* Change this tag to empty. */
749 min = EMPTY;
750 max = pos = -1;
751 }
752 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
753 && first_tag)
754 {
755 /* Maximize the first tag. */
756 tag_directions[max] = TRE_TAG_MAXIMIZE;
757 first_tag = 0;
758 }
759 *result = tre_ast_new_literal(mem, min, max, pos);
760 if (*result == NULL)
761 status = REG_ESPACE;
762
763 if (pos > *max_pos)
764 *max_pos = pos;
765 break;
766 }
767 case UNION:
768 {
769 tre_union_t *uni = node->obj;
770 tre_union_t *tmp;
771 *result = tre_ast_new_union(mem, uni->left, uni->right);
772 if (*result == NULL)
773 {
774 status = REG_ESPACE;
775 break;
776 }
777 tmp = (*result)->obj;
778 result = &tmp->left;
779 STACK_PUSHX(stack, voidptr, uni->right);
780 STACK_PUSHX(stack, int, COPY_RECURSE);
781 STACK_PUSHX(stack, voidptr, &tmp->right);
782 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
783 STACK_PUSHX(stack, voidptr, uni->left);
784 STACK_PUSHX(stack, int, COPY_RECURSE);
785 break;
786 }
787 case CATENATION:
788 {
789 tre_catenation_t *cat = node->obj;
790 tre_catenation_t *tmp;
791 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
792 if (*result == NULL)
793 {
794 status = REG_ESPACE;
795 break;
796 }
797 tmp = (*result)->obj;
798 tmp->left = NULL;
799 tmp->right = NULL;
800 result = &tmp->left;
801
802 STACK_PUSHX(stack, voidptr, cat->right);
803 STACK_PUSHX(stack, int, COPY_RECURSE);
804 STACK_PUSHX(stack, voidptr, &tmp->right);
805 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
806 STACK_PUSHX(stack, voidptr, cat->left);
807 STACK_PUSHX(stack, int, COPY_RECURSE);
808 break;
809 }
810 case ITERATION:
811 {
812 tre_iteration_t *iter = node->obj;
813 STACK_PUSHX(stack, voidptr, iter->arg);
814 STACK_PUSHX(stack, int, COPY_RECURSE);
815 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
816 iter->max, iter->minimal);
817 if (*result == NULL)
818 {
819 status = REG_ESPACE;
820 break;
821 }
822 iter = (*result)->obj;
823 result = &iter->arg;
824 break;
825 }
826 default:
827 assert(0);
828 break;
829 }
830 break;
831 }
832 }
833 *pos_add += num_copied;
834 return status;
835 }
836
837 typedef enum {
838 EXPAND_RECURSE,
839 EXPAND_AFTER_ITER
840 } tre_expand_ast_symbol_t;
841
842 /* Expands each iteration node that has a finite nonzero minimum or maximum
843 iteration count to a catenated sequence of copies of the node. */
844 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions,int * max_depth)845 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
846 int *position, tre_tag_direction_t *tag_directions,
847 int *max_depth)
848 {
849 reg_errcode_t status = REG_OK;
850 int bottom = tre_stack_num_objects(stack);
851 int pos_add = 0;
852 int pos_add_total = 0;
853 int max_pos = 0;
854 /* Current approximate matching parameters. */
855 int params[TRE_PARAM_LAST];
856 /* Approximate parameter nesting level. */
857 int params_depth = 0;
858 int iter_depth = 0;
859 int i;
860
861 for (i = 0; i < TRE_PARAM_LAST; i++)
862 params[i] = TRE_PARAM_DEFAULT;
863
864 STACK_PUSHR(stack, voidptr, ast);
865 STACK_PUSHR(stack, int, EXPAND_RECURSE);
866 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
867 {
868 tre_ast_node_t *node;
869 tre_expand_ast_symbol_t symbol;
870
871 if (status != REG_OK)
872 break;
873
874 DPRINT(("pos_add %d\n", pos_add));
875
876 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
877 node = tre_stack_pop_voidptr(stack);
878 switch (symbol)
879 {
880 case EXPAND_RECURSE:
881 switch (node->type)
882 {
883 case LITERAL:
884 {
885 tre_literal_t *lit= node->obj;
886 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
887 {
888 lit->position += pos_add;
889 if (lit->position > max_pos)
890 max_pos = lit->position;
891 }
892 break;
893 }
894 case UNION:
895 {
896 tre_union_t *uni = node->obj;
897 STACK_PUSHX(stack, voidptr, uni->right);
898 STACK_PUSHX(stack, int, EXPAND_RECURSE);
899 STACK_PUSHX(stack, voidptr, uni->left);
900 STACK_PUSHX(stack, int, EXPAND_RECURSE);
901 break;
902 }
903 case CATENATION:
904 {
905 tre_catenation_t *cat = node->obj;
906 STACK_PUSHX(stack, voidptr, cat->right);
907 STACK_PUSHX(stack, int, EXPAND_RECURSE);
908 STACK_PUSHX(stack, voidptr, cat->left);
909 STACK_PUSHX(stack, int, EXPAND_RECURSE);
910 break;
911 }
912 case ITERATION:
913 {
914 tre_iteration_t *iter = node->obj;
915 STACK_PUSHX(stack, int, pos_add);
916 STACK_PUSHX(stack, voidptr, node);
917 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
918 STACK_PUSHX(stack, voidptr, iter->arg);
919 STACK_PUSHX(stack, int, EXPAND_RECURSE);
920 /* If we are going to expand this node at EXPAND_AFTER_ITER
921 then don't increase the `pos' fields of the nodes now, it
922 will get done when expanding. */
923 if (iter->min > 1 || iter->max > 1)
924 pos_add = 0;
925 iter_depth++;
926 DPRINT(("iter\n"));
927 break;
928 }
929 default:
930 assert(0);
931 break;
932 }
933 break;
934 case EXPAND_AFTER_ITER:
935 {
936 tre_iteration_t *iter = node->obj;
937 int pos_add_last;
938 pos_add = tre_stack_pop_int(stack);
939 pos_add_last = pos_add;
940 if (iter->min > 1 || iter->max > 1)
941 {
942 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
943 int i;
944 int pos_add_save = pos_add;
945
946 /* Create a catenated sequence of copies of the node. */
947 for (i = 0; i < iter->min; i++)
948 {
949 tre_ast_node_t *copy;
950 /* Remove tags from all but the last copy. */
951 int flags = ((i + 1 < iter->min)
952 ? COPY_REMOVE_TAGS
953 : COPY_MAXIMIZE_FIRST_TAG);
954 DPRINT((" pos_add %d\n", pos_add));
955 pos_add_save = pos_add;
956 status = tre_copy_ast(mem, stack, iter->arg, flags,
957 &pos_add, tag_directions, ©,
958 &max_pos);
959 if (status != REG_OK)
960 return status;
961 if (seq1 != NULL)
962 seq1 = tre_ast_new_catenation(mem, seq1, copy);
963 else
964 seq1 = copy;
965 if (seq1 == NULL)
966 return REG_ESPACE;
967 }
968
969 if (iter->max == -1)
970 {
971 /* No upper limit. */
972 pos_add_save = pos_add;
973 status = tre_copy_ast(mem, stack, iter->arg, 0,
974 &pos_add, NULL, &seq2, &max_pos);
975 if (status != REG_OK)
976 return status;
977 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
978 if (seq2 == NULL)
979 return REG_ESPACE;
980 }
981 else
982 {
983 for (i = iter->min; i < iter->max; i++)
984 {
985 tre_ast_node_t *tmp, *copy;
986 pos_add_save = pos_add;
987 status = tre_copy_ast(mem, stack, iter->arg, 0,
988 &pos_add, NULL, ©, &max_pos);
989 if (status != REG_OK)
990 return status;
991 if (seq2 != NULL)
992 seq2 = tre_ast_new_catenation(mem, copy, seq2);
993 else
994 seq2 = copy;
995 if (seq2 == NULL)
996 return REG_ESPACE;
997 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
998 if (tmp == NULL)
999 return REG_ESPACE;
1000 seq2 = tre_ast_new_union(mem, tmp, seq2);
1001 if (seq2 == NULL)
1002 return REG_ESPACE;
1003 }
1004 }
1005
1006 pos_add = pos_add_save;
1007 if (seq1 == NULL)
1008 seq1 = seq2;
1009 else if (seq2 != NULL)
1010 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1011 if (seq1 == NULL)
1012 return REG_ESPACE;
1013 node->obj = seq1->obj;
1014 node->type = seq1->type;
1015 }
1016
1017 iter_depth--;
1018 pos_add_total += pos_add - pos_add_last;
1019 if (iter_depth == 0)
1020 pos_add = pos_add_total;
1021
1022 /* If approximate parameters are specified, surround the result
1023 with two parameter setting nodes. The one on the left sets
1024 the specified parameters, and the one on the right restores
1025 the old parameters. */
1026 if (iter->params)
1027 {
1028 tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
1029 int *old_params;
1030
1031 tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1032 if (!tmp_l)
1033 return REG_ESPACE;
1034 ((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
1035 iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
1036 tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1037 if (!tmp_r)
1038 return REG_ESPACE;
1039 old_params = tre_mem_alloc(mem, sizeof(*old_params)
1040 * TRE_PARAM_LAST);
1041 if (!old_params)
1042 return REG_ESPACE;
1043 for (i = 0; i < TRE_PARAM_LAST; i++)
1044 old_params[i] = params[i];
1045 ((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1046 old_params[TRE_PARAM_DEPTH] = params_depth;
1047 /* XXX - this is the only place where ast_new_node is
1048 needed -- should be moved inside AST module. */
1049 node_copy = tre_ast_new_node(mem, ITERATION,
1050 sizeof(tre_iteration_t));
1051 if (!node_copy)
1052 return REG_ESPACE;
1053 node_copy->obj = node->obj;
1054 tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1055 if (!tmp_node)
1056 return REG_ESPACE;
1057 tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1058 if (!tmp_node)
1059 return REG_ESPACE;
1060 /* Replace the contents of `node' with `tmp_node'. */
1061 memcpy(node, tmp_node, sizeof(*node));
1062 node->obj = tmp_node->obj;
1063 node->type = tmp_node->type;
1064 params_depth++;
1065 if (params_depth > *max_depth)
1066 *max_depth = params_depth;
1067 }
1068 break;
1069 }
1070 default:
1071 assert(0);
1072 break;
1073 }
1074 }
1075
1076 *position += pos_add_total;
1077
1078 /* `max_pos' should never be larger than `*position' if the above
1079 code works, but just an extra safeguard let's make sure
1080 `*position' is set large enough so enough memory will be
1081 allocated for the transition table. */
1082 if (max_pos > *position)
1083 *position = max_pos;
1084
1085 #ifdef TRE_DEBUG
1086 DPRINT(("Expanded AST:\n"));
1087 tre_ast_print(ast);
1088 DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1089 #endif
1090
1091 return status;
1092 }
1093
1094 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)1095 tre_set_empty(tre_mem_t mem)
1096 {
1097 tre_pos_and_tags_t *new_set;
1098
1099 new_set = tre_mem_calloc(mem, sizeof(*new_set));
1100 if (new_set == NULL)
1101 return NULL;
1102
1103 new_set[0].position = -1;
1104 new_set[0].code_min = -1;
1105 new_set[0].code_max = -1;
1106
1107 return new_set;
1108 }
1109
1110 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)1111 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1112 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1113 {
1114 tre_pos_and_tags_t *new_set;
1115
1116 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1117 if (new_set == NULL)
1118 return NULL;
1119
1120 new_set[0].position = position;
1121 new_set[0].code_min = code_min;
1122 new_set[0].code_max = code_max;
1123 new_set[0].class = class;
1124 new_set[0].neg_classes = neg_classes;
1125 new_set[0].backref = backref;
1126 new_set[1].position = -1;
1127 new_set[1].code_min = -1;
1128 new_set[1].code_max = -1;
1129
1130 return new_set;
1131 }
1132
1133 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions,int * params)1134 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1135 int *tags, int assertions, int *params)
1136 {
1137 int s1, s2, i, j;
1138 tre_pos_and_tags_t *new_set;
1139 int *new_tags;
1140 int num_tags;
1141
1142 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1143 for (s1 = 0; set1[s1].position >= 0; s1++);
1144 for (s2 = 0; set2[s2].position >= 0; s2++);
1145 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1146 if (!new_set )
1147 return NULL;
1148
1149 for (s1 = 0; set1[s1].position >= 0; s1++)
1150 {
1151 new_set[s1].position = set1[s1].position;
1152 new_set[s1].code_min = set1[s1].code_min;
1153 new_set[s1].code_max = set1[s1].code_max;
1154 new_set[s1].assertions = set1[s1].assertions | assertions;
1155 new_set[s1].class = set1[s1].class;
1156 new_set[s1].neg_classes = set1[s1].neg_classes;
1157 new_set[s1].backref = set1[s1].backref;
1158 if (set1[s1].tags == NULL && tags == NULL)
1159 new_set[s1].tags = NULL;
1160 else
1161 {
1162 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1163 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1164 * (i + num_tags + 1)));
1165 if (new_tags == NULL)
1166 return NULL;
1167 for (j = 0; j < i; j++)
1168 new_tags[j] = set1[s1].tags[j];
1169 for (i = 0; i < num_tags; i++)
1170 new_tags[j + i] = tags[i];
1171 new_tags[j + i] = -1;
1172 new_set[s1].tags = new_tags;
1173 }
1174 if (set1[s1].params)
1175 new_set[s1].params = set1[s1].params;
1176 if (params)
1177 {
1178 if (!new_set[s1].params)
1179 new_set[s1].params = params;
1180 else
1181 {
1182 new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1183 TRE_PARAM_LAST);
1184 if (!new_set[s1].params)
1185 return NULL;
1186 for (i = 0; i < TRE_PARAM_LAST; i++)
1187 if (params[i] != TRE_PARAM_UNSET)
1188 new_set[s1].params[i] = params[i];
1189 }
1190 }
1191 }
1192
1193 for (s2 = 0; set2[s2].position >= 0; s2++)
1194 {
1195 new_set[s1 + s2].position = set2[s2].position;
1196 new_set[s1 + s2].code_min = set2[s2].code_min;
1197 new_set[s1 + s2].code_max = set2[s2].code_max;
1198 /* XXX - why not | assertions here as well? */
1199 new_set[s1 + s2].assertions = set2[s2].assertions;
1200 new_set[s1 + s2].class = set2[s2].class;
1201 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1202 new_set[s1 + s2].backref = set2[s2].backref;
1203 if (set2[s2].tags == NULL)
1204 new_set[s1 + s2].tags = NULL;
1205 else
1206 {
1207 for (i = 0; set2[s2].tags[i] >= 0; i++);
1208 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1209 if (new_tags == NULL)
1210 return NULL;
1211 for (j = 0; j < i; j++)
1212 new_tags[j] = set2[s2].tags[j];
1213 new_tags[j] = -1;
1214 new_set[s1 + s2].tags = new_tags;
1215 }
1216 if (set2[s2].params)
1217 new_set[s1 + s2].params = set2[s2].params;
1218 if (params)
1219 {
1220 if (!new_set[s1 + s2].params)
1221 new_set[s1 + s2].params = params;
1222 else
1223 {
1224 new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1225 TRE_PARAM_LAST);
1226 if (!new_set[s1 + s2].params)
1227 return NULL;
1228 for (i = 0; i < TRE_PARAM_LAST; i++)
1229 if (params[i] != TRE_PARAM_UNSET)
1230 new_set[s1 + s2].params[i] = params[i];
1231 }
1232 }
1233 }
1234 new_set[s1 + s2].position = -1;
1235 return new_set;
1236 }
1237
1238 /* Finds the empty path through `node' which is the one that should be
1239 taken according to POSIX.2 rules, and adds the tags on that path to
1240 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
1241 set to the number of tags seen on the path. */
1242 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * params,int * num_tags_seen,int * params_seen)1243 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1244 int *assertions, int *params, int *num_tags_seen,
1245 int *params_seen)
1246 {
1247 tre_literal_t *lit;
1248 tre_union_t *uni;
1249 tre_catenation_t *cat;
1250 tre_iteration_t *iter;
1251 int i;
1252 int bottom = tre_stack_num_objects(stack);
1253 reg_errcode_t status = REG_OK;
1254 if (num_tags_seen)
1255 *num_tags_seen = 0;
1256 if (params_seen)
1257 *params_seen = 0;
1258
1259 status = tre_stack_push_voidptr(stack, node);
1260
1261 /* Walk through the tree recursively. */
1262 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1263 {
1264 node = tre_stack_pop_voidptr(stack);
1265
1266 switch (node->type)
1267 {
1268 case LITERAL:
1269 lit = (tre_literal_t *)node->obj;
1270 switch (lit->code_min)
1271 {
1272 case TAG:
1273 if (lit->code_max >= 0)
1274 {
1275 if (tags != NULL)
1276 {
1277 /* Add the tag to `tags'. */
1278 for (i = 0; tags[i] >= 0; i++)
1279 if (tags[i] == lit->code_max)
1280 break;
1281 if (tags[i] < 0)
1282 {
1283 tags[i] = lit->code_max;
1284 tags[i + 1] = -1;
1285 }
1286 }
1287 if (num_tags_seen)
1288 (*num_tags_seen)++;
1289 }
1290 break;
1291 case ASSERTION:
1292 assert(lit->code_max >= 1
1293 || lit->code_max <= ASSERT_LAST);
1294 if (assertions != NULL)
1295 *assertions |= lit->code_max;
1296 break;
1297 case PARAMETER:
1298 if (params != NULL)
1299 for (i = 0; i < TRE_PARAM_LAST; i++)
1300 params[i] = lit->u.params[i];
1301 if (params_seen != NULL)
1302 *params_seen = 1;
1303 break;
1304 case EMPTY:
1305 break;
1306 default:
1307 assert(0);
1308 break;
1309 }
1310 break;
1311
1312 case UNION:
1313 /* Subexpressions starting earlier take priority over ones
1314 starting later, so we prefer the left subexpression over the
1315 right subexpression. */
1316 uni = (tre_union_t *)node->obj;
1317 if (uni->left->nullable)
1318 STACK_PUSHX(stack, voidptr, uni->left)
1319 else if (uni->right->nullable)
1320 STACK_PUSHX(stack, voidptr, uni->right)
1321 else
1322 assert(0);
1323 break;
1324
1325 case CATENATION:
1326 /* The path must go through both children. */
1327 cat = (tre_catenation_t *)node->obj;
1328 assert(cat->left->nullable);
1329 assert(cat->right->nullable);
1330 STACK_PUSHX(stack, voidptr, cat->left);
1331 STACK_PUSHX(stack, voidptr, cat->right);
1332 break;
1333
1334 case ITERATION:
1335 /* A match with an empty string is preferred over no match at
1336 all, so we go through the argument if possible. */
1337 iter = (tre_iteration_t *)node->obj;
1338 if (iter->arg->nullable)
1339 STACK_PUSHX(stack, voidptr, iter->arg);
1340 break;
1341
1342 default:
1343 assert(0);
1344 break;
1345 }
1346 }
1347
1348 return status;
1349 }
1350
1351
1352 typedef enum {
1353 NFL_RECURSE,
1354 NFL_POST_UNION,
1355 NFL_POST_CATENATION,
1356 NFL_POST_ITERATION
1357 } tre_nfl_stack_symbol_t;
1358
1359
1360 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1361 the nodes of the AST `tree'. */
1362 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)1363 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1364 {
1365 int bottom = tre_stack_num_objects(stack);
1366
1367 STACK_PUSHR(stack, voidptr, tree);
1368 STACK_PUSHR(stack, int, NFL_RECURSE);
1369
1370 while (tre_stack_num_objects(stack) > bottom)
1371 {
1372 tre_nfl_stack_symbol_t symbol;
1373 tre_ast_node_t *node;
1374
1375 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1376 node = tre_stack_pop_voidptr(stack);
1377 switch (symbol)
1378 {
1379 case NFL_RECURSE:
1380 switch (node->type)
1381 {
1382 case LITERAL:
1383 {
1384 tre_literal_t *lit = (tre_literal_t *)node->obj;
1385 if (IS_BACKREF(lit))
1386 {
1387 /* Back references: nullable = false, firstpos = {i},
1388 lastpos = {i}. */
1389 node->nullable = 0;
1390 node->firstpos = tre_set_one(mem, lit->position, 0,
1391 TRE_CHAR_MAX, 0, NULL, -1);
1392 if (!node->firstpos)
1393 return REG_ESPACE;
1394 node->lastpos = tre_set_one(mem, lit->position, 0,
1395 TRE_CHAR_MAX, 0, NULL,
1396 lit->code_max);
1397 if (!node->lastpos)
1398 return REG_ESPACE;
1399 }
1400 else if (lit->code_min < 0)
1401 {
1402 /* Tags, empty strings, params, and zero width assertions:
1403 nullable = true, firstpos = {}, and lastpos = {}. */
1404 node->nullable = 1;
1405 node->firstpos = tre_set_empty(mem);
1406 if (!node->firstpos)
1407 return REG_ESPACE;
1408 node->lastpos = tre_set_empty(mem);
1409 if (!node->lastpos)
1410 return REG_ESPACE;
1411 }
1412 else
1413 {
1414 /* Literal at position i: nullable = false, firstpos = {i},
1415 lastpos = {i}. */
1416 node->nullable = 0;
1417 node->firstpos =
1418 tre_set_one(mem, lit->position, lit->code_min,
1419 lit->code_max, 0, NULL, -1);
1420 if (!node->firstpos)
1421 return REG_ESPACE;
1422 node->lastpos = tre_set_one(mem, lit->position,
1423 lit->code_min, lit->code_max,
1424 lit->u.class, lit->neg_classes,
1425 -1);
1426 if (!node->lastpos)
1427 return REG_ESPACE;
1428 }
1429 break;
1430 }
1431
1432 case UNION:
1433 /* Compute the attributes for the two subtrees, and after that
1434 for this node. */
1435 STACK_PUSHR(stack, voidptr, node);
1436 STACK_PUSHR(stack, int, NFL_POST_UNION);
1437 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1438 STACK_PUSHR(stack, int, NFL_RECURSE);
1439 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1440 STACK_PUSHR(stack, int, NFL_RECURSE);
1441 break;
1442
1443 case CATENATION:
1444 /* Compute the attributes for the two subtrees, and after that
1445 for this node. */
1446 STACK_PUSHR(stack, voidptr, node);
1447 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1448 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1449 STACK_PUSHR(stack, int, NFL_RECURSE);
1450 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1451 STACK_PUSHR(stack, int, NFL_RECURSE);
1452 break;
1453
1454 case ITERATION:
1455 /* Compute the attributes for the subtree, and after that for
1456 this node. */
1457 STACK_PUSHR(stack, voidptr, node);
1458 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1459 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1460 STACK_PUSHR(stack, int, NFL_RECURSE);
1461 break;
1462 }
1463 break; /* end case: NFL_RECURSE */
1464
1465 case NFL_POST_UNION:
1466 {
1467 tre_union_t *uni = (tre_union_t *)node->obj;
1468 node->nullable = uni->left->nullable || uni->right->nullable;
1469 node->firstpos = tre_set_union(mem, uni->left->firstpos,
1470 uni->right->firstpos, NULL, 0, NULL);
1471 if (!node->firstpos)
1472 return REG_ESPACE;
1473 node->lastpos = tre_set_union(mem, uni->left->lastpos,
1474 uni->right->lastpos, NULL, 0, NULL);
1475 if (!node->lastpos)
1476 return REG_ESPACE;
1477 break;
1478 }
1479
1480 case NFL_POST_ITERATION:
1481 {
1482 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1483
1484 if (iter->min == 0 || iter->arg->nullable)
1485 node->nullable = 1;
1486 else
1487 node->nullable = 0;
1488 node->firstpos = iter->arg->firstpos;
1489 node->lastpos = iter->arg->lastpos;
1490 break;
1491 }
1492
1493 case NFL_POST_CATENATION:
1494 {
1495 int num_tags, *tags, assertions, params_seen;
1496 int *params;
1497 reg_errcode_t status;
1498 tre_catenation_t *cat = node->obj;
1499 node->nullable = cat->left->nullable && cat->right->nullable;
1500
1501 /* Compute firstpos. */
1502 if (cat->left->nullable)
1503 {
1504 /* The left side matches the empty string. Make a first pass
1505 with tre_match_empty() to get the number of tags and
1506 parameters. */
1507 status = tre_match_empty(stack, cat->left,
1508 NULL, NULL, NULL, &num_tags,
1509 ¶ms_seen);
1510 if (status != REG_OK)
1511 return status;
1512 /* Allocate arrays for the tags and parameters. */
1513 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1514 if (!tags)
1515 return REG_ESPACE;
1516 tags[0] = -1;
1517 assertions = 0;
1518 params = NULL;
1519 if (params_seen)
1520 {
1521 params = tre_mem_alloc(mem, sizeof(*params)
1522 * TRE_PARAM_LAST);
1523 if (!params)
1524 {
1525 xfree(tags);
1526 return REG_ESPACE;
1527 }
1528 }
1529 /* Second pass with tre_mach_empty() to get the list of
1530 tags and parameters. */
1531 status = tre_match_empty(stack, cat->left, tags,
1532 &assertions, params, NULL, NULL);
1533 if (status != REG_OK)
1534 {
1535 xfree(tags);
1536 return status;
1537 }
1538 node->firstpos =
1539 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1540 tags, assertions, params);
1541 xfree(tags);
1542 if (!node->firstpos)
1543 return REG_ESPACE;
1544 }
1545 else
1546 {
1547 node->firstpos = cat->left->firstpos;
1548 }
1549
1550 /* Compute lastpos. */
1551 if (cat->right->nullable)
1552 {
1553 /* The right side matches the empty string. Make a first pass
1554 with tre_match_empty() to get the number of tags and
1555 parameters. */
1556 status = tre_match_empty(stack, cat->right,
1557 NULL, NULL, NULL, &num_tags,
1558 ¶ms_seen);
1559 if (status != REG_OK)
1560 return status;
1561 /* Allocate arrays for the tags and parameters. */
1562 tags = xmalloc(sizeof(int) * (num_tags + 1));
1563 if (!tags)
1564 return REG_ESPACE;
1565 tags[0] = -1;
1566 assertions = 0;
1567 params = NULL;
1568 if (params_seen)
1569 {
1570 params = tre_mem_alloc(mem, sizeof(*params)
1571 * TRE_PARAM_LAST);
1572 if (!params)
1573 {
1574 xfree(tags);
1575 return REG_ESPACE;
1576 }
1577 }
1578 /* Second pass with tre_mach_empty() to get the list of
1579 tags and parameters. */
1580 status = tre_match_empty(stack, cat->right, tags,
1581 &assertions, params, NULL, NULL);
1582 if (status != REG_OK)
1583 {
1584 xfree(tags);
1585 return status;
1586 }
1587 node->lastpos =
1588 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1589 tags, assertions, params);
1590 xfree(tags);
1591 if (!node->lastpos)
1592 return REG_ESPACE;
1593 }
1594 else
1595 {
1596 node->lastpos = cat->right->lastpos;
1597 }
1598 break;
1599 }
1600
1601 default:
1602 assert(0);
1603 break;
1604 }
1605 }
1606
1607 return REG_OK;
1608 }
1609
1610
1611 /* Adds a transition from each position in `p1' to each position in `p2'. */
1612 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)1613 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1614 tre_tnfa_transition_t *transitions,
1615 int *counts, int *offs)
1616 {
1617 tre_pos_and_tags_t *orig_p2 = p2;
1618 tre_tnfa_transition_t *trans;
1619 int i, j, k, l, dup, prev_p2_pos;
1620
1621 if (transitions != NULL)
1622 while (p1->position >= 0)
1623 {
1624 p2 = orig_p2;
1625 prev_p2_pos = -1;
1626 while (p2->position >= 0)
1627 {
1628 /* Optimization: if this position was already handled, skip it. */
1629 if (p2->position == prev_p2_pos)
1630 {
1631 p2++;
1632 continue;
1633 }
1634 prev_p2_pos = p2->position;
1635 /* Set `trans' to point to the next unused transition from
1636 position `p1->position'. */
1637 trans = transitions + offs[p1->position];
1638 while (trans->state != NULL)
1639 {
1640 #if 0
1641 /* If we find a previous transition from `p1->position' to
1642 `p2->position', it is overwritten. This can happen only
1643 if there are nested loops in the regexp, like in "((a)*)*".
1644 In POSIX.2 repetition using the outer loop is always
1645 preferred over using the inner loop. Therefore the
1646 transition for the inner loop is useless and can be thrown
1647 away. */
1648 /* XXX - The same position is used for all nodes in a bracket
1649 expression, so this optimization cannot be used (it will
1650 break bracket expressions) unless I figure out a way to
1651 detect it here. */
1652 if (trans->state_id == p2->position)
1653 {
1654 DPRINT(("*"));
1655 break;
1656 }
1657 #endif
1658 trans++;
1659 }
1660
1661 if (trans->state == NULL)
1662 (trans + 1)->state = NULL;
1663 /* Use the character ranges, assertions, etc. from `p1' for
1664 the transition from `p1' to `p2'. */
1665 trans->code_min = p1->code_min;
1666 trans->code_max = p1->code_max;
1667 trans->state = transitions + offs[p2->position];
1668 trans->state_id = p2->position;
1669 trans->assertions = p1->assertions | p2->assertions
1670 | (p1->class ? ASSERT_CHAR_CLASS : 0)
1671 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1672 if (p1->backref >= 0)
1673 {
1674 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1675 assert(p2->backref < 0);
1676 trans->u.backref = p1->backref;
1677 trans->assertions |= ASSERT_BACKREF;
1678 }
1679 else
1680 trans->u.class = p1->class;
1681 if (p1->neg_classes != NULL)
1682 {
1683 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1684 trans->neg_classes =
1685 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1686 if (trans->neg_classes == NULL)
1687 return REG_ESPACE;
1688 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1689 trans->neg_classes[i] = p1->neg_classes[i];
1690 trans->neg_classes[i] = (tre_ctype_t)0;
1691 }
1692 else
1693 trans->neg_classes = NULL;
1694
1695 /* Find out how many tags this transition has. */
1696 i = 0;
1697 if (p1->tags != NULL)
1698 while(p1->tags[i] >= 0)
1699 i++;
1700 j = 0;
1701 if (p2->tags != NULL)
1702 while(p2->tags[j] >= 0)
1703 j++;
1704
1705 /* If we are overwriting a transition, free the old tag array. */
1706 if (trans->tags != NULL)
1707 xfree(trans->tags);
1708 trans->tags = NULL;
1709
1710 /* If there were any tags, allocate an array and fill it. */
1711 if (i + j > 0)
1712 {
1713 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1714 if (!trans->tags)
1715 return REG_ESPACE;
1716 i = 0;
1717 if (p1->tags != NULL)
1718 while(p1->tags[i] >= 0)
1719 {
1720 trans->tags[i] = p1->tags[i];
1721 i++;
1722 }
1723 l = i;
1724 j = 0;
1725 if (p2->tags != NULL)
1726 while (p2->tags[j] >= 0)
1727 {
1728 /* Don't add duplicates. */
1729 dup = 0;
1730 for (k = 0; k < i; k++)
1731 if (trans->tags[k] == p2->tags[j])
1732 {
1733 dup = 1;
1734 break;
1735 }
1736 if (!dup)
1737 trans->tags[l++] = p2->tags[j];
1738 j++;
1739 }
1740 trans->tags[l] = -1;
1741 }
1742
1743 /* Set the parameter array. If both `p2' and `p1' have same
1744 parameters, the values in `p2' override those in `p1'. */
1745 if (p1->params || p2->params)
1746 {
1747 if (!trans->params)
1748 trans->params = xmalloc(sizeof(*trans->params)
1749 * TRE_PARAM_LAST);
1750 if (!trans->params)
1751 return REG_ESPACE;
1752 for (i = 0; i < TRE_PARAM_LAST; i++)
1753 {
1754 trans->params[i] = TRE_PARAM_UNSET;
1755 if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1756 trans->params[i] = p1->params[i];
1757 if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1758 trans->params[i] = p2->params[i];
1759 }
1760 }
1761 else
1762 {
1763 if (trans->params)
1764 xfree(trans->params);
1765 trans->params = NULL;
1766 }
1767
1768
1769 #ifdef TRE_DEBUG
1770 {
1771 int *tags;
1772
1773 DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
1774 p1->code_min));
1775 if (p1->code_max != p1->code_min)
1776 DPRINT(("-%3d", p1->code_max));
1777 tags = trans->tags;
1778 if (tags)
1779 {
1780 DPRINT((", tags ["));
1781 while (*tags >= 0)
1782 {
1783 DPRINT(("%d", *tags));
1784 tags++;
1785 if (*tags >= 0)
1786 DPRINT((","));
1787 }
1788 DPRINT(("]"));
1789 }
1790 if (trans->assertions)
1791 DPRINT((", assert %d", trans->assertions));
1792 if (trans->assertions & ASSERT_BACKREF)
1793 DPRINT((", backref %d", trans->u.backref));
1794 else if (trans->u.class)
1795 DPRINT((", class %ld", (long)trans->u.class));
1796 if (trans->neg_classes)
1797 DPRINT((", neg_classes %p", trans->neg_classes));
1798 if (trans->params)
1799 {
1800 DPRINT((", "));
1801 tre_print_params(trans->params);
1802 }
1803 DPRINT(("\n"));
1804 }
1805 #endif /* TRE_DEBUG */
1806 p2++;
1807 }
1808 p1++;
1809 }
1810 else
1811 /* Compute a maximum limit for the number of transitions leaving
1812 from each state. */
1813 while (p1->position >= 0)
1814 {
1815 p2 = orig_p2;
1816 while (p2->position >= 0)
1817 {
1818 counts[p1->position]++;
1819 p2++;
1820 }
1821 p1++;
1822 }
1823 return REG_OK;
1824 }
1825
1826 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
1827 labelled with one character range (there are no transitions on empty
1828 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
1829 the regexp. */
1830 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1831 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1832 int *counts, int *offs)
1833 {
1834 tre_union_t *uni;
1835 tre_catenation_t *cat;
1836 tre_iteration_t *iter;
1837 reg_errcode_t errcode = REG_OK;
1838
1839 /* XXX - recurse using a stack!. */
1840 switch (node->type)
1841 {
1842 case LITERAL:
1843 break;
1844 case UNION:
1845 uni = (tre_union_t *)node->obj;
1846 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
1847 if (errcode != REG_OK)
1848 return errcode;
1849 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
1850 break;
1851
1852 case CATENATION:
1853 cat = (tre_catenation_t *)node->obj;
1854 /* Add a transition from each position in cat->left->lastpos
1855 to each position in cat->right->firstpos. */
1856 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1857 transitions, counts, offs);
1858 if (errcode != REG_OK)
1859 return errcode;
1860 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
1861 if (errcode != REG_OK)
1862 return errcode;
1863 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
1864 break;
1865
1866 case ITERATION:
1867 iter = (tre_iteration_t *)node->obj;
1868 assert(iter->max == -1 || iter->max == 1);
1869
1870 if (iter->max == -1)
1871 {
1872 assert(iter->min == 0 || iter->min == 1);
1873 /* Add a transition from each last position in the iterated
1874 expression to each first position. */
1875 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1876 transitions, counts, offs);
1877 if (errcode != REG_OK)
1878 return errcode;
1879 }
1880 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
1881 break;
1882 }
1883 return errcode;
1884 }
1885
1886
1887 #define ERROR_EXIT(err) \
1888 do \
1889 { \
1890 errcode = err; \
1891 if (1) goto error_exit; \
1892 } \
1893 while (0)
1894
1895
1896 int
tre_compile(regex_t * preg,const tre_char_t * regex,size_t n,int cflags)1897 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1898 {
1899 tre_stack_t *stack;
1900 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1901 tre_pos_and_tags_t *p;
1902 int *counts = NULL, *offs = NULL;
1903 int i, add = 0;
1904 tre_tnfa_transition_t *transitions, *initial;
1905 tre_tnfa_t *tnfa = NULL;
1906 tre_submatch_data_t *submatch_data;
1907 tre_tag_direction_t *tag_directions = NULL;
1908 reg_errcode_t errcode;
1909 tre_mem_t mem;
1910
1911 /* Parse context. */
1912 tre_parse_ctx_t parse_ctx;
1913
1914 /* Allocate a stack used throughout the compilation process for various
1915 purposes. */
1916 stack = tre_stack_new(512, 10240, 128);
1917 if (!stack)
1918 return REG_ESPACE;
1919 /* Allocate a fast memory allocator. */
1920 mem = tre_mem_new();
1921 if (!mem)
1922 {
1923 tre_stack_destroy(stack);
1924 return REG_ESPACE;
1925 }
1926
1927 /* Parse the regexp. */
1928 memset(&parse_ctx, 0, sizeof(parse_ctx));
1929 parse_ctx.mem = mem;
1930 parse_ctx.stack = stack;
1931 parse_ctx.re = regex;
1932 parse_ctx.len = n;
1933 parse_ctx.cflags = cflags;
1934 parse_ctx.max_backref = -1;
1935 DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1936 errcode = tre_parse(&parse_ctx);
1937 if (errcode != REG_OK)
1938 ERROR_EXIT(errcode);
1939 preg->re_nsub = parse_ctx.submatch_id - 1;
1940 tree = parse_ctx.result;
1941
1942 /* Back references and approximate matching cannot currently be used
1943 in the same regexp. */
1944 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1945 ERROR_EXIT(REG_BADPAT);
1946
1947 #ifdef TRE_DEBUG
1948 tre_ast_print(tree);
1949 #endif /* TRE_DEBUG */
1950
1951 /* Referring to nonexistent subexpressions is illegal. */
1952 if (parse_ctx.max_backref > (int)preg->re_nsub)
1953 ERROR_EXIT(REG_ESUBREG);
1954
1955 /* Allocate the TNFA struct. */
1956 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1957 if (tnfa == NULL)
1958 ERROR_EXIT(REG_ESPACE);
1959 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1960 tnfa->have_approx = parse_ctx.have_approx;
1961 tnfa->num_submatches = parse_ctx.submatch_id;
1962
1963 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
1964 regexp does not have back references, this can be skipped. */
1965 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1966 {
1967 DPRINT(("tre_compile: setting up tags\n"));
1968
1969 /* Figure out how many tags we will need. */
1970 errcode = tre_add_tags(NULL, stack, tree, tnfa);
1971 if (errcode != REG_OK)
1972 ERROR_EXIT(errcode);
1973 #ifdef TRE_DEBUG
1974 tre_ast_print(tree);
1975 #endif /* TRE_DEBUG */
1976
1977 if (tnfa->num_tags > 0)
1978 {
1979 tag_directions = xmalloc(sizeof(*tag_directions)
1980 * (tnfa->num_tags + 1));
1981 if (tag_directions == NULL)
1982 ERROR_EXIT(REG_ESPACE);
1983 tnfa->tag_directions = tag_directions;
1984 memset(tag_directions, -1,
1985 sizeof(*tag_directions) * (tnfa->num_tags + 1));
1986 }
1987 tnfa->minimal_tags = xcalloc(tnfa->num_tags * 2 + 1,
1988 sizeof(tnfa->minimal_tags));
1989 if (tnfa->minimal_tags == NULL)
1990 ERROR_EXIT(REG_ESPACE);
1991
1992 submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
1993 if (submatch_data == NULL)
1994 ERROR_EXIT(REG_ESPACE);
1995 tnfa->submatch_data = submatch_data;
1996
1997 errcode = tre_add_tags(mem, stack, tree, tnfa);
1998 if (errcode != REG_OK)
1999 ERROR_EXIT(errcode);
2000
2001 #ifdef TRE_DEBUG
2002 for (i = 0; i < parse_ctx.submatch_id; i++)
2003 DPRINT(("pmatch[%d] = {t%d, t%d}\n",
2004 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
2005 for (i = 0; i < tnfa->num_tags; i++)
2006 DPRINT(("t%d is %s\n", i,
2007 tag_directions[i] == TRE_TAG_MINIMIZE ?
2008 "minimized" : "maximized"));
2009 #endif /* TRE_DEBUG */
2010 }
2011
2012 /* Expand iteration nodes. */
2013 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2014 tag_directions, &tnfa->params_depth);
2015 if (errcode != REG_OK)
2016 ERROR_EXIT(errcode);
2017
2018 /* Add a dummy node for the final state.
2019 XXX - For certain patterns this dummy node can be optimized away,
2020 for example "a*" or "ab*". Figure out a simple way to detect
2021 this possibility. */
2022 tmp_ast_l = tree;
2023 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2024 if (tmp_ast_r == NULL)
2025 ERROR_EXIT(REG_ESPACE);
2026
2027 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2028 if (tree == NULL)
2029 ERROR_EXIT(REG_ESPACE);
2030
2031 #ifdef TRE_DEBUG
2032 tre_ast_print(tree);
2033 DPRINT(("Number of states: %d\n", parse_ctx.position));
2034 #endif /* TRE_DEBUG */
2035
2036 errcode = tre_compute_nfl(mem, stack, tree);
2037 if (errcode != REG_OK)
2038 ERROR_EXIT(errcode);
2039
2040 counts = xmalloc(sizeof(int) * parse_ctx.position);
2041 if (counts == NULL)
2042 ERROR_EXIT(REG_ESPACE);
2043
2044 offs = xmalloc(sizeof(int) * parse_ctx.position);
2045 if (offs == NULL)
2046 ERROR_EXIT(REG_ESPACE);
2047
2048 for (i = 0; i < parse_ctx.position; i++)
2049 counts[i] = 0;
2050 tre_ast_to_tnfa(tree, NULL, counts, NULL);
2051
2052 add = 0;
2053 for (i = 0; i < parse_ctx.position; i++)
2054 {
2055 offs[i] = add;
2056 add += counts[i] + 1;
2057 counts[i] = 0;
2058 }
2059 transitions = xcalloc(add + 1, sizeof(*transitions));
2060 if (transitions == NULL)
2061 ERROR_EXIT(REG_ESPACE);
2062 tnfa->transitions = transitions;
2063 tnfa->num_transitions = add;
2064
2065 DPRINT(("Converting to TNFA:\n"));
2066 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2067 if (errcode != REG_OK)
2068 ERROR_EXIT(errcode);
2069
2070 /* If in eight bit mode, compute a table of characters that can be the
2071 first character of a match. */
2072 tnfa->first_char = -1;
2073 if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2074 {
2075 int count = 0;
2076 int k;
2077 DPRINT(("Characters that can start a match:"));
2078 tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2079 if (tnfa->firstpos_chars == NULL)
2080 ERROR_EXIT(REG_ESPACE);
2081 for (p = tree->firstpos; p->position >= 0; p++)
2082 {
2083 tre_tnfa_transition_t *j = transitions + offs[p->position];
2084 while (j->state != NULL)
2085 {
2086 for (k = j->code_min; k <= j->code_max && k < 256; k++)
2087 {
2088 DPRINT((" %d", k));
2089 tnfa->firstpos_chars[k] = 1;
2090 count++;
2091 }
2092 j++;
2093 }
2094 }
2095 DPRINT(("\n"));
2096 #define TRE_OPTIMIZE_FIRST_CHAR 1
2097 #if TRE_OPTIMIZE_FIRST_CHAR
2098 if (count == 1)
2099 {
2100 for (k = 0; k < 256; k++)
2101 if (tnfa->firstpos_chars[k])
2102 {
2103 DPRINT(("first char must be %d\n", k));
2104 tnfa->first_char = k;
2105 xfree(tnfa->firstpos_chars);
2106 tnfa->firstpos_chars = NULL;
2107 break;
2108 }
2109 }
2110 #endif
2111
2112 }
2113 else
2114 tnfa->firstpos_chars = NULL;
2115
2116
2117 p = tree->firstpos;
2118 i = 0;
2119 while (p->position >= 0)
2120 {
2121 i++;
2122
2123 #ifdef TRE_DEBUG
2124 {
2125 int *tags;
2126 DPRINT(("initial: %d", p->position));
2127 tags = p->tags;
2128 if (tags != NULL)
2129 {
2130 if (*tags >= 0)
2131 DPRINT(("/"));
2132 while (*tags >= 0)
2133 {
2134 DPRINT(("%d", *tags));
2135 tags++;
2136 if (*tags >= 0)
2137 DPRINT((","));
2138 }
2139 }
2140 DPRINT((", assert %d", p->assertions));
2141 if (p->params)
2142 {
2143 DPRINT((", "));
2144 tre_print_params(p->params);
2145 }
2146 DPRINT(("\n"));
2147 }
2148 #endif /* TRE_DEBUG */
2149
2150 p++;
2151 }
2152
2153 initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
2154 if (initial == NULL)
2155 ERROR_EXIT(REG_ESPACE);
2156 tnfa->initial = initial;
2157
2158 i = 0;
2159 for (p = tree->firstpos; p->position >= 0; p++)
2160 {
2161 initial[i].state = transitions + offs[p->position];
2162 initial[i].state_id = p->position;
2163 initial[i].tags = NULL;
2164 /* Copy the arrays p->tags, and p->params, they are allocated
2165 from a tre_mem object. */
2166 if (p->tags)
2167 {
2168 int j;
2169 for (j = 0; p->tags[j] >= 0; j++);
2170 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2171 if (!initial[i].tags)
2172 ERROR_EXIT(REG_ESPACE);
2173 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2174 }
2175 initial[i].params = NULL;
2176 if (p->params)
2177 {
2178 initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2179 if (!initial[i].params)
2180 ERROR_EXIT(REG_ESPACE);
2181 memcpy(initial[i].params, p->params,
2182 sizeof(*p->params) * TRE_PARAM_LAST);
2183 }
2184 initial[i].assertions = p->assertions;
2185 i++;
2186 }
2187 initial[i].state = NULL;
2188
2189 tnfa->num_transitions = add;
2190 tnfa->final = transitions + offs[tree->lastpos[0].position];
2191 tnfa->num_states = parse_ctx.position;
2192 tnfa->cflags = cflags;
2193
2194 DPRINT(("final state %p\n", (void *)tnfa->final));
2195
2196 tre_mem_destroy(mem);
2197 tre_stack_destroy(stack);
2198 xfree(counts);
2199 xfree(offs);
2200
2201 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2202 return REG_OK;
2203
2204 error_exit:
2205 /* Free everything that was allocated and return the error code. */
2206 tre_mem_destroy(mem);
2207 if (stack != NULL)
2208 tre_stack_destroy(stack);
2209 if (counts != NULL)
2210 xfree(counts);
2211 if (offs != NULL)
2212 xfree(offs);
2213 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2214 tre_free(preg);
2215 return errcode;
2216 }
2217
2218
2219
2220
2221 void
tre_free(regex_t * preg)2222 tre_free(regex_t *preg)
2223 {
2224 tre_tnfa_t *tnfa;
2225 unsigned int i;
2226 tre_tnfa_transition_t *trans;
2227
2228 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2229 if (!tnfa)
2230 return;
2231
2232 for (i = 0; i < tnfa->num_transitions; i++)
2233 if (tnfa->transitions[i].state)
2234 {
2235 if (tnfa->transitions[i].tags)
2236 xfree(tnfa->transitions[i].tags);
2237 if (tnfa->transitions[i].neg_classes)
2238 xfree(tnfa->transitions[i].neg_classes);
2239 if (tnfa->transitions[i].params)
2240 xfree(tnfa->transitions[i].params);
2241 }
2242 if (tnfa->transitions)
2243 xfree(tnfa->transitions);
2244
2245 if (tnfa->initial)
2246 {
2247 for (trans = tnfa->initial; trans->state; trans++)
2248 {
2249 if (trans->tags)
2250 xfree(trans->tags);
2251 if (trans->params)
2252 xfree(trans->params);
2253 }
2254 xfree(tnfa->initial);
2255 }
2256
2257 if (tnfa->submatch_data)
2258 {
2259 for (i = 0; i < tnfa->num_submatches; i++)
2260 if (tnfa->submatch_data[i].parents)
2261 xfree(tnfa->submatch_data[i].parents);
2262 xfree(tnfa->submatch_data);
2263 }
2264
2265 if (tnfa->tag_directions)
2266 xfree(tnfa->tag_directions);
2267 if (tnfa->firstpos_chars)
2268 xfree(tnfa->firstpos_chars);
2269 if (tnfa->minimal_tags)
2270 xfree(tnfa->minimal_tags);
2271 xfree(tnfa);
2272 }
2273
2274 char *
tre_version(void)2275 tre_version(void)
2276 {
2277 static char str[256];
2278 char *version;
2279
2280 if (str[0] == 0)
2281 {
2282 tre_config(TRE_CONFIG_VERSION, &version);
2283 sprintf(str, "TRE %s (LGPL)", version);
2284 }
2285 return str;
2286 }
2287
2288 int
tre_config(int query,void * result)2289 tre_config(int query, void *result)
2290 {
2291 int *int_result = result;
2292 char **string_result = result;
2293
2294 switch (query)
2295 {
2296 case TRE_CONFIG_APPROX:
2297 #ifdef TRE_APPROX
2298 *int_result = 1;
2299 #else /* !TRE_APPROX */
2300 *int_result = 0;
2301 #endif /* !TRE_APPROX */
2302 return REG_OK;
2303
2304 case TRE_CONFIG_WCHAR:
2305 #ifdef TRE_WCHAR
2306 *int_result = 1;
2307 #else /* !TRE_WCHAR */
2308 *int_result = 0;
2309 #endif /* !TRE_WCHAR */
2310 return REG_OK;
2311
2312 case TRE_CONFIG_MULTIBYTE:
2313 #ifdef TRE_MULTIBYTE
2314 *int_result = 1;
2315 #else /* !TRE_MULTIBYTE */
2316 *int_result = 0;
2317 #endif /* !TRE_MULTIBYTE */
2318 return REG_OK;
2319
2320 case TRE_CONFIG_SYSTEM_ABI:
2321 #ifdef TRE_CONFIG_SYSTEM_ABI
2322 *int_result = 1;
2323 #else /* !TRE_CONFIG_SYSTEM_ABI */
2324 *int_result = 0;
2325 #endif /* !TRE_CONFIG_SYSTEM_ABI */
2326 return REG_OK;
2327
2328 case TRE_CONFIG_VERSION:
2329 *string_result = TRE_VERSION;
2330 return REG_OK;
2331 }
2332
2333 return REG_NOMATCH;
2334 }
2335
2336
2337 /* EOF */
2338