1 /*
2 tre-compile.c - TRE regex compiler
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 TODO:
11 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
12 function calls.
13 */
14
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <stdio.h>
20 #include <assert.h>
21 #include <string.h>
22
23 #include "tre-internal.h"
24 #include "tre-mem.h"
25 #include "tre-stack.h"
26 #include "tre-ast.h"
27 #include "tre-parse.h"
28 #include "tre-compile.h"
29 #include "tre.h"
30 #include "xmalloc.h"
31
32 /*
33 Algorithms to setup tags so that submatch addressing can be done.
34 */
35
36
37 /* Inserts a catenation node to the root of the tree given in `node'.
38 As the left child a new tag with number `tag_id' to `node' is added,
39 and the right child is the old root. */
40 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)41 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
42 {
43 tre_catenation_t *c;
44
45 DPRINT(("add_tag_left: tag %d\n", tag_id));
46
47 c = tre_mem_alloc(mem, sizeof(*c));
48 if (c == NULL)
49 return REG_ESPACE;
50 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
51 if (c->left == NULL)
52 return REG_ESPACE;
53 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
54 if (c->right == NULL)
55 return REG_ESPACE;
56
57 c->right->obj = node->obj;
58 c->right->type = node->type;
59 c->right->nullable = -1;
60 c->right->submatch_id = -1;
61 c->right->firstpos = NULL;
62 c->right->lastpos = NULL;
63 c->right->num_tags = 0;
64 node->obj = c;
65 node->type = CATENATION;
66 return REG_OK;
67 }
68
69 /* Inserts a catenation node to the root of the tree given in `node'.
70 As the right child a new tag with number `tag_id' to `node' is added,
71 and the left child is the old root. */
72 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)73 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
74 {
75 tre_catenation_t *c;
76
77 DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
78
79 c = tre_mem_alloc(mem, sizeof(*c));
80 if (c == NULL)
81 return REG_ESPACE;
82 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
83 if (c->right == NULL)
84 return REG_ESPACE;
85 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
86 if (c->left == NULL)
87 return REG_ESPACE;
88
89 c->left->obj = node->obj;
90 c->left->type = node->type;
91 c->left->nullable = -1;
92 c->left->submatch_id = -1;
93 c->left->firstpos = NULL;
94 c->left->lastpos = NULL;
95 c->left->num_tags = 0;
96 node->obj = c;
97 node->type = CATENATION;
98 return REG_OK;
99 }
100
101 typedef enum {
102 ADDTAGS_RECURSE,
103 ADDTAGS_AFTER_ITERATION,
104 ADDTAGS_AFTER_UNION_LEFT,
105 ADDTAGS_AFTER_UNION_RIGHT,
106 ADDTAGS_AFTER_CAT_LEFT,
107 ADDTAGS_AFTER_CAT_RIGHT,
108 ADDTAGS_SET_SUBMATCH_END
109 } tre_addtags_symbol_t;
110
111
112 typedef struct {
113 int tag;
114 int next_tag;
115 } tre_tag_states_t;
116
117
118 /* Go through `regset' and set submatch data for submatches that are
119 using this tag. */
120 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)121 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
122 {
123 int i;
124
125 for (i = 0; regset[i] >= 0; i++)
126 {
127 int id = regset[i] / 2;
128 int start = !(regset[i] % 2);
129 DPRINT((" Using tag %d for %s offset of "
130 "submatch %d\n", tag,
131 start ? "start" : "end", id));
132 if (start)
133 tnfa->submatch_data[id].so_tag = tag;
134 else
135 tnfa->submatch_data[id].eo_tag = tag;
136 }
137 regset[0] = -1;
138 }
139
140
141 /* Adds tags to appropriate locations in the parse tree in `tree', so that
142 subexpressions marked for submatch addressing can be traced. */
143 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)144 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
145 tre_tnfa_t *tnfa)
146 {
147 reg_errcode_t status = REG_OK;
148 tre_addtags_symbol_t symbol;
149 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
150 int bottom = tre_stack_num_objects(stack);
151 /* True for first pass (counting number of needed tags) */
152 int first_pass = (mem == NULL || tnfa == NULL);
153 int *regset, *orig_regset;
154 int num_tags = 0; /* Total number of tags. */
155 int num_minimals = 0; /* Number of special minimal tags. */
156 int tag = 0; /* The tag that is to be added next. */
157 int next_tag = 1; /* Next tag to use after this one. */
158 int *parents; /* Stack of submatches the current submatch is
159 contained in. */
160 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
161 tre_tag_states_t *saved_states;
162
163 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
164 if (!first_pass)
165 {
166 tnfa->end_tag = 0;
167 tnfa->minimal_tags[0] = -1;
168 }
169
170 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
171 if (regset == NULL)
172 return REG_ESPACE;
173 regset[0] = -1;
174 orig_regset = regset;
175
176 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
177 if (parents == NULL)
178 {
179 xfree(regset);
180 return REG_ESPACE;
181 }
182 parents[0] = -1;
183
184 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
185 if (saved_states == NULL)
186 {
187 xfree(regset);
188 xfree(parents);
189 return REG_ESPACE;
190 }
191 else
192 {
193 unsigned int i;
194 for (i = 0; i <= tnfa->num_submatches; i++)
195 saved_states[i].tag = -1;
196 }
197
198 STACK_PUSH(stack, voidptr, node);
199 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
200
201 while (tre_stack_num_objects(stack) > bottom)
202 {
203 if (status != REG_OK)
204 break;
205
206 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
207 switch (symbol)
208 {
209
210 case ADDTAGS_SET_SUBMATCH_END:
211 {
212 int id = tre_stack_pop_int(stack);
213 int i;
214
215 /* Add end of this submatch to regset. */
216 for (i = 0; regset[i] >= 0; i++);
217 regset[i] = id * 2 + 1;
218 regset[i + 1] = -1;
219
220 /* Pop this submatch from the parents stack. */
221 for (i = 0; parents[i] >= 0; i++);
222 parents[i - 1] = -1;
223 break;
224 }
225
226 case ADDTAGS_RECURSE:
227 node = tre_stack_pop_voidptr(stack);
228
229 if (node->submatch_id >= 0)
230 {
231 int id = node->submatch_id;
232 int i;
233
234
235 /* Add start of this submatch to regset. */
236 for (i = 0; regset[i] >= 0; i++);
237 regset[i] = id * 2;
238 regset[i + 1] = -1;
239
240 if (!first_pass)
241 {
242 for (i = 0; parents[i] >= 0; i++);
243 tnfa->submatch_data[id].parents = NULL;
244 if (i > 0)
245 {
246 int *p = xmalloc(sizeof(*p) * (i + 1));
247 if (p == NULL)
248 {
249 status = REG_ESPACE;
250 break;
251 }
252 assert(tnfa->submatch_data[id].parents == NULL);
253 tnfa->submatch_data[id].parents = p;
254 for (i = 0; parents[i] >= 0; i++)
255 p[i] = parents[i];
256 p[i] = -1;
257 }
258 }
259
260 /* Add end of this submatch to regset after processing this
261 node. */
262 STACK_PUSHX(stack, int, node->submatch_id);
263 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
264 }
265
266 switch (node->type)
267 {
268 case LITERAL:
269 {
270 tre_literal_t *lit = node->obj;
271
272 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
273 {
274 int i;
275 DPRINT(("Literal %d-%d\n",
276 (int)lit->code_min, (int)lit->code_max));
277 if (regset[0] >= 0)
278 {
279 /* Regset is not empty, so add a tag before the
280 literal or backref. */
281 if (!first_pass)
282 {
283 status = tre_add_tag_left(mem, node, tag);
284 tnfa->tag_directions[tag] = direction;
285 if (minimal_tag >= 0)
286 {
287 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
288 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
289 tnfa->minimal_tags[i] = tag;
290 tnfa->minimal_tags[i + 1] = minimal_tag;
291 tnfa->minimal_tags[i + 2] = -1;
292 minimal_tag = -1;
293 num_minimals++;
294 }
295 tre_purge_regset(regset, tnfa, tag);
296 }
297 else
298 {
299 DPRINT((" num_tags = 1\n"));
300 node->num_tags = 1;
301 }
302
303 DPRINT((" num_tags++\n"));
304 regset[0] = -1;
305 tag = next_tag;
306 num_tags++;
307 next_tag++;
308 }
309 }
310 else
311 {
312 assert(!IS_TAG(lit));
313 }
314 break;
315 }
316 case CATENATION:
317 {
318 tre_catenation_t *cat = node->obj;
319 tre_ast_node_t *left = cat->left;
320 tre_ast_node_t *right = cat->right;
321 int reserved_tag = -1;
322 DPRINT(("Catenation, next_tag = %d\n", next_tag));
323
324
325 /* After processing right child. */
326 STACK_PUSHX(stack, voidptr, node);
327 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
328
329 /* Process right child. */
330 STACK_PUSHX(stack, voidptr, right);
331 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
332
333 /* After processing left child. */
334 STACK_PUSHX(stack, int, next_tag + left->num_tags);
335 DPRINT((" Pushing %d for after left\n",
336 next_tag + left->num_tags));
337 if (left->num_tags > 0 && right->num_tags > 0)
338 {
339 /* Reserve the next tag to the right child. */
340 DPRINT((" Reserving next_tag %d to right child\n",
341 next_tag));
342 reserved_tag = next_tag;
343 next_tag++;
344 }
345 STACK_PUSHX(stack, int, reserved_tag);
346 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
347
348 /* Process left child. */
349 STACK_PUSHX(stack, voidptr, left);
350 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
351
352 }
353 break;
354 case ITERATION:
355 {
356 tre_iteration_t *iter = node->obj;
357 DPRINT(("Iteration\n"));
358
359 if (first_pass)
360 {
361 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
362 }
363 else
364 {
365 STACK_PUSHX(stack, int, tag);
366 STACK_PUSHX(stack, int, iter->minimal);
367 }
368 STACK_PUSHX(stack, voidptr, node);
369 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
370
371 STACK_PUSHX(stack, voidptr, iter->arg);
372 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
373
374 /* Regset is not empty, so add a tag here. */
375 if (regset[0] >= 0 || iter->minimal)
376 {
377 if (!first_pass)
378 {
379 int i;
380 status = tre_add_tag_left(mem, node, tag);
381 if (iter->minimal)
382 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
383 else
384 tnfa->tag_directions[tag] = direction;
385 if (minimal_tag >= 0)
386 {
387 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
388 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
389 tnfa->minimal_tags[i] = tag;
390 tnfa->minimal_tags[i + 1] = minimal_tag;
391 tnfa->minimal_tags[i + 2] = -1;
392 minimal_tag = -1;
393 num_minimals++;
394 }
395 tre_purge_regset(regset, tnfa, tag);
396 }
397
398 DPRINT((" num_tags++\n"));
399 regset[0] = -1;
400 tag = next_tag;
401 num_tags++;
402 next_tag++;
403 }
404 direction = TRE_TAG_MINIMIZE;
405 }
406 break;
407 case UNION:
408 {
409 tre_union_t *uni = node->obj;
410 tre_ast_node_t *left = uni->left;
411 tre_ast_node_t *right = uni->right;
412 int left_tag;
413 int right_tag;
414
415 if (regset[0] >= 0)
416 {
417 left_tag = next_tag;
418 right_tag = next_tag + 1;
419 }
420 else
421 {
422 left_tag = tag;
423 right_tag = next_tag;
424 }
425
426 DPRINT(("Union\n"));
427
428 /* After processing right child. */
429 STACK_PUSHX(stack, int, right_tag);
430 STACK_PUSHX(stack, int, left_tag);
431 STACK_PUSHX(stack, voidptr, regset);
432 STACK_PUSHX(stack, int, regset[0] >= 0);
433 STACK_PUSHX(stack, voidptr, node);
434 STACK_PUSHX(stack, voidptr, right);
435 STACK_PUSHX(stack, voidptr, left);
436 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
437
438 /* Process right child. */
439 STACK_PUSHX(stack, voidptr, right);
440 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
441
442 /* After processing left child. */
443 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
444
445 /* Process left child. */
446 STACK_PUSHX(stack, voidptr, left);
447 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
448
449 /* Regset is not empty, so add a tag here. */
450 if (regset[0] >= 0)
451 {
452 if (!first_pass)
453 {
454 int i;
455 status = tre_add_tag_left(mem, node, tag);
456 tnfa->tag_directions[tag] = direction;
457 if (minimal_tag >= 0)
458 {
459 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
460 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
461 tnfa->minimal_tags[i] = tag;
462 tnfa->minimal_tags[i + 1] = minimal_tag;
463 tnfa->minimal_tags[i + 2] = -1;
464 minimal_tag = -1;
465 num_minimals++;
466 }
467 tre_purge_regset(regset, tnfa, tag);
468 }
469
470 DPRINT((" num_tags++\n"));
471 regset[0] = -1;
472 tag = next_tag;
473 num_tags++;
474 next_tag++;
475 }
476
477 if (node->num_submatches > 0)
478 {
479 /* The next two tags are reserved for markers. */
480 next_tag++;
481 tag = next_tag;
482 next_tag++;
483 }
484
485 break;
486 }
487 }
488
489 if (node->submatch_id >= 0)
490 {
491 int i;
492 /* Push this submatch on the parents stack. */
493 for (i = 0; parents[i] >= 0; i++);
494 parents[i] = node->submatch_id;
495 parents[i + 1] = -1;
496 }
497
498 break; /* end case: ADDTAGS_RECURSE */
499
500 case ADDTAGS_AFTER_ITERATION:
501 {
502 int minimal = 0;
503 int enter_tag;
504 node = tre_stack_pop_voidptr(stack);
505 if (first_pass)
506 {
507 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
508 + tre_stack_pop_int(stack);
509 minimal_tag = -1;
510 }
511 else
512 {
513 minimal = tre_stack_pop_int(stack);
514 enter_tag = tre_stack_pop_int(stack);
515 if (minimal)
516 minimal_tag = enter_tag;
517 }
518
519 DPRINT(("After iteration\n"));
520 if (!first_pass)
521 {
522 DPRINT((" Setting direction to %s\n",
523 minimal ? "minimize" : "maximize"));
524 if (minimal)
525 direction = TRE_TAG_MINIMIZE;
526 else
527 direction = TRE_TAG_MAXIMIZE;
528 }
529 break;
530 }
531
532 case ADDTAGS_AFTER_CAT_LEFT:
533 {
534 int new_tag = tre_stack_pop_int(stack);
535 next_tag = tre_stack_pop_int(stack);
536 DPRINT(("After cat left, tag = %d, next_tag = %d\n",
537 tag, next_tag));
538 if (new_tag >= 0)
539 {
540 DPRINT((" Setting tag to %d\n", new_tag));
541 tag = new_tag;
542 }
543 break;
544 }
545
546 case ADDTAGS_AFTER_CAT_RIGHT:
547 DPRINT(("After cat right\n"));
548 node = tre_stack_pop_voidptr(stack);
549 if (first_pass)
550 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
551 + ((tre_catenation_t *)node->obj)->right->num_tags;
552 break;
553
554 case ADDTAGS_AFTER_UNION_LEFT:
555 DPRINT(("After union left\n"));
556 /* Lift the bottom of the `regset' array so that when processing
557 the right operand the items currently in the array are
558 invisible. The original bottom was saved at ADDTAGS_UNION and
559 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
560 while (*regset >= 0)
561 regset++;
562 break;
563
564 case ADDTAGS_AFTER_UNION_RIGHT:
565 {
566 int added_tags, tag_left, tag_right;
567 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
568 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
569 DPRINT(("After union right\n"));
570 node = tre_stack_pop_voidptr(stack);
571 added_tags = tre_stack_pop_int(stack);
572 if (first_pass)
573 {
574 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
575 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
576 + ((node->num_submatches > 0) ? 2 : 0);
577 }
578 regset = tre_stack_pop_voidptr(stack);
579 tag_left = tre_stack_pop_int(stack);
580 tag_right = tre_stack_pop_int(stack);
581
582 /* Add tags after both children, the left child gets a smaller
583 tag than the right child. This guarantees that we prefer
584 the left child over the right child. */
585 /* XXX - This is not always necessary (if the children have
586 tags which must be seen for every match of that child). */
587 /* XXX - Check if this is the only place where tre_add_tag_right
588 is used. If so, use tre_add_tag_left (putting the tag before
589 the child as opposed after the child) and throw away
590 tre_add_tag_right. */
591 if (node->num_submatches > 0)
592 {
593 if (!first_pass)
594 {
595 status = tre_add_tag_right(mem, left, tag_left);
596 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
597 status = tre_add_tag_right(mem, right, tag_right);
598 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
599 }
600 DPRINT((" num_tags += 2\n"));
601 num_tags += 2;
602 }
603 direction = TRE_TAG_MAXIMIZE;
604 break;
605 }
606
607 default:
608 assert(0);
609 break;
610
611 } /* end switch(symbol) */
612 } /* end while(tre_stack_num_objects(stack) > bottom) */
613
614 if (!first_pass)
615 tre_purge_regset(regset, tnfa, tag);
616
617 if (!first_pass && minimal_tag >= 0)
618 {
619 int i;
620 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
621 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
622 tnfa->minimal_tags[i] = tag;
623 tnfa->minimal_tags[i + 1] = minimal_tag;
624 tnfa->minimal_tags[i + 2] = -1;
625 minimal_tag = -1;
626 num_minimals++;
627 }
628
629 DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
630 first_pass? "First pass" : "Second pass", num_tags));
631
632 assert(tree->num_tags == num_tags);
633 tnfa->end_tag = num_tags;
634 tnfa->num_tags = num_tags;
635 tnfa->num_minimals = num_minimals;
636 xfree(orig_regset);
637 xfree(parents);
638 xfree(saved_states);
639 return status;
640 }
641
642
643
644 /*
645 AST to TNFA compilation routines.
646 */
647
648 typedef enum {
649 COPY_RECURSE,
650 COPY_SET_RESULT_PTR
651 } tre_copyast_symbol_t;
652
653 /* Flags for tre_copy_ast(). */
654 #define COPY_REMOVE_TAGS 1
655 #define COPY_MAXIMIZE_FIRST_TAG 2
656
657 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)658 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
659 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
660 tre_ast_node_t **copy, int *max_pos)
661 {
662 reg_errcode_t status = REG_OK;
663 int bottom = tre_stack_num_objects(stack);
664 int num_copied = 0;
665 int first_tag = 1;
666 tre_ast_node_t **result = copy;
667 tre_copyast_symbol_t symbol;
668
669 STACK_PUSH(stack, voidptr, ast);
670 STACK_PUSH(stack, int, COPY_RECURSE);
671
672 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
673 {
674 tre_ast_node_t *node;
675 if (status != REG_OK)
676 break;
677
678 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
679 switch (symbol)
680 {
681 case COPY_SET_RESULT_PTR:
682 result = tre_stack_pop_voidptr(stack);
683 break;
684 case COPY_RECURSE:
685 node = tre_stack_pop_voidptr(stack);
686 switch (node->type)
687 {
688 case LITERAL:
689 {
690 tre_literal_t *lit = node->obj;
691 int pos = lit->position;
692 int min = lit->code_min;
693 int max = lit->code_max;
694 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
695 {
696 /* XXX - e.g. [ab] has only one position but two
697 nodes, so we are creating holes in the state space
698 here. Not fatal, just wastes memory. */
699 pos += *pos_add;
700 num_copied++;
701 }
702 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
703 {
704 /* Change this tag to empty. */
705 min = EMPTY;
706 max = pos = -1;
707 }
708 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
709 && first_tag)
710 {
711 /* Maximize the first tag. */
712 tag_directions[max] = TRE_TAG_MAXIMIZE;
713 first_tag = 0;
714 }
715 *result = tre_ast_new_literal(mem, min, max, pos);
716 if (*result == NULL)
717 status = REG_ESPACE;
718
719 if (pos > *max_pos)
720 *max_pos = pos;
721 break;
722 }
723 case UNION:
724 {
725 tre_union_t *uni = node->obj;
726 tre_union_t *tmp;
727 *result = tre_ast_new_union(mem, uni->left, uni->right);
728 if (*result == NULL)
729 {
730 status = REG_ESPACE;
731 break;
732 }
733 tmp = (*result)->obj;
734 result = &tmp->left;
735 STACK_PUSHX(stack, voidptr, uni->right);
736 STACK_PUSHX(stack, int, COPY_RECURSE);
737 STACK_PUSHX(stack, voidptr, &tmp->right);
738 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
739 STACK_PUSHX(stack, voidptr, uni->left);
740 STACK_PUSHX(stack, int, COPY_RECURSE);
741 break;
742 }
743 case CATENATION:
744 {
745 tre_catenation_t *cat = node->obj;
746 tre_catenation_t *tmp;
747 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
748 if (*result == NULL)
749 {
750 status = REG_ESPACE;
751 break;
752 }
753 tmp = (*result)->obj;
754 tmp->left = NULL;
755 tmp->right = NULL;
756 result = &tmp->left;
757
758 STACK_PUSHX(stack, voidptr, cat->right);
759 STACK_PUSHX(stack, int, COPY_RECURSE);
760 STACK_PUSHX(stack, voidptr, &tmp->right);
761 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
762 STACK_PUSHX(stack, voidptr, cat->left);
763 STACK_PUSHX(stack, int, COPY_RECURSE);
764 break;
765 }
766 case ITERATION:
767 {
768 tre_iteration_t *iter = node->obj;
769 STACK_PUSHX(stack, voidptr, iter->arg);
770 STACK_PUSHX(stack, int, COPY_RECURSE);
771 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
772 iter->max, iter->minimal);
773 if (*result == NULL)
774 {
775 status = REG_ESPACE;
776 break;
777 }
778 iter = (*result)->obj;
779 result = &iter->arg;
780 break;
781 }
782 default:
783 assert(0);
784 break;
785 }
786 break;
787 }
788 }
789 *pos_add += num_copied;
790 return status;
791 }
792
793 typedef enum {
794 EXPAND_RECURSE,
795 EXPAND_AFTER_ITER
796 } tre_expand_ast_symbol_t;
797
798 /* Expands each iteration node that has a finite nonzero minimum or maximum
799 iteration count to a catenated sequence of copies of the node. */
800 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions,int * max_depth)801 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
802 int *position, tre_tag_direction_t *tag_directions,
803 int *max_depth)
804 {
805 reg_errcode_t status = REG_OK;
806 int bottom = tre_stack_num_objects(stack);
807 int pos_add = 0;
808 int pos_add_total = 0;
809 int max_pos = 0;
810 /* Current approximate matching parameters. */
811 int params[TRE_PARAM_LAST];
812 /* Approximate parameter nesting level. */
813 int params_depth = 0;
814 int iter_depth = 0;
815 int i;
816
817 for (i = 0; i < TRE_PARAM_LAST; i++)
818 params[i] = TRE_PARAM_DEFAULT;
819
820 STACK_PUSHR(stack, voidptr, ast);
821 STACK_PUSHR(stack, int, EXPAND_RECURSE);
822 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
823 {
824 tre_ast_node_t *node;
825 tre_expand_ast_symbol_t symbol;
826
827 if (status != REG_OK)
828 break;
829
830 DPRINT(("pos_add %d\n", pos_add));
831
832 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
833 node = tre_stack_pop_voidptr(stack);
834 switch (symbol)
835 {
836 case EXPAND_RECURSE:
837 switch (node->type)
838 {
839 case LITERAL:
840 {
841 tre_literal_t *lit= node->obj;
842 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
843 {
844 lit->position += pos_add;
845 if (lit->position > max_pos)
846 max_pos = lit->position;
847 }
848 break;
849 }
850 case UNION:
851 {
852 tre_union_t *uni = node->obj;
853 STACK_PUSHX(stack, voidptr, uni->right);
854 STACK_PUSHX(stack, int, EXPAND_RECURSE);
855 STACK_PUSHX(stack, voidptr, uni->left);
856 STACK_PUSHX(stack, int, EXPAND_RECURSE);
857 break;
858 }
859 case CATENATION:
860 {
861 tre_catenation_t *cat = node->obj;
862 STACK_PUSHX(stack, voidptr, cat->right);
863 STACK_PUSHX(stack, int, EXPAND_RECURSE);
864 STACK_PUSHX(stack, voidptr, cat->left);
865 STACK_PUSHX(stack, int, EXPAND_RECURSE);
866 break;
867 }
868 case ITERATION:
869 {
870 tre_iteration_t *iter = node->obj;
871 STACK_PUSHX(stack, int, pos_add);
872 STACK_PUSHX(stack, voidptr, node);
873 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
874 STACK_PUSHX(stack, voidptr, iter->arg);
875 STACK_PUSHX(stack, int, EXPAND_RECURSE);
876 /* If we are going to expand this node at EXPAND_AFTER_ITER
877 then don't increase the `pos' fields of the nodes now, it
878 will get done when expanding. */
879 if (iter->min > 1 || iter->max > 1)
880 pos_add = 0;
881 iter_depth++;
882 DPRINT(("iter\n"));
883 break;
884 }
885 default:
886 assert(0);
887 break;
888 }
889 break;
890 case EXPAND_AFTER_ITER:
891 {
892 tre_iteration_t *iter = node->obj;
893 int pos_add_last;
894 pos_add = tre_stack_pop_int(stack);
895 pos_add_last = pos_add;
896 if (iter->min > 1 || iter->max > 1)
897 {
898 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
899 int j;
900 int pos_add_save = pos_add;
901
902 /* Create a catenated sequence of copies of the node. */
903 for (j = 0; j < iter->min; j++)
904 {
905 tre_ast_node_t *copy;
906 /* Remove tags from all but the last copy. */
907 int flags = ((j + 1 < iter->min)
908 ? COPY_REMOVE_TAGS
909 : COPY_MAXIMIZE_FIRST_TAG);
910 DPRINT((" pos_add %d\n", pos_add));
911 pos_add_save = pos_add;
912 status = tre_copy_ast(mem, stack, iter->arg, flags,
913 &pos_add, tag_directions, ©,
914 &max_pos);
915 if (status != REG_OK)
916 return status;
917 if (seq1 != NULL)
918 seq1 = tre_ast_new_catenation(mem, seq1, copy);
919 else
920 seq1 = copy;
921 if (seq1 == NULL)
922 return REG_ESPACE;
923 }
924
925 if (iter->max == -1)
926 {
927 /* No upper limit. */
928 pos_add_save = pos_add;
929 status = tre_copy_ast(mem, stack, iter->arg, 0,
930 &pos_add, NULL, &seq2, &max_pos);
931 if (status != REG_OK)
932 return status;
933 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
934 if (seq2 == NULL)
935 return REG_ESPACE;
936 }
937 else
938 {
939 for (j = iter->min; j < iter->max; j++)
940 {
941 tre_ast_node_t *tmp, *copy;
942 pos_add_save = pos_add;
943 status = tre_copy_ast(mem, stack, iter->arg, 0,
944 &pos_add, NULL, ©, &max_pos);
945 if (status != REG_OK)
946 return status;
947 if (seq2 != NULL)
948 seq2 = tre_ast_new_catenation(mem, copy, seq2);
949 else
950 seq2 = copy;
951 if (seq2 == NULL)
952 return REG_ESPACE;
953 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
954 if (tmp == NULL)
955 return REG_ESPACE;
956 seq2 = tre_ast_new_union(mem, tmp, seq2);
957 if (seq2 == NULL)
958 return REG_ESPACE;
959 }
960 }
961
962 pos_add = pos_add_save;
963 if (seq1 == NULL)
964 seq1 = seq2;
965 else if (seq2 != NULL)
966 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
967 if (seq1 == NULL)
968 return REG_ESPACE;
969 node->obj = seq1->obj;
970 node->type = seq1->type;
971 }
972
973 iter_depth--;
974 pos_add_total += pos_add - pos_add_last;
975 if (iter_depth == 0)
976 pos_add = pos_add_total;
977
978 /* If approximate parameters are specified, surround the result
979 with two parameter setting nodes. The one on the left sets
980 the specified parameters, and the one on the right restores
981 the old parameters. */
982 if (iter->params)
983 {
984 tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
985 int *old_params;
986
987 tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
988 if (!tmp_l)
989 return REG_ESPACE;
990 ((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
991 iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
992 tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
993 if (!tmp_r)
994 return REG_ESPACE;
995 old_params = tre_mem_alloc(mem, sizeof(*old_params)
996 * TRE_PARAM_LAST);
997 if (!old_params)
998 return REG_ESPACE;
999 for (i = 0; i < TRE_PARAM_LAST; i++)
1000 old_params[i] = params[i];
1001 ((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1002 old_params[TRE_PARAM_DEPTH] = params_depth;
1003 /* XXX - this is the only place where ast_new_node is
1004 needed -- should be moved inside AST module. */
1005 node_copy = tre_ast_new_node(mem, ITERATION,
1006 sizeof(tre_iteration_t));
1007 if (!node_copy)
1008 return REG_ESPACE;
1009 node_copy->obj = node->obj;
1010 tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1011 if (!tmp_node)
1012 return REG_ESPACE;
1013 tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1014 if (!tmp_node)
1015 return REG_ESPACE;
1016 /* Replace the contents of `node' with `tmp_node'. */
1017 memcpy(node, tmp_node, sizeof(*node));
1018 node->obj = tmp_node->obj;
1019 node->type = tmp_node->type;
1020 params_depth++;
1021 if (params_depth > *max_depth)
1022 *max_depth = params_depth;
1023 }
1024 break;
1025 }
1026 default:
1027 assert(0);
1028 break;
1029 }
1030 }
1031
1032 *position += pos_add_total;
1033
1034 /* `max_pos' should never be larger than `*position' if the above
1035 code works, but just an extra safeguard let's make sure
1036 `*position' is set large enough so enough memory will be
1037 allocated for the transition table. */
1038 if (max_pos > *position)
1039 *position = max_pos;
1040
1041 #ifdef TRE_DEBUG
1042 DPRINT(("Expanded AST:\n"));
1043 tre_ast_print(ast);
1044 DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1045 #endif
1046
1047 return status;
1048 }
1049
1050 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)1051 tre_set_empty(tre_mem_t mem)
1052 {
1053 tre_pos_and_tags_t *new_set;
1054
1055 new_set = tre_mem_calloc(mem, sizeof(*new_set));
1056 if (new_set == NULL)
1057 return NULL;
1058
1059 new_set[0].position = -1;
1060 new_set[0].code_min = -1;
1061 new_set[0].code_max = -1;
1062
1063 return new_set;
1064 }
1065
1066 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)1067 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1068 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1069 {
1070 tre_pos_and_tags_t *new_set;
1071
1072 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1073 if (new_set == NULL)
1074 return NULL;
1075
1076 new_set[0].position = position;
1077 new_set[0].code_min = code_min;
1078 new_set[0].code_max = code_max;
1079 new_set[0].class = class;
1080 new_set[0].neg_classes = neg_classes;
1081 new_set[0].backref = backref;
1082 new_set[1].position = -1;
1083 new_set[1].code_min = -1;
1084 new_set[1].code_max = -1;
1085
1086 return new_set;
1087 }
1088
1089 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions,int * params)1090 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1091 int *tags, int assertions, int *params)
1092 {
1093 int s1, s2, i, j;
1094 tre_pos_and_tags_t *new_set;
1095 int *new_tags;
1096 int num_tags;
1097
1098 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1099 for (s1 = 0; set1[s1].position >= 0; s1++);
1100 for (s2 = 0; set2[s2].position >= 0; s2++);
1101 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1102 if (!new_set )
1103 return NULL;
1104
1105 for (s1 = 0; set1[s1].position >= 0; s1++)
1106 {
1107 new_set[s1].position = set1[s1].position;
1108 new_set[s1].code_min = set1[s1].code_min;
1109 new_set[s1].code_max = set1[s1].code_max;
1110 new_set[s1].assertions = set1[s1].assertions | assertions;
1111 new_set[s1].class = set1[s1].class;
1112 new_set[s1].neg_classes = set1[s1].neg_classes;
1113 new_set[s1].backref = set1[s1].backref;
1114 if (set1[s1].tags == NULL && tags == NULL)
1115 new_set[s1].tags = NULL;
1116 else
1117 {
1118 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1119 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1120 * (i + num_tags + 1)));
1121 if (new_tags == NULL)
1122 return NULL;
1123 for (j = 0; j < i; j++)
1124 new_tags[j] = set1[s1].tags[j];
1125 for (i = 0; i < num_tags; i++)
1126 new_tags[j + i] = tags[i];
1127 new_tags[j + i] = -1;
1128 new_set[s1].tags = new_tags;
1129 }
1130 if (set1[s1].params)
1131 new_set[s1].params = set1[s1].params;
1132 if (params)
1133 {
1134 if (!new_set[s1].params)
1135 new_set[s1].params = params;
1136 else
1137 {
1138 new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1139 TRE_PARAM_LAST);
1140 if (!new_set[s1].params)
1141 return NULL;
1142 for (i = 0; i < TRE_PARAM_LAST; i++)
1143 if (params[i] != TRE_PARAM_UNSET)
1144 new_set[s1].params[i] = params[i];
1145 }
1146 }
1147 }
1148
1149 for (s2 = 0; set2[s2].position >= 0; s2++)
1150 {
1151 new_set[s1 + s2].position = set2[s2].position;
1152 new_set[s1 + s2].code_min = set2[s2].code_min;
1153 new_set[s1 + s2].code_max = set2[s2].code_max;
1154 /* XXX - why not | assertions here as well? */
1155 new_set[s1 + s2].assertions = set2[s2].assertions;
1156 new_set[s1 + s2].class = set2[s2].class;
1157 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1158 new_set[s1 + s2].backref = set2[s2].backref;
1159 if (set2[s2].tags == NULL)
1160 new_set[s1 + s2].tags = NULL;
1161 else
1162 {
1163 for (i = 0; set2[s2].tags[i] >= 0; i++);
1164 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1165 if (new_tags == NULL)
1166 return NULL;
1167 for (j = 0; j < i; j++)
1168 new_tags[j] = set2[s2].tags[j];
1169 new_tags[j] = -1;
1170 new_set[s1 + s2].tags = new_tags;
1171 }
1172 if (set2[s2].params)
1173 new_set[s1 + s2].params = set2[s2].params;
1174 if (params)
1175 {
1176 if (!new_set[s1 + s2].params)
1177 new_set[s1 + s2].params = params;
1178 else
1179 {
1180 new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1181 TRE_PARAM_LAST);
1182 if (!new_set[s1 + s2].params)
1183 return NULL;
1184 for (i = 0; i < TRE_PARAM_LAST; i++)
1185 if (params[i] != TRE_PARAM_UNSET)
1186 new_set[s1 + s2].params[i] = params[i];
1187 }
1188 }
1189 }
1190 new_set[s1 + s2].position = -1;
1191 return new_set;
1192 }
1193
1194 /* Finds the empty path through `node' which is the one that should be
1195 taken according to POSIX.2 rules, and adds the tags on that path to
1196 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
1197 set to the number of tags seen on the path. */
1198 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * params,int * num_tags_seen,int * params_seen)1199 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1200 int *assertions, int *params, int *num_tags_seen,
1201 int *params_seen)
1202 {
1203 tre_literal_t *lit;
1204 tre_union_t *uni;
1205 tre_catenation_t *cat;
1206 tre_iteration_t *iter;
1207 int i;
1208 int bottom = tre_stack_num_objects(stack);
1209 reg_errcode_t status = REG_OK;
1210 if (num_tags_seen)
1211 *num_tags_seen = 0;
1212 if (params_seen)
1213 *params_seen = 0;
1214
1215 status = tre_stack_push_voidptr(stack, node);
1216
1217 /* Walk through the tree recursively. */
1218 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1219 {
1220 node = tre_stack_pop_voidptr(stack);
1221
1222 switch (node->type)
1223 {
1224 case LITERAL:
1225 lit = (tre_literal_t *)node->obj;
1226 switch (lit->code_min)
1227 {
1228 case TAG:
1229 if (lit->code_max >= 0)
1230 {
1231 if (tags != NULL)
1232 {
1233 /* Add the tag to `tags'. */
1234 for (i = 0; tags[i] >= 0; i++)
1235 if (tags[i] == lit->code_max)
1236 break;
1237 if (tags[i] < 0)
1238 {
1239 tags[i] = lit->code_max;
1240 tags[i + 1] = -1;
1241 }
1242 }
1243 if (num_tags_seen)
1244 (*num_tags_seen)++;
1245 }
1246 break;
1247 case ASSERTION:
1248 assert(lit->code_max >= 1
1249 || lit->code_max <= ASSERT_LAST);
1250 if (assertions != NULL)
1251 *assertions |= lit->code_max;
1252 break;
1253 case PARAMETER:
1254 if (params != NULL)
1255 for (i = 0; i < TRE_PARAM_LAST; i++)
1256 params[i] = lit->u.params[i];
1257 if (params_seen != NULL)
1258 *params_seen = 1;
1259 break;
1260 case EMPTY:
1261 break;
1262 default:
1263 assert(0);
1264 break;
1265 }
1266 break;
1267
1268 case UNION:
1269 /* Subexpressions starting earlier take priority over ones
1270 starting later, so we prefer the left subexpression over the
1271 right subexpression. */
1272 uni = (tre_union_t *)node->obj;
1273 if (uni->left->nullable)
1274 STACK_PUSHX(stack, voidptr, uni->left)
1275 else if (uni->right->nullable)
1276 STACK_PUSHX(stack, voidptr, uni->right)
1277 else
1278 assert(0);
1279 break;
1280
1281 case CATENATION:
1282 /* The path must go through both children. */
1283 cat = (tre_catenation_t *)node->obj;
1284 assert(cat->left->nullable);
1285 assert(cat->right->nullable);
1286 STACK_PUSHX(stack, voidptr, cat->left);
1287 STACK_PUSHX(stack, voidptr, cat->right);
1288 break;
1289
1290 case ITERATION:
1291 /* A match with an empty string is preferred over no match at
1292 all, so we go through the argument if possible. */
1293 iter = (tre_iteration_t *)node->obj;
1294 if (iter->arg->nullable)
1295 STACK_PUSHX(stack, voidptr, iter->arg);
1296 break;
1297
1298 default:
1299 assert(0);
1300 break;
1301 }
1302 }
1303
1304 return status;
1305 }
1306
1307
1308 typedef enum {
1309 NFL_RECURSE,
1310 NFL_POST_UNION,
1311 NFL_POST_CATENATION,
1312 NFL_POST_ITERATION
1313 } tre_nfl_stack_symbol_t;
1314
1315
1316 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1317 the nodes of the AST `tree'. */
1318 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)1319 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1320 {
1321 int bottom = tre_stack_num_objects(stack);
1322
1323 STACK_PUSHR(stack, voidptr, tree);
1324 STACK_PUSHR(stack, int, NFL_RECURSE);
1325
1326 while (tre_stack_num_objects(stack) > bottom)
1327 {
1328 tre_nfl_stack_symbol_t symbol;
1329 tre_ast_node_t *node;
1330
1331 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1332 node = tre_stack_pop_voidptr(stack);
1333 switch (symbol)
1334 {
1335 case NFL_RECURSE:
1336 switch (node->type)
1337 {
1338 case LITERAL:
1339 {
1340 tre_literal_t *lit = (tre_literal_t *)node->obj;
1341 if (IS_BACKREF(lit))
1342 {
1343 /* Back references: nullable = false, firstpos = {i},
1344 lastpos = {i}. */
1345 node->nullable = 0;
1346 node->firstpos = tre_set_one(mem, lit->position, 0,
1347 TRE_CHAR_MAX, 0, NULL, -1);
1348 if (!node->firstpos)
1349 return REG_ESPACE;
1350 node->lastpos = tre_set_one(mem, lit->position, 0,
1351 TRE_CHAR_MAX, 0, NULL,
1352 (int)lit->code_max);
1353 if (!node->lastpos)
1354 return REG_ESPACE;
1355 }
1356 else if (lit->code_min < 0)
1357 {
1358 /* Tags, empty strings, params, and zero width assertions:
1359 nullable = true, firstpos = {}, and lastpos = {}. */
1360 node->nullable = 1;
1361 node->firstpos = tre_set_empty(mem);
1362 if (!node->firstpos)
1363 return REG_ESPACE;
1364 node->lastpos = tre_set_empty(mem);
1365 if (!node->lastpos)
1366 return REG_ESPACE;
1367 }
1368 else
1369 {
1370 /* Literal at position i: nullable = false, firstpos = {i},
1371 lastpos = {i}. */
1372 node->nullable = 0;
1373 node->firstpos =
1374 tre_set_one(mem, lit->position, (int)lit->code_min,
1375 (int)lit->code_max, 0, NULL, -1);
1376 if (!node->firstpos)
1377 return REG_ESPACE;
1378 node->lastpos = tre_set_one(mem, lit->position,
1379 (int)lit->code_min,
1380 (int)lit->code_max,
1381 lit->u.class, lit->neg_classes,
1382 -1);
1383 if (!node->lastpos)
1384 return REG_ESPACE;
1385 }
1386 break;
1387 }
1388
1389 case UNION:
1390 /* Compute the attributes for the two subtrees, and after that
1391 for this node. */
1392 STACK_PUSHR(stack, voidptr, node);
1393 STACK_PUSHR(stack, int, NFL_POST_UNION);
1394 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1395 STACK_PUSHR(stack, int, NFL_RECURSE);
1396 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1397 STACK_PUSHR(stack, int, NFL_RECURSE);
1398 break;
1399
1400 case CATENATION:
1401 /* Compute the attributes for the two subtrees, and after that
1402 for this node. */
1403 STACK_PUSHR(stack, voidptr, node);
1404 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1405 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1406 STACK_PUSHR(stack, int, NFL_RECURSE);
1407 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1408 STACK_PUSHR(stack, int, NFL_RECURSE);
1409 break;
1410
1411 case ITERATION:
1412 /* Compute the attributes for the subtree, and after that for
1413 this node. */
1414 STACK_PUSHR(stack, voidptr, node);
1415 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1416 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1417 STACK_PUSHR(stack, int, NFL_RECURSE);
1418 break;
1419 }
1420 break; /* end case: NFL_RECURSE */
1421
1422 case NFL_POST_UNION:
1423 {
1424 tre_union_t *uni = (tre_union_t *)node->obj;
1425 node->nullable = uni->left->nullable || uni->right->nullable;
1426 node->firstpos = tre_set_union(mem, uni->left->firstpos,
1427 uni->right->firstpos, NULL, 0, NULL);
1428 if (!node->firstpos)
1429 return REG_ESPACE;
1430 node->lastpos = tre_set_union(mem, uni->left->lastpos,
1431 uni->right->lastpos, NULL, 0, NULL);
1432 if (!node->lastpos)
1433 return REG_ESPACE;
1434 break;
1435 }
1436
1437 case NFL_POST_ITERATION:
1438 {
1439 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1440
1441 if (iter->min == 0 || iter->arg->nullable)
1442 node->nullable = 1;
1443 else
1444 node->nullable = 0;
1445 node->firstpos = iter->arg->firstpos;
1446 node->lastpos = iter->arg->lastpos;
1447 break;
1448 }
1449
1450 case NFL_POST_CATENATION:
1451 {
1452 int num_tags, *tags, assertions, params_seen;
1453 int *params;
1454 reg_errcode_t status;
1455 tre_catenation_t *cat = node->obj;
1456 node->nullable = cat->left->nullable && cat->right->nullable;
1457
1458 /* Compute firstpos. */
1459 if (cat->left->nullable)
1460 {
1461 /* The left side matches the empty string. Make a first pass
1462 with tre_match_empty() to get the number of tags and
1463 parameters. */
1464 status = tre_match_empty(stack, cat->left,
1465 NULL, NULL, NULL, &num_tags,
1466 ¶ms_seen);
1467 if (status != REG_OK)
1468 return status;
1469 /* Allocate arrays for the tags and parameters. */
1470 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1471 if (!tags)
1472 return REG_ESPACE;
1473 tags[0] = -1;
1474 assertions = 0;
1475 params = NULL;
1476 if (params_seen)
1477 {
1478 params = tre_mem_alloc(mem, sizeof(*params)
1479 * TRE_PARAM_LAST);
1480 if (!params)
1481 {
1482 xfree(tags);
1483 return REG_ESPACE;
1484 }
1485 }
1486 /* Second pass with tre_mach_empty() to get the list of
1487 tags and parameters. */
1488 status = tre_match_empty(stack, cat->left, tags,
1489 &assertions, params, NULL, NULL);
1490 if (status != REG_OK)
1491 {
1492 xfree(tags);
1493 return status;
1494 }
1495 node->firstpos =
1496 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1497 tags, assertions, params);
1498 xfree(tags);
1499 if (!node->firstpos)
1500 return REG_ESPACE;
1501 }
1502 else
1503 {
1504 node->firstpos = cat->left->firstpos;
1505 }
1506
1507 /* Compute lastpos. */
1508 if (cat->right->nullable)
1509 {
1510 /* The right side matches the empty string. Make a first pass
1511 with tre_match_empty() to get the number of tags and
1512 parameters. */
1513 status = tre_match_empty(stack, cat->right,
1514 NULL, NULL, NULL, &num_tags,
1515 ¶ms_seen);
1516 if (status != REG_OK)
1517 return status;
1518 /* Allocate arrays for the tags and parameters. */
1519 tags = xmalloc(sizeof(int) * (num_tags + 1));
1520 if (!tags)
1521 return REG_ESPACE;
1522 tags[0] = -1;
1523 assertions = 0;
1524 params = NULL;
1525 if (params_seen)
1526 {
1527 params = tre_mem_alloc(mem, sizeof(*params)
1528 * TRE_PARAM_LAST);
1529 if (!params)
1530 {
1531 xfree(tags);
1532 return REG_ESPACE;
1533 }
1534 }
1535 /* Second pass with tre_mach_empty() to get the list of
1536 tags and parameters. */
1537 status = tre_match_empty(stack, cat->right, tags,
1538 &assertions, params, NULL, NULL);
1539 if (status != REG_OK)
1540 {
1541 xfree(tags);
1542 return status;
1543 }
1544 node->lastpos =
1545 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1546 tags, assertions, params);
1547 xfree(tags);
1548 if (!node->lastpos)
1549 return REG_ESPACE;
1550 }
1551 else
1552 {
1553 node->lastpos = cat->right->lastpos;
1554 }
1555 break;
1556 }
1557
1558 default:
1559 assert(0);
1560 break;
1561 }
1562 }
1563
1564 return REG_OK;
1565 }
1566
1567
1568 /* Adds a transition from each position in `p1' to each position in `p2'. */
1569 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)1570 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1571 tre_tnfa_transition_t *transitions,
1572 int *counts, int *offs)
1573 {
1574 tre_pos_and_tags_t *orig_p2 = p2;
1575 tre_tnfa_transition_t *trans;
1576 int i, j, k, l, dup, prev_p2_pos;
1577
1578 if (transitions != NULL)
1579 while (p1->position >= 0)
1580 {
1581 p2 = orig_p2;
1582 prev_p2_pos = -1;
1583 while (p2->position >= 0)
1584 {
1585 /* Optimization: if this position was already handled, skip it. */
1586 if (p2->position == prev_p2_pos)
1587 {
1588 p2++;
1589 continue;
1590 }
1591 prev_p2_pos = p2->position;
1592 /* Set `trans' to point to the next unused transition from
1593 position `p1->position'. */
1594 trans = transitions + offs[p1->position];
1595 while (trans->state != NULL)
1596 {
1597 #if 0
1598 /* If we find a previous transition from `p1->position' to
1599 `p2->position', it is overwritten. This can happen only
1600 if there are nested loops in the regexp, like in "((a)*)*".
1601 In POSIX.2 repetition using the outer loop is always
1602 preferred over using the inner loop. Therefore the
1603 transition for the inner loop is useless and can be thrown
1604 away. */
1605 /* XXX - The same position is used for all nodes in a bracket
1606 expression, so this optimization cannot be used (it will
1607 break bracket expressions) unless I figure out a way to
1608 detect it here. */
1609 if (trans->state_id == p2->position)
1610 {
1611 DPRINT(("*"));
1612 break;
1613 }
1614 #endif
1615 trans++;
1616 }
1617
1618 if (trans->state == NULL)
1619 (trans + 1)->state = NULL;
1620 /* Use the character ranges, assertions, etc. from `p1' for
1621 the transition from `p1' to `p2'. */
1622 trans->code_min = p1->code_min;
1623 trans->code_max = p1->code_max;
1624 trans->state = transitions + offs[p2->position];
1625 trans->state_id = p2->position;
1626 trans->assertions = p1->assertions | p2->assertions
1627 | (p1->class ? ASSERT_CHAR_CLASS : 0)
1628 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1629 if (p1->backref >= 0)
1630 {
1631 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1632 assert(p2->backref < 0);
1633 trans->u.backref = p1->backref;
1634 trans->assertions |= ASSERT_BACKREF;
1635 }
1636 else
1637 trans->u.class = p1->class;
1638 if (p1->neg_classes != NULL)
1639 {
1640 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1641 trans->neg_classes =
1642 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1643 if (trans->neg_classes == NULL)
1644 return REG_ESPACE;
1645 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1646 trans->neg_classes[i] = p1->neg_classes[i];
1647 trans->neg_classes[i] = (tre_ctype_t)0;
1648 }
1649 else
1650 trans->neg_classes = NULL;
1651
1652 /* Find out how many tags this transition has. */
1653 i = 0;
1654 if (p1->tags != NULL)
1655 while(p1->tags[i] >= 0)
1656 i++;
1657 j = 0;
1658 if (p2->tags != NULL)
1659 while(p2->tags[j] >= 0)
1660 j++;
1661
1662 /* If we are overwriting a transition, free the old tag array. */
1663 if (trans->tags != NULL)
1664 xfree(trans->tags);
1665 trans->tags = NULL;
1666
1667 /* If there were any tags, allocate an array and fill it. */
1668 if (i + j > 0)
1669 {
1670 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1671 if (!trans->tags)
1672 return REG_ESPACE;
1673 i = 0;
1674 if (p1->tags != NULL)
1675 while(p1->tags[i] >= 0)
1676 {
1677 trans->tags[i] = p1->tags[i];
1678 i++;
1679 }
1680 l = i;
1681 j = 0;
1682 if (p2->tags != NULL)
1683 while (p2->tags[j] >= 0)
1684 {
1685 /* Don't add duplicates. */
1686 dup = 0;
1687 for (k = 0; k < i; k++)
1688 if (trans->tags[k] == p2->tags[j])
1689 {
1690 dup = 1;
1691 break;
1692 }
1693 if (!dup)
1694 trans->tags[l++] = p2->tags[j];
1695 j++;
1696 }
1697 trans->tags[l] = -1;
1698 }
1699
1700 /* Set the parameter array. If both `p2' and `p1' have same
1701 parameters, the values in `p2' override those in `p1'. */
1702 if (p1->params || p2->params)
1703 {
1704 if (!trans->params)
1705 trans->params = xmalloc(sizeof(*trans->params)
1706 * TRE_PARAM_LAST);
1707 if (!trans->params)
1708 return REG_ESPACE;
1709 for (i = 0; i < TRE_PARAM_LAST; i++)
1710 {
1711 trans->params[i] = TRE_PARAM_UNSET;
1712 if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1713 trans->params[i] = p1->params[i];
1714 if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1715 trans->params[i] = p2->params[i];
1716 }
1717 }
1718 else
1719 {
1720 if (trans->params)
1721 xfree(trans->params);
1722 trans->params = NULL;
1723 }
1724
1725
1726 #ifdef TRE_DEBUG
1727 {
1728 int *tags;
1729
1730 DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
1731 p1->code_min));
1732 if (p1->code_max != p1->code_min)
1733 DPRINT(("-%3d", p1->code_max));
1734 tags = trans->tags;
1735 if (tags)
1736 {
1737 DPRINT((", tags ["));
1738 while (*tags >= 0)
1739 {
1740 DPRINT(("%d", *tags));
1741 tags++;
1742 if (*tags >= 0)
1743 DPRINT((","));
1744 }
1745 DPRINT(("]"));
1746 }
1747 if (trans->assertions)
1748 DPRINT((", assert %d", trans->assertions));
1749 if (trans->assertions & ASSERT_BACKREF)
1750 DPRINT((", backref %d", trans->u.backref));
1751 else if (trans->u.class)
1752 DPRINT((", class %ld", (long)trans->u.class));
1753 if (trans->neg_classes)
1754 DPRINT((", neg_classes %p", trans->neg_classes));
1755 if (trans->params)
1756 {
1757 DPRINT((", "));
1758 tre_print_params(trans->params);
1759 }
1760 DPRINT(("\n"));
1761 }
1762 #endif /* TRE_DEBUG */
1763 p2++;
1764 }
1765 p1++;
1766 }
1767 else
1768 /* Compute a maximum limit for the number of transitions leaving
1769 from each state. */
1770 while (p1->position >= 0)
1771 {
1772 p2 = orig_p2;
1773 while (p2->position >= 0)
1774 {
1775 counts[p1->position]++;
1776 p2++;
1777 }
1778 p1++;
1779 }
1780 return REG_OK;
1781 }
1782
1783 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
1784 labelled with one character range (there are no transitions on empty
1785 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
1786 the regexp. */
1787 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1788 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1789 int *counts, int *offs)
1790 {
1791 tre_union_t *uni;
1792 tre_catenation_t *cat;
1793 tre_iteration_t *iter;
1794 reg_errcode_t errcode = REG_OK;
1795
1796 /* XXX - recurse using a stack!. */
1797 switch (node->type)
1798 {
1799 case LITERAL:
1800 break;
1801 case UNION:
1802 uni = (tre_union_t *)node->obj;
1803 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
1804 if (errcode != REG_OK)
1805 return errcode;
1806 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
1807 break;
1808
1809 case CATENATION:
1810 cat = (tre_catenation_t *)node->obj;
1811 /* Add a transition from each position in cat->left->lastpos
1812 to each position in cat->right->firstpos. */
1813 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1814 transitions, counts, offs);
1815 if (errcode != REG_OK)
1816 return errcode;
1817 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
1818 if (errcode != REG_OK)
1819 return errcode;
1820 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
1821 break;
1822
1823 case ITERATION:
1824 iter = (tre_iteration_t *)node->obj;
1825 assert(iter->max == -1 || iter->max == 1);
1826
1827 if (iter->max == -1)
1828 {
1829 assert(iter->min == 0 || iter->min == 1);
1830 /* Add a transition from each last position in the iterated
1831 expression to each first position. */
1832 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1833 transitions, counts, offs);
1834 if (errcode != REG_OK)
1835 return errcode;
1836 }
1837 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
1838 break;
1839 }
1840 return errcode;
1841 }
1842
1843
1844 #define ERROR_EXIT(err) \
1845 do \
1846 { \
1847 errcode = err; \
1848 if (/*CONSTCOND*/1) \
1849 goto error_exit; \
1850 } \
1851 while (/*CONSTCOND*/0)
1852
1853
1854 int
tre_compile(regex_t * preg,const tre_char_t * regex,size_t n,int cflags)1855 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1856 {
1857 tre_stack_t *stack;
1858 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1859 tre_pos_and_tags_t *p;
1860 int *counts = NULL, *offs = NULL;
1861 int i, add = 0;
1862 tre_tnfa_transition_t *transitions, *initial;
1863 tre_tnfa_t *tnfa = NULL;
1864 tre_submatch_data_t *submatch_data;
1865 tre_tag_direction_t *tag_directions = NULL;
1866 reg_errcode_t errcode;
1867 tre_mem_t mem;
1868
1869 /* Parse context. */
1870 tre_parse_ctx_t parse_ctx;
1871
1872 /* Allocate a stack used throughout the compilation process for various
1873 purposes. */
1874 stack = tre_stack_new(512, 10240, 128);
1875 if (!stack)
1876 return REG_ESPACE;
1877 /* Allocate a fast memory allocator. */
1878 mem = tre_mem_new();
1879 if (!mem)
1880 {
1881 tre_stack_destroy(stack);
1882 return REG_ESPACE;
1883 }
1884
1885 /* Parse the regexp. */
1886 memset(&parse_ctx, 0, sizeof(parse_ctx));
1887 parse_ctx.mem = mem;
1888 parse_ctx.stack = stack;
1889 parse_ctx.re = regex;
1890 parse_ctx.len = n;
1891 parse_ctx.cflags = cflags;
1892 parse_ctx.max_backref = -1;
1893 DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1894 errcode = tre_parse(&parse_ctx);
1895 if (errcode != REG_OK)
1896 ERROR_EXIT(errcode);
1897 preg->re_nsub = parse_ctx.submatch_id - 1;
1898 tree = parse_ctx.result;
1899
1900 /* Back references and approximate matching cannot currently be used
1901 in the same regexp. */
1902 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1903 ERROR_EXIT(REG_BADPAT);
1904
1905 #ifdef TRE_DEBUG
1906 tre_ast_print(tree);
1907 #endif /* TRE_DEBUG */
1908
1909 /* Referring to nonexistent subexpressions is illegal. */
1910 if (parse_ctx.max_backref > (int)preg->re_nsub)
1911 ERROR_EXIT(REG_ESUBREG);
1912
1913 /* Allocate the TNFA struct. */
1914 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1915 if (tnfa == NULL)
1916 ERROR_EXIT(REG_ESPACE);
1917 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1918 tnfa->have_approx = parse_ctx.have_approx;
1919 tnfa->num_submatches = parse_ctx.submatch_id;
1920
1921 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
1922 regexp does not have back references, this can be skipped. */
1923 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1924 {
1925 DPRINT(("tre_compile: setting up tags\n"));
1926
1927 /* Figure out how many tags we will need. */
1928 errcode = tre_add_tags(NULL, stack, tree, tnfa);
1929 if (errcode != REG_OK)
1930 ERROR_EXIT(errcode);
1931 #ifdef TRE_DEBUG
1932 tre_ast_print(tree);
1933 #endif /* TRE_DEBUG */
1934
1935 if (tnfa->num_tags > 0)
1936 {
1937 tag_directions = xmalloc(sizeof(*tag_directions)
1938 * (tnfa->num_tags + 1));
1939 if (tag_directions == NULL)
1940 ERROR_EXIT(REG_ESPACE);
1941 tnfa->tag_directions = tag_directions;
1942 memset(tag_directions, -1,
1943 sizeof(*tag_directions) * (tnfa->num_tags + 1));
1944 }
1945 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
1946 sizeof(tnfa->minimal_tags));
1947 if (tnfa->minimal_tags == NULL)
1948 ERROR_EXIT(REG_ESPACE);
1949
1950 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
1951 sizeof(*submatch_data));
1952 if (submatch_data == NULL)
1953 ERROR_EXIT(REG_ESPACE);
1954 tnfa->submatch_data = submatch_data;
1955
1956 errcode = tre_add_tags(mem, stack, tree, tnfa);
1957 if (errcode != REG_OK)
1958 ERROR_EXIT(errcode);
1959
1960 #ifdef TRE_DEBUG
1961 for (i = 0; i < parse_ctx.submatch_id; i++)
1962 DPRINT(("pmatch[%d] = {t%d, t%d}\n",
1963 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
1964 for (i = 0; i < tnfa->num_tags; i++)
1965 DPRINT(("t%d is %s\n", i,
1966 tag_directions[i] == TRE_TAG_MINIMIZE ?
1967 "minimized" : "maximized"));
1968 #endif /* TRE_DEBUG */
1969 }
1970
1971 /* Expand iteration nodes. */
1972 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
1973 tag_directions, &tnfa->params_depth);
1974 if (errcode != REG_OK)
1975 ERROR_EXIT(errcode);
1976
1977 /* Add a dummy node for the final state.
1978 XXX - For certain patterns this dummy node can be optimized away,
1979 for example "a*" or "ab*". Figure out a simple way to detect
1980 this possibility. */
1981 tmp_ast_l = tree;
1982 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
1983 if (tmp_ast_r == NULL)
1984 ERROR_EXIT(REG_ESPACE);
1985
1986 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
1987 if (tree == NULL)
1988 ERROR_EXIT(REG_ESPACE);
1989
1990 #ifdef TRE_DEBUG
1991 tre_ast_print(tree);
1992 DPRINT(("Number of states: %d\n", parse_ctx.position));
1993 #endif /* TRE_DEBUG */
1994
1995 errcode = tre_compute_nfl(mem, stack, tree);
1996 if (errcode != REG_OK)
1997 ERROR_EXIT(errcode);
1998
1999 counts = xmalloc(sizeof(int) * parse_ctx.position);
2000 if (counts == NULL)
2001 ERROR_EXIT(REG_ESPACE);
2002
2003 offs = xmalloc(sizeof(int) * parse_ctx.position);
2004 if (offs == NULL)
2005 ERROR_EXIT(REG_ESPACE);
2006
2007 for (i = 0; i < parse_ctx.position; i++)
2008 counts[i] = 0;
2009 tre_ast_to_tnfa(tree, NULL, counts, NULL);
2010
2011 add = 0;
2012 for (i = 0; i < parse_ctx.position; i++)
2013 {
2014 offs[i] = add;
2015 add += counts[i] + 1;
2016 counts[i] = 0;
2017 }
2018 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2019 if (transitions == NULL)
2020 ERROR_EXIT(REG_ESPACE);
2021 tnfa->transitions = transitions;
2022 tnfa->num_transitions = add;
2023
2024 DPRINT(("Converting to TNFA:\n"));
2025 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2026 if (errcode != REG_OK)
2027 ERROR_EXIT(errcode);
2028
2029 /* If in eight bit mode, compute a table of characters that can be the
2030 first character of a match. */
2031 tnfa->first_char = -1;
2032 if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2033 {
2034 int count = 0;
2035 tre_cint_t k;
2036 DPRINT(("Characters that can start a match:"));
2037 tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2038 if (tnfa->firstpos_chars == NULL)
2039 ERROR_EXIT(REG_ESPACE);
2040 for (p = tree->firstpos; p->position >= 0; p++)
2041 {
2042 tre_tnfa_transition_t *j = transitions + offs[p->position];
2043 while (j->state != NULL)
2044 {
2045 for (k = j->code_min; k <= j->code_max && k < 256; k++)
2046 {
2047 DPRINT((" %d", k));
2048 tnfa->firstpos_chars[k] = 1;
2049 count++;
2050 }
2051 j++;
2052 }
2053 }
2054 DPRINT(("\n"));
2055 #define TRE_OPTIMIZE_FIRST_CHAR 1
2056 #if TRE_OPTIMIZE_FIRST_CHAR
2057 if (count == 1)
2058 {
2059 for (k = 0; k < 256; k++)
2060 if (tnfa->firstpos_chars[k])
2061 {
2062 DPRINT(("first char must be %d\n", k));
2063 tnfa->first_char = k;
2064 xfree(tnfa->firstpos_chars);
2065 tnfa->firstpos_chars = NULL;
2066 break;
2067 }
2068 }
2069 #endif
2070
2071 }
2072 else
2073 tnfa->firstpos_chars = NULL;
2074
2075
2076 p = tree->firstpos;
2077 i = 0;
2078 while (p->position >= 0)
2079 {
2080 i++;
2081
2082 #ifdef TRE_DEBUG
2083 {
2084 int *tags;
2085 DPRINT(("initial: %d", p->position));
2086 tags = p->tags;
2087 if (tags != NULL)
2088 {
2089 if (*tags >= 0)
2090 DPRINT(("/"));
2091 while (*tags >= 0)
2092 {
2093 DPRINT(("%d", *tags));
2094 tags++;
2095 if (*tags >= 0)
2096 DPRINT((","));
2097 }
2098 }
2099 DPRINT((", assert %d", p->assertions));
2100 if (p->params)
2101 {
2102 DPRINT((", "));
2103 tre_print_params(p->params);
2104 }
2105 DPRINT(("\n"));
2106 }
2107 #endif /* TRE_DEBUG */
2108
2109 p++;
2110 }
2111
2112 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2113 if (initial == NULL)
2114 ERROR_EXIT(REG_ESPACE);
2115 tnfa->initial = initial;
2116
2117 i = 0;
2118 for (p = tree->firstpos; p->position >= 0; p++)
2119 {
2120 initial[i].state = transitions + offs[p->position];
2121 initial[i].state_id = p->position;
2122 initial[i].tags = NULL;
2123 /* Copy the arrays p->tags, and p->params, they are allocated
2124 from a tre_mem object. */
2125 if (p->tags)
2126 {
2127 int j;
2128 for (j = 0; p->tags[j] >= 0; j++);
2129 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2130 if (!initial[i].tags)
2131 ERROR_EXIT(REG_ESPACE);
2132 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2133 }
2134 initial[i].params = NULL;
2135 if (p->params)
2136 {
2137 initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2138 if (!initial[i].params)
2139 ERROR_EXIT(REG_ESPACE);
2140 memcpy(initial[i].params, p->params,
2141 sizeof(*p->params) * TRE_PARAM_LAST);
2142 }
2143 initial[i].assertions = p->assertions;
2144 i++;
2145 }
2146 initial[i].state = NULL;
2147
2148 tnfa->num_transitions = add;
2149 tnfa->final = transitions + offs[tree->lastpos[0].position];
2150 tnfa->num_states = parse_ctx.position;
2151 tnfa->cflags = cflags;
2152
2153 DPRINT(("final state %p\n", (void *)tnfa->final));
2154
2155 tre_mem_destroy(mem);
2156 tre_stack_destroy(stack);
2157 xfree(counts);
2158 xfree(offs);
2159
2160 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2161 return REG_OK;
2162
2163 error_exit:
2164 /* Free everything that was allocated and return the error code. */
2165 tre_mem_destroy(mem);
2166 if (stack != NULL)
2167 tre_stack_destroy(stack);
2168 if (counts != NULL)
2169 xfree(counts);
2170 if (offs != NULL)
2171 xfree(offs);
2172 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2173 tre_free(preg);
2174 return errcode;
2175 }
2176
2177
2178
2179
2180 void
tre_free(regex_t * preg)2181 tre_free(regex_t *preg)
2182 {
2183 tre_tnfa_t *tnfa;
2184 unsigned int i;
2185 tre_tnfa_transition_t *trans;
2186
2187 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2188 if (!tnfa)
2189 return;
2190
2191 for (i = 0; i < tnfa->num_transitions; i++)
2192 if (tnfa->transitions[i].state)
2193 {
2194 if (tnfa->transitions[i].tags)
2195 xfree(tnfa->transitions[i].tags);
2196 if (tnfa->transitions[i].neg_classes)
2197 xfree(tnfa->transitions[i].neg_classes);
2198 if (tnfa->transitions[i].params)
2199 xfree(tnfa->transitions[i].params);
2200 }
2201 if (tnfa->transitions)
2202 xfree(tnfa->transitions);
2203
2204 if (tnfa->initial)
2205 {
2206 for (trans = tnfa->initial; trans->state; trans++)
2207 {
2208 if (trans->tags)
2209 xfree(trans->tags);
2210 if (trans->params)
2211 xfree(trans->params);
2212 }
2213 xfree(tnfa->initial);
2214 }
2215
2216 if (tnfa->submatch_data)
2217 {
2218 for (i = 0; i < tnfa->num_submatches; i++)
2219 if (tnfa->submatch_data[i].parents)
2220 xfree(tnfa->submatch_data[i].parents);
2221 xfree(tnfa->submatch_data);
2222 }
2223
2224 if (tnfa->tag_directions)
2225 xfree(tnfa->tag_directions);
2226 if (tnfa->firstpos_chars)
2227 xfree(tnfa->firstpos_chars);
2228 if (tnfa->minimal_tags)
2229 xfree(tnfa->minimal_tags);
2230 xfree(tnfa);
2231 }
2232
2233 char *
tre_version(void)2234 tre_version(void)
2235 {
2236 static char str[256];
2237 char *version;
2238
2239 if (str[0] == 0)
2240 {
2241 (void) tre_config(TRE_CONFIG_VERSION, &version);
2242 (void) snprintf(str, sizeof(str), "TRE %s (BSD)", version);
2243 }
2244 return str;
2245 }
2246
2247 int
tre_config(int query,void * result)2248 tre_config(int query, void *result)
2249 {
2250 int *int_result = result;
2251 const char **string_result = result;
2252
2253 switch (query)
2254 {
2255 case TRE_CONFIG_APPROX:
2256 #ifdef TRE_APPROX
2257 *int_result = 1;
2258 #else /* !TRE_APPROX */
2259 *int_result = 0;
2260 #endif /* !TRE_APPROX */
2261 return REG_OK;
2262
2263 case TRE_CONFIG_WCHAR:
2264 #ifdef TRE_WCHAR
2265 *int_result = 1;
2266 #else /* !TRE_WCHAR */
2267 *int_result = 0;
2268 #endif /* !TRE_WCHAR */
2269 return REG_OK;
2270
2271 case TRE_CONFIG_MULTIBYTE:
2272 #ifdef TRE_MULTIBYTE
2273 *int_result = 1;
2274 #else /* !TRE_MULTIBYTE */
2275 *int_result = 0;
2276 #endif /* !TRE_MULTIBYTE */
2277 return REG_OK;
2278
2279 case TRE_CONFIG_SYSTEM_ABI:
2280 #ifdef TRE_CONFIG_SYSTEM_ABI
2281 *int_result = 1;
2282 #else /* !TRE_CONFIG_SYSTEM_ABI */
2283 *int_result = 0;
2284 #endif /* !TRE_CONFIG_SYSTEM_ABI */
2285 return REG_OK;
2286
2287 case TRE_CONFIG_VERSION:
2288 *string_result = TRE_VERSION;
2289 return REG_OK;
2290 }
2291
2292 return REG_NOMATCH;
2293 }
2294
2295
2296 /* EOF */
2297