1 /*
2 tre-compile.c - TRE regex compiler
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 TODO:
11 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
12 function calls.
13 */
14
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <stdio.h>
20 //#include <assert.h>
21 #include <string.h>
22
23 #include "tre-internal.h"
24 #include "tre-mem.h"
25 #include "tre-stack.h"
26 #include "tre-ast.h"
27 #include "tre-parse.h"
28 #include "tre-compile.h"
29 #include "tre.h"
30 #include "xmalloc.h"
31
32 #define assert(a) R_assert(a)
33
34 /*
35 Algorithms to setup tags so that submatch addressing can be done.
36 */
37
38
39 /* Inserts a catenation node to the root of the tree given in `node'.
40 As the left child a new tag with number `tag_id' to `node' is added,
41 and the right child is the old root. */
42 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)43 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
44 {
45 tre_catenation_t *c;
46
47 DPRINT(("add_tag_left: tag %d\n", tag_id));
48
49 c = tre_mem_alloc(mem, sizeof(*c));
50 if (c == NULL)
51 return REG_ESPACE;
52 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
53 if (c->left == NULL)
54 return REG_ESPACE;
55 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
56 if (c->right == NULL)
57 return REG_ESPACE;
58
59 c->right->obj = node->obj;
60 c->right->type = node->type;
61 c->right->nullable = -1;
62 c->right->submatch_id = -1;
63 c->right->firstpos = NULL;
64 c->right->lastpos = NULL;
65 c->right->num_tags = 0;
66 node->obj = c;
67 node->type = CATENATION;
68 return REG_OK;
69 }
70
71 /* Inserts a catenation node to the root of the tree given in `node'.
72 As the right child a new tag with number `tag_id' to `node' is added,
73 and the left child is the old root. */
74 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)75 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
76 {
77 tre_catenation_t *c;
78
79 DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
80
81 c = tre_mem_alloc(mem, sizeof(*c));
82 if (c == NULL)
83 return REG_ESPACE;
84 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
85 if (c->right == NULL)
86 return REG_ESPACE;
87 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
88 if (c->left == NULL)
89 return REG_ESPACE;
90
91 c->left->obj = node->obj;
92 c->left->type = node->type;
93 c->left->nullable = -1;
94 c->left->submatch_id = -1;
95 c->left->firstpos = NULL;
96 c->left->lastpos = NULL;
97 c->left->num_tags = 0;
98 node->obj = c;
99 node->type = CATENATION;
100 return REG_OK;
101 }
102
103 typedef enum {
104 ADDTAGS_RECURSE,
105 ADDTAGS_AFTER_ITERATION,
106 ADDTAGS_AFTER_UNION_LEFT,
107 ADDTAGS_AFTER_UNION_RIGHT,
108 ADDTAGS_AFTER_CAT_LEFT,
109 ADDTAGS_AFTER_CAT_RIGHT,
110 ADDTAGS_SET_SUBMATCH_END
111 } tre_addtags_symbol_t;
112
113
114 typedef struct {
115 int tag;
116 int next_tag;
117 } tre_tag_states_t;
118
119
120 /* Go through `regset' and set submatch data for submatches that are
121 using this tag. */
122 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)123 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
124 {
125 int i;
126
127 for (i = 0; regset[i] >= 0; i++)
128 {
129 int id = regset[i] / 2;
130 int start = !(regset[i] % 2);
131 DPRINT((" Using tag %d for %s offset of "
132 "submatch %d\n", tag,
133 start ? "start" : "end", id));
134 if (start)
135 tnfa->submatch_data[id].so_tag = tag;
136 else
137 tnfa->submatch_data[id].eo_tag = tag;
138 }
139 regset[0] = -1;
140 }
141
142
143 /* Adds tags to appropriate locations in the parse tree in `tree', so that
144 subexpressions marked for submatch addressing can be traced. */
145 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)146 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
147 tre_tnfa_t *tnfa)
148 {
149 reg_errcode_t status = REG_OK;
150 tre_addtags_symbol_t symbol;
151 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
152 int bottom = tre_stack_num_objects(stack);
153 /* True for first pass (counting number of needed tags) */
154 int first_pass = (mem == NULL || tnfa == NULL);
155 int *regset, *orig_regset;
156 int num_tags = 0; /* Total number of tags. */
157 int num_minimals = 0; /* Number of special minimal tags. */
158 int tag = 0; /* The tag that is to be added next. */
159 int next_tag = 1; /* Next tag to use after this one. */
160 int *parents; /* Stack of submatches the current submatch is
161 contained in. */
162 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
163 tre_tag_states_t *saved_states;
164
165 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
166 if (!first_pass)
167 {
168 tnfa->end_tag = 0;
169 tnfa->minimal_tags[0] = -1;
170 }
171
172 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
173 if (regset == NULL)
174 return REG_ESPACE;
175 regset[0] = -1;
176 orig_regset = regset;
177
178 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
179 if (parents == NULL)
180 {
181 xfree(regset);
182 return REG_ESPACE;
183 }
184 parents[0] = -1;
185
186 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
187 if (saved_states == NULL)
188 {
189 xfree(regset);
190 xfree(parents);
191 return REG_ESPACE;
192 }
193 else
194 {
195 unsigned int i;
196 for (i = 0; i <= tnfa->num_submatches; i++)
197 saved_states[i].tag = -1;
198 }
199
200 STACK_PUSH(stack, voidptr, node);
201 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
202
203 while (tre_stack_num_objects(stack) > bottom)
204 {
205 if (status != REG_OK)
206 break;
207
208 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
209 switch (symbol)
210 {
211
212 case ADDTAGS_SET_SUBMATCH_END:
213 {
214 int id = tre_stack_pop_int(stack);
215 int i;
216
217 /* Add end of this submatch to regset. */
218 for (i = 0; regset[i] >= 0; i++);
219 regset[i] = id * 2 + 1;
220 regset[i + 1] = -1;
221
222 /* Pop this submatch from the parents stack. */
223 for (i = 0; parents[i] >= 0; i++);
224 parents[i - 1] = -1;
225 break;
226 }
227
228 case ADDTAGS_RECURSE:
229 node = tre_stack_pop_voidptr(stack);
230
231 if (node->submatch_id >= 0)
232 {
233 int id = node->submatch_id;
234 int i;
235
236
237 /* Add start of this submatch to regset. */
238 for (i = 0; regset[i] >= 0; i++);
239 regset[i] = id * 2;
240 regset[i + 1] = -1;
241
242 if (!first_pass)
243 {
244 for (i = 0; parents[i] >= 0; i++);
245 tnfa->submatch_data[id].parents = NULL;
246 if (i > 0)
247 {
248 int *p = xmalloc(sizeof(*p) * (i + 1));
249 if (p == NULL)
250 {
251 status = REG_ESPACE;
252 break;
253 }
254 assert(tnfa->submatch_data[id].parents == NULL);
255 tnfa->submatch_data[id].parents = p;
256 for (i = 0; parents[i] >= 0; i++)
257 p[i] = parents[i];
258 p[i] = -1;
259 }
260 }
261
262 /* Add end of this submatch to regset after processing this
263 node. */
264 STACK_PUSHX(stack, int, node->submatch_id);
265 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
266 }
267
268 switch (node->type)
269 {
270 case LITERAL:
271 {
272 tre_literal_t *lit = node->obj;
273
274 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
275 {
276 int i;
277 DPRINT(("Literal %d-%d\n",
278 (int)lit->code_min, (int)lit->code_max));
279 if (regset[0] >= 0)
280 {
281 /* Regset is not empty, so add a tag before the
282 literal or backref. */
283 if (!first_pass)
284 {
285 status = tre_add_tag_left(mem, node, tag);
286 tnfa->tag_directions[tag] = direction;
287 if (minimal_tag >= 0)
288 {
289 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
290 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
291 tnfa->minimal_tags[i] = tag;
292 tnfa->minimal_tags[i + 1] = minimal_tag;
293 tnfa->minimal_tags[i + 2] = -1;
294 minimal_tag = -1;
295 num_minimals++;
296 }
297 tre_purge_regset(regset, tnfa, tag);
298 }
299 else
300 {
301 DPRINT((" num_tags = 1\n"));
302 node->num_tags = 1;
303 }
304
305 DPRINT((" num_tags++\n"));
306 regset[0] = -1;
307 tag = next_tag;
308 num_tags++;
309 next_tag++;
310 }
311 }
312 else
313 {
314 assert(!IS_TAG(lit));
315 }
316 break;
317 }
318 case CATENATION:
319 {
320 tre_catenation_t *cat = node->obj;
321 tre_ast_node_t *left = cat->left;
322 tre_ast_node_t *right = cat->right;
323 int reserved_tag = -1;
324 DPRINT(("Catenation, next_tag = %d\n", next_tag));
325
326
327 /* After processing right child. */
328 STACK_PUSHX(stack, voidptr, node);
329 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
330
331 /* Process right child. */
332 STACK_PUSHX(stack, voidptr, right);
333 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
334
335 /* After processing left child. */
336 STACK_PUSHX(stack, int, next_tag + left->num_tags);
337 DPRINT((" Pushing %d for after left\n",
338 next_tag + left->num_tags));
339 if (left->num_tags > 0 && right->num_tags > 0)
340 {
341 /* Reserve the next tag to the right child. */
342 DPRINT((" Reserving next_tag %d to right child\n",
343 next_tag));
344 reserved_tag = next_tag;
345 next_tag++;
346 }
347 STACK_PUSHX(stack, int, reserved_tag);
348 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
349
350 /* Process left child. */
351 STACK_PUSHX(stack, voidptr, left);
352 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
353
354 }
355 break;
356 case ITERATION:
357 {
358 tre_iteration_t *iter = node->obj;
359 DPRINT(("Iteration\n"));
360
361 if (first_pass)
362 {
363 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
364 }
365 else
366 {
367 STACK_PUSHX(stack, int, tag);
368 STACK_PUSHX(stack, int, iter->minimal);
369 }
370 STACK_PUSHX(stack, voidptr, node);
371 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
372
373 STACK_PUSHX(stack, voidptr, iter->arg);
374 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
375
376 /* Regset is not empty, so add a tag here. */
377 if (regset[0] >= 0 || iter->minimal)
378 {
379 if (!first_pass)
380 {
381 int i;
382 status = tre_add_tag_left(mem, node, tag);
383 if (iter->minimal)
384 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
385 else
386 tnfa->tag_directions[tag] = direction;
387 if (minimal_tag >= 0)
388 {
389 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
390 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
391 tnfa->minimal_tags[i] = tag;
392 tnfa->minimal_tags[i + 1] = minimal_tag;
393 tnfa->minimal_tags[i + 2] = -1;
394 minimal_tag = -1;
395 num_minimals++;
396 }
397 tre_purge_regset(regset, tnfa, tag);
398 }
399
400 DPRINT((" num_tags++\n"));
401 regset[0] = -1;
402 tag = next_tag;
403 num_tags++;
404 next_tag++;
405 }
406 direction = TRE_TAG_MINIMIZE;
407 }
408 break;
409 case UNION:
410 {
411 tre_union_t *uni = node->obj;
412 tre_ast_node_t *left = uni->left;
413 tre_ast_node_t *right = uni->right;
414 int left_tag;
415 int right_tag;
416
417 if (regset[0] >= 0)
418 {
419 left_tag = next_tag;
420 right_tag = next_tag + 1;
421 }
422 else
423 {
424 left_tag = tag;
425 right_tag = next_tag;
426 }
427
428 DPRINT(("Union\n"));
429
430 /* After processing right child. */
431 STACK_PUSHX(stack, int, right_tag);
432 STACK_PUSHX(stack, int, left_tag);
433 STACK_PUSHX(stack, voidptr, regset);
434 STACK_PUSHX(stack, int, regset[0] >= 0);
435 STACK_PUSHX(stack, voidptr, node);
436 STACK_PUSHX(stack, voidptr, right);
437 STACK_PUSHX(stack, voidptr, left);
438 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
439
440 /* Process right child. */
441 STACK_PUSHX(stack, voidptr, right);
442 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
443
444 /* After processing left child. */
445 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
446
447 /* Process left child. */
448 STACK_PUSHX(stack, voidptr, left);
449 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
450
451 /* Regset is not empty, so add a tag here. */
452 if (regset[0] >= 0)
453 {
454 if (!first_pass)
455 {
456 int i;
457 status = tre_add_tag_left(mem, node, tag);
458 tnfa->tag_directions[tag] = direction;
459 if (minimal_tag >= 0)
460 {
461 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
462 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
463 tnfa->minimal_tags[i] = tag;
464 tnfa->minimal_tags[i + 1] = minimal_tag;
465 tnfa->minimal_tags[i + 2] = -1;
466 minimal_tag = -1;
467 num_minimals++;
468 }
469 tre_purge_regset(regset, tnfa, tag);
470 }
471
472 DPRINT((" num_tags++\n"));
473 regset[0] = -1;
474 tag = next_tag;
475 num_tags++;
476 next_tag++;
477 }
478
479 if (node->num_submatches > 0)
480 {
481 /* The next two tags are reserved for markers. */
482 next_tag++;
483 tag = next_tag;
484 next_tag++;
485 }
486
487 break;
488 }
489 }
490
491 if (node->submatch_id >= 0)
492 {
493 int i;
494 /* Push this submatch on the parents stack. */
495 for (i = 0; parents[i] >= 0; i++);
496 parents[i] = node->submatch_id;
497 parents[i + 1] = -1;
498 }
499
500 break; /* end case: ADDTAGS_RECURSE */
501
502 case ADDTAGS_AFTER_ITERATION:
503 {
504 int minimal = 0;
505 int enter_tag;
506 node = tre_stack_pop_voidptr(stack);
507 if (first_pass)
508 {
509 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
510 + tre_stack_pop_int(stack);
511 minimal_tag = -1;
512 }
513 else
514 {
515 minimal = tre_stack_pop_int(stack);
516 enter_tag = tre_stack_pop_int(stack);
517 if (minimal)
518 minimal_tag = enter_tag;
519 }
520
521 DPRINT(("After iteration\n"));
522 if (!first_pass)
523 {
524 DPRINT((" Setting direction to %s\n",
525 minimal ? "minimize" : "maximize"));
526 if (minimal)
527 direction = TRE_TAG_MINIMIZE;
528 else
529 direction = TRE_TAG_MAXIMIZE;
530 }
531 break;
532 }
533
534 case ADDTAGS_AFTER_CAT_LEFT:
535 {
536 int new_tag = tre_stack_pop_int(stack);
537 next_tag = tre_stack_pop_int(stack);
538 DPRINT(("After cat left, tag = %d, next_tag = %d\n",
539 tag, next_tag));
540 if (new_tag >= 0)
541 {
542 DPRINT((" Setting tag to %d\n", new_tag));
543 tag = new_tag;
544 }
545 break;
546 }
547
548 case ADDTAGS_AFTER_CAT_RIGHT:
549 DPRINT(("After cat right\n"));
550 node = tre_stack_pop_voidptr(stack);
551 if (first_pass)
552 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
553 + ((tre_catenation_t *)node->obj)->right->num_tags;
554 break;
555
556 case ADDTAGS_AFTER_UNION_LEFT:
557 DPRINT(("After union left\n"));
558 /* Lift the bottom of the `regset' array so that when processing
559 the right operand the items currently in the array are
560 invisible. The original bottom was saved at ADDTAGS_UNION and
561 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
562 while (*regset >= 0)
563 regset++;
564 break;
565
566 case ADDTAGS_AFTER_UNION_RIGHT:
567 {
568 int added_tags, tag_left, tag_right;
569 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
570 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
571 DPRINT(("After union right\n"));
572 node = tre_stack_pop_voidptr(stack);
573 added_tags = tre_stack_pop_int(stack);
574 if (first_pass)
575 {
576 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
577 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
578 + ((node->num_submatches > 0) ? 2 : 0);
579 }
580 regset = tre_stack_pop_voidptr(stack);
581 tag_left = tre_stack_pop_int(stack);
582 tag_right = tre_stack_pop_int(stack);
583
584 /* Add tags after both children, the left child gets a smaller
585 tag than the right child. This guarantees that we prefer
586 the left child over the right child. */
587 /* XXX - This is not always necessary (if the children have
588 tags which must be seen for every match of that child). */
589 /* XXX - Check if this is the only place where tre_add_tag_right
590 is used. If so, use tre_add_tag_left (putting the tag before
591 the child as opposed after the child) and throw away
592 tre_add_tag_right. */
593 if (node->num_submatches > 0)
594 {
595 if (!first_pass)
596 {
597 status = tre_add_tag_right(mem, left, tag_left);
598 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
599 status = tre_add_tag_right(mem, right, tag_right);
600 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
601 }
602 DPRINT((" num_tags += 2\n"));
603 num_tags += 2;
604 }
605 direction = TRE_TAG_MAXIMIZE;
606 break;
607 }
608
609 default:
610 assert(0);
611 break;
612
613 } /* end switch(symbol) */
614 } /* end while(tre_stack_num_objects(stack) > bottom) */
615
616 if (!first_pass)
617 tre_purge_regset(regset, tnfa, tag);
618
619 if (!first_pass && minimal_tag >= 0)
620 {
621 int i;
622 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
623 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
624 tnfa->minimal_tags[i] = tag;
625 tnfa->minimal_tags[i + 1] = minimal_tag;
626 tnfa->minimal_tags[i + 2] = -1;
627 minimal_tag = -1;
628 num_minimals++;
629 }
630
631 DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
632 first_pass? "First pass" : "Second pass", num_tags));
633
634 assert(tree->num_tags == num_tags);
635 tnfa->end_tag = num_tags;
636 tnfa->num_tags = num_tags;
637 tnfa->num_minimals = num_minimals;
638 xfree(orig_regset);
639 xfree(parents);
640 xfree(saved_states);
641 return status;
642 }
643
644
645
646 /*
647 AST to TNFA compilation routines.
648 */
649
650 typedef enum {
651 COPY_RECURSE,
652 COPY_SET_RESULT_PTR
653 } tre_copyast_symbol_t;
654
655 /* Flags for tre_copy_ast(). */
656 #define COPY_REMOVE_TAGS 1
657 #define COPY_MAXIMIZE_FIRST_TAG 2
658
659 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)660 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
661 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
662 tre_ast_node_t **copy, int *max_pos)
663 {
664 reg_errcode_t status = REG_OK;
665 int bottom = tre_stack_num_objects(stack);
666 int num_copied = 0;
667 int first_tag = 1;
668 tre_ast_node_t **result = copy;
669 tre_copyast_symbol_t symbol;
670
671 STACK_PUSH(stack, voidptr, ast);
672 STACK_PUSH(stack, int, COPY_RECURSE);
673
674 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
675 {
676 tre_ast_node_t *node;
677 if (status != REG_OK)
678 break;
679
680 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
681 switch (symbol)
682 {
683 case COPY_SET_RESULT_PTR:
684 result = tre_stack_pop_voidptr(stack);
685 break;
686 case COPY_RECURSE:
687 node = tre_stack_pop_voidptr(stack);
688 switch (node->type)
689 {
690 case LITERAL:
691 {
692 tre_literal_t *lit = node->obj;
693 int pos = lit->position;
694 int min = (int)lit->code_min;
695 int max = (int)lit->code_max;
696 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
697 {
698 /* XXX - e.g. [ab] has only one position but two
699 nodes, so we are creating holes in the state space
700 here. Not fatal, just wastes memory. */
701 pos += *pos_add;
702 num_copied++;
703 }
704 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
705 {
706 /* Change this tag to empty. */
707 min = EMPTY;
708 max = pos = -1;
709 }
710 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
711 && first_tag)
712 {
713 /* Maximize the first tag. */
714 tag_directions[max] = TRE_TAG_MAXIMIZE;
715 first_tag = 0;
716 }
717 *result = tre_ast_new_literal(mem, min, max, pos);
718 if (*result == NULL)
719 status = REG_ESPACE;
720
721 ((tre_literal_t*)(*result)->obj)->u.class = lit->u.class;
722 if (pos > *max_pos)
723 *max_pos = pos;
724 break;
725 }
726 case UNION:
727 {
728 tre_union_t *uni = node->obj;
729 tre_union_t *tmp;
730 *result = tre_ast_new_union(mem, uni->left, uni->right);
731 if (*result == NULL)
732 {
733 status = REG_ESPACE;
734 break;
735 }
736 tmp = (*result)->obj;
737 result = &tmp->left;
738 STACK_PUSHX(stack, voidptr, uni->right);
739 STACK_PUSHX(stack, int, COPY_RECURSE);
740 STACK_PUSHX(stack, voidptr, &tmp->right);
741 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
742 STACK_PUSHX(stack, voidptr, uni->left);
743 STACK_PUSHX(stack, int, COPY_RECURSE);
744 break;
745 }
746 case CATENATION:
747 {
748 tre_catenation_t *cat = node->obj;
749 tre_catenation_t *tmp;
750 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
751 if (*result == NULL)
752 {
753 status = REG_ESPACE;
754 break;
755 }
756 tmp = (*result)->obj;
757 tmp->left = NULL;
758 tmp->right = NULL;
759 result = &tmp->left;
760
761 STACK_PUSHX(stack, voidptr, cat->right);
762 STACK_PUSHX(stack, int, COPY_RECURSE);
763 STACK_PUSHX(stack, voidptr, &tmp->right);
764 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
765 STACK_PUSHX(stack, voidptr, cat->left);
766 STACK_PUSHX(stack, int, COPY_RECURSE);
767 break;
768 }
769 case ITERATION:
770 {
771 tre_iteration_t *iter = node->obj;
772 STACK_PUSHX(stack, voidptr, iter->arg);
773 STACK_PUSHX(stack, int, COPY_RECURSE);
774 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
775 iter->max, iter->minimal);
776 if (*result == NULL)
777 {
778 status = REG_ESPACE;
779 break;
780 }
781 iter = (*result)->obj;
782 result = &iter->arg;
783 break;
784 }
785 default:
786 assert(0);
787 break;
788 }
789 break;
790 }
791 }
792 *pos_add += num_copied;
793 return status;
794 }
795
796 typedef enum {
797 EXPAND_RECURSE,
798 EXPAND_AFTER_ITER
799 } tre_expand_ast_symbol_t;
800
801 /* Expands each iteration node that has a finite nonzero minimum or maximum
802 iteration count to a catenated sequence of copies of the node. */
803 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions,int * max_depth)804 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
805 int *position, tre_tag_direction_t *tag_directions,
806 int *max_depth)
807 {
808 reg_errcode_t status = REG_OK;
809 int bottom = tre_stack_num_objects(stack);
810 int pos_add = 0;
811 int pos_add_total = 0;
812 int max_pos = 0;
813 /* Current approximate matching parameters. */
814 int params[TRE_PARAM_LAST];
815 /* Approximate parameter nesting level. */
816 int params_depth = 0;
817 int iter_depth = 0;
818 int i;
819
820 for (i = 0; i < TRE_PARAM_LAST; i++)
821 params[i] = TRE_PARAM_DEFAULT;
822
823 STACK_PUSHR(stack, voidptr, ast);
824 STACK_PUSHR(stack, int, EXPAND_RECURSE);
825 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
826 {
827 tre_ast_node_t *node;
828 tre_expand_ast_symbol_t symbol;
829
830 if (status != REG_OK)
831 break;
832
833 DPRINT(("pos_add %d\n", pos_add));
834
835 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
836 node = tre_stack_pop_voidptr(stack);
837 switch (symbol)
838 {
839 case EXPAND_RECURSE:
840 switch (node->type)
841 {
842 case LITERAL:
843 {
844 tre_literal_t *lit= node->obj;
845 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
846 {
847 lit->position += pos_add;
848 if (lit->position > max_pos)
849 max_pos = lit->position;
850 }
851 break;
852 }
853 case UNION:
854 {
855 tre_union_t *uni = node->obj;
856 STACK_PUSHX(stack, voidptr, uni->right);
857 STACK_PUSHX(stack, int, EXPAND_RECURSE);
858 STACK_PUSHX(stack, voidptr, uni->left);
859 STACK_PUSHX(stack, int, EXPAND_RECURSE);
860 break;
861 }
862 case CATENATION:
863 {
864 tre_catenation_t *cat = node->obj;
865 STACK_PUSHX(stack, voidptr, cat->right);
866 STACK_PUSHX(stack, int, EXPAND_RECURSE);
867 STACK_PUSHX(stack, voidptr, cat->left);
868 STACK_PUSHX(stack, int, EXPAND_RECURSE);
869 break;
870 }
871 case ITERATION:
872 {
873 tre_iteration_t *iter = node->obj;
874 STACK_PUSHX(stack, int, pos_add);
875 STACK_PUSHX(stack, voidptr, node);
876 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
877 STACK_PUSHX(stack, voidptr, iter->arg);
878 STACK_PUSHX(stack, int, EXPAND_RECURSE);
879 /* If we are going to expand this node at EXPAND_AFTER_ITER
880 then don't increase the `pos' fields of the nodes now, it
881 will get done when expanding. */
882 if (iter->min > 1 || iter->max > 1)
883 pos_add = 0;
884 iter_depth++;
885 DPRINT(("iter\n"));
886 break;
887 }
888 default:
889 assert(0);
890 break;
891 }
892 break;
893 case EXPAND_AFTER_ITER:
894 {
895 tre_iteration_t *iter = node->obj;
896 int pos_add_last;
897 pos_add = tre_stack_pop_int(stack);
898 pos_add_last = pos_add;
899 if (iter->min > 1 || iter->max > 1)
900 {
901 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
902 int j;
903 int pos_add_save = pos_add;
904
905 /* Create a catenated sequence of copies of the node. */
906 for (j = 0; j < iter->min; j++)
907 {
908 tre_ast_node_t *copy;
909 /* Remove tags from all but the last copy. */
910 int flags = ((j + 1 < iter->min)
911 ? COPY_REMOVE_TAGS
912 : COPY_MAXIMIZE_FIRST_TAG);
913 DPRINT((" pos_add %d\n", pos_add));
914 pos_add_save = pos_add;
915 status = tre_copy_ast(mem, stack, iter->arg, flags,
916 &pos_add, tag_directions, ©,
917 &max_pos);
918 if (status != REG_OK)
919 return status;
920 if (seq1 != NULL)
921 seq1 = tre_ast_new_catenation(mem, seq1, copy);
922 else
923 seq1 = copy;
924 if (seq1 == NULL)
925 return REG_ESPACE;
926 }
927
928 if (iter->max == -1)
929 {
930 /* No upper limit. */
931 pos_add_save = pos_add;
932 status = tre_copy_ast(mem, stack, iter->arg, 0,
933 &pos_add, NULL, &seq2, &max_pos);
934 if (status != REG_OK)
935 return status;
936 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
937 if (seq2 == NULL)
938 return REG_ESPACE;
939 }
940 else
941 {
942 for (j = iter->min; j < iter->max; j++)
943 {
944 tre_ast_node_t *tmp, *copy;
945 pos_add_save = pos_add;
946 status = tre_copy_ast(mem, stack, iter->arg, 0,
947 &pos_add, NULL, ©, &max_pos);
948 if (status != REG_OK)
949 return status;
950 if (seq2 != NULL)
951 seq2 = tre_ast_new_catenation(mem, copy, seq2);
952 else
953 seq2 = copy;
954 if (seq2 == NULL)
955 return REG_ESPACE;
956 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
957 if (tmp == NULL)
958 return REG_ESPACE;
959 seq2 = tre_ast_new_union(mem, tmp, seq2);
960 if (seq2 == NULL)
961 return REG_ESPACE;
962 }
963 }
964
965 pos_add = pos_add_save;
966 if (seq1 == NULL)
967 seq1 = seq2;
968 else if (seq2 != NULL)
969 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
970 if (seq1 == NULL)
971 return REG_ESPACE;
972 node->obj = seq1->obj;
973 node->type = seq1->type;
974 }
975
976 iter_depth--;
977 pos_add_total += pos_add - pos_add_last;
978 if (iter_depth == 0)
979 pos_add = pos_add_total;
980
981 /* If approximate parameters are specified, surround the result
982 with two parameter setting nodes. The one on the left sets
983 the specified parameters, and the one on the right restores
984 the old parameters. */
985 if (iter->params)
986 {
987 tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
988 int *old_params;
989
990 tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
991 if (!tmp_l)
992 return REG_ESPACE;
993 ((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
994 iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
995 tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
996 if (!tmp_r)
997 return REG_ESPACE;
998 old_params = tre_mem_alloc(mem, sizeof(*old_params)
999 * TRE_PARAM_LAST);
1000 if (!old_params)
1001 return REG_ESPACE;
1002 for (i = 0; i < TRE_PARAM_LAST; i++)
1003 old_params[i] = params[i];
1004 ((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1005 old_params[TRE_PARAM_DEPTH] = params_depth;
1006 /* XXX - this is the only place where ast_new_node is
1007 needed -- should be moved inside AST module. */
1008 node_copy = tre_ast_new_node(mem, ITERATION,
1009 sizeof(tre_iteration_t));
1010 if (!node_copy)
1011 return REG_ESPACE;
1012 node_copy->obj = node->obj;
1013 tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1014 if (!tmp_node)
1015 return REG_ESPACE;
1016 tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1017 if (!tmp_node)
1018 return REG_ESPACE;
1019 /* Replace the contents of `node' with `tmp_node'. */
1020 memcpy(node, tmp_node, sizeof(*node));
1021 node->obj = tmp_node->obj;
1022 node->type = tmp_node->type;
1023 params_depth++;
1024 if (params_depth > *max_depth)
1025 *max_depth = params_depth;
1026 }
1027 break;
1028 }
1029 default:
1030 assert(0);
1031 break;
1032 }
1033 }
1034
1035 *position += pos_add_total;
1036
1037 /* `max_pos' should never be larger than `*position' if the above
1038 code works, but just an extra safeguard let's make sure
1039 `*position' is set large enough so enough memory will be
1040 allocated for the transition table. */
1041 if (max_pos > *position)
1042 *position = max_pos;
1043
1044 #ifdef TRE_DEBUG
1045 DPRINT(("Expanded AST:\n"));
1046 tre_ast_print(ast);
1047 DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1048 #endif
1049
1050 return status;
1051 }
1052
1053 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)1054 tre_set_empty(tre_mem_t mem)
1055 {
1056 tre_pos_and_tags_t *new_set;
1057
1058 new_set = tre_mem_calloc(mem, sizeof(*new_set));
1059 if (new_set == NULL)
1060 return NULL;
1061
1062 new_set[0].position = -1;
1063 new_set[0].code_min = -1;
1064 new_set[0].code_max = -1;
1065
1066 return new_set;
1067 }
1068
1069 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)1070 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1071 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1072 {
1073 tre_pos_and_tags_t *new_set;
1074
1075 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1076 if (new_set == NULL)
1077 return NULL;
1078
1079 new_set[0].position = position;
1080 new_set[0].code_min = code_min;
1081 new_set[0].code_max = code_max;
1082 new_set[0].class = class;
1083 new_set[0].neg_classes = neg_classes;
1084 new_set[0].backref = backref;
1085 new_set[1].position = -1;
1086 new_set[1].code_min = -1;
1087 new_set[1].code_max = -1;
1088
1089 return new_set;
1090 }
1091
1092 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions,int * params)1093 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1094 int *tags, int assertions, int *params)
1095 {
1096 int s1, s2, i, j;
1097 tre_pos_and_tags_t *new_set;
1098 int *new_tags;
1099 int num_tags;
1100
1101 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1102 for (s1 = 0; set1[s1].position >= 0; s1++);
1103 for (s2 = 0; set2[s2].position >= 0; s2++);
1104 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1105 if (!new_set )
1106 return NULL;
1107
1108 for (s1 = 0; set1[s1].position >= 0; s1++)
1109 {
1110 new_set[s1].position = set1[s1].position;
1111 new_set[s1].code_min = set1[s1].code_min;
1112 new_set[s1].code_max = set1[s1].code_max;
1113 new_set[s1].assertions = set1[s1].assertions | assertions;
1114 new_set[s1].class = set1[s1].class;
1115 new_set[s1].neg_classes = set1[s1].neg_classes;
1116 new_set[s1].backref = set1[s1].backref;
1117 if (set1[s1].tags == NULL && tags == NULL)
1118 new_set[s1].tags = NULL;
1119 else
1120 {
1121 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1122 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1123 * (i + num_tags + 1)));
1124 if (new_tags == NULL)
1125 return NULL;
1126 for (j = 0; j < i; j++)
1127 new_tags[j] = set1[s1].tags[j];
1128 for (i = 0; i < num_tags; i++)
1129 new_tags[j + i] = tags[i];
1130 new_tags[j + i] = -1;
1131 new_set[s1].tags = new_tags;
1132 }
1133 if (set1[s1].params)
1134 new_set[s1].params = set1[s1].params;
1135 if (params)
1136 {
1137 if (!new_set[s1].params)
1138 new_set[s1].params = params;
1139 else
1140 {
1141 new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1142 TRE_PARAM_LAST);
1143 if (!new_set[s1].params)
1144 return NULL;
1145 for (i = 0; i < TRE_PARAM_LAST; i++)
1146 if (params[i] != TRE_PARAM_UNSET)
1147 new_set[s1].params[i] = params[i];
1148 }
1149 }
1150 }
1151
1152 for (s2 = 0; set2[s2].position >= 0; s2++)
1153 {
1154 new_set[s1 + s2].position = set2[s2].position;
1155 new_set[s1 + s2].code_min = set2[s2].code_min;
1156 new_set[s1 + s2].code_max = set2[s2].code_max;
1157 /* XXX - why not | assertions here as well? */
1158 new_set[s1 + s2].assertions = set2[s2].assertions;
1159 new_set[s1 + s2].class = set2[s2].class;
1160 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1161 new_set[s1 + s2].backref = set2[s2].backref;
1162 if (set2[s2].tags == NULL)
1163 new_set[s1 + s2].tags = NULL;
1164 else
1165 {
1166 for (i = 0; set2[s2].tags[i] >= 0; i++);
1167 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1168 if (new_tags == NULL)
1169 return NULL;
1170 for (j = 0; j < i; j++)
1171 new_tags[j] = set2[s2].tags[j];
1172 new_tags[j] = -1;
1173 new_set[s1 + s2].tags = new_tags;
1174 }
1175 if (set2[s2].params)
1176 new_set[s1 + s2].params = set2[s2].params;
1177 if (params)
1178 {
1179 if (!new_set[s1 + s2].params)
1180 new_set[s1 + s2].params = params;
1181 else
1182 {
1183 new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1184 TRE_PARAM_LAST);
1185 if (!new_set[s1 + s2].params)
1186 return NULL;
1187 for (i = 0; i < TRE_PARAM_LAST; i++)
1188 if (params[i] != TRE_PARAM_UNSET)
1189 new_set[s1 + s2].params[i] = params[i];
1190 }
1191 }
1192 }
1193 new_set[s1 + s2].position = -1;
1194 return new_set;
1195 }
1196
1197 /* Finds the empty path through `node' which is the one that should be
1198 taken according to POSIX.2 rules, and adds the tags on that path to
1199 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
1200 set to the number of tags seen on the path. */
1201 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * params,int * num_tags_seen,int * params_seen)1202 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1203 int *assertions, int *params, int *num_tags_seen,
1204 int *params_seen)
1205 {
1206 tre_literal_t *lit;
1207 tre_union_t *uni;
1208 tre_catenation_t *cat;
1209 tre_iteration_t *iter;
1210 int i;
1211 int bottom = tre_stack_num_objects(stack);
1212 reg_errcode_t status = REG_OK;
1213 if (num_tags_seen)
1214 *num_tags_seen = 0;
1215 if (params_seen)
1216 *params_seen = 0;
1217
1218 status = tre_stack_push_voidptr(stack, node);
1219
1220 /* Walk through the tree recursively. */
1221 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1222 {
1223 node = tre_stack_pop_voidptr(stack);
1224
1225 switch (node->type)
1226 {
1227 case LITERAL:
1228 lit = (tre_literal_t *)node->obj;
1229 switch (lit->code_min)
1230 {
1231 case TAG:
1232 if (lit->code_max >= 0)
1233 {
1234 if (tags != NULL)
1235 {
1236 /* Add the tag to `tags'. */
1237 for (i = 0; tags[i] >= 0; i++)
1238 if (tags[i] == lit->code_max)
1239 break;
1240 if (tags[i] < 0)
1241 {
1242 tags[i] = (int)lit->code_max;
1243 tags[i + 1] = -1;
1244 }
1245 }
1246 if (num_tags_seen)
1247 (*num_tags_seen)++;
1248 }
1249 break;
1250 case ASSERTION:
1251 assert(lit->code_max >= 1
1252 || lit->code_max <= ASSERT_LAST);
1253 if (assertions != NULL)
1254 *assertions |= lit->code_max;
1255 break;
1256 case PARAMETER:
1257 if (params != NULL)
1258 for (i = 0; i < TRE_PARAM_LAST; i++)
1259 params[i] = lit->u.params[i];
1260 if (params_seen != NULL)
1261 *params_seen = 1;
1262 break;
1263 case EMPTY:
1264 break;
1265 default:
1266 assert(0);
1267 break;
1268 }
1269 break;
1270
1271 case UNION:
1272 /* Subexpressions starting earlier take priority over ones
1273 starting later, so we prefer the left subexpression over the
1274 right subexpression. */
1275 uni = (tre_union_t *)node->obj;
1276 if (uni->left->nullable)
1277 STACK_PUSHX(stack, voidptr, uni->left)
1278 else if (uni->right->nullable)
1279 STACK_PUSHX(stack, voidptr, uni->right)
1280 else
1281 assert(0);
1282 break;
1283
1284 case CATENATION:
1285 /* The path must go through both children. */
1286 cat = (tre_catenation_t *)node->obj;
1287 assert(cat->left->nullable);
1288 assert(cat->right->nullable);
1289 STACK_PUSHX(stack, voidptr, cat->left);
1290 STACK_PUSHX(stack, voidptr, cat->right);
1291 break;
1292
1293 case ITERATION:
1294 /* A match with an empty string is preferred over no match at
1295 all, so we go through the argument if possible. */
1296 iter = (tre_iteration_t *)node->obj;
1297 if (iter->arg->nullable)
1298 STACK_PUSHX(stack, voidptr, iter->arg);
1299 break;
1300
1301 default:
1302 assert(0);
1303 break;
1304 }
1305 }
1306
1307 return status;
1308 }
1309
1310
1311 typedef enum {
1312 NFL_RECURSE,
1313 NFL_POST_UNION,
1314 NFL_POST_CATENATION,
1315 NFL_POST_ITERATION
1316 } tre_nfl_stack_symbol_t;
1317
1318
1319 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1320 the nodes of the AST `tree'. */
1321 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)1322 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1323 {
1324 int bottom = tre_stack_num_objects(stack);
1325
1326 STACK_PUSHR(stack, voidptr, tree);
1327 STACK_PUSHR(stack, int, NFL_RECURSE);
1328
1329 while (tre_stack_num_objects(stack) > bottom)
1330 {
1331 tre_nfl_stack_symbol_t symbol;
1332 tre_ast_node_t *node;
1333
1334 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1335 node = tre_stack_pop_voidptr(stack);
1336 switch (symbol)
1337 {
1338 case NFL_RECURSE:
1339 switch (node->type)
1340 {
1341 case LITERAL:
1342 {
1343 tre_literal_t *lit = (tre_literal_t *)node->obj;
1344 if (IS_BACKREF(lit))
1345 {
1346 /* Back references: nullable = false, firstpos = {i},
1347 lastpos = {i}. */
1348 node->nullable = 0;
1349 node->firstpos = tre_set_one(mem, lit->position, 0,
1350 TRE_CHAR_MAX, 0, NULL, -1);
1351 if (!node->firstpos)
1352 return REG_ESPACE;
1353 node->lastpos = tre_set_one(mem, lit->position, 0,
1354 TRE_CHAR_MAX, 0, NULL,
1355 (int)lit->code_max);
1356 if (!node->lastpos)
1357 return REG_ESPACE;
1358 }
1359 else if (lit->code_min < 0)
1360 {
1361 /* Tags, empty strings, params, and zero width assertions:
1362 nullable = true, firstpos = {}, and lastpos = {}. */
1363 node->nullable = 1;
1364 node->firstpos = tre_set_empty(mem);
1365 if (!node->firstpos)
1366 return REG_ESPACE;
1367 node->lastpos = tre_set_empty(mem);
1368 if (!node->lastpos)
1369 return REG_ESPACE;
1370 }
1371 else
1372 {
1373 /* Literal at position i: nullable = false, firstpos = {i},
1374 lastpos = {i}. */
1375 node->nullable = 0;
1376 node->firstpos =
1377 tre_set_one(mem, lit->position, (int)lit->code_min,
1378 (int)lit->code_max, 0, NULL, -1);
1379 if (!node->firstpos)
1380 return REG_ESPACE;
1381 node->lastpos = tre_set_one(mem, lit->position,
1382 (int)lit->code_min,
1383 (int)lit->code_max,
1384 lit->u.class, lit->neg_classes,
1385 -1);
1386 if (!node->lastpos)
1387 return REG_ESPACE;
1388 }
1389 break;
1390 }
1391
1392 case UNION:
1393 /* Compute the attributes for the two subtrees, and after that
1394 for this node. */
1395 STACK_PUSHR(stack, voidptr, node);
1396 STACK_PUSHR(stack, int, NFL_POST_UNION);
1397 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1398 STACK_PUSHR(stack, int, NFL_RECURSE);
1399 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1400 STACK_PUSHR(stack, int, NFL_RECURSE);
1401 break;
1402
1403 case CATENATION:
1404 /* Compute the attributes for the two subtrees, and after that
1405 for this node. */
1406 STACK_PUSHR(stack, voidptr, node);
1407 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1408 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1409 STACK_PUSHR(stack, int, NFL_RECURSE);
1410 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1411 STACK_PUSHR(stack, int, NFL_RECURSE);
1412 break;
1413
1414 case ITERATION:
1415 /* Compute the attributes for the subtree, and after that for
1416 this node. */
1417 STACK_PUSHR(stack, voidptr, node);
1418 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1419 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1420 STACK_PUSHR(stack, int, NFL_RECURSE);
1421 break;
1422 }
1423 break; /* end case: NFL_RECURSE */
1424
1425 case NFL_POST_UNION:
1426 {
1427 tre_union_t *uni = (tre_union_t *)node->obj;
1428 node->nullable = uni->left->nullable || uni->right->nullable;
1429 node->firstpos = tre_set_union(mem, uni->left->firstpos,
1430 uni->right->firstpos, NULL, 0, NULL);
1431 if (!node->firstpos)
1432 return REG_ESPACE;
1433 node->lastpos = tre_set_union(mem, uni->left->lastpos,
1434 uni->right->lastpos, NULL, 0, NULL);
1435 if (!node->lastpos)
1436 return REG_ESPACE;
1437 break;
1438 }
1439
1440 case NFL_POST_ITERATION:
1441 {
1442 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1443
1444 if (iter->min == 0 || iter->arg->nullable)
1445 node->nullable = 1;
1446 else
1447 node->nullable = 0;
1448 node->firstpos = iter->arg->firstpos;
1449 node->lastpos = iter->arg->lastpos;
1450 break;
1451 }
1452
1453 case NFL_POST_CATENATION:
1454 {
1455 int num_tags, *tags, assertions, params_seen;
1456 int *params;
1457 reg_errcode_t status;
1458 tre_catenation_t *cat = node->obj;
1459 node->nullable = cat->left->nullable && cat->right->nullable;
1460
1461 /* Compute firstpos. */
1462 if (cat->left->nullable)
1463 {
1464 /* The left side matches the empty string. Make a first pass
1465 with tre_match_empty() to get the number of tags and
1466 parameters. */
1467 status = tre_match_empty(stack, cat->left,
1468 NULL, NULL, NULL, &num_tags,
1469 ¶ms_seen);
1470 if (status != REG_OK)
1471 return status;
1472 /* Allocate arrays for the tags and parameters. */
1473 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1474 if (!tags)
1475 return REG_ESPACE;
1476 tags[0] = -1;
1477 assertions = 0;
1478 params = NULL;
1479 if (params_seen)
1480 {
1481 params = tre_mem_alloc(mem, sizeof(*params)
1482 * TRE_PARAM_LAST);
1483 if (!params)
1484 {
1485 xfree(tags);
1486 return REG_ESPACE;
1487 }
1488 }
1489 /* Second pass with tre_mach_empty() to get the list of
1490 tags and parameters. */
1491 status = tre_match_empty(stack, cat->left, tags,
1492 &assertions, params, NULL, NULL);
1493 if (status != REG_OK)
1494 {
1495 xfree(tags);
1496 return status;
1497 }
1498 node->firstpos =
1499 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1500 tags, assertions, params);
1501 xfree(tags);
1502 if (!node->firstpos)
1503 return REG_ESPACE;
1504 }
1505 else
1506 {
1507 node->firstpos = cat->left->firstpos;
1508 }
1509
1510 /* Compute lastpos. */
1511 if (cat->right->nullable)
1512 {
1513 /* The right side matches the empty string. Make a first pass
1514 with tre_match_empty() to get the number of tags and
1515 parameters. */
1516 status = tre_match_empty(stack, cat->right,
1517 NULL, NULL, NULL, &num_tags,
1518 ¶ms_seen);
1519 if (status != REG_OK)
1520 return status;
1521 /* Allocate arrays for the tags and parameters. */
1522 tags = xmalloc(sizeof(int) * (num_tags + 1));
1523 if (!tags)
1524 return REG_ESPACE;
1525 tags[0] = -1;
1526 assertions = 0;
1527 params = NULL;
1528 if (params_seen)
1529 {
1530 params = tre_mem_alloc(mem, sizeof(*params)
1531 * TRE_PARAM_LAST);
1532 if (!params)
1533 {
1534 xfree(tags);
1535 return REG_ESPACE;
1536 }
1537 }
1538 /* Second pass with tre_mach_empty() to get the list of
1539 tags and parameters. */
1540 status = tre_match_empty(stack, cat->right, tags,
1541 &assertions, params, NULL, NULL);
1542 if (status != REG_OK)
1543 {
1544 xfree(tags);
1545 return status;
1546 }
1547 node->lastpos =
1548 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1549 tags, assertions, params);
1550 xfree(tags);
1551 if (!node->lastpos)
1552 return REG_ESPACE;
1553 }
1554 else
1555 {
1556 node->lastpos = cat->right->lastpos;
1557 }
1558 break;
1559 }
1560
1561 default:
1562 assert(0);
1563 break;
1564 }
1565 }
1566
1567 return REG_OK;
1568 }
1569
1570
1571 /* Adds a transition from each position in `p1' to each position in `p2'. */
1572 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)1573 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1574 tre_tnfa_transition_t *transitions,
1575 int *counts, int *offs)
1576 {
1577 tre_pos_and_tags_t *orig_p2 = p2;
1578 tre_tnfa_transition_t *trans;
1579 int i, j, k, l, dup, prev_p2_pos;
1580
1581 if (transitions != NULL)
1582 while (p1->position >= 0)
1583 {
1584 p2 = orig_p2;
1585 prev_p2_pos = -1;
1586 while (p2->position >= 0)
1587 {
1588 /* Optimization: if this position was already handled, skip it. */
1589 if (p2->position == prev_p2_pos)
1590 {
1591 p2++;
1592 continue;
1593 }
1594 prev_p2_pos = p2->position;
1595 /* Set `trans' to point to the next unused transition from
1596 position `p1->position'. */
1597 trans = transitions + offs[p1->position];
1598 while (trans->state != NULL)
1599 {
1600 #if 0
1601 /* If we find a previous transition from `p1->position' to
1602 `p2->position', it is overwritten. This can happen only
1603 if there are nested loops in the regexp, like in "((a)*)*".
1604 In POSIX.2 repetition using the outer loop is always
1605 preferred over using the inner loop. Therefore the
1606 transition for the inner loop is useless and can be thrown
1607 away. */
1608 /* XXX - The same position is used for all nodes in a bracket
1609 expression, so this optimization cannot be used (it will
1610 break bracket expressions) unless I figure out a way to
1611 detect it here. */
1612 if (trans->state_id == p2->position)
1613 {
1614 DPRINT(("*"));
1615 break;
1616 }
1617 #endif
1618 trans++;
1619 }
1620
1621 if (trans->state == NULL)
1622 (trans + 1)->state = NULL;
1623 /* Use the character ranges, assertions, etc. from `p1' for
1624 the transition from `p1' to `p2'. */
1625 trans->code_min = (tre_cint_t) p1->code_min;
1626 trans->code_max = (tre_cint_t) p1->code_max;
1627 trans->state = transitions + offs[p2->position];
1628 trans->state_id = p2->position;
1629 trans->assertions = p1->assertions | p2->assertions
1630 | (p1->class ? ASSERT_CHAR_CLASS : 0)
1631 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1632 if (p1->backref >= 0)
1633 {
1634 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1635 assert(p2->backref < 0);
1636 trans->u.backref = p1->backref;
1637 trans->assertions |= ASSERT_BACKREF;
1638 }
1639 else
1640 trans->u.class = p1->class;
1641 if (p1->neg_classes != NULL)
1642 {
1643 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1644 trans->neg_classes =
1645 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1646 if (trans->neg_classes == NULL)
1647 return REG_ESPACE;
1648 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1649 trans->neg_classes[i] = p1->neg_classes[i];
1650 trans->neg_classes[i] = (tre_ctype_t)0;
1651 }
1652 else
1653 trans->neg_classes = NULL;
1654
1655 /* Find out how many tags this transition has. */
1656 i = 0;
1657 if (p1->tags != NULL)
1658 while(p1->tags[i] >= 0)
1659 i++;
1660 j = 0;
1661 if (p2->tags != NULL)
1662 while(p2->tags[j] >= 0)
1663 j++;
1664
1665 /* If we are overwriting a transition, free the old tag array. */
1666 if (trans->tags != NULL)
1667 xfree(trans->tags);
1668 trans->tags = NULL;
1669
1670 /* If there were any tags, allocate an array and fill it. */
1671 if (i + j > 0)
1672 {
1673 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1674 if (!trans->tags)
1675 return REG_ESPACE;
1676 i = 0;
1677 if (p1->tags != NULL)
1678 while(p1->tags[i] >= 0)
1679 {
1680 trans->tags[i] = p1->tags[i];
1681 i++;
1682 }
1683 l = i;
1684 j = 0;
1685 if (p2->tags != NULL)
1686 while (p2->tags[j] >= 0)
1687 {
1688 /* Don't add duplicates. */
1689 dup = 0;
1690 for (k = 0; k < i; k++)
1691 if (trans->tags[k] == p2->tags[j])
1692 {
1693 dup = 1;
1694 break;
1695 }
1696 if (!dup)
1697 trans->tags[l++] = p2->tags[j];
1698 j++;
1699 }
1700 trans->tags[l] = -1;
1701 }
1702
1703 /* Set the parameter array. If both `p2' and `p1' have same
1704 parameters, the values in `p2' override those in `p1'. */
1705 if (p1->params || p2->params)
1706 {
1707 if (!trans->params)
1708 trans->params = xmalloc(sizeof(*trans->params)
1709 * TRE_PARAM_LAST);
1710 if (!trans->params)
1711 return REG_ESPACE;
1712 for (i = 0; i < TRE_PARAM_LAST; i++)
1713 {
1714 trans->params[i] = TRE_PARAM_UNSET;
1715 if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1716 trans->params[i] = p1->params[i];
1717 if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1718 trans->params[i] = p2->params[i];
1719 }
1720 }
1721 else
1722 {
1723 if (trans->params)
1724 xfree(trans->params);
1725 trans->params = NULL;
1726 }
1727
1728
1729 #ifdef TRE_DEBUG
1730 {
1731 int *tags;
1732
1733 DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
1734 p1->code_min));
1735 if (p1->code_max != p1->code_min)
1736 DPRINT(("-%3d", p1->code_max));
1737 tags = trans->tags;
1738 if (tags)
1739 {
1740 DPRINT((", tags ["));
1741 while (*tags >= 0)
1742 {
1743 DPRINT(("%d", *tags));
1744 tags++;
1745 if (*tags >= 0)
1746 DPRINT((","));
1747 }
1748 DPRINT(("]"));
1749 }
1750 if (trans->assertions)
1751 DPRINT((", assert %d", trans->assertions));
1752 if (trans->assertions & ASSERT_BACKREF)
1753 DPRINT((", backref %d", trans->u.backref));
1754 else if (trans->u.class)
1755 DPRINT((", class %ld", (long)trans->u.class));
1756 if (trans->neg_classes)
1757 DPRINT((", neg_classes %p", trans->neg_classes));
1758 if (trans->params)
1759 {
1760 DPRINT((", "));
1761 tre_print_params(trans->params);
1762 }
1763 DPRINT(("\n"));
1764 }
1765 #endif /* TRE_DEBUG */
1766 p2++;
1767 }
1768 p1++;
1769 }
1770 else
1771 /* Compute a maximum limit for the number of transitions leaving
1772 from each state. */
1773 while (p1->position >= 0)
1774 {
1775 p2 = orig_p2;
1776 while (p2->position >= 0)
1777 {
1778 counts[p1->position]++;
1779 p2++;
1780 }
1781 p1++;
1782 }
1783 return REG_OK;
1784 }
1785
1786 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
1787 labelled with one character range (there are no transitions on empty
1788 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
1789 the regexp.
1790 This is the iterative version using a stack (previously used recursion
1791 which will fail for large patterns)
1792 */
1793 static reg_errcode_t
tre_ast_to_tnfa_iter(tre_stack_t * stack,tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1794 tre_ast_to_tnfa_iter(tre_stack_t *stack, tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1795 int *counts, int *offs)
1796 {
1797 tre_union_t *uni;
1798 tre_catenation_t *cat;
1799 tre_iteration_t *iter;
1800 reg_errcode_t errcode = REG_OK;
1801
1802 STACK_PUSHR(stack, voidptr, node);
1803
1804 while (tre_stack_num_objects(stack)) {
1805 node = (tre_ast_node_t*) tre_stack_pop_voidptr(stack);
1806
1807 switch (node->type)
1808 {
1809 case LITERAL:
1810 break;
1811
1812 case UNION:
1813 uni = (tre_union_t *)node->obj;
1814 STACK_PUSHR(stack, voidptr, uni->right);
1815 STACK_PUSHR(stack, voidptr, uni->left);
1816 break;
1817
1818 case CATENATION:
1819 cat = (tre_catenation_t *)node->obj;
1820 /* Add a transition from each position in cat->left->lastpos
1821 to each position in cat->right->firstpos. */
1822 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1823 transitions, counts, offs);
1824 if (errcode != REG_OK)
1825 return errcode;
1826
1827 STACK_PUSHR(stack, voidptr, cat->right);
1828 STACK_PUSHR(stack, voidptr, cat->left);
1829 break;
1830
1831 case ITERATION:
1832 iter = (tre_iteration_t *)node->obj;
1833 // assert(iter->max == -1 || iter->max == 1);
1834 if(!(iter->max == -1 || iter->max == 1)) return REG_BADBR;
1835
1836 if (iter->max == -1)
1837 {
1838 // assert(iter->min == 0 || iter->min == 1);
1839 if(!(iter->min == 0 || iter->min == 1)) return REG_BADBR;
1840 /* Add a transition from each last position in the iterated
1841 expression to each first position. */
1842 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1843 transitions, counts, offs);
1844 if (errcode != REG_OK)
1845 return errcode;
1846 }
1847 STACK_PUSHR(stack, voidptr, iter->arg);
1848 break;
1849 }
1850 }
1851 return REG_OK;
1852 }
1853
1854 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1855 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1856 int *counts, int *offs)
1857 {
1858 reg_errcode_t errcode;
1859 tre_stack_t *stack;
1860 /* I made up max_size, there is no reason for that particular value */
1861 stack = tre_stack_new(1024, 256*1024, 4096);
1862 errcode = tre_ast_to_tnfa_iter(stack, node, transitions, counts, offs);
1863 tre_stack_destroy(stack);
1864 return errcode;
1865 }
1866
1867
1868 #define ERROR_EXIT(err) \
1869 do \
1870 { \
1871 errcode = err; \
1872 if (/*CONSTCOND*/(void)1,1) \
1873 goto error_exit; \
1874 } \
1875 while (/*CONSTCOND*/(void)0,0)
1876
1877
1878 int
tre_compile(regex_t * preg,const tre_char_t * regex,size_t n,int cflags)1879 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1880 {
1881 tre_stack_t *stack;
1882 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1883 tre_pos_and_tags_t *p;
1884 int *counts = NULL, *offs = NULL;
1885 int i, add = 0;
1886 tre_tnfa_transition_t *transitions, *initial;
1887 tre_tnfa_t *tnfa = NULL;
1888 tre_submatch_data_t *submatch_data;
1889 tre_tag_direction_t *tag_directions = NULL;
1890 reg_errcode_t errcode;
1891 tre_mem_t mem;
1892
1893 /* Parse context. */
1894 tre_parse_ctx_t parse_ctx;
1895
1896 /* Allocate a stack used throughout the compilation process for various
1897 purposes. */
1898 stack = tre_stack_new(512, 10240, 128);
1899 if (!stack)
1900 return REG_ESPACE;
1901 /* Allocate a fast memory allocator. */
1902 mem = tre_mem_new();
1903 if (!mem)
1904 {
1905 tre_stack_destroy(stack);
1906 return REG_ESPACE;
1907 }
1908
1909 /* Parse the regexp. */
1910 memset(&parse_ctx, 0, sizeof(parse_ctx));
1911 parse_ctx.mem = mem;
1912 parse_ctx.stack = stack;
1913 parse_ctx.re = regex;
1914 parse_ctx.len = n;
1915 parse_ctx.cflags = cflags;
1916 parse_ctx.max_backref = -1;
1917 /* workaround for PR#14408: use 8-bit optimizations in 8-bit mode */
1918 parse_ctx.cur_max = (cflags & REG_USEBYTES) ? 1 : TRE_MB_CUR_MAX;
1919 DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1920 errcode = tre_parse(&parse_ctx);
1921 if (errcode != REG_OK)
1922 ERROR_EXIT(errcode);
1923 preg->re_nsub = parse_ctx.submatch_id - 1;
1924 tree = parse_ctx.result;
1925
1926 /* Back references and approximate matching cannot currently be used
1927 in the same regexp. */
1928 if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1929 ERROR_EXIT(REG_BADPAT);
1930
1931 #ifdef TRE_DEBUG
1932 tre_ast_print(tree);
1933 #endif /* TRE_DEBUG */
1934
1935 /* Referring to nonexistent subexpressions is illegal. */
1936 if (parse_ctx.max_backref > (int)preg->re_nsub)
1937 ERROR_EXIT(REG_ESUBREG);
1938
1939 /* Allocate the TNFA struct. */
1940 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1941 if (tnfa == NULL)
1942 ERROR_EXIT(REG_ESPACE);
1943 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1944 tnfa->have_approx = parse_ctx.have_approx;
1945 tnfa->num_submatches = parse_ctx.submatch_id;
1946
1947 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
1948 regexp does not have back references, this can be skipped. */
1949 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1950 {
1951 DPRINT(("tre_compile: setting up tags\n"));
1952
1953 /* Figure out how many tags we will need. */
1954 errcode = tre_add_tags(NULL, stack, tree, tnfa);
1955 if (errcode != REG_OK)
1956 ERROR_EXIT(errcode);
1957 #ifdef TRE_DEBUG
1958 tre_ast_print(tree);
1959 #endif /* TRE_DEBUG */
1960
1961 if (tnfa->num_tags > 0)
1962 {
1963 tag_directions = xmalloc(sizeof(*tag_directions)
1964 * (tnfa->num_tags + 1));
1965 if (tag_directions == NULL)
1966 ERROR_EXIT(REG_ESPACE);
1967 tnfa->tag_directions = tag_directions;
1968 memset(tag_directions, -1,
1969 sizeof(*tag_directions) * (tnfa->num_tags + 1));
1970 }
1971 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
1972 sizeof(tnfa->minimal_tags));
1973 if (tnfa->minimal_tags == NULL)
1974 ERROR_EXIT(REG_ESPACE);
1975
1976 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
1977 sizeof(*submatch_data));
1978 if (submatch_data == NULL)
1979 ERROR_EXIT(REG_ESPACE);
1980 tnfa->submatch_data = submatch_data;
1981
1982 errcode = tre_add_tags(mem, stack, tree, tnfa);
1983 if (errcode != REG_OK)
1984 ERROR_EXIT(errcode);
1985
1986 #ifdef TRE_DEBUG
1987 for (i = 0; i < parse_ctx.submatch_id; i++)
1988 DPRINT(("pmatch[%d] = {t%d, t%d}\n",
1989 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
1990 for (i = 0; i < tnfa->num_tags; i++)
1991 DPRINT(("t%d is %s\n", i,
1992 tag_directions[i] == TRE_TAG_MINIMIZE ?
1993 "minimized" : "maximized"));
1994 #endif /* TRE_DEBUG */
1995 }
1996
1997 /* Expand iteration nodes. */
1998 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
1999 tag_directions, &tnfa->params_depth);
2000 if (errcode != REG_OK)
2001 ERROR_EXIT(errcode);
2002
2003 /* Add a dummy node for the final state.
2004 XXX - For certain patterns this dummy node can be optimized away,
2005 for example "a*" or "ab*". Figure out a simple way to detect
2006 this possibility. */
2007 tmp_ast_l = tree;
2008 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2009 if (tmp_ast_r == NULL)
2010 ERROR_EXIT(REG_ESPACE);
2011
2012 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2013 if (tree == NULL)
2014 ERROR_EXIT(REG_ESPACE);
2015
2016 #ifdef TRE_DEBUG
2017 tre_ast_print(tree);
2018 DPRINT(("Number of states: %d\n", parse_ctx.position));
2019 #endif /* TRE_DEBUG */
2020
2021 errcode = tre_compute_nfl(mem, stack, tree);
2022 if (errcode != REG_OK)
2023 ERROR_EXIT(errcode);
2024
2025 counts = xmalloc(sizeof(int) * parse_ctx.position);
2026 if (counts == NULL)
2027 ERROR_EXIT(REG_ESPACE);
2028
2029 offs = xmalloc(sizeof(int) * parse_ctx.position);
2030 if (offs == NULL)
2031 ERROR_EXIT(REG_ESPACE);
2032
2033 for (i = 0; i < parse_ctx.position; i++)
2034 counts[i] = 0;
2035 tre_ast_to_tnfa(tree, NULL, counts, NULL);
2036
2037 add = 0;
2038 for (i = 0; i < parse_ctx.position; i++)
2039 {
2040 offs[i] = add;
2041 add += counts[i] + 1;
2042 counts[i] = 0;
2043 }
2044 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2045 if (transitions == NULL)
2046 ERROR_EXIT(REG_ESPACE);
2047 tnfa->transitions = transitions;
2048 tnfa->num_transitions = add;
2049
2050 DPRINT(("Converting to TNFA:\n"));
2051 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2052 if (errcode != REG_OK)
2053 ERROR_EXIT(errcode);
2054
2055 /* If in eight bit mode, compute a table of characters that can be the
2056 first character of a match. */
2057 tnfa->first_char = -1;
2058 if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2059 {
2060 int count = 0;
2061 tre_cint_t k;
2062 DPRINT(("Characters that can start a match:"));
2063 tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2064 if (tnfa->firstpos_chars == NULL)
2065 ERROR_EXIT(REG_ESPACE);
2066 for (p = tree->firstpos; p->position >= 0; p++)
2067 {
2068 tre_tnfa_transition_t *j = transitions + offs[p->position];
2069 while (j->state != NULL)
2070 {
2071 for (k = j->code_min; k <= j->code_max && k < 256; k++)
2072 {
2073 DPRINT((" %d", k));
2074 tnfa->firstpos_chars[k] = 1;
2075 count++;
2076 }
2077 j++;
2078 }
2079 }
2080 DPRINT(("\n"));
2081 #define TRE_OPTIMIZE_FIRST_CHAR 1
2082 #if TRE_OPTIMIZE_FIRST_CHAR
2083 if (count == 1)
2084 {
2085 for (k = 0; k < 256; k++)
2086 if (tnfa->firstpos_chars[k])
2087 {
2088 DPRINT(("first char must be %d\n", k));
2089 tnfa->first_char = k;
2090 xfree(tnfa->firstpos_chars);
2091 tnfa->firstpos_chars = NULL;
2092 break;
2093 }
2094 }
2095 #endif
2096
2097 }
2098 else
2099 tnfa->firstpos_chars = NULL;
2100
2101
2102 p = tree->firstpos;
2103 i = 0;
2104 while (p->position >= 0)
2105 {
2106 i++;
2107
2108 #ifdef TRE_DEBUG
2109 {
2110 int *tags;
2111 DPRINT(("initial: %d", p->position));
2112 tags = p->tags;
2113 if (tags != NULL)
2114 {
2115 if (*tags >= 0)
2116 DPRINT(("/"));
2117 while (*tags >= 0)
2118 {
2119 DPRINT(("%d", *tags));
2120 tags++;
2121 if (*tags >= 0)
2122 DPRINT((","));
2123 }
2124 }
2125 DPRINT((", assert %d", p->assertions));
2126 if (p->params)
2127 {
2128 DPRINT((", "));
2129 tre_print_params(p->params);
2130 }
2131 DPRINT(("\n"));
2132 }
2133 #endif /* TRE_DEBUG */
2134
2135 p++;
2136 }
2137
2138 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2139 if (initial == NULL)
2140 ERROR_EXIT(REG_ESPACE);
2141 tnfa->initial = initial;
2142
2143 i = 0;
2144 for (p = tree->firstpos; p->position >= 0; p++)
2145 {
2146 initial[i].state = transitions + offs[p->position];
2147 initial[i].state_id = p->position;
2148 initial[i].tags = NULL;
2149 /* Copy the arrays p->tags, and p->params, they are allocated
2150 from a tre_mem object. */
2151 if (p->tags)
2152 {
2153 int j;
2154 for (j = 0; p->tags[j] >= 0; j++);
2155 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2156 if (!initial[i].tags)
2157 ERROR_EXIT(REG_ESPACE);
2158 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2159 }
2160 initial[i].params = NULL;
2161 if (p->params)
2162 {
2163 initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2164 if (!initial[i].params)
2165 ERROR_EXIT(REG_ESPACE);
2166 memcpy(initial[i].params, p->params,
2167 sizeof(*p->params) * TRE_PARAM_LAST);
2168 }
2169 initial[i].assertions = p->assertions;
2170 i++;
2171 }
2172 initial[i].state = NULL;
2173
2174 tnfa->num_transitions = add;
2175 tnfa->final = transitions + offs[tree->lastpos[0].position];
2176 tnfa->num_states = parse_ctx.position;
2177 tnfa->cflags = cflags;
2178
2179 DPRINT(("final state %p\n", (void *)tnfa->final));
2180
2181 tre_mem_destroy(mem);
2182 tre_stack_destroy(stack);
2183 xfree(counts);
2184 xfree(offs);
2185
2186 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2187 return REG_OK;
2188
2189 error_exit:
2190 /* Free everything that was allocated and return the error code. */
2191 tre_mem_destroy(mem);
2192 if (stack != NULL)
2193 tre_stack_destroy(stack);
2194 if (counts != NULL)
2195 xfree(counts);
2196 if (offs != NULL)
2197 xfree(offs);
2198 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2199 tre_free(preg);
2200 return errcode;
2201 }
2202
2203
2204
2205
2206 void
tre_free(regex_t * preg)2207 tre_free(regex_t *preg)
2208 {
2209 tre_tnfa_t *tnfa;
2210 unsigned int i;
2211 tre_tnfa_transition_t *trans;
2212
2213 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2214 if (!tnfa)
2215 return;
2216
2217 for (i = 0; i < tnfa->num_transitions; i++)
2218 if (tnfa->transitions[i].state)
2219 {
2220 if (tnfa->transitions[i].tags)
2221 xfree(tnfa->transitions[i].tags);
2222 if (tnfa->transitions[i].neg_classes)
2223 xfree(tnfa->transitions[i].neg_classes);
2224 if (tnfa->transitions[i].params)
2225 xfree(tnfa->transitions[i].params);
2226 }
2227 if (tnfa->transitions)
2228 xfree(tnfa->transitions);
2229
2230 if (tnfa->initial)
2231 {
2232 for (trans = tnfa->initial; trans->state; trans++)
2233 {
2234 if (trans->tags)
2235 xfree(trans->tags);
2236 if (trans->params)
2237 xfree(trans->params);
2238 }
2239 xfree(tnfa->initial);
2240 }
2241
2242 if (tnfa->submatch_data)
2243 {
2244 for (i = 0; i < tnfa->num_submatches; i++)
2245 if (tnfa->submatch_data[i].parents)
2246 xfree(tnfa->submatch_data[i].parents);
2247 xfree(tnfa->submatch_data);
2248 }
2249
2250 if (tnfa->tag_directions)
2251 xfree(tnfa->tag_directions);
2252 if (tnfa->firstpos_chars)
2253 xfree(tnfa->firstpos_chars);
2254 if (tnfa->minimal_tags)
2255 xfree(tnfa->minimal_tags);
2256 xfree(tnfa);
2257 }
2258
2259 char *
tre_version(void)2260 tre_version(void)
2261 {
2262 static char str[256];
2263 char *version;
2264
2265 if (str[0] == 0)
2266 {
2267 (void) tre_config(TRE_CONFIG_VERSION, &version);
2268 assert(strlen(version) < 200);
2269 #if defined(_MSC_VER)
2270 (void) _snprintf(str, sizeof(str), "TRE %s R_fixes (BSD)", version);
2271 #else
2272 (void) snprintf(str, sizeof(str), "TRE %s R_fixes (BSD)", version);
2273 #endif
2274 }
2275 return str;
2276 }
2277
2278 int
tre_config(int query,void * result)2279 tre_config(int query, void *result)
2280 {
2281 int *int_result = result;
2282 const char **string_result = result;
2283
2284 switch (query)
2285 {
2286 case TRE_CONFIG_APPROX:
2287 #ifdef TRE_APPROX
2288 *int_result = 1;
2289 #else /* !TRE_APPROX */
2290 *int_result = 0;
2291 #endif /* !TRE_APPROX */
2292 return REG_OK;
2293
2294 case TRE_CONFIG_WCHAR:
2295 #ifdef TRE_WCHAR
2296 *int_result = 1;
2297 #else /* !TRE_WCHAR */
2298 *int_result = 0;
2299 #endif /* !TRE_WCHAR */
2300 return REG_OK;
2301
2302 case TRE_CONFIG_MULTIBYTE:
2303 #ifdef TRE_MULTIBYTE
2304 *int_result = 1;
2305 #else /* !TRE_MULTIBYTE */
2306 *int_result = 0;
2307 #endif /* !TRE_MULTIBYTE */
2308 return REG_OK;
2309
2310 case TRE_CONFIG_SYSTEM_ABI:
2311 #ifdef TRE_CONFIG_SYSTEM_ABI
2312 *int_result = 1;
2313 #else /* !TRE_CONFIG_SYSTEM_ABI */
2314 *int_result = 0;
2315 #endif /* !TRE_CONFIG_SYSTEM_ABI */
2316 return REG_OK;
2317
2318 case TRE_CONFIG_VERSION:
2319 *string_result = TRE_VERSION;
2320 return REG_OK;
2321 }
2322
2323 return REG_NOMATCH;
2324 }
2325
2326
2327 /* EOF */
2328