1 /*
2   tre-compile.c - TRE regex compiler
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   TODO:
11    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
12      function calls.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <stdio.h>
20 //#include <assert.h>
21 #include <string.h>
22 
23 #include "tre-internal.h"
24 #include "tre-mem.h"
25 #include "tre-stack.h"
26 #include "tre-ast.h"
27 #include "tre-parse.h"
28 #include "tre-compile.h"
29 #include "tre.h"
30 #include "xmalloc.h"
31 
32 #define assert(a) R_assert(a)
33 
34 /*
35   Algorithms to setup tags so that submatch addressing can be done.
36 */
37 
38 
39 /* Inserts a catenation node to the root of the tree given in `node'.
40    As the left child a new tag with number `tag_id' to `node' is added,
41    and the right child is the old root. */
42 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)43 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
44 {
45   tre_catenation_t *c;
46 
47   DPRINT(("add_tag_left: tag %d\n", tag_id));
48 
49   c = tre_mem_alloc(mem, sizeof(*c));
50   if (c == NULL)
51     return REG_ESPACE;
52   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
53   if (c->left == NULL)
54     return REG_ESPACE;
55   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
56   if (c->right == NULL)
57     return REG_ESPACE;
58 
59   c->right->obj = node->obj;
60   c->right->type = node->type;
61   c->right->nullable = -1;
62   c->right->submatch_id = -1;
63   c->right->firstpos = NULL;
64   c->right->lastpos = NULL;
65   c->right->num_tags = 0;
66   node->obj = c;
67   node->type = CATENATION;
68   return REG_OK;
69 }
70 
71 /* Inserts a catenation node to the root of the tree given in `node'.
72    As the right child a new tag with number `tag_id' to `node' is added,
73    and the left child is the old root. */
74 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)75 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
76 {
77   tre_catenation_t *c;
78 
79   DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
80 
81   c = tre_mem_alloc(mem, sizeof(*c));
82   if (c == NULL)
83     return REG_ESPACE;
84   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
85   if (c->right == NULL)
86     return REG_ESPACE;
87   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
88   if (c->left == NULL)
89     return REG_ESPACE;
90 
91   c->left->obj = node->obj;
92   c->left->type = node->type;
93   c->left->nullable = -1;
94   c->left->submatch_id = -1;
95   c->left->firstpos = NULL;
96   c->left->lastpos = NULL;
97   c->left->num_tags = 0;
98   node->obj = c;
99   node->type = CATENATION;
100   return REG_OK;
101 }
102 
103 typedef enum {
104   ADDTAGS_RECURSE,
105   ADDTAGS_AFTER_ITERATION,
106   ADDTAGS_AFTER_UNION_LEFT,
107   ADDTAGS_AFTER_UNION_RIGHT,
108   ADDTAGS_AFTER_CAT_LEFT,
109   ADDTAGS_AFTER_CAT_RIGHT,
110   ADDTAGS_SET_SUBMATCH_END
111 } tre_addtags_symbol_t;
112 
113 
114 typedef struct {
115   int tag;
116   int next_tag;
117 } tre_tag_states_t;
118 
119 
120 /* Go through `regset' and set submatch data for submatches that are
121    using this tag. */
122 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)123 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
124 {
125   int i;
126 
127   for (i = 0; regset[i] >= 0; i++)
128     {
129       int id = regset[i] / 2;
130       int start = !(regset[i] % 2);
131       DPRINT(("  Using tag %d for %s offset of "
132 	      "submatch %d\n", tag,
133 	      start ? "start" : "end", id));
134       if (start)
135 	tnfa->submatch_data[id].so_tag = tag;
136       else
137 	tnfa->submatch_data[id].eo_tag = tag;
138     }
139   regset[0] = -1;
140 }
141 
142 
143 /* Adds tags to appropriate locations in the parse tree in `tree', so that
144    subexpressions marked for submatch addressing can be traced. */
145 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)146 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
147 	     tre_tnfa_t *tnfa)
148 {
149   reg_errcode_t status = REG_OK;
150   tre_addtags_symbol_t symbol;
151   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
152   int bottom = tre_stack_num_objects(stack);
153   /* True for first pass (counting number of needed tags) */
154   int first_pass = (mem == NULL || tnfa == NULL);
155   int *regset, *orig_regset;
156   int num_tags = 0; /* Total number of tags. */
157   int num_minimals = 0;	 /* Number of special minimal tags. */
158   int tag = 0;	    /* The tag that is to be added next. */
159   int next_tag = 1; /* Next tag to use after this one. */
160   int *parents;	    /* Stack of submatches the current submatch is
161 		       contained in. */
162   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
163   tre_tag_states_t *saved_states;
164 
165   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
166   if (!first_pass)
167     {
168       tnfa->end_tag = 0;
169       tnfa->minimal_tags[0] = -1;
170     }
171 
172   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
173   if (regset == NULL)
174     return REG_ESPACE;
175   regset[0] = -1;
176   orig_regset = regset;
177 
178   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
179   if (parents == NULL)
180     {
181       xfree(regset);
182       return REG_ESPACE;
183     }
184   parents[0] = -1;
185 
186   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
187   if (saved_states == NULL)
188     {
189       xfree(regset);
190       xfree(parents);
191       return REG_ESPACE;
192     }
193   else
194     {
195       unsigned int i;
196       for (i = 0; i <= tnfa->num_submatches; i++)
197 	saved_states[i].tag = -1;
198     }
199 
200   STACK_PUSH(stack, voidptr, node);
201   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
202 
203   while (tre_stack_num_objects(stack) > bottom)
204     {
205       if (status != REG_OK)
206 	break;
207 
208       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
209       switch (symbol)
210 	{
211 
212 	case ADDTAGS_SET_SUBMATCH_END:
213 	  {
214 	    int id = tre_stack_pop_int(stack);
215 	    int i;
216 
217 	    /* Add end of this submatch to regset. */
218 	    for (i = 0; regset[i] >= 0; i++);
219 	    regset[i] = id * 2 + 1;
220 	    regset[i + 1] = -1;
221 
222 	    /* Pop this submatch from the parents stack. */
223 	    for (i = 0; parents[i] >= 0; i++);
224 	    parents[i - 1] = -1;
225 	    break;
226 	  }
227 
228 	case ADDTAGS_RECURSE:
229 	  node = tre_stack_pop_voidptr(stack);
230 
231 	  if (node->submatch_id >= 0)
232 	    {
233 	      int id = node->submatch_id;
234 	      int i;
235 
236 
237 	      /* Add start of this submatch to regset. */
238 	      for (i = 0; regset[i] >= 0; i++);
239 	      regset[i] = id * 2;
240 	      regset[i + 1] = -1;
241 
242 	      if (!first_pass)
243 		{
244 		  for (i = 0; parents[i] >= 0; i++);
245 		  tnfa->submatch_data[id].parents = NULL;
246 		  if (i > 0)
247 		    {
248 		      int *p = xmalloc(sizeof(*p) * (i + 1));
249 		      if (p == NULL)
250 			{
251 			  status = REG_ESPACE;
252 			  break;
253 			}
254 		      assert(tnfa->submatch_data[id].parents == NULL);
255 		      tnfa->submatch_data[id].parents = p;
256 		      for (i = 0; parents[i] >= 0; i++)
257 			p[i] = parents[i];
258 		      p[i] = -1;
259 		    }
260 		}
261 
262 	      /* Add end of this submatch to regset after processing this
263 		 node. */
264 	      STACK_PUSHX(stack, int, node->submatch_id);
265 	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
266 	    }
267 
268 	  switch (node->type)
269 	    {
270 	    case LITERAL:
271 	      {
272 		tre_literal_t *lit = node->obj;
273 
274 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
275 		  {
276 		    int i;
277 		    DPRINT(("Literal %d-%d\n",
278 			    (int)lit->code_min, (int)lit->code_max));
279 		    if (regset[0] >= 0)
280 		      {
281 			/* Regset is not empty, so add a tag before the
282 			   literal or backref. */
283 			if (!first_pass)
284 			  {
285 			    status = tre_add_tag_left(mem, node, tag);
286 			    tnfa->tag_directions[tag] = direction;
287 			    if (minimal_tag >= 0)
288 			      {
289 				DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
290 				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
291 				tnfa->minimal_tags[i] = tag;
292 				tnfa->minimal_tags[i + 1] = minimal_tag;
293 				tnfa->minimal_tags[i + 2] = -1;
294 				minimal_tag = -1;
295 				num_minimals++;
296 			      }
297 			    tre_purge_regset(regset, tnfa, tag);
298 			  }
299 			else
300 			  {
301 			    DPRINT(("  num_tags = 1\n"));
302 			    node->num_tags = 1;
303 			  }
304 
305 			DPRINT(("  num_tags++\n"));
306 			regset[0] = -1;
307 			tag = next_tag;
308 			num_tags++;
309 			next_tag++;
310 		      }
311 		  }
312 		else
313 		  {
314 		    assert(!IS_TAG(lit));
315 		  }
316 		break;
317 	      }
318 	    case CATENATION:
319 	      {
320 		tre_catenation_t *cat = node->obj;
321 		tre_ast_node_t *left = cat->left;
322 		tre_ast_node_t *right = cat->right;
323 		int reserved_tag = -1;
324 		DPRINT(("Catenation, next_tag = %d\n", next_tag));
325 
326 
327 		/* After processing right child. */
328 		STACK_PUSHX(stack, voidptr, node);
329 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
330 
331 		/* Process right child. */
332 		STACK_PUSHX(stack, voidptr, right);
333 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
334 
335 		/* After processing left child. */
336 		STACK_PUSHX(stack, int, next_tag + left->num_tags);
337 		DPRINT(("  Pushing %d for after left\n",
338 			next_tag + left->num_tags));
339 		if (left->num_tags > 0 && right->num_tags > 0)
340 		  {
341 		    /* Reserve the next tag to the right child. */
342 		    DPRINT(("  Reserving next_tag %d to right child\n",
343 			    next_tag));
344 		    reserved_tag = next_tag;
345 		    next_tag++;
346 		  }
347 		STACK_PUSHX(stack, int, reserved_tag);
348 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
349 
350 		/* Process left child. */
351 		STACK_PUSHX(stack, voidptr, left);
352 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
353 
354 		}
355 	      break;
356 	    case ITERATION:
357 	      {
358 		tre_iteration_t *iter = node->obj;
359 		DPRINT(("Iteration\n"));
360 
361 		if (first_pass)
362 		  {
363 		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
364 		  }
365 		else
366 		  {
367 		    STACK_PUSHX(stack, int, tag);
368 		    STACK_PUSHX(stack, int, iter->minimal);
369 		  }
370 		STACK_PUSHX(stack, voidptr, node);
371 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
372 
373 		STACK_PUSHX(stack, voidptr, iter->arg);
374 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
375 
376 		/* Regset is not empty, so add a tag here. */
377 		if (regset[0] >= 0 || iter->minimal)
378 		  {
379 		    if (!first_pass)
380 		      {
381 			int i;
382 			status = tre_add_tag_left(mem, node, tag);
383 			if (iter->minimal)
384 			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
385 			else
386 			  tnfa->tag_directions[tag] = direction;
387 			if (minimal_tag >= 0)
388 			  {
389 			    DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
390 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
391 			    tnfa->minimal_tags[i] = tag;
392 			    tnfa->minimal_tags[i + 1] = minimal_tag;
393 			    tnfa->minimal_tags[i + 2] = -1;
394 			    minimal_tag = -1;
395 			    num_minimals++;
396 			  }
397 			tre_purge_regset(regset, tnfa, tag);
398 		      }
399 
400 		    DPRINT(("  num_tags++\n"));
401 		    regset[0] = -1;
402 		    tag = next_tag;
403 		    num_tags++;
404 		    next_tag++;
405 		  }
406 		direction = TRE_TAG_MINIMIZE;
407 	      }
408 	      break;
409 	    case UNION:
410 	      {
411 		tre_union_t *uni = node->obj;
412 		tre_ast_node_t *left = uni->left;
413 		tre_ast_node_t *right = uni->right;
414 		int left_tag;
415 		int right_tag;
416 
417 		if (regset[0] >= 0)
418 		  {
419 		    left_tag = next_tag;
420 		    right_tag = next_tag + 1;
421 		  }
422 		else
423 		  {
424 		    left_tag = tag;
425 		    right_tag = next_tag;
426 		  }
427 
428 		DPRINT(("Union\n"));
429 
430 		/* After processing right child. */
431 		STACK_PUSHX(stack, int, right_tag);
432 		STACK_PUSHX(stack, int, left_tag);
433 		STACK_PUSHX(stack, voidptr, regset);
434 		STACK_PUSHX(stack, int, regset[0] >= 0);
435 		STACK_PUSHX(stack, voidptr, node);
436 		STACK_PUSHX(stack, voidptr, right);
437 		STACK_PUSHX(stack, voidptr, left);
438 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
439 
440 		/* Process right child. */
441 		STACK_PUSHX(stack, voidptr, right);
442 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
443 
444 		/* After processing left child. */
445 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
446 
447 		/* Process left child. */
448 		STACK_PUSHX(stack, voidptr, left);
449 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
450 
451 		/* Regset is not empty, so add a tag here. */
452 		if (regset[0] >= 0)
453 		  {
454 		    if (!first_pass)
455 		      {
456 			int i;
457 			status = tre_add_tag_left(mem, node, tag);
458 			tnfa->tag_directions[tag] = direction;
459 			if (minimal_tag >= 0)
460 			  {
461 			    DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
462 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
463 			    tnfa->minimal_tags[i] = tag;
464 			    tnfa->minimal_tags[i + 1] = minimal_tag;
465 			    tnfa->minimal_tags[i + 2] = -1;
466 			    minimal_tag = -1;
467 			    num_minimals++;
468 			  }
469 			tre_purge_regset(regset, tnfa, tag);
470 		      }
471 
472 		    DPRINT(("  num_tags++\n"));
473 		    regset[0] = -1;
474 		    tag = next_tag;
475 		    num_tags++;
476 		    next_tag++;
477 		  }
478 
479 		if (node->num_submatches > 0)
480 		  {
481 		    /* The next two tags are reserved for markers. */
482 		    next_tag++;
483 		    tag = next_tag;
484 		    next_tag++;
485 		  }
486 
487 		break;
488 	      }
489 	    }
490 
491 	  if (node->submatch_id >= 0)
492 	    {
493 	      int i;
494 	      /* Push this submatch on the parents stack. */
495 	      for (i = 0; parents[i] >= 0; i++);
496 	      parents[i] = node->submatch_id;
497 	      parents[i + 1] = -1;
498 	    }
499 
500 	  break; /* end case: ADDTAGS_RECURSE */
501 
502 	case ADDTAGS_AFTER_ITERATION:
503 	  {
504 	    int minimal = 0;
505 	    int enter_tag;
506 	    node = tre_stack_pop_voidptr(stack);
507 	    if (first_pass)
508 	      {
509 		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
510 		  + tre_stack_pop_int(stack);
511 		minimal_tag = -1;
512 	      }
513 	    else
514 	      {
515 		minimal = tre_stack_pop_int(stack);
516 		enter_tag = tre_stack_pop_int(stack);
517 		if (minimal)
518 		  minimal_tag = enter_tag;
519 	      }
520 
521 	    DPRINT(("After iteration\n"));
522 	    if (!first_pass)
523 	      {
524 		DPRINT(("  Setting direction to %s\n",
525 			minimal ? "minimize" : "maximize"));
526 		if (minimal)
527 		  direction = TRE_TAG_MINIMIZE;
528 		else
529 		  direction = TRE_TAG_MAXIMIZE;
530 	      }
531 	    break;
532 	  }
533 
534 	case ADDTAGS_AFTER_CAT_LEFT:
535 	  {
536 	    int new_tag = tre_stack_pop_int(stack);
537 	    next_tag = tre_stack_pop_int(stack);
538 	    DPRINT(("After cat left, tag = %d, next_tag = %d\n",
539 		    tag, next_tag));
540 	    if (new_tag >= 0)
541 	      {
542 		DPRINT(("  Setting tag to %d\n", new_tag));
543 		tag = new_tag;
544 	      }
545 	    break;
546 	  }
547 
548 	case ADDTAGS_AFTER_CAT_RIGHT:
549 	  DPRINT(("After cat right\n"));
550 	  node = tre_stack_pop_voidptr(stack);
551 	  if (first_pass)
552 	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
553 	      + ((tre_catenation_t *)node->obj)->right->num_tags;
554 	  break;
555 
556 	case ADDTAGS_AFTER_UNION_LEFT:
557 	  DPRINT(("After union left\n"));
558 	  /* Lift the bottom of the `regset' array so that when processing
559 	     the right operand the items currently in the array are
560 	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
561 	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
562 	  while (*regset >= 0)
563 	    regset++;
564 	  break;
565 
566 	case ADDTAGS_AFTER_UNION_RIGHT:
567 	  {
568 	    int added_tags, tag_left, tag_right;
569 	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
570 	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
571 	    DPRINT(("After union right\n"));
572 	    node = tre_stack_pop_voidptr(stack);
573 	    added_tags = tre_stack_pop_int(stack);
574 	    if (first_pass)
575 	      {
576 		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
577 		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
578 		  + ((node->num_submatches > 0) ? 2 : 0);
579 	      }
580 	    regset = tre_stack_pop_voidptr(stack);
581 	    tag_left = tre_stack_pop_int(stack);
582 	    tag_right = tre_stack_pop_int(stack);
583 
584 	    /* Add tags after both children, the left child gets a smaller
585 	       tag than the right child.  This guarantees that we prefer
586 	       the left child over the right child. */
587 	    /* XXX - This is not always necessary (if the children have
588 	       tags which must be seen for every match of that child). */
589 	    /* XXX - Check if this is the only place where tre_add_tag_right
590 	       is used.	 If so, use tre_add_tag_left (putting the tag before
591 	       the child as opposed after the child) and throw away
592 	       tre_add_tag_right. */
593 	    if (node->num_submatches > 0)
594 	      {
595 		if (!first_pass)
596 		  {
597 		    status = tre_add_tag_right(mem, left, tag_left);
598 		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
599 		    status = tre_add_tag_right(mem, right, tag_right);
600 		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
601 		  }
602 		DPRINT(("  num_tags += 2\n"));
603 		num_tags += 2;
604 	      }
605 	    direction = TRE_TAG_MAXIMIZE;
606 	    break;
607 	  }
608 
609 	default:
610 	  assert(0);
611 	  break;
612 
613 	} /* end switch(symbol) */
614     } /* end while(tre_stack_num_objects(stack) > bottom) */
615 
616   if (!first_pass)
617     tre_purge_regset(regset, tnfa, tag);
618 
619   if (!first_pass && minimal_tag >= 0)
620     {
621       int i;
622       DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
623       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
624       tnfa->minimal_tags[i] = tag;
625       tnfa->minimal_tags[i + 1] = minimal_tag;
626       tnfa->minimal_tags[i + 2] = -1;
627       minimal_tag = -1;
628       num_minimals++;
629     }
630 
631   DPRINT(("tre_add_tags: %s complete.  Number of tags %d.\n",
632 	  first_pass? "First pass" : "Second pass", num_tags));
633 
634   assert(tree->num_tags == num_tags);
635   tnfa->end_tag = num_tags;
636   tnfa->num_tags = num_tags;
637   tnfa->num_minimals = num_minimals;
638   xfree(orig_regset);
639   xfree(parents);
640   xfree(saved_states);
641   return status;
642 }
643 
644 
645 
646 /*
647   AST to TNFA compilation routines.
648 */
649 
650 typedef enum {
651   COPY_RECURSE,
652   COPY_SET_RESULT_PTR
653 } tre_copyast_symbol_t;
654 
655 /* Flags for tre_copy_ast(). */
656 #define COPY_REMOVE_TAGS	 1
657 #define COPY_MAXIMIZE_FIRST_TAG	 2
658 
659 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)660 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
661 	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
662 	     tre_ast_node_t **copy, int *max_pos)
663 {
664   reg_errcode_t status = REG_OK;
665   int bottom = tre_stack_num_objects(stack);
666   int num_copied = 0;
667   int first_tag = 1;
668   tre_ast_node_t **result = copy;
669   tre_copyast_symbol_t symbol;
670 
671   STACK_PUSH(stack, voidptr, ast);
672   STACK_PUSH(stack, int, COPY_RECURSE);
673 
674   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
675     {
676       tre_ast_node_t *node;
677       if (status != REG_OK)
678 	break;
679 
680       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
681       switch (symbol)
682 	{
683 	case COPY_SET_RESULT_PTR:
684 	  result = tre_stack_pop_voidptr(stack);
685 	  break;
686 	case COPY_RECURSE:
687 	  node = tre_stack_pop_voidptr(stack);
688 	  switch (node->type)
689 	    {
690 	    case LITERAL:
691 	      {
692 		tre_literal_t *lit = node->obj;
693 		int pos = lit->position;
694 		int min = (int)lit->code_min;
695 		int max = (int)lit->code_max;
696 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
697 		  {
698 		    /* XXX - e.g. [ab] has only one position but two
699 		       nodes, so we are creating holes in the state space
700 		       here.  Not fatal, just wastes memory. */
701 		    pos += *pos_add;
702 		    num_copied++;
703 		  }
704 		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
705 		  {
706 		    /* Change this tag to empty. */
707 		    min = EMPTY;
708 		    max = pos = -1;
709 		  }
710 		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
711 			 && first_tag)
712 		  {
713 		    /* Maximize the first tag. */
714 		    tag_directions[max] = TRE_TAG_MAXIMIZE;
715 		    first_tag = 0;
716 		  }
717 		*result = tre_ast_new_literal(mem, min, max, pos);
718 		if (*result == NULL)
719 		  status = REG_ESPACE;
720 
721                 ((tre_literal_t*)(*result)->obj)->u.class = lit->u.class;
722 		if (pos > *max_pos)
723 		  *max_pos = pos;
724 		break;
725 	      }
726 	    case UNION:
727 	      {
728 		tre_union_t *uni = node->obj;
729 		tre_union_t *tmp;
730 		*result = tre_ast_new_union(mem, uni->left, uni->right);
731 		if (*result == NULL)
732 		  {
733 		    status = REG_ESPACE;
734 		    break;
735 		  }
736 		tmp = (*result)->obj;
737 		result = &tmp->left;
738 		STACK_PUSHX(stack, voidptr, uni->right);
739 		STACK_PUSHX(stack, int, COPY_RECURSE);
740 		STACK_PUSHX(stack, voidptr, &tmp->right);
741 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
742 		STACK_PUSHX(stack, voidptr, uni->left);
743 		STACK_PUSHX(stack, int, COPY_RECURSE);
744 		break;
745 	      }
746 	    case CATENATION:
747 	      {
748 		tre_catenation_t *cat = node->obj;
749 		tre_catenation_t *tmp;
750 		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
751 		if (*result == NULL)
752 		  {
753 		    status = REG_ESPACE;
754 		    break;
755 		  }
756 		tmp = (*result)->obj;
757 		tmp->left = NULL;
758 		tmp->right = NULL;
759 		result = &tmp->left;
760 
761 		STACK_PUSHX(stack, voidptr, cat->right);
762 		STACK_PUSHX(stack, int, COPY_RECURSE);
763 		STACK_PUSHX(stack, voidptr, &tmp->right);
764 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
765 		STACK_PUSHX(stack, voidptr, cat->left);
766 		STACK_PUSHX(stack, int, COPY_RECURSE);
767 		break;
768 	      }
769 	    case ITERATION:
770 	      {
771 		tre_iteration_t *iter = node->obj;
772 		STACK_PUSHX(stack, voidptr, iter->arg);
773 		STACK_PUSHX(stack, int, COPY_RECURSE);
774 		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
775 					   iter->max, iter->minimal);
776 		if (*result == NULL)
777 		  {
778 		    status = REG_ESPACE;
779 		    break;
780 		  }
781 		iter = (*result)->obj;
782 		result = &iter->arg;
783 		break;
784 	      }
785 	    default:
786 	      assert(0);
787 	      break;
788 	    }
789 	  break;
790 	}
791     }
792   *pos_add += num_copied;
793   return status;
794 }
795 
796 typedef enum {
797   EXPAND_RECURSE,
798   EXPAND_AFTER_ITER
799 } tre_expand_ast_symbol_t;
800 
801 /* Expands each iteration node that has a finite nonzero minimum or maximum
802    iteration count to a catenated sequence of copies of the node. */
803 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions,int * max_depth)804 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
805 	       int *position, tre_tag_direction_t *tag_directions,
806 	       int *max_depth)
807 {
808   reg_errcode_t status = REG_OK;
809   int bottom = tre_stack_num_objects(stack);
810   int pos_add = 0;
811   int pos_add_total = 0;
812   int max_pos = 0;
813   /* Current approximate matching parameters. */
814   int params[TRE_PARAM_LAST];
815   /* Approximate parameter nesting level. */
816   int params_depth = 0;
817   int iter_depth = 0;
818   int i;
819 
820   for (i = 0; i < TRE_PARAM_LAST; i++)
821     params[i] = TRE_PARAM_DEFAULT;
822 
823   STACK_PUSHR(stack, voidptr, ast);
824   STACK_PUSHR(stack, int, EXPAND_RECURSE);
825   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
826     {
827       tre_ast_node_t *node;
828       tre_expand_ast_symbol_t symbol;
829 
830       if (status != REG_OK)
831 	break;
832 
833       DPRINT(("pos_add %d\n", pos_add));
834 
835       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
836       node = tre_stack_pop_voidptr(stack);
837       switch (symbol)
838 	{
839 	case EXPAND_RECURSE:
840 	  switch (node->type)
841 	    {
842 	    case LITERAL:
843 	      {
844 		tre_literal_t *lit= node->obj;
845 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
846 		  {
847 		    lit->position += pos_add;
848 		    if (lit->position > max_pos)
849 		      max_pos = lit->position;
850 		  }
851 		break;
852 	      }
853 	    case UNION:
854 	      {
855 		tre_union_t *uni = node->obj;
856 		STACK_PUSHX(stack, voidptr, uni->right);
857 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
858 		STACK_PUSHX(stack, voidptr, uni->left);
859 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
860 		break;
861 	      }
862 	    case CATENATION:
863 	      {
864 		tre_catenation_t *cat = node->obj;
865 		STACK_PUSHX(stack, voidptr, cat->right);
866 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
867 		STACK_PUSHX(stack, voidptr, cat->left);
868 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
869 		break;
870 	      }
871 	    case ITERATION:
872 	      {
873 		tre_iteration_t *iter = node->obj;
874 		STACK_PUSHX(stack, int, pos_add);
875 		STACK_PUSHX(stack, voidptr, node);
876 		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
877 		STACK_PUSHX(stack, voidptr, iter->arg);
878 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
879 		/* If we are going to expand this node at EXPAND_AFTER_ITER
880 		   then don't increase the `pos' fields of the nodes now, it
881 		   will get done when expanding. */
882 		if (iter->min > 1 || iter->max > 1)
883 		  pos_add = 0;
884 		iter_depth++;
885 		DPRINT(("iter\n"));
886 		break;
887 	      }
888 	    default:
889 	      assert(0);
890 	      break;
891 	    }
892 	  break;
893 	case EXPAND_AFTER_ITER:
894 	  {
895 	    tre_iteration_t *iter = node->obj;
896 	    int pos_add_last;
897 	    pos_add = tre_stack_pop_int(stack);
898 	    pos_add_last = pos_add;
899 	    if (iter->min > 1 || iter->max > 1)
900 	      {
901 		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
902 		int j;
903 		int pos_add_save = pos_add;
904 
905 		/* Create a catenated sequence of copies of the node. */
906 		for (j = 0; j < iter->min; j++)
907 		  {
908 		    tre_ast_node_t *copy;
909 		    /* Remove tags from all but the last copy. */
910 		    int flags = ((j + 1 < iter->min)
911 				 ? COPY_REMOVE_TAGS
912 				 : COPY_MAXIMIZE_FIRST_TAG);
913 		    DPRINT(("  pos_add %d\n", pos_add));
914 		    pos_add_save = pos_add;
915 		    status = tre_copy_ast(mem, stack, iter->arg, flags,
916 					  &pos_add, tag_directions, &copy,
917 					  &max_pos);
918 		    if (status != REG_OK)
919 		      return status;
920 		    if (seq1 != NULL)
921 		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
922 		    else
923 		      seq1 = copy;
924 		    if (seq1 == NULL)
925 		      return REG_ESPACE;
926 		  }
927 
928 		if (iter->max == -1)
929 		  {
930 		    /* No upper limit. */
931 		    pos_add_save = pos_add;
932 		    status = tre_copy_ast(mem, stack, iter->arg, 0,
933 					  &pos_add, NULL, &seq2, &max_pos);
934 		    if (status != REG_OK)
935 		      return status;
936 		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
937 		    if (seq2 == NULL)
938 		      return REG_ESPACE;
939 		  }
940 		else
941 		  {
942 		    for (j = iter->min; j < iter->max; j++)
943 		      {
944 			tre_ast_node_t *tmp, *copy;
945 			pos_add_save = pos_add;
946 			status = tre_copy_ast(mem, stack, iter->arg, 0,
947 					      &pos_add, NULL, &copy, &max_pos);
948 			if (status != REG_OK)
949 			  return status;
950 			if (seq2 != NULL)
951 			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
952 			else
953 			  seq2 = copy;
954 			if (seq2 == NULL)
955 			  return REG_ESPACE;
956 			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
957 			if (tmp == NULL)
958 			  return REG_ESPACE;
959 			seq2 = tre_ast_new_union(mem, tmp, seq2);
960 			if (seq2 == NULL)
961 			  return REG_ESPACE;
962 		      }
963 		  }
964 
965 		pos_add = pos_add_save;
966 		if (seq1 == NULL)
967 		  seq1 = seq2;
968 		else if (seq2 != NULL)
969 		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
970 		if (seq1 == NULL)
971 		  return REG_ESPACE;
972 		node->obj = seq1->obj;
973 		node->type = seq1->type;
974 	      }
975 
976 	    iter_depth--;
977 	    pos_add_total += pos_add - pos_add_last;
978 	    if (iter_depth == 0)
979 	      pos_add = pos_add_total;
980 
981 	    /* If approximate parameters are specified, surround the result
982 	       with two parameter setting nodes.  The one on the left sets
983 	       the specified parameters, and the one on the right restores
984 	       the old parameters. */
985 	    if (iter->params)
986 	      {
987 		tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
988 		int *old_params;
989 
990 		tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
991 		if (!tmp_l)
992 		  return REG_ESPACE;
993 		((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
994 		iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
995 		tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
996 		if (!tmp_r)
997 		  return REG_ESPACE;
998 		old_params = tre_mem_alloc(mem, sizeof(*old_params)
999 					   * TRE_PARAM_LAST);
1000 		if (!old_params)
1001 		  return REG_ESPACE;
1002 		for (i = 0; i < TRE_PARAM_LAST; i++)
1003 		  old_params[i] = params[i];
1004 		((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1005 		old_params[TRE_PARAM_DEPTH] = params_depth;
1006 		/* XXX - this is the only place where ast_new_node is
1007 		   needed -- should be moved inside AST module. */
1008 		node_copy = tre_ast_new_node(mem, ITERATION,
1009 					     sizeof(tre_iteration_t));
1010 		if (!node_copy)
1011 		  return REG_ESPACE;
1012 		node_copy->obj = node->obj;
1013 		tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1014 		if (!tmp_node)
1015 		  return REG_ESPACE;
1016 		tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1017 		if (!tmp_node)
1018 		  return REG_ESPACE;
1019 		/* Replace the contents of `node' with `tmp_node'. */
1020 		memcpy(node, tmp_node, sizeof(*node));
1021 		node->obj = tmp_node->obj;
1022 		node->type = tmp_node->type;
1023 		params_depth++;
1024 		if (params_depth > *max_depth)
1025 		  *max_depth = params_depth;
1026 	      }
1027 	    break;
1028 	  }
1029 	default:
1030 	  assert(0);
1031 	  break;
1032 	}
1033     }
1034 
1035   *position += pos_add_total;
1036 
1037   /* `max_pos' should never be larger than `*position' if the above
1038      code works, but just an extra safeguard let's make sure
1039      `*position' is set large enough so enough memory will be
1040      allocated for the transition table. */
1041   if (max_pos > *position)
1042     *position = max_pos;
1043 
1044 #ifdef TRE_DEBUG
1045   DPRINT(("Expanded AST:\n"));
1046   tre_ast_print(ast);
1047   DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1048 #endif
1049 
1050   return status;
1051 }
1052 
1053 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)1054 tre_set_empty(tre_mem_t mem)
1055 {
1056   tre_pos_and_tags_t *new_set;
1057 
1058   new_set = tre_mem_calloc(mem, sizeof(*new_set));
1059   if (new_set == NULL)
1060     return NULL;
1061 
1062   new_set[0].position = -1;
1063   new_set[0].code_min = -1;
1064   new_set[0].code_max = -1;
1065 
1066   return new_set;
1067 }
1068 
1069 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)1070 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1071 	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1072 {
1073   tre_pos_and_tags_t *new_set;
1074 
1075   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1076   if (new_set == NULL)
1077     return NULL;
1078 
1079   new_set[0].position = position;
1080   new_set[0].code_min = code_min;
1081   new_set[0].code_max = code_max;
1082   new_set[0].class = class;
1083   new_set[0].neg_classes = neg_classes;
1084   new_set[0].backref = backref;
1085   new_set[1].position = -1;
1086   new_set[1].code_min = -1;
1087   new_set[1].code_max = -1;
1088 
1089   return new_set;
1090 }
1091 
1092 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions,int * params)1093 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1094 	      int *tags, int assertions, int *params)
1095 {
1096   int s1, s2, i, j;
1097   tre_pos_and_tags_t *new_set;
1098   int *new_tags;
1099   int num_tags;
1100 
1101   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1102   for (s1 = 0; set1[s1].position >= 0; s1++);
1103   for (s2 = 0; set2[s2].position >= 0; s2++);
1104   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1105   if (!new_set )
1106     return NULL;
1107 
1108   for (s1 = 0; set1[s1].position >= 0; s1++)
1109     {
1110       new_set[s1].position = set1[s1].position;
1111       new_set[s1].code_min = set1[s1].code_min;
1112       new_set[s1].code_max = set1[s1].code_max;
1113       new_set[s1].assertions = set1[s1].assertions | assertions;
1114       new_set[s1].class = set1[s1].class;
1115       new_set[s1].neg_classes = set1[s1].neg_classes;
1116       new_set[s1].backref = set1[s1].backref;
1117       if (set1[s1].tags == NULL && tags == NULL)
1118 	new_set[s1].tags = NULL;
1119       else
1120 	{
1121 	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1122 	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1123 					 * (i + num_tags + 1)));
1124 	  if (new_tags == NULL)
1125 	    return NULL;
1126 	  for (j = 0; j < i; j++)
1127 	    new_tags[j] = set1[s1].tags[j];
1128 	  for (i = 0; i < num_tags; i++)
1129 	    new_tags[j + i] = tags[i];
1130 	  new_tags[j + i] = -1;
1131 	  new_set[s1].tags = new_tags;
1132 	}
1133       if (set1[s1].params)
1134 	new_set[s1].params = set1[s1].params;
1135       if (params)
1136 	{
1137 	  if (!new_set[s1].params)
1138 	    new_set[s1].params = params;
1139 	  else
1140 	    {
1141 	      new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1142 						 TRE_PARAM_LAST);
1143 	      if (!new_set[s1].params)
1144 		return NULL;
1145 	      for (i = 0; i < TRE_PARAM_LAST; i++)
1146 		if (params[i] != TRE_PARAM_UNSET)
1147 		  new_set[s1].params[i] = params[i];
1148 	    }
1149 	}
1150     }
1151 
1152   for (s2 = 0; set2[s2].position >= 0; s2++)
1153     {
1154       new_set[s1 + s2].position = set2[s2].position;
1155       new_set[s1 + s2].code_min = set2[s2].code_min;
1156       new_set[s1 + s2].code_max = set2[s2].code_max;
1157       /* XXX - why not | assertions here as well? */
1158       new_set[s1 + s2].assertions = set2[s2].assertions;
1159       new_set[s1 + s2].class = set2[s2].class;
1160       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1161       new_set[s1 + s2].backref = set2[s2].backref;
1162       if (set2[s2].tags == NULL)
1163 	new_set[s1 + s2].tags = NULL;
1164       else
1165 	{
1166 	  for (i = 0; set2[s2].tags[i] >= 0; i++);
1167 	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1168 	  if (new_tags == NULL)
1169 	    return NULL;
1170 	  for (j = 0; j < i; j++)
1171 	    new_tags[j] = set2[s2].tags[j];
1172 	  new_tags[j] = -1;
1173 	  new_set[s1 + s2].tags = new_tags;
1174 	}
1175       if (set2[s2].params)
1176 	new_set[s1 + s2].params = set2[s2].params;
1177       if (params)
1178 	{
1179 	  if (!new_set[s1 + s2].params)
1180 	    new_set[s1 + s2].params = params;
1181 	  else
1182 	    {
1183 	      new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1184 						      TRE_PARAM_LAST);
1185 	      if (!new_set[s1 + s2].params)
1186 		return NULL;
1187 	      for (i = 0; i < TRE_PARAM_LAST; i++)
1188 		if (params[i] != TRE_PARAM_UNSET)
1189 		  new_set[s1 + s2].params[i] = params[i];
1190 	    }
1191 	}
1192     }
1193   new_set[s1 + s2].position = -1;
1194   return new_set;
1195 }
1196 
1197 /* Finds the empty path through `node' which is the one that should be
1198    taken according to POSIX.2 rules, and adds the tags on that path to
1199    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
1200    set to the number of tags seen on the path. */
1201 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * params,int * num_tags_seen,int * params_seen)1202 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1203 		int *assertions, int *params, int *num_tags_seen,
1204 		int *params_seen)
1205 {
1206   tre_literal_t *lit;
1207   tre_union_t *uni;
1208   tre_catenation_t *cat;
1209   tre_iteration_t *iter;
1210   int i;
1211   int bottom = tre_stack_num_objects(stack);
1212   reg_errcode_t status = REG_OK;
1213   if (num_tags_seen)
1214     *num_tags_seen = 0;
1215   if (params_seen)
1216     *params_seen = 0;
1217 
1218   status = tre_stack_push_voidptr(stack, node);
1219 
1220   /* Walk through the tree recursively. */
1221   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1222     {
1223       node = tre_stack_pop_voidptr(stack);
1224 
1225       switch (node->type)
1226 	{
1227 	case LITERAL:
1228 	  lit = (tre_literal_t *)node->obj;
1229 	  switch (lit->code_min)
1230 	    {
1231 	    case TAG:
1232 	      if (lit->code_max >= 0)
1233 		{
1234 		  if (tags != NULL)
1235 		    {
1236 		      /* Add the tag to `tags'. */
1237 		      for (i = 0; tags[i] >= 0; i++)
1238 			if (tags[i] == lit->code_max)
1239 			  break;
1240 		      if (tags[i] < 0)
1241 			{
1242 			  tags[i] = (int)lit->code_max;
1243 			  tags[i + 1] = -1;
1244 			}
1245 		    }
1246 		  if (num_tags_seen)
1247 		    (*num_tags_seen)++;
1248 		}
1249 	      break;
1250 	    case ASSERTION:
1251 	      assert(lit->code_max >= 1
1252 		     || lit->code_max <= ASSERT_LAST);
1253 	      if (assertions != NULL)
1254 		*assertions |= lit->code_max;
1255 	      break;
1256 	    case PARAMETER:
1257 	      if (params != NULL)
1258 		for (i = 0; i < TRE_PARAM_LAST; i++)
1259 		  params[i] = lit->u.params[i];
1260 	      if (params_seen != NULL)
1261 		*params_seen = 1;
1262 	      break;
1263 	    case EMPTY:
1264 	      break;
1265 	    default:
1266 	      assert(0);
1267 	      break;
1268 	    }
1269 	  break;
1270 
1271 	case UNION:
1272 	  /* Subexpressions starting earlier take priority over ones
1273 	     starting later, so we prefer the left subexpression over the
1274 	     right subexpression. */
1275 	  uni = (tre_union_t *)node->obj;
1276 	  if (uni->left->nullable)
1277 	    STACK_PUSHX(stack, voidptr, uni->left)
1278 	  else if (uni->right->nullable)
1279 	    STACK_PUSHX(stack, voidptr, uni->right)
1280 	  else
1281 	    assert(0);
1282 	  break;
1283 
1284 	case CATENATION:
1285 	  /* The path must go through both children. */
1286 	  cat = (tre_catenation_t *)node->obj;
1287 	  assert(cat->left->nullable);
1288 	  assert(cat->right->nullable);
1289 	  STACK_PUSHX(stack, voidptr, cat->left);
1290 	  STACK_PUSHX(stack, voidptr, cat->right);
1291 	  break;
1292 
1293 	case ITERATION:
1294 	  /* A match with an empty string is preferred over no match at
1295 	     all, so we go through the argument if possible. */
1296 	  iter = (tre_iteration_t *)node->obj;
1297 	  if (iter->arg->nullable)
1298 	    STACK_PUSHX(stack, voidptr, iter->arg);
1299 	  break;
1300 
1301 	default:
1302 	  assert(0);
1303 	  break;
1304 	}
1305     }
1306 
1307   return status;
1308 }
1309 
1310 
1311 typedef enum {
1312   NFL_RECURSE,
1313   NFL_POST_UNION,
1314   NFL_POST_CATENATION,
1315   NFL_POST_ITERATION
1316 } tre_nfl_stack_symbol_t;
1317 
1318 
1319 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1320    the nodes of the AST `tree'. */
1321 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)1322 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1323 {
1324   int bottom = tre_stack_num_objects(stack);
1325 
1326   STACK_PUSHR(stack, voidptr, tree);
1327   STACK_PUSHR(stack, int, NFL_RECURSE);
1328 
1329   while (tre_stack_num_objects(stack) > bottom)
1330     {
1331       tre_nfl_stack_symbol_t symbol;
1332       tre_ast_node_t *node;
1333 
1334       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1335       node = tre_stack_pop_voidptr(stack);
1336       switch (symbol)
1337 	{
1338 	case NFL_RECURSE:
1339 	  switch (node->type)
1340 	    {
1341 	    case LITERAL:
1342 	      {
1343 		tre_literal_t *lit = (tre_literal_t *)node->obj;
1344 		if (IS_BACKREF(lit))
1345 		  {
1346 		    /* Back references: nullable = false, firstpos = {i},
1347 		       lastpos = {i}. */
1348 		    node->nullable = 0;
1349 		    node->firstpos = tre_set_one(mem, lit->position, 0,
1350 					     TRE_CHAR_MAX, 0, NULL, -1);
1351 		    if (!node->firstpos)
1352 		      return REG_ESPACE;
1353 		    node->lastpos = tre_set_one(mem, lit->position, 0,
1354 						TRE_CHAR_MAX, 0, NULL,
1355 						(int)lit->code_max);
1356 		    if (!node->lastpos)
1357 		      return REG_ESPACE;
1358 		  }
1359 		else if (lit->code_min < 0)
1360 		  {
1361 		    /* Tags, empty strings, params, and zero width assertions:
1362 		       nullable = true, firstpos = {}, and lastpos = {}. */
1363 		    node->nullable = 1;
1364 		    node->firstpos = tre_set_empty(mem);
1365 		    if (!node->firstpos)
1366 		      return REG_ESPACE;
1367 		    node->lastpos = tre_set_empty(mem);
1368 		    if (!node->lastpos)
1369 		      return REG_ESPACE;
1370 		  }
1371 		else
1372 		  {
1373 		    /* Literal at position i: nullable = false, firstpos = {i},
1374 		       lastpos = {i}. */
1375 		    node->nullable = 0;
1376 		    node->firstpos =
1377 		      tre_set_one(mem, lit->position, (int)lit->code_min,
1378 				  (int)lit->code_max, 0, NULL, -1);
1379 		    if (!node->firstpos)
1380 		      return REG_ESPACE;
1381 		    node->lastpos = tre_set_one(mem, lit->position,
1382 						(int)lit->code_min,
1383 						(int)lit->code_max,
1384 						lit->u.class, lit->neg_classes,
1385 						-1);
1386 		    if (!node->lastpos)
1387 		      return REG_ESPACE;
1388 		  }
1389 		break;
1390 	      }
1391 
1392 	    case UNION:
1393 	      /* Compute the attributes for the two subtrees, and after that
1394 		 for this node. */
1395 	      STACK_PUSHR(stack, voidptr, node);
1396 	      STACK_PUSHR(stack, int, NFL_POST_UNION);
1397 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1398 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1399 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1400 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1401 	      break;
1402 
1403 	    case CATENATION:
1404 	      /* Compute the attributes for the two subtrees, and after that
1405 		 for this node. */
1406 	      STACK_PUSHR(stack, voidptr, node);
1407 	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1408 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1409 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1410 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1411 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1412 	      break;
1413 
1414 	    case ITERATION:
1415 	      /* Compute the attributes for the subtree, and after that for
1416 		 this node. */
1417 	      STACK_PUSHR(stack, voidptr, node);
1418 	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1419 	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1420 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1421 	      break;
1422 	    }
1423 	  break; /* end case: NFL_RECURSE */
1424 
1425 	case NFL_POST_UNION:
1426 	  {
1427 	    tre_union_t *uni = (tre_union_t *)node->obj;
1428 	    node->nullable = uni->left->nullable || uni->right->nullable;
1429 	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
1430 					   uni->right->firstpos, NULL, 0, NULL);
1431 	    if (!node->firstpos)
1432 	      return REG_ESPACE;
1433 	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
1434 					  uni->right->lastpos, NULL, 0, NULL);
1435 	    if (!node->lastpos)
1436 	      return REG_ESPACE;
1437 	    break;
1438 	  }
1439 
1440 	case NFL_POST_ITERATION:
1441 	  {
1442 	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1443 
1444 	    if (iter->min == 0 || iter->arg->nullable)
1445 	      node->nullable = 1;
1446 	    else
1447 	      node->nullable = 0;
1448 	    node->firstpos = iter->arg->firstpos;
1449 	    node->lastpos = iter->arg->lastpos;
1450 	    break;
1451 	  }
1452 
1453 	case NFL_POST_CATENATION:
1454 	  {
1455 	    int num_tags, *tags, assertions, params_seen;
1456 	    int *params;
1457 	    reg_errcode_t status;
1458 	    tre_catenation_t *cat = node->obj;
1459 	    node->nullable = cat->left->nullable && cat->right->nullable;
1460 
1461 	    /* Compute firstpos. */
1462 	    if (cat->left->nullable)
1463 	      {
1464 		/* The left side matches the empty string.  Make a first pass
1465 		   with tre_match_empty() to get the number of tags and
1466 		   parameters. */
1467 		status = tre_match_empty(stack, cat->left,
1468 					 NULL, NULL, NULL, &num_tags,
1469 					 &params_seen);
1470 		if (status != REG_OK)
1471 		  return status;
1472 		/* Allocate arrays for the tags and parameters. */
1473 		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1474 		if (!tags)
1475 		  return REG_ESPACE;
1476 		tags[0] = -1;
1477 		assertions = 0;
1478 		params = NULL;
1479 		if (params_seen)
1480 		  {
1481 		    params = tre_mem_alloc(mem, sizeof(*params)
1482 					   * TRE_PARAM_LAST);
1483 		    if (!params)
1484 		      {
1485 			xfree(tags);
1486 			return REG_ESPACE;
1487 		      }
1488 		  }
1489 		/* Second pass with tre_mach_empty() to get the list of
1490 		   tags and parameters. */
1491 		status = tre_match_empty(stack, cat->left, tags,
1492 					 &assertions, params, NULL, NULL);
1493 		if (status != REG_OK)
1494 		  {
1495 		    xfree(tags);
1496 		    return status;
1497 		  }
1498 		node->firstpos =
1499 		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1500 				tags, assertions, params);
1501 		xfree(tags);
1502 		if (!node->firstpos)
1503 		  return REG_ESPACE;
1504 	      }
1505 	    else
1506 	      {
1507 		node->firstpos = cat->left->firstpos;
1508 	      }
1509 
1510 	    /* Compute lastpos. */
1511 	    if (cat->right->nullable)
1512 	      {
1513 		/* The right side matches the empty string.  Make a first pass
1514 		   with tre_match_empty() to get the number of tags and
1515 		   parameters. */
1516 		status = tre_match_empty(stack, cat->right,
1517 					 NULL, NULL, NULL, &num_tags,
1518 					 &params_seen);
1519 		if (status != REG_OK)
1520 		  return status;
1521 		/* Allocate arrays for the tags and parameters. */
1522 		tags = xmalloc(sizeof(int) * (num_tags + 1));
1523 		if (!tags)
1524 		  return REG_ESPACE;
1525 		tags[0] = -1;
1526 		assertions = 0;
1527 		params = NULL;
1528 		if (params_seen)
1529 		  {
1530 		    params = tre_mem_alloc(mem, sizeof(*params)
1531 					   * TRE_PARAM_LAST);
1532 		    if (!params)
1533 		      {
1534 			xfree(tags);
1535 			return REG_ESPACE;
1536 		      }
1537 		  }
1538 		/* Second pass with tre_mach_empty() to get the list of
1539 		   tags and parameters. */
1540 		status = tre_match_empty(stack, cat->right, tags,
1541 					 &assertions, params, NULL, NULL);
1542 		if (status != REG_OK)
1543 		  {
1544 		    xfree(tags);
1545 		    return status;
1546 		  }
1547 		node->lastpos =
1548 		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1549 				tags, assertions, params);
1550 		xfree(tags);
1551 		if (!node->lastpos)
1552 		  return REG_ESPACE;
1553 	      }
1554 	    else
1555 	      {
1556 		node->lastpos = cat->right->lastpos;
1557 	      }
1558 	    break;
1559 	  }
1560 
1561 	default:
1562 	  assert(0);
1563 	  break;
1564 	}
1565     }
1566 
1567   return REG_OK;
1568 }
1569 
1570 
1571 /* Adds a transition from each position in `p1' to each position in `p2'. */
1572 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)1573 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1574 	       tre_tnfa_transition_t *transitions,
1575 	       int *counts, int *offs)
1576 {
1577   tre_pos_and_tags_t *orig_p2 = p2;
1578   tre_tnfa_transition_t *trans;
1579   int i, j, k, l, dup, prev_p2_pos;
1580 
1581   if (transitions != NULL)
1582     while (p1->position >= 0)
1583       {
1584 	p2 = orig_p2;
1585 	prev_p2_pos = -1;
1586 	while (p2->position >= 0)
1587 	  {
1588 	    /* Optimization: if this position was already handled, skip it. */
1589 	    if (p2->position == prev_p2_pos)
1590 	      {
1591 		p2++;
1592 		continue;
1593 	      }
1594 	    prev_p2_pos = p2->position;
1595 	    /* Set `trans' to point to the next unused transition from
1596 	       position `p1->position'. */
1597 	    trans = transitions + offs[p1->position];
1598 	    while (trans->state != NULL)
1599 	      {
1600 #if 0
1601 		/* If we find a previous transition from `p1->position' to
1602 		   `p2->position', it is overwritten.  This can happen only
1603 		   if there are nested loops in the regexp, like in "((a)*)*".
1604 		   In POSIX.2 repetition using the outer loop is always
1605 		   preferred over using the inner loop.	 Therefore the
1606 		   transition for the inner loop is useless and can be thrown
1607 		   away. */
1608 		/* XXX - The same position is used for all nodes in a bracket
1609 		   expression, so this optimization cannot be used (it will
1610 		   break bracket expressions) unless I figure out a way to
1611 		   detect it here. */
1612 		if (trans->state_id == p2->position)
1613 		  {
1614 		    DPRINT(("*"));
1615 		    break;
1616 		  }
1617 #endif
1618 		trans++;
1619 	      }
1620 
1621 	    if (trans->state == NULL)
1622 	      (trans + 1)->state = NULL;
1623 	    /* Use the character ranges, assertions, etc. from `p1' for
1624 	       the transition from `p1' to `p2'. */
1625 	    trans->code_min = (tre_cint_t) p1->code_min;
1626 	    trans->code_max = (tre_cint_t) p1->code_max;
1627 	    trans->state = transitions + offs[p2->position];
1628 	    trans->state_id = p2->position;
1629 	    trans->assertions = p1->assertions | p2->assertions
1630 	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
1631 	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1632 	    if (p1->backref >= 0)
1633 	      {
1634 		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1635 		assert(p2->backref < 0);
1636 		trans->u.backref = p1->backref;
1637 		trans->assertions |= ASSERT_BACKREF;
1638 	      }
1639 	    else
1640 	      trans->u.class = p1->class;
1641 	    if (p1->neg_classes != NULL)
1642 	      {
1643 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1644 		trans->neg_classes =
1645 		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1646 		if (trans->neg_classes == NULL)
1647 		  return REG_ESPACE;
1648 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1649 		  trans->neg_classes[i] = p1->neg_classes[i];
1650 		trans->neg_classes[i] = (tre_ctype_t)0;
1651 	      }
1652 	    else
1653 	      trans->neg_classes = NULL;
1654 
1655 	    /* Find out how many tags this transition has. */
1656 	    i = 0;
1657 	    if (p1->tags != NULL)
1658 	      while(p1->tags[i] >= 0)
1659 		i++;
1660 	    j = 0;
1661 	    if (p2->tags != NULL)
1662 	      while(p2->tags[j] >= 0)
1663 		j++;
1664 
1665 	    /* If we are overwriting a transition, free the old tag array. */
1666 	    if (trans->tags != NULL)
1667 	      xfree(trans->tags);
1668 	    trans->tags = NULL;
1669 
1670 	    /* If there were any tags, allocate an array and fill it. */
1671 	    if (i + j > 0)
1672 	      {
1673 		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1674 		if (!trans->tags)
1675 		  return REG_ESPACE;
1676 		i = 0;
1677 		if (p1->tags != NULL)
1678 		  while(p1->tags[i] >= 0)
1679 		    {
1680 		      trans->tags[i] = p1->tags[i];
1681 		      i++;
1682 		    }
1683 		l = i;
1684 		j = 0;
1685 		if (p2->tags != NULL)
1686 		  while (p2->tags[j] >= 0)
1687 		    {
1688 		      /* Don't add duplicates. */
1689 		      dup = 0;
1690 		      for (k = 0; k < i; k++)
1691 			if (trans->tags[k] == p2->tags[j])
1692 			  {
1693 			    dup = 1;
1694 			    break;
1695 			  }
1696 		      if (!dup)
1697 			trans->tags[l++] = p2->tags[j];
1698 		      j++;
1699 		    }
1700 		trans->tags[l] = -1;
1701 	      }
1702 
1703 	    /* Set the parameter array.	 If both `p2' and `p1' have same
1704 	       parameters, the values in `p2' override those in `p1'. */
1705 	    if (p1->params || p2->params)
1706 	      {
1707 		if (!trans->params)
1708 		  trans->params = xmalloc(sizeof(*trans->params)
1709 					  * TRE_PARAM_LAST);
1710 		if (!trans->params)
1711 		  return REG_ESPACE;
1712 		for (i = 0; i < TRE_PARAM_LAST; i++)
1713 		  {
1714 		    trans->params[i] = TRE_PARAM_UNSET;
1715 		    if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1716 		      trans->params[i] = p1->params[i];
1717 		    if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1718 		      trans->params[i] = p2->params[i];
1719 		  }
1720 	      }
1721 	    else
1722 	      {
1723 		if (trans->params)
1724 		  xfree(trans->params);
1725 		trans->params = NULL;
1726 	      }
1727 
1728 
1729 #ifdef TRE_DEBUG
1730 	    {
1731 	      int *tags;
1732 
1733 	      DPRINT(("	 %2d -> %2d on %3d", p1->position, p2->position,
1734 		      p1->code_min));
1735 	      if (p1->code_max != p1->code_min)
1736 		DPRINT(("-%3d", p1->code_max));
1737 	      tags = trans->tags;
1738 	      if (tags)
1739 		{
1740 		  DPRINT((", tags ["));
1741 		  while (*tags >= 0)
1742 		    {
1743 		      DPRINT(("%d", *tags));
1744 		      tags++;
1745 		      if (*tags >= 0)
1746 			DPRINT((","));
1747 		    }
1748 		  DPRINT(("]"));
1749 		}
1750 	      if (trans->assertions)
1751 		DPRINT((", assert %d", trans->assertions));
1752 	      if (trans->assertions & ASSERT_BACKREF)
1753 		DPRINT((", backref %d", trans->u.backref));
1754 	      else if (trans->u.class)
1755 		DPRINT((", class %ld", (long)trans->u.class));
1756 	      if (trans->neg_classes)
1757 		DPRINT((", neg_classes %p", trans->neg_classes));
1758 	      if (trans->params)
1759 		{
1760 		  DPRINT((", "));
1761 		  tre_print_params(trans->params);
1762 		}
1763 	      DPRINT(("\n"));
1764 	    }
1765 #endif /* TRE_DEBUG */
1766 	    p2++;
1767 	  }
1768 	p1++;
1769       }
1770   else
1771     /* Compute a maximum limit for the number of transitions leaving
1772        from each state. */
1773     while (p1->position >= 0)
1774       {
1775 	p2 = orig_p2;
1776 	while (p2->position >= 0)
1777 	  {
1778 	    counts[p1->position]++;
1779 	    p2++;
1780 	  }
1781 	p1++;
1782       }
1783   return REG_OK;
1784 }
1785 
1786 /* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
1787    labelled with one character range (there are no transitions on empty
1788    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
1789    the regexp.
1790    This is the iterative version using a stack (previously used recursion
1791    which will fail for large patterns)
1792  */
1793 static reg_errcode_t
tre_ast_to_tnfa_iter(tre_stack_t * stack,tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1794 tre_ast_to_tnfa_iter(tre_stack_t *stack, tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1795 		     int *counts, int *offs)
1796 {
1797   tre_union_t *uni;
1798   tre_catenation_t *cat;
1799   tre_iteration_t *iter;
1800   reg_errcode_t errcode = REG_OK;
1801 
1802   STACK_PUSHR(stack, voidptr, node);
1803 
1804   while (tre_stack_num_objects(stack)) {
1805     node = (tre_ast_node_t*) tre_stack_pop_voidptr(stack);
1806 
1807     switch (node->type)
1808       {
1809       case LITERAL:
1810 	break;
1811 
1812       case UNION:
1813 	uni = (tre_union_t *)node->obj;
1814 	STACK_PUSHR(stack, voidptr, uni->right);
1815 	STACK_PUSHR(stack, voidptr, uni->left);
1816 	break;
1817 
1818       case CATENATION:
1819 	cat = (tre_catenation_t *)node->obj;
1820 	/* Add a transition from each position in cat->left->lastpos
1821 	   to each position in cat->right->firstpos. */
1822 	errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1823 				 transitions, counts, offs);
1824 	if (errcode != REG_OK)
1825 	  return errcode;
1826 
1827 	STACK_PUSHR(stack, voidptr, cat->right);
1828 	STACK_PUSHR(stack, voidptr, cat->left);
1829 	break;
1830 
1831       case ITERATION:
1832 	iter = (tre_iteration_t *)node->obj;
1833 	// assert(iter->max == -1 || iter->max == 1);
1834 	if(!(iter->max == -1 || iter->max == 1)) return REG_BADBR;
1835 
1836 	if (iter->max == -1)
1837 	  {
1838 	    // assert(iter->min == 0 || iter->min == 1);
1839 	    if(!(iter->min == 0 || iter->min == 1)) return REG_BADBR;
1840 	    /* Add a transition from each last position in the iterated
1841 	       expression to each first position. */
1842 	    errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1843 				     transitions, counts, offs);
1844 	    if (errcode != REG_OK)
1845 	      return errcode;
1846 	  }
1847 	STACK_PUSHR(stack, voidptr, iter->arg);
1848 	break;
1849       }
1850   }
1851   return REG_OK;
1852 }
1853 
1854 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1855 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1856 		int *counts, int *offs)
1857 {
1858   reg_errcode_t errcode;
1859   tre_stack_t *stack;
1860   /* I made up max_size, there is no reason for that particular value */
1861   stack = tre_stack_new(1024, 256*1024, 4096);
1862   errcode = tre_ast_to_tnfa_iter(stack, node, transitions, counts, offs);
1863   tre_stack_destroy(stack);
1864   return errcode;
1865 }
1866 
1867 
1868 #define ERROR_EXIT(err)		  \
1869   do				  \
1870     {				  \
1871       errcode = err;		  \
1872       if (/*CONSTCOND*/(void)1,1)	  \
1873       	goto error_exit;	  \
1874     }				  \
1875  while (/*CONSTCOND*/(void)0,0)
1876 
1877 
1878 int
tre_compile(regex_t * preg,const tre_char_t * regex,size_t n,int cflags)1879 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1880 {
1881   tre_stack_t *stack;
1882   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1883   tre_pos_and_tags_t *p;
1884   int *counts = NULL, *offs = NULL;
1885   int i, add = 0;
1886   tre_tnfa_transition_t *transitions, *initial;
1887   tre_tnfa_t *tnfa = NULL;
1888   tre_submatch_data_t *submatch_data;
1889   tre_tag_direction_t *tag_directions = NULL;
1890   reg_errcode_t errcode;
1891   tre_mem_t mem;
1892 
1893   /* Parse context. */
1894   tre_parse_ctx_t parse_ctx;
1895 
1896   /* Allocate a stack used throughout the compilation process for various
1897      purposes. */
1898   stack = tre_stack_new(512, 10240, 128);
1899   if (!stack)
1900     return REG_ESPACE;
1901   /* Allocate a fast memory allocator. */
1902   mem = tre_mem_new();
1903   if (!mem)
1904     {
1905       tre_stack_destroy(stack);
1906       return REG_ESPACE;
1907     }
1908 
1909   /* Parse the regexp. */
1910   memset(&parse_ctx, 0, sizeof(parse_ctx));
1911   parse_ctx.mem = mem;
1912   parse_ctx.stack = stack;
1913   parse_ctx.re = regex;
1914   parse_ctx.len = n;
1915   parse_ctx.cflags = cflags;
1916   parse_ctx.max_backref = -1;
1917   /* workaround for PR#14408: use 8-bit optimizations in 8-bit mode */
1918   parse_ctx.cur_max = (cflags & REG_USEBYTES) ? 1 : TRE_MB_CUR_MAX;
1919   DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1920   errcode = tre_parse(&parse_ctx);
1921   if (errcode != REG_OK)
1922     ERROR_EXIT(errcode);
1923   preg->re_nsub = parse_ctx.submatch_id - 1;
1924   tree = parse_ctx.result;
1925 
1926   /* Back references and approximate matching cannot currently be used
1927      in the same regexp. */
1928   if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1929     ERROR_EXIT(REG_BADPAT);
1930 
1931 #ifdef TRE_DEBUG
1932   tre_ast_print(tree);
1933 #endif /* TRE_DEBUG */
1934 
1935   /* Referring to nonexistent subexpressions is illegal. */
1936   if (parse_ctx.max_backref > (int)preg->re_nsub)
1937     ERROR_EXIT(REG_ESUBREG);
1938 
1939   /* Allocate the TNFA struct. */
1940   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1941   if (tnfa == NULL)
1942     ERROR_EXIT(REG_ESPACE);
1943   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1944   tnfa->have_approx = parse_ctx.have_approx;
1945   tnfa->num_submatches = parse_ctx.submatch_id;
1946 
1947   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
1948      regexp does not have back references, this can be skipped. */
1949   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1950     {
1951       DPRINT(("tre_compile: setting up tags\n"));
1952 
1953       /* Figure out how many tags we will need. */
1954       errcode = tre_add_tags(NULL, stack, tree, tnfa);
1955       if (errcode != REG_OK)
1956 	ERROR_EXIT(errcode);
1957 #ifdef TRE_DEBUG
1958       tre_ast_print(tree);
1959 #endif /* TRE_DEBUG */
1960 
1961       if (tnfa->num_tags > 0)
1962 	{
1963 	  tag_directions = xmalloc(sizeof(*tag_directions)
1964 				   * (tnfa->num_tags + 1));
1965 	  if (tag_directions == NULL)
1966 	    ERROR_EXIT(REG_ESPACE);
1967 	  tnfa->tag_directions = tag_directions;
1968 	  memset(tag_directions, -1,
1969 		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
1970 	}
1971       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
1972 				   sizeof(tnfa->minimal_tags));
1973       if (tnfa->minimal_tags == NULL)
1974 	ERROR_EXIT(REG_ESPACE);
1975 
1976       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
1977 			      sizeof(*submatch_data));
1978       if (submatch_data == NULL)
1979 	ERROR_EXIT(REG_ESPACE);
1980       tnfa->submatch_data = submatch_data;
1981 
1982       errcode = tre_add_tags(mem, stack, tree, tnfa);
1983       if (errcode != REG_OK)
1984 	ERROR_EXIT(errcode);
1985 
1986 #ifdef TRE_DEBUG
1987       for (i = 0; i < parse_ctx.submatch_id; i++)
1988 	DPRINT(("pmatch[%d] = {t%d, t%d}\n",
1989 		i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
1990       for (i = 0; i < tnfa->num_tags; i++)
1991 	DPRINT(("t%d is %s\n", i,
1992 		tag_directions[i] == TRE_TAG_MINIMIZE ?
1993 		"minimized" : "maximized"));
1994 #endif /* TRE_DEBUG */
1995     }
1996 
1997   /* Expand iteration nodes. */
1998   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
1999 			   tag_directions, &tnfa->params_depth);
2000   if (errcode != REG_OK)
2001     ERROR_EXIT(errcode);
2002 
2003   /* Add a dummy node for the final state.
2004      XXX - For certain patterns this dummy node can be optimized away,
2005 	   for example "a*" or "ab*".	Figure out a simple way to detect
2006 	   this possibility. */
2007   tmp_ast_l = tree;
2008   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2009   if (tmp_ast_r == NULL)
2010     ERROR_EXIT(REG_ESPACE);
2011 
2012   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2013   if (tree == NULL)
2014     ERROR_EXIT(REG_ESPACE);
2015 
2016 #ifdef TRE_DEBUG
2017   tre_ast_print(tree);
2018   DPRINT(("Number of states: %d\n", parse_ctx.position));
2019 #endif /* TRE_DEBUG */
2020 
2021   errcode = tre_compute_nfl(mem, stack, tree);
2022   if (errcode != REG_OK)
2023     ERROR_EXIT(errcode);
2024 
2025   counts = xmalloc(sizeof(int) * parse_ctx.position);
2026   if (counts == NULL)
2027     ERROR_EXIT(REG_ESPACE);
2028 
2029   offs = xmalloc(sizeof(int) * parse_ctx.position);
2030   if (offs == NULL)
2031     ERROR_EXIT(REG_ESPACE);
2032 
2033   for (i = 0; i < parse_ctx.position; i++)
2034     counts[i] = 0;
2035   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2036 
2037   add = 0;
2038   for (i = 0; i < parse_ctx.position; i++)
2039     {
2040       offs[i] = add;
2041       add += counts[i] + 1;
2042       counts[i] = 0;
2043     }
2044   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2045   if (transitions == NULL)
2046     ERROR_EXIT(REG_ESPACE);
2047   tnfa->transitions = transitions;
2048   tnfa->num_transitions = add;
2049 
2050   DPRINT(("Converting to TNFA:\n"));
2051   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2052   if (errcode != REG_OK)
2053     ERROR_EXIT(errcode);
2054 
2055   /* If in eight bit mode, compute a table of characters that can be the
2056      first character of a match. */
2057   tnfa->first_char = -1;
2058   if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2059     {
2060       int count = 0;
2061       tre_cint_t k;
2062       DPRINT(("Characters that can start a match:"));
2063       tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2064       if (tnfa->firstpos_chars == NULL)
2065 	ERROR_EXIT(REG_ESPACE);
2066       for (p = tree->firstpos; p->position >= 0; p++)
2067 	{
2068 	  tre_tnfa_transition_t *j = transitions + offs[p->position];
2069 	  while (j->state != NULL)
2070 	    {
2071 	      for (k = j->code_min; k <= j->code_max && k < 256; k++)
2072 		{
2073 		  DPRINT((" %d", k));
2074 		  tnfa->firstpos_chars[k] = 1;
2075 		  count++;
2076 		}
2077 	      j++;
2078 	    }
2079 	}
2080       DPRINT(("\n"));
2081 #define TRE_OPTIMIZE_FIRST_CHAR 1
2082 #if TRE_OPTIMIZE_FIRST_CHAR
2083       if (count == 1)
2084 	{
2085 	  for (k = 0; k < 256; k++)
2086 	    if (tnfa->firstpos_chars[k])
2087 	      {
2088 		DPRINT(("first char must be %d\n", k));
2089 		tnfa->first_char = k;
2090 		xfree(tnfa->firstpos_chars);
2091 		tnfa->firstpos_chars = NULL;
2092 		break;
2093 	      }
2094 	}
2095 #endif
2096 
2097     }
2098   else
2099     tnfa->firstpos_chars = NULL;
2100 
2101 
2102   p = tree->firstpos;
2103   i = 0;
2104   while (p->position >= 0)
2105     {
2106       i++;
2107 
2108 #ifdef TRE_DEBUG
2109       {
2110 	int *tags;
2111 	DPRINT(("initial: %d", p->position));
2112 	tags = p->tags;
2113 	if (tags != NULL)
2114 	  {
2115 	    if (*tags >= 0)
2116 	      DPRINT(("/"));
2117 	    while (*tags >= 0)
2118 	      {
2119 		DPRINT(("%d", *tags));
2120 		tags++;
2121 		if (*tags >= 0)
2122 		  DPRINT((","));
2123 	      }
2124 	  }
2125 	DPRINT((", assert %d", p->assertions));
2126 	if (p->params)
2127 	  {
2128 	    DPRINT((", "));
2129 	    tre_print_params(p->params);
2130 	  }
2131 	DPRINT(("\n"));
2132       }
2133 #endif /* TRE_DEBUG */
2134 
2135       p++;
2136     }
2137 
2138   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2139   if (initial == NULL)
2140     ERROR_EXIT(REG_ESPACE);
2141   tnfa->initial = initial;
2142 
2143   i = 0;
2144   for (p = tree->firstpos; p->position >= 0; p++)
2145     {
2146       initial[i].state = transitions + offs[p->position];
2147       initial[i].state_id = p->position;
2148       initial[i].tags = NULL;
2149       /* Copy the arrays p->tags, and p->params, they are allocated
2150 	 from a tre_mem object. */
2151       if (p->tags)
2152 	{
2153 	  int j;
2154 	  for (j = 0; p->tags[j] >= 0; j++);
2155 	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2156 	  if (!initial[i].tags)
2157 	    ERROR_EXIT(REG_ESPACE);
2158 	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2159 	}
2160       initial[i].params = NULL;
2161       if (p->params)
2162 	{
2163 	  initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2164 	  if (!initial[i].params)
2165 	    ERROR_EXIT(REG_ESPACE);
2166 	  memcpy(initial[i].params, p->params,
2167 		 sizeof(*p->params) * TRE_PARAM_LAST);
2168 	}
2169       initial[i].assertions = p->assertions;
2170       i++;
2171     }
2172   initial[i].state = NULL;
2173 
2174   tnfa->num_transitions = add;
2175   tnfa->final = transitions + offs[tree->lastpos[0].position];
2176   tnfa->num_states = parse_ctx.position;
2177   tnfa->cflags = cflags;
2178 
2179   DPRINT(("final state %p\n", (void *)tnfa->final));
2180 
2181   tre_mem_destroy(mem);
2182   tre_stack_destroy(stack);
2183   xfree(counts);
2184   xfree(offs);
2185 
2186   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2187   return REG_OK;
2188 
2189  error_exit:
2190   /* Free everything that was allocated and return the error code. */
2191   tre_mem_destroy(mem);
2192   if (stack != NULL)
2193     tre_stack_destroy(stack);
2194   if (counts != NULL)
2195     xfree(counts);
2196   if (offs != NULL)
2197     xfree(offs);
2198   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2199   tre_free(preg);
2200   return errcode;
2201 }
2202 
2203 
2204 
2205 
2206 void
tre_free(regex_t * preg)2207 tre_free(regex_t *preg)
2208 {
2209   tre_tnfa_t *tnfa;
2210   unsigned int i;
2211   tre_tnfa_transition_t *trans;
2212 
2213   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2214   if (!tnfa)
2215     return;
2216 
2217   for (i = 0; i < tnfa->num_transitions; i++)
2218     if (tnfa->transitions[i].state)
2219       {
2220 	if (tnfa->transitions[i].tags)
2221 	  xfree(tnfa->transitions[i].tags);
2222 	if (tnfa->transitions[i].neg_classes)
2223 	  xfree(tnfa->transitions[i].neg_classes);
2224 	if (tnfa->transitions[i].params)
2225 	  xfree(tnfa->transitions[i].params);
2226       }
2227   if (tnfa->transitions)
2228     xfree(tnfa->transitions);
2229 
2230   if (tnfa->initial)
2231     {
2232       for (trans = tnfa->initial; trans->state; trans++)
2233 	{
2234 	  if (trans->tags)
2235 	    xfree(trans->tags);
2236 	  if (trans->params)
2237 	    xfree(trans->params);
2238 	}
2239       xfree(tnfa->initial);
2240     }
2241 
2242   if (tnfa->submatch_data)
2243     {
2244       for (i = 0; i < tnfa->num_submatches; i++)
2245 	if (tnfa->submatch_data[i].parents)
2246 	  xfree(tnfa->submatch_data[i].parents);
2247       xfree(tnfa->submatch_data);
2248     }
2249 
2250   if (tnfa->tag_directions)
2251     xfree(tnfa->tag_directions);
2252   if (tnfa->firstpos_chars)
2253     xfree(tnfa->firstpos_chars);
2254   if (tnfa->minimal_tags)
2255     xfree(tnfa->minimal_tags);
2256   xfree(tnfa);
2257 }
2258 
2259 char *
tre_version(void)2260 tre_version(void)
2261 {
2262   static char str[256];
2263   char *version;
2264 
2265   if (str[0] == 0)
2266     {
2267       (void) tre_config(TRE_CONFIG_VERSION, &version);
2268 	  assert(strlen(version) < 200);
2269 #if defined(_MSC_VER)
2270 	  (void) _snprintf(str, sizeof(str), "TRE %s R_fixes (BSD)", version);
2271 #else
2272 	  (void) snprintf(str, sizeof(str), "TRE %s R_fixes (BSD)", version);
2273 #endif
2274     }
2275   return str;
2276 }
2277 
2278 int
tre_config(int query,void * result)2279 tre_config(int query, void *result)
2280 {
2281   int *int_result = result;
2282   const char **string_result = result;
2283 
2284   switch (query)
2285     {
2286     case TRE_CONFIG_APPROX:
2287 #ifdef TRE_APPROX
2288       *int_result = 1;
2289 #else /* !TRE_APPROX */
2290       *int_result = 0;
2291 #endif /* !TRE_APPROX */
2292       return REG_OK;
2293 
2294     case TRE_CONFIG_WCHAR:
2295 #ifdef TRE_WCHAR
2296       *int_result = 1;
2297 #else /* !TRE_WCHAR */
2298       *int_result = 0;
2299 #endif /* !TRE_WCHAR */
2300       return REG_OK;
2301 
2302     case TRE_CONFIG_MULTIBYTE:
2303 #ifdef TRE_MULTIBYTE
2304       *int_result = 1;
2305 #else /* !TRE_MULTIBYTE */
2306       *int_result = 0;
2307 #endif /* !TRE_MULTIBYTE */
2308       return REG_OK;
2309 
2310     case TRE_CONFIG_SYSTEM_ABI:
2311 #ifdef TRE_CONFIG_SYSTEM_ABI
2312       *int_result = 1;
2313 #else /* !TRE_CONFIG_SYSTEM_ABI */
2314       *int_result = 0;
2315 #endif /* !TRE_CONFIG_SYSTEM_ABI */
2316       return REG_OK;
2317 
2318     case TRE_CONFIG_VERSION:
2319       *string_result = TRE_VERSION;
2320       return REG_OK;
2321     }
2322 
2323   return REG_NOMATCH;
2324 }
2325 
2326 
2327 /* EOF */
2328