1 /*
2   tre-compile.c - TRE regex compiler
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   TODO:
11    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
12      function calls.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <stdio.h>
20 #include <assert.h>
21 #include <string.h>
22 
23 #include "tre-internal.h"
24 #include "tre-mem.h"
25 #include "tre-stack.h"
26 #include "tre-ast.h"
27 #include "tre-parse.h"
28 #include "tre-compile.h"
29 #include "tre.h"
30 #include "xmalloc.h"
31 
32 /*
33   Algorithms to setup tags so that submatch addressing can be done.
34 */
35 
36 
37 /* Inserts a catenation node to the root of the tree given in `node'.
38    As the left child a new tag with number `tag_id' to `node' is added,
39    and the right child is the old root. */
40 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)41 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
42 {
43   tre_catenation_t *c;
44 
45   DPRINT(("add_tag_left: tag %d\n", tag_id));
46 
47   c = tre_mem_alloc(mem, sizeof(*c));
48   if (c == NULL)
49     return REG_ESPACE;
50   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
51   if (c->left == NULL)
52     return REG_ESPACE;
53   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
54   if (c->right == NULL)
55     return REG_ESPACE;
56 
57   c->right->obj = node->obj;
58   c->right->type = node->type;
59   c->right->nullable = -1;
60   c->right->submatch_id = -1;
61   c->right->firstpos = NULL;
62   c->right->lastpos = NULL;
63   c->right->num_tags = 0;
64   node->obj = c;
65   node->type = CATENATION;
66   return REG_OK;
67 }
68 
69 /* Inserts a catenation node to the root of the tree given in `node'.
70    As the right child a new tag with number `tag_id' to `node' is added,
71    and the left child is the old root. */
72 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)73 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
74 {
75   tre_catenation_t *c;
76 
77   DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
78 
79   c = tre_mem_alloc(mem, sizeof(*c));
80   if (c == NULL)
81     return REG_ESPACE;
82   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
83   if (c->right == NULL)
84     return REG_ESPACE;
85   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
86   if (c->left == NULL)
87     return REG_ESPACE;
88 
89   c->left->obj = node->obj;
90   c->left->type = node->type;
91   c->left->nullable = -1;
92   c->left->submatch_id = -1;
93   c->left->firstpos = NULL;
94   c->left->lastpos = NULL;
95   c->left->num_tags = 0;
96   node->obj = c;
97   node->type = CATENATION;
98   return REG_OK;
99 }
100 
101 typedef enum {
102   ADDTAGS_RECURSE,
103   ADDTAGS_AFTER_ITERATION,
104   ADDTAGS_AFTER_UNION_LEFT,
105   ADDTAGS_AFTER_UNION_RIGHT,
106   ADDTAGS_AFTER_CAT_LEFT,
107   ADDTAGS_AFTER_CAT_RIGHT,
108   ADDTAGS_SET_SUBMATCH_END
109 } tre_addtags_symbol_t;
110 
111 
112 typedef struct {
113   int tag;
114   int next_tag;
115 } tre_tag_states_t;
116 
117 
118 /* Go through `regset' and set submatch data for submatches that are
119    using this tag. */
120 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)121 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
122 {
123   int i;
124 
125   for (i = 0; regset[i] >= 0; i++)
126     {
127       int id = regset[i] / 2;
128       int start = !(regset[i] % 2);
129       DPRINT(("  Using tag %d for %s offset of "
130 	      "submatch %d\n", tag,
131 	      start ? "start" : "end", id));
132       if (start)
133 	tnfa->submatch_data[id].so_tag = tag;
134       else
135 	tnfa->submatch_data[id].eo_tag = tag;
136     }
137   regset[0] = -1;
138 }
139 
140 
141 /* Adds tags to appropriate locations in the parse tree in `tree', so that
142    subexpressions marked for submatch addressing can be traced. */
143 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)144 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
145 	     tre_tnfa_t *tnfa)
146 {
147   reg_errcode_t status = REG_OK;
148   tre_addtags_symbol_t symbol;
149   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
150   int bottom = tre_stack_num_objects(stack);
151   /* True for first pass (counting number of needed tags) */
152   int first_pass = (mem == NULL || tnfa == NULL);
153   int *regset, *orig_regset;
154   int num_tags = 0; /* Total number of tags. */
155   int num_minimals = 0;	 /* Number of special minimal tags. */
156   int tag = 0;	    /* The tag that is to be added next. */
157   int next_tag = 1; /* Next tag to use after this one. */
158   int *parents;	    /* Stack of submatches the current submatch is
159 		       contained in. */
160   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
161   tre_tag_states_t *saved_states;
162 
163   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
164   if (!first_pass)
165     {
166       tnfa->end_tag = 0;
167       tnfa->minimal_tags[0] = -1;
168     }
169 
170   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
171   if (regset == NULL)
172     return REG_ESPACE;
173   regset[0] = -1;
174   orig_regset = regset;
175 
176   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
177   if (parents == NULL)
178     {
179       xfree(regset);
180       return REG_ESPACE;
181     }
182   parents[0] = -1;
183 
184   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
185   if (saved_states == NULL)
186     {
187       xfree(regset);
188       xfree(parents);
189       return REG_ESPACE;
190     }
191   else
192     {
193       unsigned int i;
194       for (i = 0; i <= tnfa->num_submatches; i++)
195 	saved_states[i].tag = -1;
196     }
197 
198   STACK_PUSH(stack, voidptr, node);
199   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
200 
201   while (tre_stack_num_objects(stack) > bottom)
202     {
203       if (status != REG_OK)
204 	break;
205 
206       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
207       switch (symbol)
208 	{
209 
210 	case ADDTAGS_SET_SUBMATCH_END:
211 	  {
212 	    int id = tre_stack_pop_int(stack);
213 	    int i;
214 
215 	    /* Add end of this submatch to regset. */
216 	    for (i = 0; regset[i] >= 0; i++);
217 	    regset[i] = id * 2 + 1;
218 	    regset[i + 1] = -1;
219 
220 	    /* Pop this submatch from the parents stack. */
221 	    for (i = 0; parents[i] >= 0; i++);
222 	    parents[i - 1] = -1;
223 	    break;
224 	  }
225 
226 	case ADDTAGS_RECURSE:
227 	  node = tre_stack_pop_voidptr(stack);
228 
229 	  if (node->submatch_id >= 0)
230 	    {
231 	      int id = node->submatch_id;
232 	      int i;
233 
234 
235 	      /* Add start of this submatch to regset. */
236 	      for (i = 0; regset[i] >= 0; i++);
237 	      regset[i] = id * 2;
238 	      regset[i + 1] = -1;
239 
240 	      if (!first_pass)
241 		{
242 		  for (i = 0; parents[i] >= 0; i++);
243 		  tnfa->submatch_data[id].parents = NULL;
244 		  if (i > 0)
245 		    {
246 		      int *p = xmalloc(sizeof(*p) * (i + 1));
247 		      if (p == NULL)
248 			{
249 			  status = REG_ESPACE;
250 			  break;
251 			}
252 		      assert(tnfa->submatch_data[id].parents == NULL);
253 		      tnfa->submatch_data[id].parents = p;
254 		      for (i = 0; parents[i] >= 0; i++)
255 			p[i] = parents[i];
256 		      p[i] = -1;
257 		    }
258 		}
259 
260 	      /* Add end of this submatch to regset after processing this
261 		 node. */
262 	      STACK_PUSHX(stack, int, node->submatch_id);
263 	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
264 	    }
265 
266 	  switch (node->type)
267 	    {
268 	    case LITERAL:
269 	      {
270 		tre_literal_t *lit = node->obj;
271 
272 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
273 		  {
274 		    int i;
275 		    DPRINT(("Literal %d-%d\n",
276 			    (int)lit->code_min, (int)lit->code_max));
277 		    if (regset[0] >= 0)
278 		      {
279 			/* Regset is not empty, so add a tag before the
280 			   literal or backref. */
281 			if (!first_pass)
282 			  {
283 			    status = tre_add_tag_left(mem, node, tag);
284 			    tnfa->tag_directions[tag] = direction;
285 			    if (minimal_tag >= 0)
286 			      {
287 				DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
288 				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
289 				tnfa->minimal_tags[i] = tag;
290 				tnfa->minimal_tags[i + 1] = minimal_tag;
291 				tnfa->minimal_tags[i + 2] = -1;
292 				minimal_tag = -1;
293 				num_minimals++;
294 			      }
295 			    tre_purge_regset(regset, tnfa, tag);
296 			  }
297 			else
298 			  {
299 			    DPRINT(("  num_tags = 1\n"));
300 			    node->num_tags = 1;
301 			  }
302 
303 			DPRINT(("  num_tags++\n"));
304 			regset[0] = -1;
305 			tag = next_tag;
306 			num_tags++;
307 			next_tag++;
308 		      }
309 		  }
310 		else
311 		  {
312 		    assert(!IS_TAG(lit));
313 		  }
314 		break;
315 	      }
316 	    case CATENATION:
317 	      {
318 		tre_catenation_t *cat = node->obj;
319 		tre_ast_node_t *left = cat->left;
320 		tre_ast_node_t *right = cat->right;
321 		int reserved_tag = -1;
322 		DPRINT(("Catenation, next_tag = %d\n", next_tag));
323 
324 
325 		/* After processing right child. */
326 		STACK_PUSHX(stack, voidptr, node);
327 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
328 
329 		/* Process right child. */
330 		STACK_PUSHX(stack, voidptr, right);
331 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
332 
333 		/* After processing left child. */
334 		STACK_PUSHX(stack, int, next_tag + left->num_tags);
335 		DPRINT(("  Pushing %d for after left\n",
336 			next_tag + left->num_tags));
337 		if (left->num_tags > 0 && right->num_tags > 0)
338 		  {
339 		    /* Reserve the next tag to the right child. */
340 		    DPRINT(("  Reserving next_tag %d to right child\n",
341 			    next_tag));
342 		    reserved_tag = next_tag;
343 		    next_tag++;
344 		  }
345 		STACK_PUSHX(stack, int, reserved_tag);
346 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
347 
348 		/* Process left child. */
349 		STACK_PUSHX(stack, voidptr, left);
350 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
351 
352 		}
353 	      break;
354 	    case ITERATION:
355 	      {
356 		tre_iteration_t *iter = node->obj;
357 		DPRINT(("Iteration\n"));
358 
359 		if (first_pass)
360 		  {
361 		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
362 		  }
363 		else
364 		  {
365 		    STACK_PUSHX(stack, int, tag);
366 		    STACK_PUSHX(stack, int, iter->minimal);
367 		  }
368 		STACK_PUSHX(stack, voidptr, node);
369 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
370 
371 		STACK_PUSHX(stack, voidptr, iter->arg);
372 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
373 
374 		/* Regset is not empty, so add a tag here. */
375 		if (regset[0] >= 0 || iter->minimal)
376 		  {
377 		    if (!first_pass)
378 		      {
379 			int i;
380 			status = tre_add_tag_left(mem, node, tag);
381 			if (iter->minimal)
382 			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
383 			else
384 			  tnfa->tag_directions[tag] = direction;
385 			if (minimal_tag >= 0)
386 			  {
387 			    DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
388 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
389 			    tnfa->minimal_tags[i] = tag;
390 			    tnfa->minimal_tags[i + 1] = minimal_tag;
391 			    tnfa->minimal_tags[i + 2] = -1;
392 			    minimal_tag = -1;
393 			    num_minimals++;
394 			  }
395 			tre_purge_regset(regset, tnfa, tag);
396 		      }
397 
398 		    DPRINT(("  num_tags++\n"));
399 		    regset[0] = -1;
400 		    tag = next_tag;
401 		    num_tags++;
402 		    next_tag++;
403 		  }
404 		direction = TRE_TAG_MINIMIZE;
405 	      }
406 	      break;
407 	    case UNION:
408 	      {
409 		tre_union_t *uni = node->obj;
410 		tre_ast_node_t *left = uni->left;
411 		tre_ast_node_t *right = uni->right;
412 		int left_tag;
413 		int right_tag;
414 
415 		if (regset[0] >= 0)
416 		  {
417 		    left_tag = next_tag;
418 		    right_tag = next_tag + 1;
419 		  }
420 		else
421 		  {
422 		    left_tag = tag;
423 		    right_tag = next_tag;
424 		  }
425 
426 		DPRINT(("Union\n"));
427 
428 		/* After processing right child. */
429 		STACK_PUSHX(stack, int, right_tag);
430 		STACK_PUSHX(stack, int, left_tag);
431 		STACK_PUSHX(stack, voidptr, regset);
432 		STACK_PUSHX(stack, int, regset[0] >= 0);
433 		STACK_PUSHX(stack, voidptr, node);
434 		STACK_PUSHX(stack, voidptr, right);
435 		STACK_PUSHX(stack, voidptr, left);
436 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
437 
438 		/* Process right child. */
439 		STACK_PUSHX(stack, voidptr, right);
440 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
441 
442 		/* After processing left child. */
443 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
444 
445 		/* Process left child. */
446 		STACK_PUSHX(stack, voidptr, left);
447 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
448 
449 		/* Regset is not empty, so add a tag here. */
450 		if (regset[0] >= 0)
451 		  {
452 		    if (!first_pass)
453 		      {
454 			int i;
455 			status = tre_add_tag_left(mem, node, tag);
456 			tnfa->tag_directions[tag] = direction;
457 			if (minimal_tag >= 0)
458 			  {
459 			    DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
460 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
461 			    tnfa->minimal_tags[i] = tag;
462 			    tnfa->minimal_tags[i + 1] = minimal_tag;
463 			    tnfa->minimal_tags[i + 2] = -1;
464 			    minimal_tag = -1;
465 			    num_minimals++;
466 			  }
467 			tre_purge_regset(regset, tnfa, tag);
468 		      }
469 
470 		    DPRINT(("  num_tags++\n"));
471 		    regset[0] = -1;
472 		    tag = next_tag;
473 		    num_tags++;
474 		    next_tag++;
475 		  }
476 
477 		if (node->num_submatches > 0)
478 		  {
479 		    /* The next two tags are reserved for markers. */
480 		    next_tag++;
481 		    tag = next_tag;
482 		    next_tag++;
483 		  }
484 
485 		break;
486 	      }
487 	    }
488 
489 	  if (node->submatch_id >= 0)
490 	    {
491 	      int i;
492 	      /* Push this submatch on the parents stack. */
493 	      for (i = 0; parents[i] >= 0; i++);
494 	      parents[i] = node->submatch_id;
495 	      parents[i + 1] = -1;
496 	    }
497 
498 	  break; /* end case: ADDTAGS_RECURSE */
499 
500 	case ADDTAGS_AFTER_ITERATION:
501 	  {
502 	    int minimal = 0;
503 	    int enter_tag;
504 	    node = tre_stack_pop_voidptr(stack);
505 	    if (first_pass)
506 	      {
507 		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
508 		  + tre_stack_pop_int(stack);
509 		minimal_tag = -1;
510 	      }
511 	    else
512 	      {
513 		minimal = tre_stack_pop_int(stack);
514 		enter_tag = tre_stack_pop_int(stack);
515 		if (minimal)
516 		  minimal_tag = enter_tag;
517 	      }
518 
519 	    DPRINT(("After iteration\n"));
520 	    if (!first_pass)
521 	      {
522 		DPRINT(("  Setting direction to %s\n",
523 			minimal ? "minimize" : "maximize"));
524 		if (minimal)
525 		  direction = TRE_TAG_MINIMIZE;
526 		else
527 		  direction = TRE_TAG_MAXIMIZE;
528 	      }
529 	    break;
530 	  }
531 
532 	case ADDTAGS_AFTER_CAT_LEFT:
533 	  {
534 	    int new_tag = tre_stack_pop_int(stack);
535 	    next_tag = tre_stack_pop_int(stack);
536 	    DPRINT(("After cat left, tag = %d, next_tag = %d\n",
537 		    tag, next_tag));
538 	    if (new_tag >= 0)
539 	      {
540 		DPRINT(("  Setting tag to %d\n", new_tag));
541 		tag = new_tag;
542 	      }
543 	    break;
544 	  }
545 
546 	case ADDTAGS_AFTER_CAT_RIGHT:
547 	  DPRINT(("After cat right\n"));
548 	  node = tre_stack_pop_voidptr(stack);
549 	  if (first_pass)
550 	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
551 	      + ((tre_catenation_t *)node->obj)->right->num_tags;
552 	  break;
553 
554 	case ADDTAGS_AFTER_UNION_LEFT:
555 	  DPRINT(("After union left\n"));
556 	  /* Lift the bottom of the `regset' array so that when processing
557 	     the right operand the items currently in the array are
558 	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
559 	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
560 	  while (*regset >= 0)
561 	    regset++;
562 	  break;
563 
564 	case ADDTAGS_AFTER_UNION_RIGHT:
565 	  {
566 	    int added_tags, tag_left, tag_right;
567 	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
568 	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
569 	    DPRINT(("After union right\n"));
570 	    node = tre_stack_pop_voidptr(stack);
571 	    added_tags = tre_stack_pop_int(stack);
572 	    if (first_pass)
573 	      {
574 		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
575 		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
576 		  + ((node->num_submatches > 0) ? 2 : 0);
577 	      }
578 	    regset = tre_stack_pop_voidptr(stack);
579 	    tag_left = tre_stack_pop_int(stack);
580 	    tag_right = tre_stack_pop_int(stack);
581 
582 	    /* Add tags after both children, the left child gets a smaller
583 	       tag than the right child.  This guarantees that we prefer
584 	       the left child over the right child. */
585 	    /* XXX - This is not always necessary (if the children have
586 	       tags which must be seen for every match of that child). */
587 	    /* XXX - Check if this is the only place where tre_add_tag_right
588 	       is used.	 If so, use tre_add_tag_left (putting the tag before
589 	       the child as opposed after the child) and throw away
590 	       tre_add_tag_right. */
591 	    if (node->num_submatches > 0)
592 	      {
593 		if (!first_pass)
594 		  {
595 		    status = tre_add_tag_right(mem, left, tag_left);
596 		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
597 		    status = tre_add_tag_right(mem, right, tag_right);
598 		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
599 		  }
600 		DPRINT(("  num_tags += 2\n"));
601 		num_tags += 2;
602 	      }
603 	    direction = TRE_TAG_MAXIMIZE;
604 	    break;
605 	  }
606 
607 	default:
608 	  assert(0);
609 	  break;
610 
611 	} /* end switch(symbol) */
612     } /* end while(tre_stack_num_objects(stack) > bottom) */
613 
614   if (!first_pass)
615     tre_purge_regset(regset, tnfa, tag);
616 
617   if (!first_pass && minimal_tag >= 0)
618     {
619       int i;
620       DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
621       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
622       tnfa->minimal_tags[i] = tag;
623       tnfa->minimal_tags[i + 1] = minimal_tag;
624       tnfa->minimal_tags[i + 2] = -1;
625       minimal_tag = -1;
626       num_minimals++;
627     }
628 
629   DPRINT(("tre_add_tags: %s complete.  Number of tags %d.\n",
630 	  first_pass? "First pass" : "Second pass", num_tags));
631 
632   assert(tree->num_tags == num_tags);
633   tnfa->end_tag = num_tags;
634   tnfa->num_tags = num_tags;
635   tnfa->num_minimals = num_minimals;
636   xfree(orig_regset);
637   xfree(parents);
638   xfree(saved_states);
639   return status;
640 }
641 
642 
643 
644 /*
645   AST to TNFA compilation routines.
646 */
647 
648 typedef enum {
649   COPY_RECURSE,
650   COPY_SET_RESULT_PTR
651 } tre_copyast_symbol_t;
652 
653 /* Flags for tre_copy_ast(). */
654 #define COPY_REMOVE_TAGS	 1
655 #define COPY_MAXIMIZE_FIRST_TAG	 2
656 
657 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)658 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
659 	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
660 	     tre_ast_node_t **copy, int *max_pos)
661 {
662   reg_errcode_t status = REG_OK;
663   int bottom = tre_stack_num_objects(stack);
664   int num_copied = 0;
665   int first_tag = 1;
666   tre_ast_node_t **result = copy;
667   tre_copyast_symbol_t symbol;
668 
669   STACK_PUSH(stack, voidptr, ast);
670   STACK_PUSH(stack, int, COPY_RECURSE);
671 
672   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
673     {
674       tre_ast_node_t *node;
675       if (status != REG_OK)
676 	break;
677 
678       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
679       switch (symbol)
680 	{
681 	case COPY_SET_RESULT_PTR:
682 	  result = tre_stack_pop_voidptr(stack);
683 	  break;
684 	case COPY_RECURSE:
685 	  node = tre_stack_pop_voidptr(stack);
686 	  switch (node->type)
687 	    {
688 	    case LITERAL:
689 	      {
690 		tre_literal_t *lit = node->obj;
691 		int pos = lit->position;
692 		int min = lit->code_min;
693 		int max = lit->code_max;
694 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
695 		  {
696 		    /* XXX - e.g. [ab] has only one position but two
697 		       nodes, so we are creating holes in the state space
698 		       here.  Not fatal, just wastes memory. */
699 		    pos += *pos_add;
700 		    num_copied++;
701 		  }
702 		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
703 		  {
704 		    /* Change this tag to empty. */
705 		    min = EMPTY;
706 		    max = pos = -1;
707 		  }
708 		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
709 			 && first_tag)
710 		  {
711 		    /* Maximize the first tag. */
712 		    tag_directions[max] = TRE_TAG_MAXIMIZE;
713 		    first_tag = 0;
714 		  }
715 		*result = tre_ast_new_literal(mem, min, max, pos);
716 		if (*result == NULL)
717 		  status = REG_ESPACE;
718 
719 		if (pos > *max_pos)
720 		  *max_pos = pos;
721 		break;
722 	      }
723 	    case UNION:
724 	      {
725 		tre_union_t *uni = node->obj;
726 		tre_union_t *tmp;
727 		*result = tre_ast_new_union(mem, uni->left, uni->right);
728 		if (*result == NULL)
729 		  {
730 		    status = REG_ESPACE;
731 		    break;
732 		  }
733 		tmp = (*result)->obj;
734 		result = &tmp->left;
735 		STACK_PUSHX(stack, voidptr, uni->right);
736 		STACK_PUSHX(stack, int, COPY_RECURSE);
737 		STACK_PUSHX(stack, voidptr, &tmp->right);
738 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
739 		STACK_PUSHX(stack, voidptr, uni->left);
740 		STACK_PUSHX(stack, int, COPY_RECURSE);
741 		break;
742 	      }
743 	    case CATENATION:
744 	      {
745 		tre_catenation_t *cat = node->obj;
746 		tre_catenation_t *tmp;
747 		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
748 		if (*result == NULL)
749 		  {
750 		    status = REG_ESPACE;
751 		    break;
752 		  }
753 		tmp = (*result)->obj;
754 		tmp->left = NULL;
755 		tmp->right = NULL;
756 		result = &tmp->left;
757 
758 		STACK_PUSHX(stack, voidptr, cat->right);
759 		STACK_PUSHX(stack, int, COPY_RECURSE);
760 		STACK_PUSHX(stack, voidptr, &tmp->right);
761 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
762 		STACK_PUSHX(stack, voidptr, cat->left);
763 		STACK_PUSHX(stack, int, COPY_RECURSE);
764 		break;
765 	      }
766 	    case ITERATION:
767 	      {
768 		tre_iteration_t *iter = node->obj;
769 		STACK_PUSHX(stack, voidptr, iter->arg);
770 		STACK_PUSHX(stack, int, COPY_RECURSE);
771 		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
772 					   iter->max, iter->minimal);
773 		if (*result == NULL)
774 		  {
775 		    status = REG_ESPACE;
776 		    break;
777 		  }
778 		iter = (*result)->obj;
779 		result = &iter->arg;
780 		break;
781 	      }
782 	    default:
783 	      assert(0);
784 	      break;
785 	    }
786 	  break;
787 	}
788     }
789   *pos_add += num_copied;
790   return status;
791 }
792 
793 typedef enum {
794   EXPAND_RECURSE,
795   EXPAND_AFTER_ITER
796 } tre_expand_ast_symbol_t;
797 
798 /* Expands each iteration node that has a finite nonzero minimum or maximum
799    iteration count to a catenated sequence of copies of the node. */
800 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions,int * max_depth)801 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
802 	       int *position, tre_tag_direction_t *tag_directions,
803 	       int *max_depth)
804 {
805   reg_errcode_t status = REG_OK;
806   int bottom = tre_stack_num_objects(stack);
807   int pos_add = 0;
808   int pos_add_total = 0;
809   int max_pos = 0;
810   /* Current approximate matching parameters. */
811   int params[TRE_PARAM_LAST];
812   /* Approximate parameter nesting level. */
813   int params_depth = 0;
814   int iter_depth = 0;
815   int i;
816 
817   for (i = 0; i < TRE_PARAM_LAST; i++)
818     params[i] = TRE_PARAM_DEFAULT;
819 
820   STACK_PUSHR(stack, voidptr, ast);
821   STACK_PUSHR(stack, int, EXPAND_RECURSE);
822   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
823     {
824       tre_ast_node_t *node;
825       tre_expand_ast_symbol_t symbol;
826 
827       if (status != REG_OK)
828 	break;
829 
830       DPRINT(("pos_add %d\n", pos_add));
831 
832       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
833       node = tre_stack_pop_voidptr(stack);
834       switch (symbol)
835 	{
836 	case EXPAND_RECURSE:
837 	  switch (node->type)
838 	    {
839 	    case LITERAL:
840 	      {
841 		tre_literal_t *lit= node->obj;
842 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
843 		  {
844 		    lit->position += pos_add;
845 		    if (lit->position > max_pos)
846 		      max_pos = lit->position;
847 		  }
848 		break;
849 	      }
850 	    case UNION:
851 	      {
852 		tre_union_t *uni = node->obj;
853 		STACK_PUSHX(stack, voidptr, uni->right);
854 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
855 		STACK_PUSHX(stack, voidptr, uni->left);
856 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
857 		break;
858 	      }
859 	    case CATENATION:
860 	      {
861 		tre_catenation_t *cat = node->obj;
862 		STACK_PUSHX(stack, voidptr, cat->right);
863 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
864 		STACK_PUSHX(stack, voidptr, cat->left);
865 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
866 		break;
867 	      }
868 	    case ITERATION:
869 	      {
870 		tre_iteration_t *iter = node->obj;
871 		STACK_PUSHX(stack, int, pos_add);
872 		STACK_PUSHX(stack, voidptr, node);
873 		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
874 		STACK_PUSHX(stack, voidptr, iter->arg);
875 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
876 		/* If we are going to expand this node at EXPAND_AFTER_ITER
877 		   then don't increase the `pos' fields of the nodes now, it
878 		   will get done when expanding. */
879 		if (iter->min > 1 || iter->max > 1)
880 		  pos_add = 0;
881 		iter_depth++;
882 		DPRINT(("iter\n"));
883 		break;
884 	      }
885 	    default:
886 	      assert(0);
887 	      break;
888 	    }
889 	  break;
890 	case EXPAND_AFTER_ITER:
891 	  {
892 	    tre_iteration_t *iter = node->obj;
893 	    int pos_add_last;
894 	    pos_add = tre_stack_pop_int(stack);
895 	    pos_add_last = pos_add;
896 	    if (iter->min > 1 || iter->max > 1)
897 	      {
898 		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
899 		int j;
900 		int pos_add_save = pos_add;
901 
902 		/* Create a catenated sequence of copies of the node. */
903 		for (j = 0; j < iter->min; j++)
904 		  {
905 		    tre_ast_node_t *copy;
906 		    /* Remove tags from all but the last copy. */
907 		    int flags = ((j + 1 < iter->min)
908 				 ? COPY_REMOVE_TAGS
909 				 : COPY_MAXIMIZE_FIRST_TAG);
910 		    DPRINT(("  pos_add %d\n", pos_add));
911 		    pos_add_save = pos_add;
912 		    status = tre_copy_ast(mem, stack, iter->arg, flags,
913 					  &pos_add, tag_directions, &copy,
914 					  &max_pos);
915 		    if (status != REG_OK)
916 		      return status;
917 		    if (seq1 != NULL)
918 		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
919 		    else
920 		      seq1 = copy;
921 		    if (seq1 == NULL)
922 		      return REG_ESPACE;
923 		  }
924 
925 		if (iter->max == -1)
926 		  {
927 		    /* No upper limit. */
928 		    pos_add_save = pos_add;
929 		    status = tre_copy_ast(mem, stack, iter->arg, 0,
930 					  &pos_add, NULL, &seq2, &max_pos);
931 		    if (status != REG_OK)
932 		      return status;
933 		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
934 		    if (seq2 == NULL)
935 		      return REG_ESPACE;
936 		  }
937 		else
938 		  {
939 		    for (j = iter->min; j < iter->max; j++)
940 		      {
941 			tre_ast_node_t *tmp, *copy;
942 			pos_add_save = pos_add;
943 			status = tre_copy_ast(mem, stack, iter->arg, 0,
944 					      &pos_add, NULL, &copy, &max_pos);
945 			if (status != REG_OK)
946 			  return status;
947 			if (seq2 != NULL)
948 			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
949 			else
950 			  seq2 = copy;
951 			if (seq2 == NULL)
952 			  return REG_ESPACE;
953 			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
954 			if (tmp == NULL)
955 			  return REG_ESPACE;
956 			seq2 = tre_ast_new_union(mem, tmp, seq2);
957 			if (seq2 == NULL)
958 			  return REG_ESPACE;
959 		      }
960 		  }
961 
962 		pos_add = pos_add_save;
963 		if (seq1 == NULL)
964 		  seq1 = seq2;
965 		else if (seq2 != NULL)
966 		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
967 		if (seq1 == NULL)
968 		  return REG_ESPACE;
969 		node->obj = seq1->obj;
970 		node->type = seq1->type;
971 	      }
972 
973 	    iter_depth--;
974 	    pos_add_total += pos_add - pos_add_last;
975 	    if (iter_depth == 0)
976 	      pos_add = pos_add_total;
977 
978 	    /* If approximate parameters are specified, surround the result
979 	       with two parameter setting nodes.  The one on the left sets
980 	       the specified parameters, and the one on the right restores
981 	       the old parameters. */
982 	    if (iter->params)
983 	      {
984 		tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
985 		int *old_params;
986 
987 		tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
988 		if (!tmp_l)
989 		  return REG_ESPACE;
990 		((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
991 		iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
992 		tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
993 		if (!tmp_r)
994 		  return REG_ESPACE;
995 		old_params = tre_mem_alloc(mem, sizeof(*old_params)
996 					   * TRE_PARAM_LAST);
997 		if (!old_params)
998 		  return REG_ESPACE;
999 		for (i = 0; i < TRE_PARAM_LAST; i++)
1000 		  old_params[i] = params[i];
1001 		((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1002 		old_params[TRE_PARAM_DEPTH] = params_depth;
1003 		/* XXX - this is the only place where ast_new_node is
1004 		   needed -- should be moved inside AST module. */
1005 		node_copy = tre_ast_new_node(mem, ITERATION,
1006 					     sizeof(tre_iteration_t));
1007 		if (!node_copy)
1008 		  return REG_ESPACE;
1009 		node_copy->obj = node->obj;
1010 		tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1011 		if (!tmp_node)
1012 		  return REG_ESPACE;
1013 		tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1014 		if (!tmp_node)
1015 		  return REG_ESPACE;
1016 		/* Replace the contents of `node' with `tmp_node'. */
1017 		memcpy(node, tmp_node, sizeof(*node));
1018 		node->obj = tmp_node->obj;
1019 		node->type = tmp_node->type;
1020 		params_depth++;
1021 		if (params_depth > *max_depth)
1022 		  *max_depth = params_depth;
1023 	      }
1024 	    break;
1025 	  }
1026 	default:
1027 	  assert(0);
1028 	  break;
1029 	}
1030     }
1031 
1032   *position += pos_add_total;
1033 
1034   /* `max_pos' should never be larger than `*position' if the above
1035      code works, but just an extra safeguard let's make sure
1036      `*position' is set large enough so enough memory will be
1037      allocated for the transition table. */
1038   if (max_pos > *position)
1039     *position = max_pos;
1040 
1041 #ifdef TRE_DEBUG
1042   DPRINT(("Expanded AST:\n"));
1043   tre_ast_print(ast);
1044   DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1045 #endif
1046 
1047   return status;
1048 }
1049 
1050 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)1051 tre_set_empty(tre_mem_t mem)
1052 {
1053   tre_pos_and_tags_t *new_set;
1054 
1055   new_set = tre_mem_calloc(mem, sizeof(*new_set));
1056   if (new_set == NULL)
1057     return NULL;
1058 
1059   new_set[0].position = -1;
1060   new_set[0].code_min = -1;
1061   new_set[0].code_max = -1;
1062 
1063   return new_set;
1064 }
1065 
1066 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)1067 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1068 	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1069 {
1070   tre_pos_and_tags_t *new_set;
1071 
1072   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1073   if (new_set == NULL)
1074     return NULL;
1075 
1076   new_set[0].position = position;
1077   new_set[0].code_min = code_min;
1078   new_set[0].code_max = code_max;
1079   new_set[0].class = class;
1080   new_set[0].neg_classes = neg_classes;
1081   new_set[0].backref = backref;
1082   new_set[1].position = -1;
1083   new_set[1].code_min = -1;
1084   new_set[1].code_max = -1;
1085 
1086   return new_set;
1087 }
1088 
1089 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions,int * params)1090 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1091 	      int *tags, int assertions, int *params)
1092 {
1093   int s1, s2, i, j;
1094   tre_pos_and_tags_t *new_set;
1095   int *new_tags;
1096   int num_tags;
1097 
1098   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1099   for (s1 = 0; set1[s1].position >= 0; s1++);
1100   for (s2 = 0; set2[s2].position >= 0; s2++);
1101   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1102   if (!new_set )
1103     return NULL;
1104 
1105   for (s1 = 0; set1[s1].position >= 0; s1++)
1106     {
1107       new_set[s1].position = set1[s1].position;
1108       new_set[s1].code_min = set1[s1].code_min;
1109       new_set[s1].code_max = set1[s1].code_max;
1110       new_set[s1].assertions = set1[s1].assertions | assertions;
1111       new_set[s1].class = set1[s1].class;
1112       new_set[s1].neg_classes = set1[s1].neg_classes;
1113       new_set[s1].backref = set1[s1].backref;
1114       if (set1[s1].tags == NULL && tags == NULL)
1115 	new_set[s1].tags = NULL;
1116       else
1117 	{
1118 	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1119 	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1120 					 * (i + num_tags + 1)));
1121 	  if (new_tags == NULL)
1122 	    return NULL;
1123 	  for (j = 0; j < i; j++)
1124 	    new_tags[j] = set1[s1].tags[j];
1125 	  for (i = 0; i < num_tags; i++)
1126 	    new_tags[j + i] = tags[i];
1127 	  new_tags[j + i] = -1;
1128 	  new_set[s1].tags = new_tags;
1129 	}
1130       if (set1[s1].params)
1131 	new_set[s1].params = set1[s1].params;
1132       if (params)
1133 	{
1134 	  if (!new_set[s1].params)
1135 	    new_set[s1].params = params;
1136 	  else
1137 	    {
1138 	      new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1139 						 TRE_PARAM_LAST);
1140 	      if (!new_set[s1].params)
1141 		return NULL;
1142 	      for (i = 0; i < TRE_PARAM_LAST; i++)
1143 		if (params[i] != TRE_PARAM_UNSET)
1144 		  new_set[s1].params[i] = params[i];
1145 	    }
1146 	}
1147     }
1148 
1149   for (s2 = 0; set2[s2].position >= 0; s2++)
1150     {
1151       new_set[s1 + s2].position = set2[s2].position;
1152       new_set[s1 + s2].code_min = set2[s2].code_min;
1153       new_set[s1 + s2].code_max = set2[s2].code_max;
1154       /* XXX - why not | assertions here as well? */
1155       new_set[s1 + s2].assertions = set2[s2].assertions;
1156       new_set[s1 + s2].class = set2[s2].class;
1157       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1158       new_set[s1 + s2].backref = set2[s2].backref;
1159       if (set2[s2].tags == NULL)
1160 	new_set[s1 + s2].tags = NULL;
1161       else
1162 	{
1163 	  for (i = 0; set2[s2].tags[i] >= 0; i++);
1164 	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1165 	  if (new_tags == NULL)
1166 	    return NULL;
1167 	  for (j = 0; j < i; j++)
1168 	    new_tags[j] = set2[s2].tags[j];
1169 	  new_tags[j] = -1;
1170 	  new_set[s1 + s2].tags = new_tags;
1171 	}
1172       if (set2[s2].params)
1173 	new_set[s1 + s2].params = set2[s2].params;
1174       if (params)
1175 	{
1176 	  if (!new_set[s1 + s2].params)
1177 	    new_set[s1 + s2].params = params;
1178 	  else
1179 	    {
1180 	      new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1181 						      TRE_PARAM_LAST);
1182 	      if (!new_set[s1 + s2].params)
1183 		return NULL;
1184 	      for (i = 0; i < TRE_PARAM_LAST; i++)
1185 		if (params[i] != TRE_PARAM_UNSET)
1186 		  new_set[s1 + s2].params[i] = params[i];
1187 	    }
1188 	}
1189     }
1190   new_set[s1 + s2].position = -1;
1191   return new_set;
1192 }
1193 
1194 /* Finds the empty path through `node' which is the one that should be
1195    taken according to POSIX.2 rules, and adds the tags on that path to
1196    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
1197    set to the number of tags seen on the path. */
1198 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * params,int * num_tags_seen,int * params_seen)1199 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1200 		int *assertions, int *params, int *num_tags_seen,
1201 		int *params_seen)
1202 {
1203   tre_literal_t *lit;
1204   tre_union_t *uni;
1205   tre_catenation_t *cat;
1206   tre_iteration_t *iter;
1207   int i;
1208   int bottom = tre_stack_num_objects(stack);
1209   reg_errcode_t status = REG_OK;
1210   if (num_tags_seen)
1211     *num_tags_seen = 0;
1212   if (params_seen)
1213     *params_seen = 0;
1214 
1215   status = tre_stack_push_voidptr(stack, node);
1216 
1217   /* Walk through the tree recursively. */
1218   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1219     {
1220       node = tre_stack_pop_voidptr(stack);
1221 
1222       switch (node->type)
1223 	{
1224 	case LITERAL:
1225 	  lit = (tre_literal_t *)node->obj;
1226 	  switch (lit->code_min)
1227 	    {
1228 	    case TAG:
1229 	      if (lit->code_max >= 0)
1230 		{
1231 		  if (tags != NULL)
1232 		    {
1233 		      /* Add the tag to `tags'. */
1234 		      for (i = 0; tags[i] >= 0; i++)
1235 			if (tags[i] == lit->code_max)
1236 			  break;
1237 		      if (tags[i] < 0)
1238 			{
1239 			  tags[i] = lit->code_max;
1240 			  tags[i + 1] = -1;
1241 			}
1242 		    }
1243 		  if (num_tags_seen)
1244 		    (*num_tags_seen)++;
1245 		}
1246 	      break;
1247 	    case ASSERTION:
1248 	      assert(lit->code_max >= 1
1249 		     || lit->code_max <= ASSERT_LAST);
1250 	      if (assertions != NULL)
1251 		*assertions |= lit->code_max;
1252 	      break;
1253 	    case PARAMETER:
1254 	      if (params != NULL)
1255 		for (i = 0; i < TRE_PARAM_LAST; i++)
1256 		  params[i] = lit->u.params[i];
1257 	      if (params_seen != NULL)
1258 		*params_seen = 1;
1259 	      break;
1260 	    case EMPTY:
1261 	      break;
1262 	    default:
1263 	      assert(0);
1264 	      break;
1265 	    }
1266 	  break;
1267 
1268 	case UNION:
1269 	  /* Subexpressions starting earlier take priority over ones
1270 	     starting later, so we prefer the left subexpression over the
1271 	     right subexpression. */
1272 	  uni = (tre_union_t *)node->obj;
1273 	  if (uni->left->nullable)
1274 	    STACK_PUSHX(stack, voidptr, uni->left)
1275 	  else if (uni->right->nullable)
1276 	    STACK_PUSHX(stack, voidptr, uni->right)
1277 	  else
1278 	    assert(0);
1279 	  break;
1280 
1281 	case CATENATION:
1282 	  /* The path must go through both children. */
1283 	  cat = (tre_catenation_t *)node->obj;
1284 	  assert(cat->left->nullable);
1285 	  assert(cat->right->nullable);
1286 	  STACK_PUSHX(stack, voidptr, cat->left);
1287 	  STACK_PUSHX(stack, voidptr, cat->right);
1288 	  break;
1289 
1290 	case ITERATION:
1291 	  /* A match with an empty string is preferred over no match at
1292 	     all, so we go through the argument if possible. */
1293 	  iter = (tre_iteration_t *)node->obj;
1294 	  if (iter->arg->nullable)
1295 	    STACK_PUSHX(stack, voidptr, iter->arg);
1296 	  break;
1297 
1298 	default:
1299 	  assert(0);
1300 	  break;
1301 	}
1302     }
1303 
1304   return status;
1305 }
1306 
1307 
1308 typedef enum {
1309   NFL_RECURSE,
1310   NFL_POST_UNION,
1311   NFL_POST_CATENATION,
1312   NFL_POST_ITERATION
1313 } tre_nfl_stack_symbol_t;
1314 
1315 
1316 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1317    the nodes of the AST `tree'. */
1318 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)1319 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1320 {
1321   int bottom = tre_stack_num_objects(stack);
1322 
1323   STACK_PUSHR(stack, voidptr, tree);
1324   STACK_PUSHR(stack, int, NFL_RECURSE);
1325 
1326   while (tre_stack_num_objects(stack) > bottom)
1327     {
1328       tre_nfl_stack_symbol_t symbol;
1329       tre_ast_node_t *node;
1330 
1331       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1332       node = tre_stack_pop_voidptr(stack);
1333       switch (symbol)
1334 	{
1335 	case NFL_RECURSE:
1336 	  switch (node->type)
1337 	    {
1338 	    case LITERAL:
1339 	      {
1340 		tre_literal_t *lit = (tre_literal_t *)node->obj;
1341 		if (IS_BACKREF(lit))
1342 		  {
1343 		    /* Back references: nullable = false, firstpos = {i},
1344 		       lastpos = {i}. */
1345 		    node->nullable = 0;
1346 		    node->firstpos = tre_set_one(mem, lit->position, 0,
1347 					     TRE_CHAR_MAX, 0, NULL, -1);
1348 		    if (!node->firstpos)
1349 		      return REG_ESPACE;
1350 		    node->lastpos = tre_set_one(mem, lit->position, 0,
1351 						TRE_CHAR_MAX, 0, NULL,
1352 						(int)lit->code_max);
1353 		    if (!node->lastpos)
1354 		      return REG_ESPACE;
1355 		  }
1356 		else if (lit->code_min < 0)
1357 		  {
1358 		    /* Tags, empty strings, params, and zero width assertions:
1359 		       nullable = true, firstpos = {}, and lastpos = {}. */
1360 		    node->nullable = 1;
1361 		    node->firstpos = tre_set_empty(mem);
1362 		    if (!node->firstpos)
1363 		      return REG_ESPACE;
1364 		    node->lastpos = tre_set_empty(mem);
1365 		    if (!node->lastpos)
1366 		      return REG_ESPACE;
1367 		  }
1368 		else
1369 		  {
1370 		    /* Literal at position i: nullable = false, firstpos = {i},
1371 		       lastpos = {i}. */
1372 		    node->nullable = 0;
1373 		    node->firstpos =
1374 		      tre_set_one(mem, lit->position, (int)lit->code_min,
1375 				  (int)lit->code_max, 0, NULL, -1);
1376 		    if (!node->firstpos)
1377 		      return REG_ESPACE;
1378 		    node->lastpos = tre_set_one(mem, lit->position,
1379 						(int)lit->code_min,
1380 						(int)lit->code_max,
1381 						lit->u.class, lit->neg_classes,
1382 						-1);
1383 		    if (!node->lastpos)
1384 		      return REG_ESPACE;
1385 		  }
1386 		break;
1387 	      }
1388 
1389 	    case UNION:
1390 	      /* Compute the attributes for the two subtrees, and after that
1391 		 for this node. */
1392 	      STACK_PUSHR(stack, voidptr, node);
1393 	      STACK_PUSHR(stack, int, NFL_POST_UNION);
1394 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1395 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1396 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1397 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1398 	      break;
1399 
1400 	    case CATENATION:
1401 	      /* Compute the attributes for the two subtrees, and after that
1402 		 for this node. */
1403 	      STACK_PUSHR(stack, voidptr, node);
1404 	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1405 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1406 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1407 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1408 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1409 	      break;
1410 
1411 	    case ITERATION:
1412 	      /* Compute the attributes for the subtree, and after that for
1413 		 this node. */
1414 	      STACK_PUSHR(stack, voidptr, node);
1415 	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1416 	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1417 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1418 	      break;
1419 	    }
1420 	  break; /* end case: NFL_RECURSE */
1421 
1422 	case NFL_POST_UNION:
1423 	  {
1424 	    tre_union_t *uni = (tre_union_t *)node->obj;
1425 	    node->nullable = uni->left->nullable || uni->right->nullable;
1426 	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
1427 					   uni->right->firstpos, NULL, 0, NULL);
1428 	    if (!node->firstpos)
1429 	      return REG_ESPACE;
1430 	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
1431 					  uni->right->lastpos, NULL, 0, NULL);
1432 	    if (!node->lastpos)
1433 	      return REG_ESPACE;
1434 	    break;
1435 	  }
1436 
1437 	case NFL_POST_ITERATION:
1438 	  {
1439 	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1440 
1441 	    if (iter->min == 0 || iter->arg->nullable)
1442 	      node->nullable = 1;
1443 	    else
1444 	      node->nullable = 0;
1445 	    node->firstpos = iter->arg->firstpos;
1446 	    node->lastpos = iter->arg->lastpos;
1447 	    break;
1448 	  }
1449 
1450 	case NFL_POST_CATENATION:
1451 	  {
1452 	    int num_tags, *tags, assertions, params_seen;
1453 	    int *params;
1454 	    reg_errcode_t status;
1455 	    tre_catenation_t *cat = node->obj;
1456 	    node->nullable = cat->left->nullable && cat->right->nullable;
1457 
1458 	    /* Compute firstpos. */
1459 	    if (cat->left->nullable)
1460 	      {
1461 		/* The left side matches the empty string.  Make a first pass
1462 		   with tre_match_empty() to get the number of tags and
1463 		   parameters. */
1464 		status = tre_match_empty(stack, cat->left,
1465 					 NULL, NULL, NULL, &num_tags,
1466 					 &params_seen);
1467 		if (status != REG_OK)
1468 		  return status;
1469 		/* Allocate arrays for the tags and parameters. */
1470 		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1471 		if (!tags)
1472 		  return REG_ESPACE;
1473 		tags[0] = -1;
1474 		assertions = 0;
1475 		params = NULL;
1476 		if (params_seen)
1477 		  {
1478 		    params = tre_mem_alloc(mem, sizeof(*params)
1479 					   * TRE_PARAM_LAST);
1480 		    if (!params)
1481 		      {
1482 			xfree(tags);
1483 			return REG_ESPACE;
1484 		      }
1485 		  }
1486 		/* Second pass with tre_mach_empty() to get the list of
1487 		   tags and parameters. */
1488 		status = tre_match_empty(stack, cat->left, tags,
1489 					 &assertions, params, NULL, NULL);
1490 		if (status != REG_OK)
1491 		  {
1492 		    xfree(tags);
1493 		    return status;
1494 		  }
1495 		node->firstpos =
1496 		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1497 				tags, assertions, params);
1498 		xfree(tags);
1499 		if (!node->firstpos)
1500 		  return REG_ESPACE;
1501 	      }
1502 	    else
1503 	      {
1504 		node->firstpos = cat->left->firstpos;
1505 	      }
1506 
1507 	    /* Compute lastpos. */
1508 	    if (cat->right->nullable)
1509 	      {
1510 		/* The right side matches the empty string.  Make a first pass
1511 		   with tre_match_empty() to get the number of tags and
1512 		   parameters. */
1513 		status = tre_match_empty(stack, cat->right,
1514 					 NULL, NULL, NULL, &num_tags,
1515 					 &params_seen);
1516 		if (status != REG_OK)
1517 		  return status;
1518 		/* Allocate arrays for the tags and parameters. */
1519 		tags = xmalloc(sizeof(int) * (num_tags + 1));
1520 		if (!tags)
1521 		  return REG_ESPACE;
1522 		tags[0] = -1;
1523 		assertions = 0;
1524 		params = NULL;
1525 		if (params_seen)
1526 		  {
1527 		    params = tre_mem_alloc(mem, sizeof(*params)
1528 					   * TRE_PARAM_LAST);
1529 		    if (!params)
1530 		      {
1531 			xfree(tags);
1532 			return REG_ESPACE;
1533 		      }
1534 		  }
1535 		/* Second pass with tre_mach_empty() to get the list of
1536 		   tags and parameters. */
1537 		status = tre_match_empty(stack, cat->right, tags,
1538 					 &assertions, params, NULL, NULL);
1539 		if (status != REG_OK)
1540 		  {
1541 		    xfree(tags);
1542 		    return status;
1543 		  }
1544 		node->lastpos =
1545 		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1546 				tags, assertions, params);
1547 		xfree(tags);
1548 		if (!node->lastpos)
1549 		  return REG_ESPACE;
1550 	      }
1551 	    else
1552 	      {
1553 		node->lastpos = cat->right->lastpos;
1554 	      }
1555 	    break;
1556 	  }
1557 
1558 	default:
1559 	  assert(0);
1560 	  break;
1561 	}
1562     }
1563 
1564   return REG_OK;
1565 }
1566 
1567 
1568 /* Adds a transition from each position in `p1' to each position in `p2'. */
1569 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)1570 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1571 	       tre_tnfa_transition_t *transitions,
1572 	       int *counts, int *offs)
1573 {
1574   tre_pos_and_tags_t *orig_p2 = p2;
1575   tre_tnfa_transition_t *trans;
1576   int i, j, k, l, dup, prev_p2_pos;
1577 
1578   if (transitions != NULL)
1579     while (p1->position >= 0)
1580       {
1581 	p2 = orig_p2;
1582 	prev_p2_pos = -1;
1583 	while (p2->position >= 0)
1584 	  {
1585 	    /* Optimization: if this position was already handled, skip it. */
1586 	    if (p2->position == prev_p2_pos)
1587 	      {
1588 		p2++;
1589 		continue;
1590 	      }
1591 	    prev_p2_pos = p2->position;
1592 	    /* Set `trans' to point to the next unused transition from
1593 	       position `p1->position'. */
1594 	    trans = transitions + offs[p1->position];
1595 	    while (trans->state != NULL)
1596 	      {
1597 #if 0
1598 		/* If we find a previous transition from `p1->position' to
1599 		   `p2->position', it is overwritten.  This can happen only
1600 		   if there are nested loops in the regexp, like in "((a)*)*".
1601 		   In POSIX.2 repetition using the outer loop is always
1602 		   preferred over using the inner loop.	 Therefore the
1603 		   transition for the inner loop is useless and can be thrown
1604 		   away. */
1605 		/* XXX - The same position is used for all nodes in a bracket
1606 		   expression, so this optimization cannot be used (it will
1607 		   break bracket expressions) unless I figure out a way to
1608 		   detect it here. */
1609 		if (trans->state_id == p2->position)
1610 		  {
1611 		    DPRINT(("*"));
1612 		    break;
1613 		  }
1614 #endif
1615 		trans++;
1616 	      }
1617 
1618 	    if (trans->state == NULL)
1619 	      (trans + 1)->state = NULL;
1620 	    /* Use the character ranges, assertions, etc. from `p1' for
1621 	       the transition from `p1' to `p2'. */
1622 	    trans->code_min = p1->code_min;
1623 	    trans->code_max = p1->code_max;
1624 	    trans->state = transitions + offs[p2->position];
1625 	    trans->state_id = p2->position;
1626 	    trans->assertions = p1->assertions | p2->assertions
1627 	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
1628 	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1629 	    if (p1->backref >= 0)
1630 	      {
1631 		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1632 		assert(p2->backref < 0);
1633 		trans->u.backref = p1->backref;
1634 		trans->assertions |= ASSERT_BACKREF;
1635 	      }
1636 	    else
1637 	      trans->u.class = p1->class;
1638 	    if (p1->neg_classes != NULL)
1639 	      {
1640 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1641 		trans->neg_classes =
1642 		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1643 		if (trans->neg_classes == NULL)
1644 		  return REG_ESPACE;
1645 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1646 		  trans->neg_classes[i] = p1->neg_classes[i];
1647 		trans->neg_classes[i] = (tre_ctype_t)0;
1648 	      }
1649 	    else
1650 	      trans->neg_classes = NULL;
1651 
1652 	    /* Find out how many tags this transition has. */
1653 	    i = 0;
1654 	    if (p1->tags != NULL)
1655 	      while(p1->tags[i] >= 0)
1656 		i++;
1657 	    j = 0;
1658 	    if (p2->tags != NULL)
1659 	      while(p2->tags[j] >= 0)
1660 		j++;
1661 
1662 	    /* If we are overwriting a transition, free the old tag array. */
1663 	    if (trans->tags != NULL)
1664 	      xfree(trans->tags);
1665 	    trans->tags = NULL;
1666 
1667 	    /* If there were any tags, allocate an array and fill it. */
1668 	    if (i + j > 0)
1669 	      {
1670 		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1671 		if (!trans->tags)
1672 		  return REG_ESPACE;
1673 		i = 0;
1674 		if (p1->tags != NULL)
1675 		  while(p1->tags[i] >= 0)
1676 		    {
1677 		      trans->tags[i] = p1->tags[i];
1678 		      i++;
1679 		    }
1680 		l = i;
1681 		j = 0;
1682 		if (p2->tags != NULL)
1683 		  while (p2->tags[j] >= 0)
1684 		    {
1685 		      /* Don't add duplicates. */
1686 		      dup = 0;
1687 		      for (k = 0; k < i; k++)
1688 			if (trans->tags[k] == p2->tags[j])
1689 			  {
1690 			    dup = 1;
1691 			    break;
1692 			  }
1693 		      if (!dup)
1694 			trans->tags[l++] = p2->tags[j];
1695 		      j++;
1696 		    }
1697 		trans->tags[l] = -1;
1698 	      }
1699 
1700 	    /* Set the parameter array.	 If both `p2' and `p1' have same
1701 	       parameters, the values in `p2' override those in `p1'. */
1702 	    if (p1->params || p2->params)
1703 	      {
1704 		if (!trans->params)
1705 		  trans->params = xmalloc(sizeof(*trans->params)
1706 					  * TRE_PARAM_LAST);
1707 		if (!trans->params)
1708 		  return REG_ESPACE;
1709 		for (i = 0; i < TRE_PARAM_LAST; i++)
1710 		  {
1711 		    trans->params[i] = TRE_PARAM_UNSET;
1712 		    if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1713 		      trans->params[i] = p1->params[i];
1714 		    if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1715 		      trans->params[i] = p2->params[i];
1716 		  }
1717 	      }
1718 	    else
1719 	      {
1720 		if (trans->params)
1721 		  xfree(trans->params);
1722 		trans->params = NULL;
1723 	      }
1724 
1725 
1726 #ifdef TRE_DEBUG
1727 	    {
1728 	      int *tags;
1729 
1730 	      DPRINT(("	 %2d -> %2d on %3d", p1->position, p2->position,
1731 		      p1->code_min));
1732 	      if (p1->code_max != p1->code_min)
1733 		DPRINT(("-%3d", p1->code_max));
1734 	      tags = trans->tags;
1735 	      if (tags)
1736 		{
1737 		  DPRINT((", tags ["));
1738 		  while (*tags >= 0)
1739 		    {
1740 		      DPRINT(("%d", *tags));
1741 		      tags++;
1742 		      if (*tags >= 0)
1743 			DPRINT((","));
1744 		    }
1745 		  DPRINT(("]"));
1746 		}
1747 	      if (trans->assertions)
1748 		DPRINT((", assert %d", trans->assertions));
1749 	      if (trans->assertions & ASSERT_BACKREF)
1750 		DPRINT((", backref %d", trans->u.backref));
1751 	      else if (trans->u.class)
1752 		DPRINT((", class %ld", (long)trans->u.class));
1753 	      if (trans->neg_classes)
1754 		DPRINT((", neg_classes %p", trans->neg_classes));
1755 	      if (trans->params)
1756 		{
1757 		  DPRINT((", "));
1758 		  tre_print_params(trans->params);
1759 		}
1760 	      DPRINT(("\n"));
1761 	    }
1762 #endif /* TRE_DEBUG */
1763 	    p2++;
1764 	  }
1765 	p1++;
1766       }
1767   else
1768     /* Compute a maximum limit for the number of transitions leaving
1769        from each state. */
1770     while (p1->position >= 0)
1771       {
1772 	p2 = orig_p2;
1773 	while (p2->position >= 0)
1774 	  {
1775 	    counts[p1->position]++;
1776 	    p2++;
1777 	  }
1778 	p1++;
1779       }
1780   return REG_OK;
1781 }
1782 
1783 /* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
1784    labelled with one character range (there are no transitions on empty
1785    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
1786    the regexp. */
1787 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1788 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1789 		int *counts, int *offs)
1790 {
1791   tre_union_t *uni;
1792   tre_catenation_t *cat;
1793   tre_iteration_t *iter;
1794   reg_errcode_t errcode = REG_OK;
1795 
1796   /* XXX - recurse using a stack!. */
1797   switch (node->type)
1798     {
1799     case LITERAL:
1800       break;
1801     case UNION:
1802       uni = (tre_union_t *)node->obj;
1803       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
1804       if (errcode != REG_OK)
1805 	return errcode;
1806       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
1807       break;
1808 
1809     case CATENATION:
1810       cat = (tre_catenation_t *)node->obj;
1811       /* Add a transition from each position in cat->left->lastpos
1812 	 to each position in cat->right->firstpos. */
1813       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1814 			       transitions, counts, offs);
1815       if (errcode != REG_OK)
1816 	return errcode;
1817       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
1818       if (errcode != REG_OK)
1819 	return errcode;
1820       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
1821       break;
1822 
1823     case ITERATION:
1824       iter = (tre_iteration_t *)node->obj;
1825       assert(iter->max == -1 || iter->max == 1);
1826 
1827       if (iter->max == -1)
1828 	{
1829 	  assert(iter->min == 0 || iter->min == 1);
1830 	  /* Add a transition from each last position in the iterated
1831 	     expression to each first position. */
1832 	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1833 				   transitions, counts, offs);
1834 	  if (errcode != REG_OK)
1835 	    return errcode;
1836 	}
1837       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
1838       break;
1839     }
1840   return errcode;
1841 }
1842 
1843 
1844 #define ERROR_EXIT(err)		  \
1845   do				  \
1846     {				  \
1847       errcode = err;		  \
1848       if (/*CONSTCOND*/1)	  \
1849       	goto error_exit;	  \
1850     }				  \
1851  while (/*CONSTCOND*/0)
1852 
1853 
1854 int
tre_compile(regex_t * preg,const tre_char_t * regex,size_t n,int cflags)1855 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1856 {
1857   tre_stack_t *stack;
1858   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1859   tre_pos_and_tags_t *p;
1860   int *counts = NULL, *offs = NULL;
1861   int i, add = 0;
1862   tre_tnfa_transition_t *transitions, *initial;
1863   tre_tnfa_t *tnfa = NULL;
1864   tre_submatch_data_t *submatch_data;
1865   tre_tag_direction_t *tag_directions = NULL;
1866   reg_errcode_t errcode;
1867   tre_mem_t mem;
1868 
1869   /* Parse context. */
1870   tre_parse_ctx_t parse_ctx;
1871 
1872   /* Allocate a stack used throughout the compilation process for various
1873      purposes. */
1874   stack = tre_stack_new(512, 10240, 128);
1875   if (!stack)
1876     return REG_ESPACE;
1877   /* Allocate a fast memory allocator. */
1878   mem = tre_mem_new();
1879   if (!mem)
1880     {
1881       tre_stack_destroy(stack);
1882       return REG_ESPACE;
1883     }
1884 
1885   /* Parse the regexp. */
1886   memset(&parse_ctx, 0, sizeof(parse_ctx));
1887   parse_ctx.mem = mem;
1888   parse_ctx.stack = stack;
1889   parse_ctx.re = regex;
1890   parse_ctx.len = n;
1891   parse_ctx.cflags = cflags;
1892   parse_ctx.max_backref = -1;
1893   DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1894   errcode = tre_parse(&parse_ctx);
1895   if (errcode != REG_OK)
1896     ERROR_EXIT(errcode);
1897   preg->re_nsub = parse_ctx.submatch_id - 1;
1898   tree = parse_ctx.result;
1899 
1900   /* Back references and approximate matching cannot currently be used
1901      in the same regexp. */
1902   if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1903     ERROR_EXIT(REG_BADPAT);
1904 
1905 #ifdef TRE_DEBUG
1906   tre_ast_print(tree);
1907 #endif /* TRE_DEBUG */
1908 
1909   /* Referring to nonexistent subexpressions is illegal. */
1910   if (parse_ctx.max_backref > (int)preg->re_nsub)
1911     ERROR_EXIT(REG_ESUBREG);
1912 
1913   /* Allocate the TNFA struct. */
1914   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1915   if (tnfa == NULL)
1916     ERROR_EXIT(REG_ESPACE);
1917   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1918   tnfa->have_approx = parse_ctx.have_approx;
1919   tnfa->num_submatches = parse_ctx.submatch_id;
1920 
1921   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
1922      regexp does not have back references, this can be skipped. */
1923   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1924     {
1925       DPRINT(("tre_compile: setting up tags\n"));
1926 
1927       /* Figure out how many tags we will need. */
1928       errcode = tre_add_tags(NULL, stack, tree, tnfa);
1929       if (errcode != REG_OK)
1930 	ERROR_EXIT(errcode);
1931 #ifdef TRE_DEBUG
1932       tre_ast_print(tree);
1933 #endif /* TRE_DEBUG */
1934 
1935       if (tnfa->num_tags > 0)
1936 	{
1937 	  tag_directions = xmalloc(sizeof(*tag_directions)
1938 				   * (tnfa->num_tags + 1));
1939 	  if (tag_directions == NULL)
1940 	    ERROR_EXIT(REG_ESPACE);
1941 	  tnfa->tag_directions = tag_directions;
1942 	  memset(tag_directions, -1,
1943 		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
1944 	}
1945       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
1946 				   sizeof(tnfa->minimal_tags));
1947       if (tnfa->minimal_tags == NULL)
1948 	ERROR_EXIT(REG_ESPACE);
1949 
1950       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
1951 			      sizeof(*submatch_data));
1952       if (submatch_data == NULL)
1953 	ERROR_EXIT(REG_ESPACE);
1954       tnfa->submatch_data = submatch_data;
1955 
1956       errcode = tre_add_tags(mem, stack, tree, tnfa);
1957       if (errcode != REG_OK)
1958 	ERROR_EXIT(errcode);
1959 
1960 #ifdef TRE_DEBUG
1961       for (i = 0; i < parse_ctx.submatch_id; i++)
1962 	DPRINT(("pmatch[%d] = {t%d, t%d}\n",
1963 		i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
1964       for (i = 0; i < tnfa->num_tags; i++)
1965 	DPRINT(("t%d is %s\n", i,
1966 		tag_directions[i] == TRE_TAG_MINIMIZE ?
1967 		"minimized" : "maximized"));
1968 #endif /* TRE_DEBUG */
1969     }
1970 
1971   /* Expand iteration nodes. */
1972   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
1973 			   tag_directions, &tnfa->params_depth);
1974   if (errcode != REG_OK)
1975     ERROR_EXIT(errcode);
1976 
1977   /* Add a dummy node for the final state.
1978      XXX - For certain patterns this dummy node can be optimized away,
1979 	   for example "a*" or "ab*".	Figure out a simple way to detect
1980 	   this possibility. */
1981   tmp_ast_l = tree;
1982   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
1983   if (tmp_ast_r == NULL)
1984     ERROR_EXIT(REG_ESPACE);
1985 
1986   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
1987   if (tree == NULL)
1988     ERROR_EXIT(REG_ESPACE);
1989 
1990 #ifdef TRE_DEBUG
1991   tre_ast_print(tree);
1992   DPRINT(("Number of states: %d\n", parse_ctx.position));
1993 #endif /* TRE_DEBUG */
1994 
1995   errcode = tre_compute_nfl(mem, stack, tree);
1996   if (errcode != REG_OK)
1997     ERROR_EXIT(errcode);
1998 
1999   counts = xmalloc(sizeof(int) * parse_ctx.position);
2000   if (counts == NULL)
2001     ERROR_EXIT(REG_ESPACE);
2002 
2003   offs = xmalloc(sizeof(int) * parse_ctx.position);
2004   if (offs == NULL)
2005     ERROR_EXIT(REG_ESPACE);
2006 
2007   for (i = 0; i < parse_ctx.position; i++)
2008     counts[i] = 0;
2009   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2010 
2011   add = 0;
2012   for (i = 0; i < parse_ctx.position; i++)
2013     {
2014       offs[i] = add;
2015       add += counts[i] + 1;
2016       counts[i] = 0;
2017     }
2018   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2019   if (transitions == NULL)
2020     ERROR_EXIT(REG_ESPACE);
2021   tnfa->transitions = transitions;
2022   tnfa->num_transitions = add;
2023 
2024   DPRINT(("Converting to TNFA:\n"));
2025   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2026   if (errcode != REG_OK)
2027     ERROR_EXIT(errcode);
2028 
2029   /* If in eight bit mode, compute a table of characters that can be the
2030      first character of a match. */
2031   tnfa->first_char = -1;
2032   if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2033     {
2034       int count = 0;
2035       tre_cint_t k;
2036       DPRINT(("Characters that can start a match:"));
2037       tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2038       if (tnfa->firstpos_chars == NULL)
2039 	ERROR_EXIT(REG_ESPACE);
2040       for (p = tree->firstpos; p->position >= 0; p++)
2041 	{
2042 	  tre_tnfa_transition_t *j = transitions + offs[p->position];
2043 	  while (j->state != NULL)
2044 	    {
2045 	      for (k = j->code_min; k <= j->code_max && k < 256; k++)
2046 		{
2047 		  DPRINT((" %d", k));
2048 		  tnfa->firstpos_chars[k] = 1;
2049 		  count++;
2050 		}
2051 	      j++;
2052 	    }
2053 	}
2054       DPRINT(("\n"));
2055 #define TRE_OPTIMIZE_FIRST_CHAR 1
2056 #if TRE_OPTIMIZE_FIRST_CHAR
2057       if (count == 1)
2058 	{
2059 	  for (k = 0; k < 256; k++)
2060 	    if (tnfa->firstpos_chars[k])
2061 	      {
2062 		DPRINT(("first char must be %d\n", k));
2063 		tnfa->first_char = k;
2064 		xfree(tnfa->firstpos_chars);
2065 		tnfa->firstpos_chars = NULL;
2066 		break;
2067 	      }
2068 	}
2069 #endif
2070 
2071     }
2072   else
2073     tnfa->firstpos_chars = NULL;
2074 
2075 
2076   p = tree->firstpos;
2077   i = 0;
2078   while (p->position >= 0)
2079     {
2080       i++;
2081 
2082 #ifdef TRE_DEBUG
2083       {
2084 	int *tags;
2085 	DPRINT(("initial: %d", p->position));
2086 	tags = p->tags;
2087 	if (tags != NULL)
2088 	  {
2089 	    if (*tags >= 0)
2090 	      DPRINT(("/"));
2091 	    while (*tags >= 0)
2092 	      {
2093 		DPRINT(("%d", *tags));
2094 		tags++;
2095 		if (*tags >= 0)
2096 		  DPRINT((","));
2097 	      }
2098 	  }
2099 	DPRINT((", assert %d", p->assertions));
2100 	if (p->params)
2101 	  {
2102 	    DPRINT((", "));
2103 	    tre_print_params(p->params);
2104 	  }
2105 	DPRINT(("\n"));
2106       }
2107 #endif /* TRE_DEBUG */
2108 
2109       p++;
2110     }
2111 
2112   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2113   if (initial == NULL)
2114     ERROR_EXIT(REG_ESPACE);
2115   tnfa->initial = initial;
2116 
2117   i = 0;
2118   for (p = tree->firstpos; p->position >= 0; p++)
2119     {
2120       initial[i].state = transitions + offs[p->position];
2121       initial[i].state_id = p->position;
2122       initial[i].tags = NULL;
2123       /* Copy the arrays p->tags, and p->params, they are allocated
2124 	 from a tre_mem object. */
2125       if (p->tags)
2126 	{
2127 	  int j;
2128 	  for (j = 0; p->tags[j] >= 0; j++);
2129 	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2130 	  if (!initial[i].tags)
2131 	    ERROR_EXIT(REG_ESPACE);
2132 	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2133 	}
2134       initial[i].params = NULL;
2135       if (p->params)
2136 	{
2137 	  initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2138 	  if (!initial[i].params)
2139 	    ERROR_EXIT(REG_ESPACE);
2140 	  memcpy(initial[i].params, p->params,
2141 		 sizeof(*p->params) * TRE_PARAM_LAST);
2142 	}
2143       initial[i].assertions = p->assertions;
2144       i++;
2145     }
2146   initial[i].state = NULL;
2147 
2148   tnfa->num_transitions = add;
2149   tnfa->final = transitions + offs[tree->lastpos[0].position];
2150   tnfa->num_states = parse_ctx.position;
2151   tnfa->cflags = cflags;
2152 
2153   DPRINT(("final state %p\n", (void *)tnfa->final));
2154 
2155   tre_mem_destroy(mem);
2156   tre_stack_destroy(stack);
2157   xfree(counts);
2158   xfree(offs);
2159 
2160   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2161   return REG_OK;
2162 
2163  error_exit:
2164   /* Free everything that was allocated and return the error code. */
2165   tre_mem_destroy(mem);
2166   if (stack != NULL)
2167     tre_stack_destroy(stack);
2168   if (counts != NULL)
2169     xfree(counts);
2170   if (offs != NULL)
2171     xfree(offs);
2172   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2173   tre_free(preg);
2174   return errcode;
2175 }
2176 
2177 
2178 
2179 
2180 void
tre_free(regex_t * preg)2181 tre_free(regex_t *preg)
2182 {
2183   tre_tnfa_t *tnfa;
2184   unsigned int i;
2185   tre_tnfa_transition_t *trans;
2186 
2187   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2188   if (!tnfa)
2189     return;
2190 
2191   for (i = 0; i < tnfa->num_transitions; i++)
2192     if (tnfa->transitions[i].state)
2193       {
2194 	if (tnfa->transitions[i].tags)
2195 	  xfree(tnfa->transitions[i].tags);
2196 	if (tnfa->transitions[i].neg_classes)
2197 	  xfree(tnfa->transitions[i].neg_classes);
2198 	if (tnfa->transitions[i].params)
2199 	  xfree(tnfa->transitions[i].params);
2200       }
2201   if (tnfa->transitions)
2202     xfree(tnfa->transitions);
2203 
2204   if (tnfa->initial)
2205     {
2206       for (trans = tnfa->initial; trans->state; trans++)
2207 	{
2208 	  if (trans->tags)
2209 	    xfree(trans->tags);
2210 	  if (trans->params)
2211 	    xfree(trans->params);
2212 	}
2213       xfree(tnfa->initial);
2214     }
2215 
2216   if (tnfa->submatch_data)
2217     {
2218       for (i = 0; i < tnfa->num_submatches; i++)
2219 	if (tnfa->submatch_data[i].parents)
2220 	  xfree(tnfa->submatch_data[i].parents);
2221       xfree(tnfa->submatch_data);
2222     }
2223 
2224   if (tnfa->tag_directions)
2225     xfree(tnfa->tag_directions);
2226   if (tnfa->firstpos_chars)
2227     xfree(tnfa->firstpos_chars);
2228   if (tnfa->minimal_tags)
2229     xfree(tnfa->minimal_tags);
2230   xfree(tnfa);
2231 }
2232 
2233 char *
tre_version(void)2234 tre_version(void)
2235 {
2236   static char str[256];
2237   char *version;
2238 
2239   if (str[0] == 0)
2240     {
2241       (void) tre_config(TRE_CONFIG_VERSION, &version);
2242       (void) snprintf(str, sizeof(str), "TRE %s (BSD)", version);
2243     }
2244   return str;
2245 }
2246 
2247 int
tre_config(int query,void * result)2248 tre_config(int query, void *result)
2249 {
2250   int *int_result = result;
2251   const char **string_result = result;
2252 
2253   switch (query)
2254     {
2255     case TRE_CONFIG_APPROX:
2256 #ifdef TRE_APPROX
2257       *int_result = 1;
2258 #else /* !TRE_APPROX */
2259       *int_result = 0;
2260 #endif /* !TRE_APPROX */
2261       return REG_OK;
2262 
2263     case TRE_CONFIG_WCHAR:
2264 #ifdef TRE_WCHAR
2265       *int_result = 1;
2266 #else /* !TRE_WCHAR */
2267       *int_result = 0;
2268 #endif /* !TRE_WCHAR */
2269       return REG_OK;
2270 
2271     case TRE_CONFIG_MULTIBYTE:
2272 #ifdef TRE_MULTIBYTE
2273       *int_result = 1;
2274 #else /* !TRE_MULTIBYTE */
2275       *int_result = 0;
2276 #endif /* !TRE_MULTIBYTE */
2277       return REG_OK;
2278 
2279     case TRE_CONFIG_SYSTEM_ABI:
2280 #ifdef TRE_CONFIG_SYSTEM_ABI
2281       *int_result = 1;
2282 #else /* !TRE_CONFIG_SYSTEM_ABI */
2283       *int_result = 0;
2284 #endif /* !TRE_CONFIG_SYSTEM_ABI */
2285       return REG_OK;
2286 
2287     case TRE_CONFIG_VERSION:
2288       *string_result = TRE_VERSION;
2289       return REG_OK;
2290     }
2291 
2292   return REG_NOMATCH;
2293 }
2294 
2295 
2296 /* EOF */
2297