1 /*
2   tre-compile.c - TRE regex compiler
3 
4   Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>
5 
6   This library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10 
11   This library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15 
16   You should have received a copy of the GNU Lesser General Public
17   License along with this library; if not, write to the Free Software
18   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 
22 /*
23   TODO:
24    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
25      function calls.
26 */
27 
28 
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif /* HAVE_CONFIG_H */
32 #include <stdio.h>
33 #include <assert.h>
34 #include <string.h>
35 
36 #include "tre-internal.h"
37 #include "tre-mem.h"
38 #include "tre-stack.h"
39 #include "tre-ast.h"
40 #include "tre-parse.h"
41 #include "tre-compile.h"
42 #include "regex.h"
43 #include "xmalloc.h"
44 
45 /*
46   Algorithms to setup tags so that submatch addressing can be done.
47 */
48 
49 
50 /* Inserts a catenation node to the root of the tree given in `node'.
51    As the left child a new tag with number `tag_id' to `node' is added,
52    and the right child is the old root. */
53 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)54 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
55 {
56   tre_catenation_t *c;
57 
58   DPRINT(("add_tag_left: tag %d\n", tag_id));
59 
60   c = tre_mem_alloc(mem, sizeof(*c));
61   if (c == NULL)
62     return REG_ESPACE;
63   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
64   if (c->left == NULL)
65     return REG_ESPACE;
66   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
67   if (c->right == NULL)
68     return REG_ESPACE;
69 
70   c->right->obj = node->obj;
71   c->right->type = node->type;
72   c->right->nullable = -1;
73   c->right->submatch_id = -1;
74   c->right->firstpos = NULL;
75   c->right->lastpos = NULL;
76   c->right->num_tags = 0;
77   node->obj = c;
78   node->type = CATENATION;
79   return REG_OK;
80 }
81 
82 /* Inserts a catenation node to the root of the tree given in `node'.
83    As the right child a new tag with number `tag_id' to `node' is added,
84    and the left child is the old root. */
85 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)86 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
87 {
88   tre_catenation_t *c;
89 
90   DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
91 
92   c = tre_mem_alloc(mem, sizeof(*c));
93   if (c == NULL)
94     return REG_ESPACE;
95   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
96   if (c->right == NULL)
97     return REG_ESPACE;
98   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
99   if (c->left == NULL)
100     return REG_ESPACE;
101 
102   c->left->obj = node->obj;
103   c->left->type = node->type;
104   c->left->nullable = -1;
105   c->left->submatch_id = -1;
106   c->left->firstpos = NULL;
107   c->left->lastpos = NULL;
108   c->left->num_tags = 0;
109   node->obj = c;
110   node->type = CATENATION;
111   return REG_OK;
112 }
113 
114 typedef enum {
115   ADDTAGS_RECURSE,
116   ADDTAGS_AFTER_ITERATION,
117   ADDTAGS_AFTER_UNION_LEFT,
118   ADDTAGS_AFTER_UNION_RIGHT,
119   ADDTAGS_AFTER_CAT_LEFT,
120   ADDTAGS_AFTER_CAT_RIGHT,
121   ADDTAGS_SET_SUBMATCH_END
122 } tre_addtags_symbol_t;
123 
124 
125 typedef struct {
126   int tag;
127   int next_tag;
128 } tre_tag_states_t;
129 
130 /* Adds tags to appropriate locations in the parse tree in `tree', so that
131    subexpressions marked for submatch addressing can be traced. */
132 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)133 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
134 	     tre_tnfa_t *tnfa)
135 {
136   reg_errcode_t status = REG_OK;
137   tre_addtags_symbol_t symbol;
138   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
139   int bottom = tre_stack_num_objects(stack);
140   /* True for first pass (counting number of needed tags) */
141   int first_pass = (mem == NULL || tnfa == NULL);
142   int *regset, *orig_regset;
143   int num_tags = 0; /* Total number of tags. */
144   int num_minimals = 0;	 /* Number of special minimal tags. */
145   int tag = 0;	    /* The tag that is to be added next. */
146   int next_tag = 1; /* Next tag to use after this one. */
147   int *parents;	    /* Stack of submatches the current submatch is
148 		       contained in. */
149   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
150   tre_tag_states_t *saved_states;
151 
152   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
153   if (!first_pass)
154     {
155       tnfa->end_tag = 0;
156       tnfa->minimal_tags[0] = -1;
157     }
158 
159   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
160   if (regset == NULL)
161     return REG_ESPACE;
162   regset[0] = -1;
163   orig_regset = regset;
164 
165   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
166   if (parents == NULL)
167     {
168       xfree(regset);
169       return REG_ESPACE;
170     }
171   parents[0] = -1;
172 
173   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
174   if (saved_states == NULL)
175     {
176       xfree(regset);
177       xfree(parents);
178       return REG_ESPACE;
179     }
180   else
181     {
182       unsigned int i;
183       for (i = 0; i <= tnfa->num_submatches; i++)
184 	saved_states[i].tag = -1;
185     }
186 
187   STACK_PUSH(stack, voidptr, node);
188   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
189 
190   while (tre_stack_num_objects(stack) > bottom)
191     {
192       if (status != REG_OK)
193 	break;
194 
195       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
196       switch (symbol)
197 	{
198 
199 	case ADDTAGS_SET_SUBMATCH_END:
200 	  {
201 	    int id = tre_stack_pop_int(stack);
202 	    int i;
203 
204 	    /* Add end of this submatch to regset. */
205 	    for (i = 0; regset[i] >= 0; i++);
206 	    regset[i] = id * 2 + 1;
207 	    regset[i + 1] = -1;
208 
209 	    /* Pop this submatch from the parents stack. */
210 	    for (i = 0; parents[i] >= 0; i++);
211 	    parents[i - 1] = -1;
212 	    break;
213 	  }
214 
215 	case ADDTAGS_RECURSE:
216 	  node = tre_stack_pop_voidptr(stack);
217 
218 	  if (node->submatch_id >= 0)
219 	    {
220 	      int id = node->submatch_id;
221 	      int i;
222 
223 
224 	      /* Add start of this submatch to regset. */
225 	      for (i = 0; regset[i] >= 0; i++);
226 	      regset[i] = id * 2;
227 	      regset[i + 1] = -1;
228 
229 	      if (!first_pass)
230 		{
231 		  for (i = 0; parents[i] >= 0; i++);
232 		  tnfa->submatch_data[id].parents = NULL;
233 		  if (i > 0)
234 		    {
235 		      int *p = xmalloc(sizeof(*p) * (i + 1));
236 		      if (p == NULL)
237 			{
238 			  status = REG_ESPACE;
239 			  break;
240 			}
241 		      assert(tnfa->submatch_data[id].parents == NULL);
242 		      tnfa->submatch_data[id].parents = p;
243 		      for (i = 0; parents[i] >= 0; i++)
244 			p[i] = parents[i];
245 		      p[i] = -1;
246 		    }
247 		}
248 
249 	      /* Add end of this submatch to regset after processing this
250 		 node. */
251 	      STACK_PUSHX(stack, int, node->submatch_id);
252 	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
253 	    }
254 
255 	  switch (node->type)
256 	    {
257 	    case LITERAL:
258 	      {
259 		tre_literal_t *lit = node->obj;
260 
261 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
262 		  {
263 		    int i;
264 		    DPRINT(("Literal %d-%d\n",
265 			    (int)lit->code_min, (int)lit->code_max));
266 		    if (regset[0] >= 0)
267 		      {
268 			/* Regset is not empty, so add a tag before the
269 			   literal or backref. */
270 			if (!first_pass)
271 			  {
272 			    status = tre_add_tag_left(mem, node, tag);
273 			    tnfa->tag_directions[tag] = direction;
274 			    if (minimal_tag >= 0)
275 			      {
276 				DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
277 				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
278 				tnfa->minimal_tags[i] = tag;
279 				tnfa->minimal_tags[i + 1] = minimal_tag;
280 				tnfa->minimal_tags[i + 2] = -1;
281 				minimal_tag = -1;
282 				num_minimals++;
283 			      }
284 			    /* Go through the regset and set submatch data for
285 			       submatches that are using this tag. */
286 			    for (i = 0; regset[i] >= 0; i++)
287 			      {
288 				int id = regset[i] / 2;
289 				int start = !(regset[i] % 2);
290 				DPRINT(("  Using tag %d for %s offset of "
291 					"submatch %d\n", tag,
292 					start ? "start" : "end", id));
293 				if (start)
294 				  tnfa->submatch_data[id].so_tag = tag;
295 				else
296 				  tnfa->submatch_data[id].eo_tag = tag;
297 			      }
298 			  }
299 			else
300 			  {
301 			    DPRINT(("  num_tags = 1\n"));
302 			    node->num_tags = 1;
303 			  }
304 
305 			DPRINT(("  num_tags++\n"));
306 			regset[0] = -1;
307 			tag = next_tag;
308 			num_tags++;
309 			next_tag++;
310 		      }
311 		  }
312 		else
313 		  {
314 		    assert(!IS_TAG(lit));
315 		  }
316 		break;
317 	      }
318 	    case CATENATION:
319 	      {
320 		tre_catenation_t *cat = node->obj;
321 		tre_ast_node_t *left = cat->left;
322 		tre_ast_node_t *right = cat->right;
323 		int reserved_tag = -1;
324 		DPRINT(("Catenation, next_tag = %d\n", next_tag));
325 
326 
327 		/* After processing right child. */
328 		STACK_PUSHX(stack, voidptr, node);
329 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
330 
331 		/* Process right child. */
332 		STACK_PUSHX(stack, voidptr, right);
333 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
334 
335 		/* After processing left child. */
336 		STACK_PUSHX(stack, int, next_tag + left->num_tags);
337 		DPRINT(("  Pushing %d for after left\n",
338 			next_tag + left->num_tags));
339 		if (left->num_tags > 0 && right->num_tags > 0)
340 		  {
341 		    /* Reserve the next tag to the right child. */
342 		    DPRINT(("  Reserving next_tag %d to right child\n",
343 			    next_tag));
344 		    reserved_tag = next_tag;
345 		    next_tag++;
346 		  }
347 		STACK_PUSHX(stack, int, reserved_tag);
348 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
349 
350 		/* Process left child. */
351 		STACK_PUSHX(stack, voidptr, left);
352 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
353 
354 		}
355 	      break;
356 	    case ITERATION:
357 	      {
358 		tre_iteration_t *iter = node->obj;
359 		DPRINT(("Iteration\n"));
360 
361 		if (first_pass)
362 		  {
363 		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
364 		  }
365 		else
366 		  {
367 		    STACK_PUSHX(stack, int, tag);
368 		    STACK_PUSHX(stack, int, iter->minimal);
369 		  }
370 		STACK_PUSHX(stack, voidptr, node);
371 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
372 
373 		STACK_PUSHX(stack, voidptr, iter->arg);
374 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
375 
376 		/* Regset is not empty, so add a tag here. */
377 		if (regset[0] >= 0 || iter->minimal)
378 		  {
379 		    if (!first_pass)
380 		      {
381 			int i;
382 			status = tre_add_tag_left(mem, node, tag);
383 			if (iter->minimal)
384 			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
385 			else
386 			  tnfa->tag_directions[tag] = direction;
387 			if (minimal_tag >= 0)
388 			  {
389 			    DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
390 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
391 			    tnfa->minimal_tags[i] = tag;
392 			    tnfa->minimal_tags[i + 1] = minimal_tag;
393 			    tnfa->minimal_tags[i + 2] = -1;
394 			    minimal_tag = -1;
395 			    num_minimals++;
396 			  }
397 			/* Go through the regset and set submatch data for
398 			   submatches that are using this tag. */
399 			for (i = 0; regset[i] >= 0; i++)
400 			  {
401 			    int id = regset[i] / 2;
402 			    int start = !(regset[i] % 2);
403 			    DPRINT(("  Using tag %d for %s offset of "
404 				    "submatch %d\n", tag,
405 				    start ? "start" : "end", id));
406 			    if (start)
407 			      tnfa->submatch_data[id].so_tag = tag;
408 			    else
409 			      tnfa->submatch_data[id].eo_tag = tag;
410 			  }
411 		      }
412 
413 		    DPRINT(("  num_tags++\n"));
414 		    regset[0] = -1;
415 		    tag = next_tag;
416 		    num_tags++;
417 		    next_tag++;
418 		  }
419 		direction = TRE_TAG_MINIMIZE;
420 	      }
421 	      break;
422 	    case UNION:
423 	      {
424 		tre_union_t *uni = node->obj;
425 		tre_ast_node_t *left = uni->left;
426 		tre_ast_node_t *right = uni->right;
427 		int left_tag;
428 		int right_tag;
429 
430 		if (regset[0] >= 0)
431 		  {
432 		    left_tag = next_tag;
433 		    right_tag = next_tag + 1;
434 		  }
435 		else
436 		  {
437 		    left_tag = tag;
438 		    right_tag = next_tag;
439 		  }
440 
441 		DPRINT(("Union\n"));
442 
443 		/* After processing right child. */
444 		STACK_PUSHX(stack, int, right_tag);
445 		STACK_PUSHX(stack, int, left_tag);
446 		STACK_PUSHX(stack, voidptr, regset);
447 		STACK_PUSHX(stack, int, regset[0] >= 0);
448 		STACK_PUSHX(stack, voidptr, node);
449 		STACK_PUSHX(stack, voidptr, right);
450 		STACK_PUSHX(stack, voidptr, left);
451 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
452 
453 		/* Process right child. */
454 		STACK_PUSHX(stack, voidptr, right);
455 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
456 
457 		/* After processing left child. */
458 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
459 
460 		/* Process left child. */
461 		STACK_PUSHX(stack, voidptr, left);
462 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
463 
464 		/* Regset is not empty, so add a tag here. */
465 		if (regset[0] >= 0)
466 		  {
467 		    if (!first_pass)
468 		      {
469 			int i;
470 			status = tre_add_tag_left(mem, node, tag);
471 			tnfa->tag_directions[tag] = direction;
472 			if (minimal_tag >= 0)
473 			  {
474 			    DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
475 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
476 			    tnfa->minimal_tags[i] = tag;
477 			    tnfa->minimal_tags[i + 1] = minimal_tag;
478 			    tnfa->minimal_tags[i + 2] = -1;
479 			    minimal_tag = -1;
480 			    num_minimals++;
481 			  }
482 			/* Go through the regset and set submatch data for
483 			   submatches that are using this tag. */
484 			for (i = 0; regset[i] >= 0; i++)
485 			  {
486 			    int id = regset[i] / 2;
487 			    int start = !(regset[i] % 2);
488 			    DPRINT(("  Using tag %d for %s offset of "
489 				    "submatch %d\n", tag,
490 				    start ? "start" : "end", id));
491 			    if (start)
492 			      tnfa->submatch_data[id].so_tag = tag;
493 			    else
494 			      tnfa->submatch_data[id].eo_tag = tag;
495 			  }
496 		      }
497 
498 		    DPRINT(("  num_tags++\n"));
499 		    regset[0] = -1;
500 		    tag = next_tag;
501 		    num_tags++;
502 		    next_tag++;
503 		  }
504 
505 		if (node->num_submatches > 0)
506 		  {
507 		    /* The next two tags are reserved for markers. */
508 		    next_tag++;
509 		    tag = next_tag;
510 		    next_tag++;
511 		  }
512 
513 		break;
514 	      }
515 	    }
516 
517 	  if (node->submatch_id >= 0)
518 	    {
519 	      int i;
520 	      /* Push this submatch on the parents stack. */
521 	      for (i = 0; parents[i] >= 0; i++);
522 	      parents[i] = node->submatch_id;
523 	      parents[i + 1] = -1;
524 	    }
525 
526 	  break; /* end case: ADDTAGS_RECURSE */
527 
528 	case ADDTAGS_AFTER_ITERATION:
529 	  {
530 	    int minimal = 0;
531 	    int enter_tag;
532 	    node = tre_stack_pop_voidptr(stack);
533 	    if (first_pass)
534 	      {
535 		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
536 		  + tre_stack_pop_int(stack);
537 		minimal_tag = -1;
538 	      }
539 	    else
540 	      {
541 		minimal = tre_stack_pop_int(stack);
542 		enter_tag = tre_stack_pop_int(stack);
543 		if (minimal)
544 		  minimal_tag = enter_tag;
545 	      }
546 
547 	    DPRINT(("After iteration\n"));
548 	    if (!first_pass)
549 	      {
550 		DPRINT(("  Setting direction to %s\n",
551 			minimal ? "minimize" : "maximize"));
552 		if (minimal)
553 		  direction = TRE_TAG_MINIMIZE;
554 		else
555 		  direction = TRE_TAG_MAXIMIZE;
556 	      }
557 	    break;
558 	  }
559 
560 	case ADDTAGS_AFTER_CAT_LEFT:
561 	  {
562 	    int new_tag = tre_stack_pop_int(stack);
563 	    next_tag = tre_stack_pop_int(stack);
564 	    DPRINT(("After cat left, tag = %d, next_tag = %d\n",
565 		    tag, next_tag));
566 	    if (new_tag >= 0)
567 	      {
568 		DPRINT(("  Setting tag to %d\n", new_tag));
569 		tag = new_tag;
570 	      }
571 	    break;
572 	  }
573 
574 	case ADDTAGS_AFTER_CAT_RIGHT:
575 	  DPRINT(("After cat right\n"));
576 	  node = tre_stack_pop_voidptr(stack);
577 	  if (first_pass)
578 	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
579 	      + ((tre_catenation_t *)node->obj)->right->num_tags;
580 	  break;
581 
582 	case ADDTAGS_AFTER_UNION_LEFT:
583 	  DPRINT(("After union left\n"));
584 	  /* Lift the bottom of the `regset' array so that when processing
585 	     the right operand the items currently in the array are
586 	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
587 	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
588 	  while (*regset >= 0)
589 	    regset++;
590 	  break;
591 
592 	case ADDTAGS_AFTER_UNION_RIGHT:
593 	  {
594 	    int added_tags, tag_left, tag_right;
595 	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
596 	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
597 	    DPRINT(("After union right\n"));
598 	    node = tre_stack_pop_voidptr(stack);
599 	    added_tags = tre_stack_pop_int(stack);
600 	    if (first_pass)
601 	      {
602 		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
603 		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
604 		  + ((node->num_submatches > 0) ? 2 : 0);
605 	      }
606 	    regset = tre_stack_pop_voidptr(stack);
607 	    tag_left = tre_stack_pop_int(stack);
608 	    tag_right = tre_stack_pop_int(stack);
609 
610 	    /* Add tags after both children, the left child gets a smaller
611 	       tag than the right child.  This guarantees that we prefer
612 	       the left child over the right child. */
613 	    /* XXX - This is not always necessary (if the children have
614 	       tags which must be seen for every match of that child). */
615 	    /* XXX - Check if this is the only place where tre_add_tag_right
616 	       is used.	 If so, use tre_add_tag_left (putting the tag before
617 	       the child as opposed after the child) and throw away
618 	       tre_add_tag_right. */
619 	    if (node->num_submatches > 0)
620 	      {
621 		if (!first_pass)
622 		  {
623 		    status = tre_add_tag_right(mem, left, tag_left);
624 		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
625 		    status = tre_add_tag_right(mem, right, tag_right);
626 		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
627 		  }
628 		DPRINT(("  num_tags += 2\n"));
629 		num_tags += 2;
630 	      }
631 	    direction = TRE_TAG_MAXIMIZE;
632 	    break;
633 	  }
634 
635 	default:
636 	  assert(0);
637 	  break;
638 
639 	} /* end switch(symbol) */
640     } /* end while(tre_stack_num_objects(stack) > bottom) */
641 
642   if (!first_pass)
643     {
644       int i;
645       /* Go through the regset and set submatch data for
646 	 submatches that are using this tag. */
647       for (i = 0; regset[i] >= 0; i++)
648 	{
649 	  int id = regset[i] / 2;
650 	  int start = !(regset[i] % 2);
651 	  DPRINT(("  Using tag %d for %s offset of "
652 		  "submatch %d\n", num_tags,
653 		  start ? "start" : "end", id));
654 	  if (start)
655 	    tnfa->submatch_data[id].so_tag = num_tags;
656 	  else
657 	    tnfa->submatch_data[id].eo_tag = num_tags;
658 	}
659     }
660 
661   if (!first_pass && minimal_tag >= 0)
662     {
663       int i;
664       DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
665       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
666       tnfa->minimal_tags[i] = tag;
667       tnfa->minimal_tags[i + 1] = minimal_tag;
668       tnfa->minimal_tags[i + 2] = -1;
669       minimal_tag = -1;
670       num_minimals++;
671     }
672 
673   DPRINT(("tre_add_tags: %s complete.  Number of tags %d.\n",
674 	  first_pass? "First pass" : "Second pass", num_tags));
675 
676   assert(tree->num_tags == num_tags);
677   tnfa->end_tag = num_tags;
678   tnfa->num_tags = num_tags;
679   tnfa->num_minimals = num_minimals;
680   xfree(orig_regset);
681   xfree(parents);
682   xfree(saved_states);
683   return status;
684 }
685 
686 
687 
688 /*
689   AST to TNFA compilation routines.
690 */
691 
692 typedef enum {
693   COPY_RECURSE,
694   COPY_SET_RESULT_PTR
695 } tre_copyast_symbol_t;
696 
697 /* Flags for tre_copy_ast(). */
698 #define COPY_REMOVE_TAGS	 1
699 #define COPY_MAXIMIZE_FIRST_TAG	 2
700 
701 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)702 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
703 	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
704 	     tre_ast_node_t **copy, int *max_pos)
705 {
706   reg_errcode_t status = REG_OK;
707   int bottom = tre_stack_num_objects(stack);
708   int num_copied = 0;
709   int first_tag = 1;
710   tre_ast_node_t **result = copy;
711   tre_copyast_symbol_t symbol;
712 
713   STACK_PUSH(stack, voidptr, ast);
714   STACK_PUSH(stack, int, COPY_RECURSE);
715 
716   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
717     {
718       tre_ast_node_t *node;
719       if (status != REG_OK)
720 	break;
721 
722       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
723       switch (symbol)
724 	{
725 	case COPY_SET_RESULT_PTR:
726 	  result = tre_stack_pop_voidptr(stack);
727 	  break;
728 	case COPY_RECURSE:
729 	  node = tre_stack_pop_voidptr(stack);
730 	  switch (node->type)
731 	    {
732 	    case LITERAL:
733 	      {
734 		tre_literal_t *lit = node->obj;
735 		int pos = lit->position;
736 		int min = lit->code_min;
737 		int max = lit->code_max;
738 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
739 		  {
740 		    /* XXX - e.g. [ab] has only one position but two
741 		       nodes, so we are creating holes in the state space
742 		       here.  Not fatal, just wastes memory. */
743 		    pos += *pos_add;
744 		    num_copied++;
745 		  }
746 		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
747 		  {
748 		    /* Change this tag to empty. */
749 		    min = EMPTY;
750 		    max = pos = -1;
751 		  }
752 		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
753 			 && first_tag)
754 		  {
755 		    /* Maximize the first tag. */
756 		    tag_directions[max] = TRE_TAG_MAXIMIZE;
757 		    first_tag = 0;
758 		  }
759 		*result = tre_ast_new_literal(mem, min, max, pos);
760 		if (*result == NULL)
761 		  status = REG_ESPACE;
762 
763 		if (pos > *max_pos)
764 		  *max_pos = pos;
765 		break;
766 	      }
767 	    case UNION:
768 	      {
769 		tre_union_t *uni = node->obj;
770 		tre_union_t *tmp;
771 		*result = tre_ast_new_union(mem, uni->left, uni->right);
772 		if (*result == NULL)
773 		  {
774 		    status = REG_ESPACE;
775 		    break;
776 		  }
777 		tmp = (*result)->obj;
778 		result = &tmp->left;
779 		STACK_PUSHX(stack, voidptr, uni->right);
780 		STACK_PUSHX(stack, int, COPY_RECURSE);
781 		STACK_PUSHX(stack, voidptr, &tmp->right);
782 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
783 		STACK_PUSHX(stack, voidptr, uni->left);
784 		STACK_PUSHX(stack, int, COPY_RECURSE);
785 		break;
786 	      }
787 	    case CATENATION:
788 	      {
789 		tre_catenation_t *cat = node->obj;
790 		tre_catenation_t *tmp;
791 		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
792 		if (*result == NULL)
793 		  {
794 		    status = REG_ESPACE;
795 		    break;
796 		  }
797 		tmp = (*result)->obj;
798 		tmp->left = NULL;
799 		tmp->right = NULL;
800 		result = &tmp->left;
801 
802 		STACK_PUSHX(stack, voidptr, cat->right);
803 		STACK_PUSHX(stack, int, COPY_RECURSE);
804 		STACK_PUSHX(stack, voidptr, &tmp->right);
805 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
806 		STACK_PUSHX(stack, voidptr, cat->left);
807 		STACK_PUSHX(stack, int, COPY_RECURSE);
808 		break;
809 	      }
810 	    case ITERATION:
811 	      {
812 		tre_iteration_t *iter = node->obj;
813 		STACK_PUSHX(stack, voidptr, iter->arg);
814 		STACK_PUSHX(stack, int, COPY_RECURSE);
815 		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
816 					   iter->max, iter->minimal);
817 		if (*result == NULL)
818 		  {
819 		    status = REG_ESPACE;
820 		    break;
821 		  }
822 		iter = (*result)->obj;
823 		result = &iter->arg;
824 		break;
825 	      }
826 	    default:
827 	      assert(0);
828 	      break;
829 	    }
830 	  break;
831 	}
832     }
833   *pos_add += num_copied;
834   return status;
835 }
836 
837 typedef enum {
838   EXPAND_RECURSE,
839   EXPAND_AFTER_ITER
840 } tre_expand_ast_symbol_t;
841 
842 /* Expands each iteration node that has a finite nonzero minimum or maximum
843    iteration count to a catenated sequence of copies of the node. */
844 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions,int * max_depth)845 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
846 	       int *position, tre_tag_direction_t *tag_directions,
847 	       int *max_depth)
848 {
849   reg_errcode_t status = REG_OK;
850   int bottom = tre_stack_num_objects(stack);
851   int pos_add = 0;
852   int pos_add_total = 0;
853   int max_pos = 0;
854   /* Current approximate matching parameters. */
855   int params[TRE_PARAM_LAST];
856   /* Approximate parameter nesting level. */
857   int params_depth = 0;
858   int iter_depth = 0;
859   int i;
860 
861   for (i = 0; i < TRE_PARAM_LAST; i++)
862     params[i] = TRE_PARAM_DEFAULT;
863 
864   STACK_PUSHR(stack, voidptr, ast);
865   STACK_PUSHR(stack, int, EXPAND_RECURSE);
866   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
867     {
868       tre_ast_node_t *node;
869       tre_expand_ast_symbol_t symbol;
870 
871       if (status != REG_OK)
872 	break;
873 
874       DPRINT(("pos_add %d\n", pos_add));
875 
876       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
877       node = tre_stack_pop_voidptr(stack);
878       switch (symbol)
879 	{
880 	case EXPAND_RECURSE:
881 	  switch (node->type)
882 	    {
883 	    case LITERAL:
884 	      {
885 		tre_literal_t *lit= node->obj;
886 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
887 		  {
888 		    lit->position += pos_add;
889 		    if (lit->position > max_pos)
890 		      max_pos = lit->position;
891 		  }
892 		break;
893 	      }
894 	    case UNION:
895 	      {
896 		tre_union_t *uni = node->obj;
897 		STACK_PUSHX(stack, voidptr, uni->right);
898 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
899 		STACK_PUSHX(stack, voidptr, uni->left);
900 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
901 		break;
902 	      }
903 	    case CATENATION:
904 	      {
905 		tre_catenation_t *cat = node->obj;
906 		STACK_PUSHX(stack, voidptr, cat->right);
907 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
908 		STACK_PUSHX(stack, voidptr, cat->left);
909 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
910 		break;
911 	      }
912 	    case ITERATION:
913 	      {
914 		tre_iteration_t *iter = node->obj;
915 		STACK_PUSHX(stack, int, pos_add);
916 		STACK_PUSHX(stack, voidptr, node);
917 		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
918 		STACK_PUSHX(stack, voidptr, iter->arg);
919 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
920 		/* If we are going to expand this node at EXPAND_AFTER_ITER
921 		   then don't increase the `pos' fields of the nodes now, it
922 		   will get done when expanding. */
923 		if (iter->min > 1 || iter->max > 1)
924 		  pos_add = 0;
925 		iter_depth++;
926 		DPRINT(("iter\n"));
927 		break;
928 	      }
929 	    default:
930 	      assert(0);
931 	      break;
932 	    }
933 	  break;
934 	case EXPAND_AFTER_ITER:
935 	  {
936 	    tre_iteration_t *iter = node->obj;
937 	    int pos_add_last;
938 	    pos_add = tre_stack_pop_int(stack);
939 	    pos_add_last = pos_add;
940 	    if (iter->min > 1 || iter->max > 1)
941 	      {
942 		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
943 		int i;
944 		int pos_add_save = pos_add;
945 
946 		/* Create a catenated sequence of copies of the node. */
947 		for (i = 0; i < iter->min; i++)
948 		  {
949 		    tre_ast_node_t *copy;
950 		    /* Remove tags from all but the last copy. */
951 		    int flags = ((i + 1 < iter->min)
952 				 ? COPY_REMOVE_TAGS
953 				 : COPY_MAXIMIZE_FIRST_TAG);
954 		    DPRINT(("  pos_add %d\n", pos_add));
955 		    pos_add_save = pos_add;
956 		    status = tre_copy_ast(mem, stack, iter->arg, flags,
957 					  &pos_add, tag_directions, &copy,
958 					  &max_pos);
959 		    if (status != REG_OK)
960 		      return status;
961 		    if (seq1 != NULL)
962 		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
963 		    else
964 		      seq1 = copy;
965 		    if (seq1 == NULL)
966 		      return REG_ESPACE;
967 		  }
968 
969 		if (iter->max == -1)
970 		  {
971 		    /* No upper limit. */
972 		    pos_add_save = pos_add;
973 		    status = tre_copy_ast(mem, stack, iter->arg, 0,
974 					  &pos_add, NULL, &seq2, &max_pos);
975 		    if (status != REG_OK)
976 		      return status;
977 		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
978 		    if (seq2 == NULL)
979 		      return REG_ESPACE;
980 		  }
981 		else
982 		  {
983 		    for (i = iter->min; i < iter->max; i++)
984 		      {
985 			tre_ast_node_t *tmp, *copy;
986 			pos_add_save = pos_add;
987 			status = tre_copy_ast(mem, stack, iter->arg, 0,
988 					      &pos_add, NULL, &copy, &max_pos);
989 			if (status != REG_OK)
990 			  return status;
991 			if (seq2 != NULL)
992 			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
993 			else
994 			  seq2 = copy;
995 			if (seq2 == NULL)
996 			  return REG_ESPACE;
997 			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
998 			if (tmp == NULL)
999 			  return REG_ESPACE;
1000 			seq2 = tre_ast_new_union(mem, tmp, seq2);
1001 			if (seq2 == NULL)
1002 			  return REG_ESPACE;
1003 		      }
1004 		  }
1005 
1006 		pos_add = pos_add_save;
1007 		if (seq1 == NULL)
1008 		  seq1 = seq2;
1009 		else if (seq2 != NULL)
1010 		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1011 		if (seq1 == NULL)
1012 		  return REG_ESPACE;
1013 		node->obj = seq1->obj;
1014 		node->type = seq1->type;
1015 	      }
1016 
1017 	    iter_depth--;
1018 	    pos_add_total += pos_add - pos_add_last;
1019 	    if (iter_depth == 0)
1020 	      pos_add = pos_add_total;
1021 
1022 	    /* If approximate parameters are specified, surround the result
1023 	       with two parameter setting nodes.  The one on the left sets
1024 	       the specified parameters, and the one on the right restores
1025 	       the old parameters. */
1026 	    if (iter->params)
1027 	      {
1028 		tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
1029 		int *old_params;
1030 
1031 		tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1032 		if (!tmp_l)
1033 		  return REG_ESPACE;
1034 		((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
1035 		iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
1036 		tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1037 		if (!tmp_r)
1038 		  return REG_ESPACE;
1039 		old_params = tre_mem_alloc(mem, sizeof(*old_params)
1040 					   * TRE_PARAM_LAST);
1041 		if (!old_params)
1042 		  return REG_ESPACE;
1043 		for (i = 0; i < TRE_PARAM_LAST; i++)
1044 		  old_params[i] = params[i];
1045 		((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1046 		old_params[TRE_PARAM_DEPTH] = params_depth;
1047 		/* XXX - this is the only place where ast_new_node is
1048 		   needed -- should be moved inside AST module. */
1049 		node_copy = tre_ast_new_node(mem, ITERATION,
1050 					     sizeof(tre_iteration_t));
1051 		if (!node_copy)
1052 		  return REG_ESPACE;
1053 		node_copy->obj = node->obj;
1054 		tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1055 		if (!tmp_node)
1056 		  return REG_ESPACE;
1057 		tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1058 		if (!tmp_node)
1059 		  return REG_ESPACE;
1060 		/* Replace the contents of `node' with `tmp_node'. */
1061 		memcpy(node, tmp_node, sizeof(*node));
1062 		node->obj = tmp_node->obj;
1063 		node->type = tmp_node->type;
1064 		params_depth++;
1065 		if (params_depth > *max_depth)
1066 		  *max_depth = params_depth;
1067 	      }
1068 	    break;
1069 	  }
1070 	default:
1071 	  assert(0);
1072 	  break;
1073 	}
1074     }
1075 
1076   *position += pos_add_total;
1077 
1078   /* `max_pos' should never be larger than `*position' if the above
1079      code works, but just an extra safeguard let's make sure
1080      `*position' is set large enough so enough memory will be
1081      allocated for the transition table. */
1082   if (max_pos > *position)
1083     *position = max_pos;
1084 
1085 #ifdef TRE_DEBUG
1086   DPRINT(("Expanded AST:\n"));
1087   tre_ast_print(ast);
1088   DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1089 #endif
1090 
1091   return status;
1092 }
1093 
1094 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)1095 tre_set_empty(tre_mem_t mem)
1096 {
1097   tre_pos_and_tags_t *new_set;
1098 
1099   new_set = tre_mem_calloc(mem, sizeof(*new_set));
1100   if (new_set == NULL)
1101     return NULL;
1102 
1103   new_set[0].position = -1;
1104   new_set[0].code_min = -1;
1105   new_set[0].code_max = -1;
1106 
1107   return new_set;
1108 }
1109 
1110 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)1111 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1112 	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
1113 {
1114   tre_pos_and_tags_t *new_set;
1115 
1116   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
1117   if (new_set == NULL)
1118     return NULL;
1119 
1120   new_set[0].position = position;
1121   new_set[0].code_min = code_min;
1122   new_set[0].code_max = code_max;
1123   new_set[0].class = class;
1124   new_set[0].neg_classes = neg_classes;
1125   new_set[0].backref = backref;
1126   new_set[1].position = -1;
1127   new_set[1].code_min = -1;
1128   new_set[1].code_max = -1;
1129 
1130   return new_set;
1131 }
1132 
1133 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions,int * params)1134 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1135 	      int *tags, int assertions, int *params)
1136 {
1137   int s1, s2, i, j;
1138   tre_pos_and_tags_t *new_set;
1139   int *new_tags;
1140   int num_tags;
1141 
1142   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1143   for (s1 = 0; set1[s1].position >= 0; s1++);
1144   for (s2 = 0; set2[s2].position >= 0; s2++);
1145   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1146   if (!new_set )
1147     return NULL;
1148 
1149   for (s1 = 0; set1[s1].position >= 0; s1++)
1150     {
1151       new_set[s1].position = set1[s1].position;
1152       new_set[s1].code_min = set1[s1].code_min;
1153       new_set[s1].code_max = set1[s1].code_max;
1154       new_set[s1].assertions = set1[s1].assertions | assertions;
1155       new_set[s1].class = set1[s1].class;
1156       new_set[s1].neg_classes = set1[s1].neg_classes;
1157       new_set[s1].backref = set1[s1].backref;
1158       if (set1[s1].tags == NULL && tags == NULL)
1159 	new_set[s1].tags = NULL;
1160       else
1161 	{
1162 	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1163 	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
1164 					 * (i + num_tags + 1)));
1165 	  if (new_tags == NULL)
1166 	    return NULL;
1167 	  for (j = 0; j < i; j++)
1168 	    new_tags[j] = set1[s1].tags[j];
1169 	  for (i = 0; i < num_tags; i++)
1170 	    new_tags[j + i] = tags[i];
1171 	  new_tags[j + i] = -1;
1172 	  new_set[s1].tags = new_tags;
1173 	}
1174       if (set1[s1].params)
1175 	new_set[s1].params = set1[s1].params;
1176       if (params)
1177 	{
1178 	  if (!new_set[s1].params)
1179 	    new_set[s1].params = params;
1180 	  else
1181 	    {
1182 	      new_set[s1].params = tre_mem_alloc(mem, sizeof(*params) *
1183 						 TRE_PARAM_LAST);
1184 	      if (!new_set[s1].params)
1185 		return NULL;
1186 	      for (i = 0; i < TRE_PARAM_LAST; i++)
1187 		if (params[i] != TRE_PARAM_UNSET)
1188 		  new_set[s1].params[i] = params[i];
1189 	    }
1190 	}
1191     }
1192 
1193   for (s2 = 0; set2[s2].position >= 0; s2++)
1194     {
1195       new_set[s1 + s2].position = set2[s2].position;
1196       new_set[s1 + s2].code_min = set2[s2].code_min;
1197       new_set[s1 + s2].code_max = set2[s2].code_max;
1198       /* XXX - why not | assertions here as well? */
1199       new_set[s1 + s2].assertions = set2[s2].assertions;
1200       new_set[s1 + s2].class = set2[s2].class;
1201       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
1202       new_set[s1 + s2].backref = set2[s2].backref;
1203       if (set2[s2].tags == NULL)
1204 	new_set[s1 + s2].tags = NULL;
1205       else
1206 	{
1207 	  for (i = 0; set2[s2].tags[i] >= 0; i++);
1208 	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1209 	  if (new_tags == NULL)
1210 	    return NULL;
1211 	  for (j = 0; j < i; j++)
1212 	    new_tags[j] = set2[s2].tags[j];
1213 	  new_tags[j] = -1;
1214 	  new_set[s1 + s2].tags = new_tags;
1215 	}
1216       if (set2[s2].params)
1217 	new_set[s1 + s2].params = set2[s2].params;
1218       if (params)
1219 	{
1220 	  if (!new_set[s1 + s2].params)
1221 	    new_set[s1 + s2].params = params;
1222 	  else
1223 	    {
1224 	      new_set[s1 + s2].params = tre_mem_alloc(mem, sizeof(*params) *
1225 						      TRE_PARAM_LAST);
1226 	      if (!new_set[s1 + s2].params)
1227 		return NULL;
1228 	      for (i = 0; i < TRE_PARAM_LAST; i++)
1229 		if (params[i] != TRE_PARAM_UNSET)
1230 		  new_set[s1 + s2].params[i] = params[i];
1231 	    }
1232 	}
1233     }
1234   new_set[s1 + s2].position = -1;
1235   return new_set;
1236 }
1237 
1238 /* Finds the empty path through `node' which is the one that should be
1239    taken according to POSIX.2 rules, and adds the tags on that path to
1240    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
1241    set to the number of tags seen on the path. */
1242 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * params,int * num_tags_seen,int * params_seen)1243 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1244 		int *assertions, int *params, int *num_tags_seen,
1245 		int *params_seen)
1246 {
1247   tre_literal_t *lit;
1248   tre_union_t *uni;
1249   tre_catenation_t *cat;
1250   tre_iteration_t *iter;
1251   int i;
1252   int bottom = tre_stack_num_objects(stack);
1253   reg_errcode_t status = REG_OK;
1254   if (num_tags_seen)
1255     *num_tags_seen = 0;
1256   if (params_seen)
1257     *params_seen = 0;
1258 
1259   status = tre_stack_push_voidptr(stack, node);
1260 
1261   /* Walk through the tree recursively. */
1262   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1263     {
1264       node = tre_stack_pop_voidptr(stack);
1265 
1266       switch (node->type)
1267 	{
1268 	case LITERAL:
1269 	  lit = (tre_literal_t *)node->obj;
1270 	  switch (lit->code_min)
1271 	    {
1272 	    case TAG:
1273 	      if (lit->code_max >= 0)
1274 		{
1275 		  if (tags != NULL)
1276 		    {
1277 		      /* Add the tag to `tags'. */
1278 		      for (i = 0; tags[i] >= 0; i++)
1279 			if (tags[i] == lit->code_max)
1280 			  break;
1281 		      if (tags[i] < 0)
1282 			{
1283 			  tags[i] = lit->code_max;
1284 			  tags[i + 1] = -1;
1285 			}
1286 		    }
1287 		  if (num_tags_seen)
1288 		    (*num_tags_seen)++;
1289 		}
1290 	      break;
1291 	    case ASSERTION:
1292 	      assert(lit->code_max >= 1
1293 		     || lit->code_max <= ASSERT_LAST);
1294 	      if (assertions != NULL)
1295 		*assertions |= lit->code_max;
1296 	      break;
1297 	    case PARAMETER:
1298 	      if (params != NULL)
1299 		for (i = 0; i < TRE_PARAM_LAST; i++)
1300 		  params[i] = lit->u.params[i];
1301 	      if (params_seen != NULL)
1302 		*params_seen = 1;
1303 	      break;
1304 	    case EMPTY:
1305 	      break;
1306 	    default:
1307 	      assert(0);
1308 	      break;
1309 	    }
1310 	  break;
1311 
1312 	case UNION:
1313 	  /* Subexpressions starting earlier take priority over ones
1314 	     starting later, so we prefer the left subexpression over the
1315 	     right subexpression. */
1316 	  uni = (tre_union_t *)node->obj;
1317 	  if (uni->left->nullable)
1318 	    STACK_PUSHX(stack, voidptr, uni->left)
1319 	  else if (uni->right->nullable)
1320 	    STACK_PUSHX(stack, voidptr, uni->right)
1321 	  else
1322 	    assert(0);
1323 	  break;
1324 
1325 	case CATENATION:
1326 	  /* The path must go through both children. */
1327 	  cat = (tre_catenation_t *)node->obj;
1328 	  assert(cat->left->nullable);
1329 	  assert(cat->right->nullable);
1330 	  STACK_PUSHX(stack, voidptr, cat->left);
1331 	  STACK_PUSHX(stack, voidptr, cat->right);
1332 	  break;
1333 
1334 	case ITERATION:
1335 	  /* A match with an empty string is preferred over no match at
1336 	     all, so we go through the argument if possible. */
1337 	  iter = (tre_iteration_t *)node->obj;
1338 	  if (iter->arg->nullable)
1339 	    STACK_PUSHX(stack, voidptr, iter->arg);
1340 	  break;
1341 
1342 	default:
1343 	  assert(0);
1344 	  break;
1345 	}
1346     }
1347 
1348   return status;
1349 }
1350 
1351 
1352 typedef enum {
1353   NFL_RECURSE,
1354   NFL_POST_UNION,
1355   NFL_POST_CATENATION,
1356   NFL_POST_ITERATION
1357 } tre_nfl_stack_symbol_t;
1358 
1359 
1360 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1361    the nodes of the AST `tree'. */
1362 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)1363 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1364 {
1365   int bottom = tre_stack_num_objects(stack);
1366 
1367   STACK_PUSHR(stack, voidptr, tree);
1368   STACK_PUSHR(stack, int, NFL_RECURSE);
1369 
1370   while (tre_stack_num_objects(stack) > bottom)
1371     {
1372       tre_nfl_stack_symbol_t symbol;
1373       tre_ast_node_t *node;
1374 
1375       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
1376       node = tre_stack_pop_voidptr(stack);
1377       switch (symbol)
1378 	{
1379 	case NFL_RECURSE:
1380 	  switch (node->type)
1381 	    {
1382 	    case LITERAL:
1383 	      {
1384 		tre_literal_t *lit = (tre_literal_t *)node->obj;
1385 		if (IS_BACKREF(lit))
1386 		  {
1387 		    /* Back references: nullable = false, firstpos = {i},
1388 		       lastpos = {i}. */
1389 		    node->nullable = 0;
1390 		    node->firstpos = tre_set_one(mem, lit->position, 0,
1391 					     TRE_CHAR_MAX, 0, NULL, -1);
1392 		    if (!node->firstpos)
1393 		      return REG_ESPACE;
1394 		    node->lastpos = tre_set_one(mem, lit->position, 0,
1395 						TRE_CHAR_MAX, 0, NULL,
1396 						lit->code_max);
1397 		    if (!node->lastpos)
1398 		      return REG_ESPACE;
1399 		  }
1400 		else if (lit->code_min < 0)
1401 		  {
1402 		    /* Tags, empty strings, params, and zero width assertions:
1403 		       nullable = true, firstpos = {}, and lastpos = {}. */
1404 		    node->nullable = 1;
1405 		    node->firstpos = tre_set_empty(mem);
1406 		    if (!node->firstpos)
1407 		      return REG_ESPACE;
1408 		    node->lastpos = tre_set_empty(mem);
1409 		    if (!node->lastpos)
1410 		      return REG_ESPACE;
1411 		  }
1412 		else
1413 		  {
1414 		    /* Literal at position i: nullable = false, firstpos = {i},
1415 		       lastpos = {i}. */
1416 		    node->nullable = 0;
1417 		    node->firstpos =
1418 		      tre_set_one(mem, lit->position, lit->code_min,
1419 				  lit->code_max, 0, NULL, -1);
1420 		    if (!node->firstpos)
1421 		      return REG_ESPACE;
1422 		    node->lastpos = tre_set_one(mem, lit->position,
1423 						lit->code_min, lit->code_max,
1424 						lit->u.class, lit->neg_classes,
1425 						-1);
1426 		    if (!node->lastpos)
1427 		      return REG_ESPACE;
1428 		  }
1429 		break;
1430 	      }
1431 
1432 	    case UNION:
1433 	      /* Compute the attributes for the two subtrees, and after that
1434 		 for this node. */
1435 	      STACK_PUSHR(stack, voidptr, node);
1436 	      STACK_PUSHR(stack, int, NFL_POST_UNION);
1437 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
1438 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1439 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
1440 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1441 	      break;
1442 
1443 	    case CATENATION:
1444 	      /* Compute the attributes for the two subtrees, and after that
1445 		 for this node. */
1446 	      STACK_PUSHR(stack, voidptr, node);
1447 	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
1448 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
1449 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1450 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
1451 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1452 	      break;
1453 
1454 	    case ITERATION:
1455 	      /* Compute the attributes for the subtree, and after that for
1456 		 this node. */
1457 	      STACK_PUSHR(stack, voidptr, node);
1458 	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
1459 	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
1460 	      STACK_PUSHR(stack, int, NFL_RECURSE);
1461 	      break;
1462 	    }
1463 	  break; /* end case: NFL_RECURSE */
1464 
1465 	case NFL_POST_UNION:
1466 	  {
1467 	    tre_union_t *uni = (tre_union_t *)node->obj;
1468 	    node->nullable = uni->left->nullable || uni->right->nullable;
1469 	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
1470 					   uni->right->firstpos, NULL, 0, NULL);
1471 	    if (!node->firstpos)
1472 	      return REG_ESPACE;
1473 	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
1474 					  uni->right->lastpos, NULL, 0, NULL);
1475 	    if (!node->lastpos)
1476 	      return REG_ESPACE;
1477 	    break;
1478 	  }
1479 
1480 	case NFL_POST_ITERATION:
1481 	  {
1482 	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1483 
1484 	    if (iter->min == 0 || iter->arg->nullable)
1485 	      node->nullable = 1;
1486 	    else
1487 	      node->nullable = 0;
1488 	    node->firstpos = iter->arg->firstpos;
1489 	    node->lastpos = iter->arg->lastpos;
1490 	    break;
1491 	  }
1492 
1493 	case NFL_POST_CATENATION:
1494 	  {
1495 	    int num_tags, *tags, assertions, params_seen;
1496 	    int *params;
1497 	    reg_errcode_t status;
1498 	    tre_catenation_t *cat = node->obj;
1499 	    node->nullable = cat->left->nullable && cat->right->nullable;
1500 
1501 	    /* Compute firstpos. */
1502 	    if (cat->left->nullable)
1503 	      {
1504 		/* The left side matches the empty string.  Make a first pass
1505 		   with tre_match_empty() to get the number of tags and
1506 		   parameters. */
1507 		status = tre_match_empty(stack, cat->left,
1508 					 NULL, NULL, NULL, &num_tags,
1509 					 &params_seen);
1510 		if (status != REG_OK)
1511 		  return status;
1512 		/* Allocate arrays for the tags and parameters. */
1513 		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
1514 		if (!tags)
1515 		  return REG_ESPACE;
1516 		tags[0] = -1;
1517 		assertions = 0;
1518 		params = NULL;
1519 		if (params_seen)
1520 		  {
1521 		    params = tre_mem_alloc(mem, sizeof(*params)
1522 					   * TRE_PARAM_LAST);
1523 		    if (!params)
1524 		      {
1525 			xfree(tags);
1526 			return REG_ESPACE;
1527 		      }
1528 		  }
1529 		/* Second pass with tre_mach_empty() to get the list of
1530 		   tags and parameters. */
1531 		status = tre_match_empty(stack, cat->left, tags,
1532 					 &assertions, params, NULL, NULL);
1533 		if (status != REG_OK)
1534 		  {
1535 		    xfree(tags);
1536 		    return status;
1537 		  }
1538 		node->firstpos =
1539 		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1540 				tags, assertions, params);
1541 		xfree(tags);
1542 		if (!node->firstpos)
1543 		  return REG_ESPACE;
1544 	      }
1545 	    else
1546 	      {
1547 		node->firstpos = cat->left->firstpos;
1548 	      }
1549 
1550 	    /* Compute lastpos. */
1551 	    if (cat->right->nullable)
1552 	      {
1553 		/* The right side matches the empty string.  Make a first pass
1554 		   with tre_match_empty() to get the number of tags and
1555 		   parameters. */
1556 		status = tre_match_empty(stack, cat->right,
1557 					 NULL, NULL, NULL, &num_tags,
1558 					 &params_seen);
1559 		if (status != REG_OK)
1560 		  return status;
1561 		/* Allocate arrays for the tags and parameters. */
1562 		tags = xmalloc(sizeof(int) * (num_tags + 1));
1563 		if (!tags)
1564 		  return REG_ESPACE;
1565 		tags[0] = -1;
1566 		assertions = 0;
1567 		params = NULL;
1568 		if (params_seen)
1569 		  {
1570 		    params = tre_mem_alloc(mem, sizeof(*params)
1571 					   * TRE_PARAM_LAST);
1572 		    if (!params)
1573 		      {
1574 			xfree(tags);
1575 			return REG_ESPACE;
1576 		      }
1577 		  }
1578 		/* Second pass with tre_mach_empty() to get the list of
1579 		   tags and parameters. */
1580 		status = tre_match_empty(stack, cat->right, tags,
1581 					 &assertions, params, NULL, NULL);
1582 		if (status != REG_OK)
1583 		  {
1584 		    xfree(tags);
1585 		    return status;
1586 		  }
1587 		node->lastpos =
1588 		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1589 				tags, assertions, params);
1590 		xfree(tags);
1591 		if (!node->lastpos)
1592 		  return REG_ESPACE;
1593 	      }
1594 	    else
1595 	      {
1596 		node->lastpos = cat->right->lastpos;
1597 	      }
1598 	    break;
1599 	  }
1600 
1601 	default:
1602 	  assert(0);
1603 	  break;
1604 	}
1605     }
1606 
1607   return REG_OK;
1608 }
1609 
1610 
1611 /* Adds a transition from each position in `p1' to each position in `p2'. */
1612 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)1613 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1614 	       tre_tnfa_transition_t *transitions,
1615 	       int *counts, int *offs)
1616 {
1617   tre_pos_and_tags_t *orig_p2 = p2;
1618   tre_tnfa_transition_t *trans;
1619   int i, j, k, l, dup, prev_p2_pos;
1620 
1621   if (transitions != NULL)
1622     while (p1->position >= 0)
1623       {
1624 	p2 = orig_p2;
1625 	prev_p2_pos = -1;
1626 	while (p2->position >= 0)
1627 	  {
1628 	    /* Optimization: if this position was already handled, skip it. */
1629 	    if (p2->position == prev_p2_pos)
1630 	      {
1631 		p2++;
1632 		continue;
1633 	      }
1634 	    prev_p2_pos = p2->position;
1635 	    /* Set `trans' to point to the next unused transition from
1636 	       position `p1->position'. */
1637 	    trans = transitions + offs[p1->position];
1638 	    while (trans->state != NULL)
1639 	      {
1640 #if 0
1641 		/* If we find a previous transition from `p1->position' to
1642 		   `p2->position', it is overwritten.  This can happen only
1643 		   if there are nested loops in the regexp, like in "((a)*)*".
1644 		   In POSIX.2 repetition using the outer loop is always
1645 		   preferred over using the inner loop.	 Therefore the
1646 		   transition for the inner loop is useless and can be thrown
1647 		   away. */
1648 		/* XXX - The same position is used for all nodes in a bracket
1649 		   expression, so this optimization cannot be used (it will
1650 		   break bracket expressions) unless I figure out a way to
1651 		   detect it here. */
1652 		if (trans->state_id == p2->position)
1653 		  {
1654 		    DPRINT(("*"));
1655 		    break;
1656 		  }
1657 #endif
1658 		trans++;
1659 	      }
1660 
1661 	    if (trans->state == NULL)
1662 	      (trans + 1)->state = NULL;
1663 	    /* Use the character ranges, assertions, etc. from `p1' for
1664 	       the transition from `p1' to `p2'. */
1665 	    trans->code_min = p1->code_min;
1666 	    trans->code_max = p1->code_max;
1667 	    trans->state = transitions + offs[p2->position];
1668 	    trans->state_id = p2->position;
1669 	    trans->assertions = p1->assertions | p2->assertions
1670 	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
1671 	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1672 	    if (p1->backref >= 0)
1673 	      {
1674 		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1675 		assert(p2->backref < 0);
1676 		trans->u.backref = p1->backref;
1677 		trans->assertions |= ASSERT_BACKREF;
1678 	      }
1679 	    else
1680 	      trans->u.class = p1->class;
1681 	    if (p1->neg_classes != NULL)
1682 	      {
1683 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
1684 		trans->neg_classes =
1685 		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
1686 		if (trans->neg_classes == NULL)
1687 		  return REG_ESPACE;
1688 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
1689 		  trans->neg_classes[i] = p1->neg_classes[i];
1690 		trans->neg_classes[i] = (tre_ctype_t)0;
1691 	      }
1692 	    else
1693 	      trans->neg_classes = NULL;
1694 
1695 	    /* Find out how many tags this transition has. */
1696 	    i = 0;
1697 	    if (p1->tags != NULL)
1698 	      while(p1->tags[i] >= 0)
1699 		i++;
1700 	    j = 0;
1701 	    if (p2->tags != NULL)
1702 	      while(p2->tags[j] >= 0)
1703 		j++;
1704 
1705 	    /* If we are overwriting a transition, free the old tag array. */
1706 	    if (trans->tags != NULL)
1707 	      xfree(trans->tags);
1708 	    trans->tags = NULL;
1709 
1710 	    /* If there were any tags, allocate an array and fill it. */
1711 	    if (i + j > 0)
1712 	      {
1713 		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
1714 		if (!trans->tags)
1715 		  return REG_ESPACE;
1716 		i = 0;
1717 		if (p1->tags != NULL)
1718 		  while(p1->tags[i] >= 0)
1719 		    {
1720 		      trans->tags[i] = p1->tags[i];
1721 		      i++;
1722 		    }
1723 		l = i;
1724 		j = 0;
1725 		if (p2->tags != NULL)
1726 		  while (p2->tags[j] >= 0)
1727 		    {
1728 		      /* Don't add duplicates. */
1729 		      dup = 0;
1730 		      for (k = 0; k < i; k++)
1731 			if (trans->tags[k] == p2->tags[j])
1732 			  {
1733 			    dup = 1;
1734 			    break;
1735 			  }
1736 		      if (!dup)
1737 			trans->tags[l++] = p2->tags[j];
1738 		      j++;
1739 		    }
1740 		trans->tags[l] = -1;
1741 	      }
1742 
1743 	    /* Set the parameter array.	 If both `p2' and `p1' have same
1744 	       parameters, the values in `p2' override those in `p1'. */
1745 	    if (p1->params || p2->params)
1746 	      {
1747 		if (!trans->params)
1748 		  trans->params = xmalloc(sizeof(*trans->params)
1749 					  * TRE_PARAM_LAST);
1750 		if (!trans->params)
1751 		  return REG_ESPACE;
1752 		for (i = 0; i < TRE_PARAM_LAST; i++)
1753 		  {
1754 		    trans->params[i] = TRE_PARAM_UNSET;
1755 		    if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1756 		      trans->params[i] = p1->params[i];
1757 		    if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1758 		      trans->params[i] = p2->params[i];
1759 		  }
1760 	      }
1761 	    else
1762 	      {
1763 		if (trans->params)
1764 		  xfree(trans->params);
1765 		trans->params = NULL;
1766 	      }
1767 
1768 
1769 #ifdef TRE_DEBUG
1770 	    {
1771 	      int *tags;
1772 
1773 	      DPRINT(("	 %2d -> %2d on %3d", p1->position, p2->position,
1774 		      p1->code_min));
1775 	      if (p1->code_max != p1->code_min)
1776 		DPRINT(("-%3d", p1->code_max));
1777 	      tags = trans->tags;
1778 	      if (tags)
1779 		{
1780 		  DPRINT((", tags ["));
1781 		  while (*tags >= 0)
1782 		    {
1783 		      DPRINT(("%d", *tags));
1784 		      tags++;
1785 		      if (*tags >= 0)
1786 			DPRINT((","));
1787 		    }
1788 		  DPRINT(("]"));
1789 		}
1790 	      if (trans->assertions)
1791 		DPRINT((", assert %d", trans->assertions));
1792 	      if (trans->assertions & ASSERT_BACKREF)
1793 		DPRINT((", backref %d", trans->u.backref));
1794 	      else if (trans->u.class)
1795 		DPRINT((", class %ld", (long)trans->u.class));
1796 	      if (trans->neg_classes)
1797 		DPRINT((", neg_classes %p", trans->neg_classes));
1798 	      if (trans->params)
1799 		{
1800 		  DPRINT((", "));
1801 		  tre_print_params(trans->params);
1802 		}
1803 	      DPRINT(("\n"));
1804 	    }
1805 #endif /* TRE_DEBUG */
1806 	    p2++;
1807 	  }
1808 	p1++;
1809       }
1810   else
1811     /* Compute a maximum limit for the number of transitions leaving
1812        from each state. */
1813     while (p1->position >= 0)
1814       {
1815 	p2 = orig_p2;
1816 	while (p2->position >= 0)
1817 	  {
1818 	    counts[p1->position]++;
1819 	    p2++;
1820 	  }
1821 	p1++;
1822       }
1823   return REG_OK;
1824 }
1825 
1826 /* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
1827    labelled with one character range (there are no transitions on empty
1828    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
1829    the regexp. */
1830 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)1831 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1832 		int *counts, int *offs)
1833 {
1834   tre_union_t *uni;
1835   tre_catenation_t *cat;
1836   tre_iteration_t *iter;
1837   reg_errcode_t errcode = REG_OK;
1838 
1839   /* XXX - recurse using a stack!. */
1840   switch (node->type)
1841     {
1842     case LITERAL:
1843       break;
1844     case UNION:
1845       uni = (tre_union_t *)node->obj;
1846       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
1847       if (errcode != REG_OK)
1848 	return errcode;
1849       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
1850       break;
1851 
1852     case CATENATION:
1853       cat = (tre_catenation_t *)node->obj;
1854       /* Add a transition from each position in cat->left->lastpos
1855 	 to each position in cat->right->firstpos. */
1856       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1857 			       transitions, counts, offs);
1858       if (errcode != REG_OK)
1859 	return errcode;
1860       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
1861       if (errcode != REG_OK)
1862 	return errcode;
1863       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
1864       break;
1865 
1866     case ITERATION:
1867       iter = (tre_iteration_t *)node->obj;
1868       assert(iter->max == -1 || iter->max == 1);
1869 
1870       if (iter->max == -1)
1871 	{
1872 	  assert(iter->min == 0 || iter->min == 1);
1873 	  /* Add a transition from each last position in the iterated
1874 	     expression to each first position. */
1875 	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1876 				   transitions, counts, offs);
1877 	  if (errcode != REG_OK)
1878 	    return errcode;
1879 	}
1880       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
1881       break;
1882     }
1883   return errcode;
1884 }
1885 
1886 
1887 #define ERROR_EXIT(err)		  \
1888   do				  \
1889     {				  \
1890       errcode = err;		  \
1891       if (1) goto error_exit;	  \
1892     }				  \
1893  while (0)
1894 
1895 
1896 int
tre_compile(regex_t * preg,const tre_char_t * regex,size_t n,int cflags)1897 tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1898 {
1899   tre_stack_t *stack;
1900   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1901   tre_pos_and_tags_t *p;
1902   int *counts = NULL, *offs = NULL;
1903   int i, add = 0;
1904   tre_tnfa_transition_t *transitions, *initial;
1905   tre_tnfa_t *tnfa = NULL;
1906   tre_submatch_data_t *submatch_data;
1907   tre_tag_direction_t *tag_directions = NULL;
1908   reg_errcode_t errcode;
1909   tre_mem_t mem;
1910 
1911   /* Parse context. */
1912   tre_parse_ctx_t parse_ctx;
1913 
1914   /* Allocate a stack used throughout the compilation process for various
1915      purposes. */
1916   stack = tre_stack_new(512, 10240, 128);
1917   if (!stack)
1918     return REG_ESPACE;
1919   /* Allocate a fast memory allocator. */
1920   mem = tre_mem_new();
1921   if (!mem)
1922     {
1923       tre_stack_destroy(stack);
1924       return REG_ESPACE;
1925     }
1926 
1927   /* Parse the regexp. */
1928   memset(&parse_ctx, 0, sizeof(parse_ctx));
1929   parse_ctx.mem = mem;
1930   parse_ctx.stack = stack;
1931   parse_ctx.re = regex;
1932   parse_ctx.len = n;
1933   parse_ctx.cflags = cflags;
1934   parse_ctx.max_backref = -1;
1935   DPRINT(("tre_compile: parsing '%.*" STRF "'\n", (int)n, regex));
1936   errcode = tre_parse(&parse_ctx);
1937   if (errcode != REG_OK)
1938     ERROR_EXIT(errcode);
1939   preg->re_nsub = parse_ctx.submatch_id - 1;
1940   tree = parse_ctx.result;
1941 
1942   /* Back references and approximate matching cannot currently be used
1943      in the same regexp. */
1944   if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1945     ERROR_EXIT(REG_BADPAT);
1946 
1947 #ifdef TRE_DEBUG
1948   tre_ast_print(tree);
1949 #endif /* TRE_DEBUG */
1950 
1951   /* Referring to nonexistent subexpressions is illegal. */
1952   if (parse_ctx.max_backref > (int)preg->re_nsub)
1953     ERROR_EXIT(REG_ESUBREG);
1954 
1955   /* Allocate the TNFA struct. */
1956   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
1957   if (tnfa == NULL)
1958     ERROR_EXIT(REG_ESPACE);
1959   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1960   tnfa->have_approx = parse_ctx.have_approx;
1961   tnfa->num_submatches = parse_ctx.submatch_id;
1962 
1963   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
1964      regexp does not have back references, this can be skipped. */
1965   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1966     {
1967       DPRINT(("tre_compile: setting up tags\n"));
1968 
1969       /* Figure out how many tags we will need. */
1970       errcode = tre_add_tags(NULL, stack, tree, tnfa);
1971       if (errcode != REG_OK)
1972 	ERROR_EXIT(errcode);
1973 #ifdef TRE_DEBUG
1974       tre_ast_print(tree);
1975 #endif /* TRE_DEBUG */
1976 
1977       if (tnfa->num_tags > 0)
1978 	{
1979 	  tag_directions = xmalloc(sizeof(*tag_directions)
1980 				   * (tnfa->num_tags + 1));
1981 	  if (tag_directions == NULL)
1982 	    ERROR_EXIT(REG_ESPACE);
1983 	  tnfa->tag_directions = tag_directions;
1984 	  memset(tag_directions, -1,
1985 		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
1986 	}
1987       tnfa->minimal_tags = xcalloc(tnfa->num_tags * 2 + 1,
1988 				   sizeof(tnfa->minimal_tags));
1989       if (tnfa->minimal_tags == NULL)
1990 	ERROR_EXIT(REG_ESPACE);
1991 
1992       submatch_data = xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
1993       if (submatch_data == NULL)
1994 	ERROR_EXIT(REG_ESPACE);
1995       tnfa->submatch_data = submatch_data;
1996 
1997       errcode = tre_add_tags(mem, stack, tree, tnfa);
1998       if (errcode != REG_OK)
1999 	ERROR_EXIT(errcode);
2000 
2001 #ifdef TRE_DEBUG
2002       for (i = 0; i < parse_ctx.submatch_id; i++)
2003 	DPRINT(("pmatch[%d] = {t%d, t%d}\n",
2004 		i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
2005       for (i = 0; i < tnfa->num_tags; i++)
2006 	DPRINT(("t%d is %s\n", i,
2007 		tag_directions[i] == TRE_TAG_MINIMIZE ?
2008 		"minimized" : "maximized"));
2009 #endif /* TRE_DEBUG */
2010     }
2011 
2012   /* Expand iteration nodes. */
2013   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2014 			   tag_directions, &tnfa->params_depth);
2015   if (errcode != REG_OK)
2016     ERROR_EXIT(errcode);
2017 
2018   /* Add a dummy node for the final state.
2019      XXX - For certain patterns this dummy node can be optimized away,
2020 	   for example "a*" or "ab*".	Figure out a simple way to detect
2021 	   this possibility. */
2022   tmp_ast_l = tree;
2023   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2024   if (tmp_ast_r == NULL)
2025     ERROR_EXIT(REG_ESPACE);
2026 
2027   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2028   if (tree == NULL)
2029     ERROR_EXIT(REG_ESPACE);
2030 
2031 #ifdef TRE_DEBUG
2032   tre_ast_print(tree);
2033   DPRINT(("Number of states: %d\n", parse_ctx.position));
2034 #endif /* TRE_DEBUG */
2035 
2036   errcode = tre_compute_nfl(mem, stack, tree);
2037   if (errcode != REG_OK)
2038     ERROR_EXIT(errcode);
2039 
2040   counts = xmalloc(sizeof(int) * parse_ctx.position);
2041   if (counts == NULL)
2042     ERROR_EXIT(REG_ESPACE);
2043 
2044   offs = xmalloc(sizeof(int) * parse_ctx.position);
2045   if (offs == NULL)
2046     ERROR_EXIT(REG_ESPACE);
2047 
2048   for (i = 0; i < parse_ctx.position; i++)
2049     counts[i] = 0;
2050   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2051 
2052   add = 0;
2053   for (i = 0; i < parse_ctx.position; i++)
2054     {
2055       offs[i] = add;
2056       add += counts[i] + 1;
2057       counts[i] = 0;
2058     }
2059   transitions = xcalloc(add + 1, sizeof(*transitions));
2060   if (transitions == NULL)
2061     ERROR_EXIT(REG_ESPACE);
2062   tnfa->transitions = transitions;
2063   tnfa->num_transitions = add;
2064 
2065   DPRINT(("Converting to TNFA:\n"));
2066   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2067   if (errcode != REG_OK)
2068     ERROR_EXIT(errcode);
2069 
2070   /* If in eight bit mode, compute a table of characters that can be the
2071      first character of a match. */
2072   tnfa->first_char = -1;
2073   if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2074     {
2075       int count = 0;
2076       int k;
2077       DPRINT(("Characters that can start a match:"));
2078       tnfa->firstpos_chars = xcalloc(256, sizeof(char));
2079       if (tnfa->firstpos_chars == NULL)
2080 	ERROR_EXIT(REG_ESPACE);
2081       for (p = tree->firstpos; p->position >= 0; p++)
2082 	{
2083 	  tre_tnfa_transition_t *j = transitions + offs[p->position];
2084 	  while (j->state != NULL)
2085 	    {
2086 	      for (k = j->code_min; k <= j->code_max && k < 256; k++)
2087 		{
2088 		  DPRINT((" %d", k));
2089 		  tnfa->firstpos_chars[k] = 1;
2090 		  count++;
2091 		}
2092 	      j++;
2093 	    }
2094 	}
2095       DPRINT(("\n"));
2096 #define TRE_OPTIMIZE_FIRST_CHAR 1
2097 #if TRE_OPTIMIZE_FIRST_CHAR
2098       if (count == 1)
2099 	{
2100 	  for (k = 0; k < 256; k++)
2101 	    if (tnfa->firstpos_chars[k])
2102 	      {
2103 		DPRINT(("first char must be %d\n", k));
2104 		tnfa->first_char = k;
2105 		xfree(tnfa->firstpos_chars);
2106 		tnfa->firstpos_chars = NULL;
2107 		break;
2108 	      }
2109 	}
2110 #endif
2111 
2112     }
2113   else
2114     tnfa->firstpos_chars = NULL;
2115 
2116 
2117   p = tree->firstpos;
2118   i = 0;
2119   while (p->position >= 0)
2120     {
2121       i++;
2122 
2123 #ifdef TRE_DEBUG
2124       {
2125 	int *tags;
2126 	DPRINT(("initial: %d", p->position));
2127 	tags = p->tags;
2128 	if (tags != NULL)
2129 	  {
2130 	    if (*tags >= 0)
2131 	      DPRINT(("/"));
2132 	    while (*tags >= 0)
2133 	      {
2134 		DPRINT(("%d", *tags));
2135 		tags++;
2136 		if (*tags >= 0)
2137 		  DPRINT((","));
2138 	      }
2139 	  }
2140 	DPRINT((", assert %d", p->assertions));
2141 	if (p->params)
2142 	  {
2143 	    DPRINT((", "));
2144 	    tre_print_params(p->params);
2145 	  }
2146 	DPRINT(("\n"));
2147       }
2148 #endif /* TRE_DEBUG */
2149 
2150       p++;
2151     }
2152 
2153   initial = xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
2154   if (initial == NULL)
2155     ERROR_EXIT(REG_ESPACE);
2156   tnfa->initial = initial;
2157 
2158   i = 0;
2159   for (p = tree->firstpos; p->position >= 0; p++)
2160     {
2161       initial[i].state = transitions + offs[p->position];
2162       initial[i].state_id = p->position;
2163       initial[i].tags = NULL;
2164       /* Copy the arrays p->tags, and p->params, they are allocated
2165 	 from a tre_mem object. */
2166       if (p->tags)
2167 	{
2168 	  int j;
2169 	  for (j = 0; p->tags[j] >= 0; j++);
2170 	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2171 	  if (!initial[i].tags)
2172 	    ERROR_EXIT(REG_ESPACE);
2173 	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2174 	}
2175       initial[i].params = NULL;
2176       if (p->params)
2177 	{
2178 	  initial[i].params = xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2179 	  if (!initial[i].params)
2180 	    ERROR_EXIT(REG_ESPACE);
2181 	  memcpy(initial[i].params, p->params,
2182 		 sizeof(*p->params) * TRE_PARAM_LAST);
2183 	}
2184       initial[i].assertions = p->assertions;
2185       i++;
2186     }
2187   initial[i].state = NULL;
2188 
2189   tnfa->num_transitions = add;
2190   tnfa->final = transitions + offs[tree->lastpos[0].position];
2191   tnfa->num_states = parse_ctx.position;
2192   tnfa->cflags = cflags;
2193 
2194   DPRINT(("final state %p\n", (void *)tnfa->final));
2195 
2196   tre_mem_destroy(mem);
2197   tre_stack_destroy(stack);
2198   xfree(counts);
2199   xfree(offs);
2200 
2201   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2202   return REG_OK;
2203 
2204  error_exit:
2205   /* Free everything that was allocated and return the error code. */
2206   tre_mem_destroy(mem);
2207   if (stack != NULL)
2208     tre_stack_destroy(stack);
2209   if (counts != NULL)
2210     xfree(counts);
2211   if (offs != NULL)
2212     xfree(offs);
2213   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2214   tre_free(preg);
2215   return errcode;
2216 }
2217 
2218 
2219 
2220 
2221 void
tre_free(regex_t * preg)2222 tre_free(regex_t *preg)
2223 {
2224   tre_tnfa_t *tnfa;
2225   unsigned int i;
2226   tre_tnfa_transition_t *trans;
2227 
2228   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2229   if (!tnfa)
2230     return;
2231 
2232   for (i = 0; i < tnfa->num_transitions; i++)
2233     if (tnfa->transitions[i].state)
2234       {
2235 	if (tnfa->transitions[i].tags)
2236 	  xfree(tnfa->transitions[i].tags);
2237 	if (tnfa->transitions[i].neg_classes)
2238 	  xfree(tnfa->transitions[i].neg_classes);
2239 	if (tnfa->transitions[i].params)
2240 	  xfree(tnfa->transitions[i].params);
2241       }
2242   if (tnfa->transitions)
2243     xfree(tnfa->transitions);
2244 
2245   if (tnfa->initial)
2246     {
2247       for (trans = tnfa->initial; trans->state; trans++)
2248 	{
2249 	  if (trans->tags)
2250 	    xfree(trans->tags);
2251 	  if (trans->params)
2252 	    xfree(trans->params);
2253 	}
2254       xfree(tnfa->initial);
2255     }
2256 
2257   if (tnfa->submatch_data)
2258     {
2259       for (i = 0; i < tnfa->num_submatches; i++)
2260 	if (tnfa->submatch_data[i].parents)
2261 	  xfree(tnfa->submatch_data[i].parents);
2262       xfree(tnfa->submatch_data);
2263     }
2264 
2265   if (tnfa->tag_directions)
2266     xfree(tnfa->tag_directions);
2267   if (tnfa->firstpos_chars)
2268     xfree(tnfa->firstpos_chars);
2269   if (tnfa->minimal_tags)
2270     xfree(tnfa->minimal_tags);
2271   xfree(tnfa);
2272 }
2273 
2274 char *
tre_version(void)2275 tre_version(void)
2276 {
2277   static char str[256];
2278   char *version;
2279 
2280   if (str[0] == 0)
2281     {
2282       tre_config(TRE_CONFIG_VERSION, &version);
2283       sprintf(str, "TRE %s (LGPL)", version);
2284     }
2285   return str;
2286 }
2287 
2288 int
tre_config(int query,void * result)2289 tre_config(int query, void *result)
2290 {
2291   int *int_result = result;
2292   char **string_result = result;
2293 
2294   switch (query)
2295     {
2296     case TRE_CONFIG_APPROX:
2297 #ifdef TRE_APPROX
2298       *int_result = 1;
2299 #else /* !TRE_APPROX */
2300       *int_result = 0;
2301 #endif /* !TRE_APPROX */
2302       return REG_OK;
2303 
2304     case TRE_CONFIG_WCHAR:
2305 #ifdef TRE_WCHAR
2306       *int_result = 1;
2307 #else /* !TRE_WCHAR */
2308       *int_result = 0;
2309 #endif /* !TRE_WCHAR */
2310       return REG_OK;
2311 
2312     case TRE_CONFIG_MULTIBYTE:
2313 #ifdef TRE_MULTIBYTE
2314       *int_result = 1;
2315 #else /* !TRE_MULTIBYTE */
2316       *int_result = 0;
2317 #endif /* !TRE_MULTIBYTE */
2318       return REG_OK;
2319 
2320     case TRE_CONFIG_SYSTEM_ABI:
2321 #ifdef TRE_CONFIG_SYSTEM_ABI
2322       *int_result = 1;
2323 #else /* !TRE_CONFIG_SYSTEM_ABI */
2324       *int_result = 0;
2325 #endif /* !TRE_CONFIG_SYSTEM_ABI */
2326       return REG_OK;
2327 
2328     case TRE_CONFIG_VERSION:
2329       *string_result = TRE_VERSION;
2330       return REG_OK;
2331     }
2332 
2333   return REG_NOMATCH;
2334 }
2335 
2336 
2337 /* EOF */
2338