1 /*
2   tre-match-backtrack.c - TRE backtracking regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This matcher is for regexps that use back referencing.  Regexp matching
11   with back referencing is an NP-complete problem on the number of back
12   references.  The easiest way to match them is to use a backtracking
13   routine which basically goes through all possible paths in the TNFA
14   and chooses the one which results in the best (leftmost and longest)
15   match.  This can be spectacularly expensive and may run out of stack
16   space, but there really is no better known generic algorithm.	 Quoting
17   Henry Spencer from comp.compilers:
18   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19 
20     POSIX.2 REs require longest match, which is really exciting to
21     implement since the obsolete ("basic") variant also includes
22     \<digit>.  I haven't found a better way of tackling this than doing
23     a preliminary match using a DFA (or simulation) on a modified RE
24     that just replicates subREs for \<digit>, and then doing a
25     backtracking match to determine whether the subRE matches were
26     right.  This can be rather slow, but I console myself with the
27     thought that people who use \<digit> deserve very slow execution.
28     (Pun unintentional but very appropriate.)
29 
30 */
31 
32 
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
36 
37 #include "tre-alloca.h"
38 
39 #include <assert.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #ifdef HAVE_WCHAR_H
43 #include <wchar.h>
44 #endif /* HAVE_WCHAR_H */
45 #ifdef HAVE_WCTYPE_H
46 #include <wctype.h>
47 #endif /* HAVE_WCTYPE_H */
48 #ifndef TRE_WCHAR
49 #include <ctype.h>
50 #endif /* !TRE_WCHAR */
51 #ifdef HAVE_MALLOC_H
52 #include <malloc.h>
53 #endif /* HAVE_MALLOC_H */
54 
55 #include "tre-internal.h"
56 #include "tre-mem.h"
57 #include "tre-match-utils.h"
58 #include "tre.h"
59 #include "xmalloc.h"
60 
61 typedef struct {
62   int pos;
63   const char *str_byte;
64 #ifdef TRE_WCHAR
65   const wchar_t *str_wide;
66 #endif /* TRE_WCHAR */
67   tre_tnfa_transition_t *state;
68   int state_id;
69   int next_c;
70   int *tags;
71 #ifdef TRE_MBSTATE
72   mbstate_t mbstate;
73 #endif /* TRE_MBSTATE */
74 } tre_backtrack_item_t;
75 
76 typedef struct tre_backtrack_struct {
77   tre_backtrack_item_t item;
78   struct tre_backtrack_struct *prev;
79   struct tre_backtrack_struct *next;
80 } *tre_backtrack_t;
81 
82 #ifdef TRE_WCHAR
83 #define BT_STACK_WIDE_IN(_str_wide)	stack->item.str_wide = (_str_wide)
84 #define BT_STACK_WIDE_OUT		(str_wide) = stack->item.str_wide
85 #else /* !TRE_WCHAR */
86 #define BT_STACK_WIDE_IN(_str_wide)
87 #define BT_STACK_WIDE_OUT
88 #endif /* !TRE_WCHAR */
89 
90 #ifdef TRE_MBSTATE
91 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
92 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
93 #else /* !TRE_MBSTATE */
94 #define BT_STACK_MBSTATE_IN
95 #define BT_STACK_MBSTATE_OUT
96 #endif /* !TRE_MBSTATE */
97 
98 
99 #ifdef TRE_USE_ALLOCA
100 #define tre_bt_mem_new		  tre_mem_newa
101 #define tre_bt_mem_alloc	  tre_mem_alloca
102 #define tre_bt_mem_destroy(obj)	  do { } while (/*CONSTCOND*/(void)0,0)
103 #else /* !TRE_USE_ALLOCA */
104 #define tre_bt_mem_new		  tre_mem_new
105 #define tre_bt_mem_alloc	  tre_mem_alloc
106 #define tre_bt_mem_destroy	  tre_mem_destroy
107 #endif /* !TRE_USE_ALLOCA */
108 
109 
110 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
111   do									      \
112     {									      \
113       size_t i;								      \
114       if (!stack->next)							      \
115 	{								      \
116 	  tre_backtrack_t s;						      \
117 	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
118 	  if (!s)							      \
119 	    {								      \
120 	      tre_bt_mem_destroy(mem);					      \
121 	      if (tags)							      \
122 		xfree(tags);						      \
123 	      if (pmatch)						      \
124 		xfree(pmatch);						      \
125 	      if (states_seen)						      \
126 		xfree(states_seen);					      \
127 	      return REG_ESPACE;					      \
128 	    }								      \
129 	  s->prev = stack;						      \
130 	  s->next = NULL;						      \
131 	  s->item.tags = tre_bt_mem_alloc(mem,				      \
132 					  sizeof(*tags) * tnfa->num_tags);    \
133 	  if (!s->item.tags)						      \
134 	    {								      \
135 	      tre_bt_mem_destroy(mem);					      \
136 	      if (tags)							      \
137 		xfree(tags);						      \
138 	      if (pmatch)						      \
139 		xfree(pmatch);						      \
140 	      if (states_seen)						      \
141 		xfree(states_seen);					      \
142 	      return REG_ESPACE;					      \
143 	    }								      \
144 	  stack->next = s;						      \
145 	  stack = s;							      \
146 	}								      \
147       else								      \
148 	stack = stack->next;						      \
149       stack->item.pos = (_pos);						      \
150       stack->item.str_byte = (_str_byte);				      \
151       BT_STACK_WIDE_IN(_str_wide);					      \
152       stack->item.state = (_state);					      \
153       stack->item.state_id = (_state_id);				      \
154       stack->item.next_c = (_next_c);					      \
155       for (i = 0; i < tnfa->num_tags; i++)				      \
156 	stack->item.tags[i] = (_tags)[i];				      \
157       BT_STACK_MBSTATE_IN;						      \
158     }									      \
159   while (/*CONSTCOND*/(void)0,0)
160 
161 #define BT_STACK_POP()							      \
162   do									      \
163     {									      \
164       size_t i;								      \
165       assert(stack->prev);						      \
166       pos = stack->item.pos;						      \
167       if (type == STR_USER)                                                   \
168         str_source->rewind(pos + pos_add_next, str_source->context);          \
169       str_byte = stack->item.str_byte;					      \
170       BT_STACK_WIDE_OUT;						      \
171       state = stack->item.state;					      \
172       next_c = (tre_char_t) stack->item.next_c;					      \
173       for (i = 0; i < tnfa->num_tags; i++)				      \
174 	tags[i] = stack->item.tags[i];					      \
175       BT_STACK_MBSTATE_OUT;						      \
176       stack = stack->prev;						      \
177     }									      \
178   while (/*CONSTCOND*/(void)0,0)
179 
180 #undef MIN
181 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
182 
183 reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)184 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
185 		       int len, tre_str_type_t type, int *match_tags,
186 		       int eflags, int *match_end_ofs)
187 {
188   /* State variables required by GET_NEXT_WCHAR. */
189   tre_char_t prev_c = 0, next_c = 0;
190   const char *str_byte = string;
191   int pos = 0;
192   unsigned int pos_add_next = 1;
193 #ifdef TRE_WCHAR
194   const wchar_t *str_wide = string;
195 #ifdef TRE_MBSTATE
196   mbstate_t mbstate;
197 #endif /* TRE_MBSTATE */
198 #endif /* TRE_WCHAR */
199   int reg_notbol = eflags & REG_NOTBOL;
200   int reg_noteol = eflags & REG_NOTEOL;
201   int reg_newline = tnfa->cflags & REG_NEWLINE;
202   size_t str_user_end = 0;
203 
204   /* These are used to remember the necessary values of the above
205      variables to return to the position where the current search
206      started from. */
207   int next_c_start;
208   const char *str_byte_start;
209   int pos_start = -1;
210 #ifdef TRE_WCHAR
211   const wchar_t *str_wide_start;
212 #endif /* TRE_WCHAR */
213 #ifdef TRE_MBSTATE
214   mbstate_t mbstate_start;
215 #endif /* TRE_MBSTATE */
216 
217   /* End offset of best match so far, or -1 if no match found yet. */
218   int match_eo = -1;
219   /* Tag arrays. */
220   int *next_tags, *tags = NULL;
221   /* Current TNFA state. */
222   tre_tnfa_transition_t *state;
223   int *states_seen = NULL;
224 
225   /* Memory allocator to for allocating the backtracking stack. */
226   tre_mem_t mem = tre_bt_mem_new();
227 
228   /* The backtracking stack. */
229   tre_backtrack_t stack;
230 
231   tre_tnfa_transition_t *trans_i;
232   regmatch_t *pmatch = NULL;
233   reg_errcode_t ret;
234 
235 #ifdef TRE_MBSTATE
236   memset(&mbstate, '\0', sizeof(mbstate));
237 #endif /* TRE_MBSTATE */
238 
239   if (!mem)
240     return REG_ESPACE;
241   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
242   if (!stack)
243     {
244       ret = REG_ESPACE;
245       goto error_exit;
246     }
247   stack->prev = NULL;
248   stack->next = NULL;
249 
250   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
251   DPRINT(("len = %d\n", len));
252 
253 #ifdef TRE_USE_ALLOCA
254   tags = alloca(sizeof(*tags) * tnfa->num_tags);
255   pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
256   states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
257 #else /* !TRE_USE_ALLOCA */
258   if (tnfa->num_tags)
259     {
260       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
261       if (!tags)
262 	{
263 	  ret = REG_ESPACE;
264 	  goto error_exit;
265 	}
266     }
267   if (tnfa->num_submatches)
268     {
269       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
270       if (!pmatch)
271 	{
272 	  ret = REG_ESPACE;
273 	  goto error_exit;
274 	}
275     }
276   if (tnfa->num_states)
277     {
278       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
279       if (!states_seen)
280 	{
281 	  ret = REG_ESPACE;
282 	  goto error_exit;
283 	}
284     }
285 #endif /* !TRE_USE_ALLOCA */
286 
287  retry:
288   {
289     size_t i;
290     for (i = 0; i < tnfa->num_tags; i++)
291       {
292 	tags[i] = -1;
293 	if (match_tags)
294 	  match_tags[i] = -1;
295       }
296     for (i = 0; i < tnfa->num_states; i++)
297       states_seen[i] = 0;
298   }
299 
300   state = NULL;
301   pos = pos_start;
302   if (type == STR_USER)
303     str_source->rewind(pos + pos_add_next, str_source->context);
304   GET_NEXT_WCHAR();
305   pos_start = pos;
306   next_c_start = next_c;
307   str_byte_start = str_byte;
308 #ifdef TRE_WCHAR
309   str_wide_start = str_wide;
310 #endif /* TRE_WCHAR */
311 #ifdef TRE_MBSTATE
312   mbstate_start = mbstate;
313 #endif /* TRE_MBSTATE */
314 
315   /* Handle initial states. */
316   next_tags = NULL;
317   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
318     {
319       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
320       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
321 	{
322 	  DPRINT(("assert failed\n"));
323 	  continue;
324 	}
325       if (state == NULL)
326 	{
327 	  /* Start from this state. */
328 	  state = trans_i->state;
329 	  next_tags = trans_i->tags;
330 	}
331       else
332 	{
333 	  /* Backtrack to this state. */
334 	  DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
335 	  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
336 			trans_i->state_id, next_c, tags, mbstate);
337 	  {
338 	    int *tmp = trans_i->tags;
339 	    if (tmp)
340 	      while (*tmp >= 0)
341 		stack->item.tags[*tmp++] = pos;
342 	  }
343 	}
344     }
345 
346   if (next_tags)
347     for (; *next_tags >= 0; next_tags++)
348       tags[*next_tags] = pos;
349 
350 
351   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
352   DPRINT(("pos:chr/code | state and tags\n"));
353   DPRINT(("-------------+------------------------------------------------\n"));
354 
355   if (state == NULL)
356     goto backtrack;
357 
358   while (/*CONSTCOND*/(void)1,1)
359     {
360       tre_tnfa_transition_t *next_state;
361       int empty_br_match;
362 
363       DPRINT(("start loop\n"));
364       if (state == tnfa->final)
365 	{
366 	  DPRINT(("  match found, %d %d\n", match_eo, pos));
367 	  if (match_eo < pos
368 	      || (match_eo == pos
369 		  && match_tags
370 		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
371 				   tags, match_tags)))
372 	    {
373 	      size_t i;
374 	      /* This match wins the previous match. */
375 	      DPRINT(("	 win previous\n"));
376 	      match_eo = pos;
377 	      if (match_tags)
378 		for (i = 0; i < tnfa->num_tags; i++)
379 		  match_tags[i] = tags[i];
380 	    }
381 	  /* Our TNFAs never have transitions leaving from the final state,
382 	     so we jump right to backtracking. */
383 	  goto backtrack;
384 	}
385 
386 #ifdef TRE_DEBUG
387       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
388 	      state));
389       {
390 	int i;
391 	for (i = 0; i < tnfa->num_tags; i++)
392 	  DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
393 	DPRINT(("\n"));
394       }
395 #endif /* TRE_DEBUG */
396 
397       /* Go to the next character in the input string. */
398       empty_br_match = 0;
399       trans_i = state;
400       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
401 	{
402 	  /* This is a back reference state.  All transitions leaving from
403 	     this state have the same back reference "assertion".  Instead
404 	     of reading the next character, we match the back reference. */
405 	  regoff_t so, eo, bt = trans_i->u.backref;
406 	  regoff_t bt_len;
407 	  regoff_t result;
408 
409 	  DPRINT(("  should match back reference %d\n", bt));
410 	  /* Get the substring we need to match against.  Remember to
411 	     turn off REG_NOSUB temporarily. */
412 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
413 			  tnfa, tags, pos);
414 	  /* LINTED */so = pmatch[bt].rm_so;
415 	  /* LINTED */eo = pmatch[bt].rm_eo;
416 	  bt_len = eo - so;
417 
418 #ifdef TRE_DEBUG
419 	  {
420 	    int slen;
421 	    if (len < 0)
422 	      slen = bt_len;
423 	    else
424 	      slen = MIN(bt_len, len - pos);
425 
426 	    if (type == STR_BYTE)
427 	      {
428 		DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
429 			bt_len, so, eo, bt_len, (char*)string + so));
430 		DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
431 	      }
432 #ifdef TRE_WCHAR
433 	    else if (type == STR_WIDE)
434 	      {
435 		DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
436 			bt_len, so, eo, bt_len, (wchar_t*)string + so));
437 		DPRINT(("  current string is '%.*" STRF "'\n",
438 			slen, str_wide - 1));
439 	      }
440 #endif /* TRE_WCHAR */
441 	  }
442 #endif
443 
444 	  if (len < 0)
445 	    {
446 	      if (type == STR_USER)
447 		result = str_source->compare((unsigned)so, (unsigned)pos,
448 					     (unsigned)bt_len,
449 					     str_source->context);
450 #ifdef TRE_WCHAR
451 	      else if (type == STR_WIDE)
452 		result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
453 				 (size_t)bt_len);
454 #endif /* TRE_WCHAR */
455 	      else
456 		/* LINTED */result = strncmp((const char*)string + so, str_byte - 1,
457 				 (size_t)bt_len);
458 	    }
459 	  else if (len - pos < bt_len)
460 	    result = 1;
461 #ifdef TRE_WCHAR
462 	  else if (type == STR_WIDE)
463 	    result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
464 			     (size_t)bt_len);
465 #endif /* TRE_WCHAR */
466 	  else
467 	    /* LINTED */result = memcmp((const char*)string + so, str_byte - 1,
468 			    (size_t)bt_len);
469 
470 	  if (result == 0)
471 	    {
472 	      /* Back reference matched.  Check for infinite loop. */
473 	      if (bt_len == 0)
474 		empty_br_match = 1;
475 	      if (empty_br_match && states_seen[trans_i->state_id])
476 		{
477 		  DPRINT(("  avoid loop\n"));
478 		  goto backtrack;
479 		}
480 
481 	      states_seen[trans_i->state_id] = empty_br_match;
482 
483 	      /* Advance in input string and resync `prev_c', `next_c'
484 		 and pos. */
485 	      DPRINT(("	 back reference matched\n"));
486 	      /* LINTED */str_byte += bt_len - 1;
487 #ifdef TRE_WCHAR
488 	      /* LINTED */str_wide += bt_len - 1;
489 #endif /* TRE_WCHAR */
490 	      /* LINTED */pos += bt_len - 1;
491 	      GET_NEXT_WCHAR();
492 	      DPRINT(("	 pos now %d\n", pos));
493 	    }
494 	  else
495 	    {
496 	      DPRINT(("	 back reference did not match\n"));
497 	      goto backtrack;
498 	    }
499 	}
500       else
501 	{
502 	  /* Check for end of string. */
503 	  if (len < 0)
504 	    {
505 	      if (type == STR_USER)
506 		{
507 		  if (str_user_end)
508 		    goto backtrack;
509 		}
510 	      else if (next_c == L'\0')
511 		goto backtrack;
512 	    }
513 	  else
514 	    {
515 	      if (pos >= len)
516 		goto backtrack;
517 	    }
518 
519 	  /* Read the next character. */
520 	  GET_NEXT_WCHAR();
521 	}
522 
523       next_state = NULL;
524       for (trans_i = state; trans_i->state; trans_i++)
525 	{
526 	  DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
527 		  trans_i->code_min, trans_i->code_max,
528 		  trans_i->code_min, trans_i->code_max,
529 		  trans_i->assertions, trans_i->state_id));
530 	  if (trans_i->code_min <= (tre_cint_t)prev_c
531 	      && trans_i->code_max >= (tre_cint_t)prev_c)
532 	    {
533 	      if (trans_i->assertions
534 		  && (CHECK_ASSERTIONS(trans_i->assertions)
535 		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
536 		{
537 		  DPRINT(("  assertion failed\n"));
538 		  continue;
539 		}
540 
541 	      if (next_state == NULL)
542 		{
543 		  /* First matching transition. */
544 		  DPRINT(("  Next state is %d\n", trans_i->state_id));
545 		  next_state = trans_i->state;
546 		  next_tags = trans_i->tags;
547 		}
548 	      else
549 		{
550 		  /* Second matching transition.  We may need to backtrack here
551 		     to take this transition instead of the first one, so we
552 		     push this transition in the backtracking stack so we can
553 		     jump back here if needed. */
554 		  DPRINT(("  saving state %d for backtracking\n",
555 			  trans_i->state_id));
556 		  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
557 				trans_i->state_id, next_c, tags, mbstate);
558 		  {
559 		    int *tmp;
560 		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
561 		      stack->item.tags[*tmp] = pos;
562 		  }
563 #if 0 /* XXX - it's important not to look at all transitions here to keep
564 	 the stack small! */
565 		  break;
566 #endif
567 		}
568 	    }
569 	}
570 
571       if (next_state != NULL)
572 	{
573 	  /* Matching transitions were found.  Take the first one. */
574 	  state = next_state;
575 
576 	  /* Update the tag values. */
577 	  if (next_tags)
578 	    while (*next_tags >= 0)
579 	      tags[*next_tags++] = pos;
580 	}
581       else
582 	{
583 	backtrack:
584 	  /* A matching transition was not found.  Try to backtrack. */
585 	  if (stack->prev)
586 	    {
587 	      DPRINT(("	 backtracking\n"));
588 #if ASSERT_BACKREF
589 	      if (stack->item.state->assertions)
590 		{
591 		  DPRINT(("  states_seen[%d] = 0\n",
592 			  stack->item.state_id));
593 		  states_seen[stack->item.state_id] = 0;
594 		}
595 #endif
596 
597 	      BT_STACK_POP();
598 	    }
599 	  else if (match_eo < 0)
600 	    {
601 	      /* Try starting from a later position in the input string. */
602 	      /* Check for end of string. */
603 	      if (len < 0)
604 		{
605 		  if (next_c == L'\0')
606 		    {
607 		      DPRINT(("end of string.\n"));
608 		      break;
609 		    }
610 		}
611 	      else
612 		{
613 		  if (pos >= len)
614 		    {
615 		      DPRINT(("end of string.\n"));
616 		      break;
617 		    }
618 		}
619 	      DPRINT(("restarting from next start position\n"));
620 	      next_c = (tre_char_t) next_c_start;
621 #ifdef TRE_MBSTATE
622 	      mbstate = mbstate_start;
623 #endif /* TRE_MBSTATE */
624 	      str_byte = str_byte_start;
625 #ifdef TRE_WCHAR
626 	      str_wide = str_wide_start;
627 #endif /* TRE_WCHAR */
628 	      goto retry;
629 	    }
630 	  else
631 	    {
632 	      DPRINT(("finished\n"));
633 	      break;
634 	    }
635 	}
636     }
637 
638   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
639   *match_end_ofs = match_eo;
640 
641  error_exit:
642   tre_bt_mem_destroy(mem);
643 #ifndef TRE_USE_ALLOCA
644   if (tags)
645     xfree(tags);
646   if (pmatch)
647     xfree(pmatch);
648   if (states_seen)
649     xfree(states_seen);
650 #endif /* !TRE_USE_ALLOCA */
651 
652   return ret;
653 }
654