1 /*
2   tre-match-backtrack.c - TRE backtracking regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This matcher is for regexps that use back referencing.  Regexp matching
11   with back referencing is an NP-complete problem on the number of back
12   references.  The easiest way to match them is to use a backtracking
13   routine which basically goes through all possible paths in the TNFA
14   and chooses the one which results in the best (leftmost and longest)
15   match.  This can be spectacularly expensive and may run out of stack
16   space, but there really is no better known generic algorithm.	 Quoting
17   Henry Spencer from comp.compilers:
18   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19 
20     POSIX.2 REs require longest match, which is really exciting to
21     implement since the obsolete ("basic") variant also includes
22     \<digit>.  I haven't found a better way of tackling this than doing
23     a preliminary match using a DFA (or simulation) on a modified RE
24     that just replicates subREs for \<digit>, and then doing a
25     backtracking match to determine whether the subRE matches were
26     right.  This can be rather slow, but I console myself with the
27     thought that people who use \<digit> deserve very slow execution.
28     (Pun unintentional but very appropriate.)
29 
30 */
31 
32 
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
36 
37 #ifdef TRE_USE_ALLOCA
38 /* AIX requires this to be the first thing in the file.	 */
39 #ifndef __GNUC__
40 # if HAVE_ALLOCA_H
41 #  include <alloca.h>
42 # else
43 #  ifdef _AIX
44  #pragma alloca
45 #  else
46 #   ifndef alloca /* predefined by HP cc +Olibcalls */
47 char *alloca ();
48 #   endif
49 #  endif
50 # endif
51 #endif
52 #endif /* TRE_USE_ALLOCA */
53 
54 //#include <assert.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #ifdef HAVE_WCHAR_H
58 #include <wchar.h>
59 #endif /* HAVE_WCHAR_H */
60 #ifdef HAVE_WCTYPE_H
61 #include <wctype.h>
62 #endif /* HAVE_WCTYPE_H */
63 #ifndef TRE_WCHAR
64 #include <ctype.h>
65 #endif /* !TRE_WCHAR */
66 #ifdef HAVE_MALLOC_H
67 #include <malloc.h>
68 #endif /* HAVE_MALLOC_H */
69 
70 #include "tre-internal.h"
71 #include "tre-mem.h"
72 #include "tre-match-utils.h"
73 #include "tre.h"
74 #include "xmalloc.h"
75 
76 #define assert(a) R_assert(a)
77 
78 typedef struct {
79   int pos;
80   const char *str_byte;
81 #ifdef TRE_WCHAR
82   const wchar_t *str_wide;
83 #endif /* TRE_WCHAR */
84   tre_tnfa_transition_t *state;
85   int state_id;
86   int next_c;
87   int *tags;
88 #ifdef TRE_MBSTATE
89   mbstate_t mbstate;
90 #endif /* TRE_MBSTATE */
91 } tre_backtrack_item_t;
92 
93 typedef struct tre_backtrack_struct {
94   tre_backtrack_item_t item;
95   struct tre_backtrack_struct *prev;
96   struct tre_backtrack_struct *next;
97 } *tre_backtrack_t;
98 
99 #ifdef TRE_WCHAR
100 #define BT_STACK_WIDE_IN(_str_wide)	stack->item.str_wide = (_str_wide)
101 #define BT_STACK_WIDE_OUT		(str_wide) = stack->item.str_wide
102 #else /* !TRE_WCHAR */
103 #define BT_STACK_WIDE_IN(_str_wide)
104 #define BT_STACK_WIDE_OUT
105 #endif /* !TRE_WCHAR */
106 
107 #ifdef TRE_MBSTATE
108 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
109 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
110 #else /* !TRE_MBSTATE */
111 #define BT_STACK_MBSTATE_IN
112 #define BT_STACK_MBSTATE_OUT
113 #endif /* !TRE_MBSTATE */
114 
115 
116 #ifdef TRE_USE_ALLOCA
117 #define tre_bt_mem_new		  tre_mem_newa
118 #define tre_bt_mem_alloc	  tre_mem_alloca
119 #define tre_bt_mem_destroy(obj)	  do { } while (0,0)
120 #else /* !TRE_USE_ALLOCA */
121 #define tre_bt_mem_new		  tre_mem_new
122 #define tre_bt_mem_alloc	  tre_mem_alloc
123 #define tre_bt_mem_destroy	  tre_mem_destroy
124 #endif /* !TRE_USE_ALLOCA */
125 
126 
127 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
128   do									      \
129     {									      \
130       int i;								      \
131       if (!stack->next)							      \
132 	{								      \
133 	  tre_backtrack_t s;						      \
134 	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
135 	  if (!s)							      \
136 	    {								      \
137 	      tre_bt_mem_destroy(mem);					      \
138 	      if (tags)							      \
139 		xfree(tags);						      \
140 	      if (pmatch)						      \
141 		xfree(pmatch);						      \
142 	      if (states_seen)						      \
143 		xfree(states_seen);					      \
144 	      return REG_ESPACE;					      \
145 	    }								      \
146 	  s->prev = stack;						      \
147 	  s->next = NULL;						      \
148 	  s->item.tags = tre_bt_mem_alloc(mem,				      \
149 					  sizeof(*tags) * tnfa->num_tags);    \
150 	  if (!s->item.tags)						      \
151 	    {								      \
152 	      tre_bt_mem_destroy(mem);					      \
153 	      if (tags)							      \
154 		xfree(tags);						      \
155 	      if (pmatch)						      \
156 		xfree(pmatch);						      \
157 	      if (states_seen)						      \
158 		xfree(states_seen);					      \
159 	      return REG_ESPACE;					      \
160 	    }								      \
161 	  stack->next = s;						      \
162 	  stack = s;							      \
163 	}								      \
164       else								      \
165 	stack = stack->next;						      \
166       stack->item.pos = (_pos);						      \
167       stack->item.str_byte = (_str_byte);				      \
168       BT_STACK_WIDE_IN(_str_wide);					      \
169       stack->item.state = (_state);					      \
170       stack->item.state_id = (_state_id);				      \
171       stack->item.next_c = (_next_c);					      \
172       for (i = 0; i < tnfa->num_tags; i++)				      \
173 	stack->item.tags[i] = (_tags)[i];				      \
174       BT_STACK_MBSTATE_IN;						      \
175     }									      \
176   while (/*CONSTCOND*/(void)0,0)
177 
178 #define BT_STACK_POP()							      \
179   do									      \
180     {									      \
181       int i;								      \
182       assert(stack->prev);						      \
183       pos = stack->item.pos;						      \
184       if (type == STR_USER)                                                   \
185         str_source->rewind(pos + pos_add_next, str_source->context);          \
186       str_byte = stack->item.str_byte;					      \
187       BT_STACK_WIDE_OUT;						      \
188       state = stack->item.state;					      \
189       next_c = (tre_char_t) stack->item.next_c;					      \
190       for (i = 0; i < tnfa->num_tags; i++)				      \
191 	tags[i] = stack->item.tags[i];					      \
192       BT_STACK_MBSTATE_OUT;						      \
193       stack = stack->prev;						      \
194     }									      \
195   while (/*CONSTCOND*/(void)0,0)
196 
197 #undef MIN
198 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
199 
200 reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)201 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
202 		       int len, tre_str_type_t type, int *match_tags,
203 		       int eflags, int *match_end_ofs)
204 {
205   /* State variables required by GET_NEXT_WCHAR. */
206   tre_char_t prev_c = 0, next_c = 0;
207   const char *str_byte = string;
208   int pos = 0;
209   unsigned int pos_add_next = 1;
210 #ifdef TRE_WCHAR
211   const wchar_t *str_wide = string;
212 #ifdef TRE_MBSTATE
213   mbstate_t mbstate;
214 #endif /* TRE_MBSTATE */
215 #endif /* TRE_WCHAR */
216   int reg_notbol = eflags & REG_NOTBOL;
217   int reg_noteol = eflags & REG_NOTEOL;
218   int reg_newline = tnfa->cflags & REG_NEWLINE;
219   int str_user_end = 0;
220 
221   /* These are used to remember the necessary values of the above
222      variables to return to the position where the current search
223      started from. */
224   int next_c_start;
225   const char *str_byte_start;
226   int pos_start = -1;
227 #ifdef TRE_WCHAR
228   const wchar_t *str_wide_start;
229 #endif /* TRE_WCHAR */
230 #ifdef TRE_MBSTATE
231   mbstate_t mbstate_start;
232 #endif /* TRE_MBSTATE */
233 
234   /* End offset of best match so far, or -1 if no match found yet. */
235   int match_eo = -1;
236   /* Tag arrays. */
237   int *next_tags, *tags = NULL;
238   /* Current TNFA state. */
239   tre_tnfa_transition_t *state;
240   int *states_seen = NULL;
241 
242   /* Memory allocator to for allocating the backtracking stack. */
243   tre_mem_t mem = tre_bt_mem_new();
244 
245   /* The backtracking stack. */
246   tre_backtrack_t stack;
247 
248   tre_tnfa_transition_t *trans_i;
249   regmatch_t *pmatch = NULL;
250   int ret;
251 
252 #ifdef TRE_MBSTATE
253   memset(&mbstate, '\0', sizeof(mbstate));
254 #endif /* TRE_MBSTATE */
255 
256   if (!mem)
257     return REG_ESPACE;
258   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
259   if (!stack)
260     {
261       ret = REG_ESPACE;
262       goto error_exit;
263     }
264   stack->prev = NULL;
265   stack->next = NULL;
266 
267   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
268   DPRINT(("len = %d\n", len));
269 
270 #ifdef TRE_USE_ALLOCA
271   tags = alloca(sizeof(*tags) * tnfa->num_tags);
272   pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
273   states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
274 #else /* !TRE_USE_ALLOCA */
275   if (tnfa->num_tags)
276     {
277       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
278       if (!tags)
279 	{
280 	  ret = REG_ESPACE;
281 	  goto error_exit;
282 	}
283     }
284   if (tnfa->num_submatches)
285     {
286       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
287       if (!pmatch)
288 	{
289 	  ret = REG_ESPACE;
290 	  goto error_exit;
291 	}
292     }
293   if (tnfa->num_states)
294     {
295       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
296       if (!states_seen)
297 	{
298 	  ret = REG_ESPACE;
299 	  goto error_exit;
300 	}
301     }
302 #endif /* !TRE_USE_ALLOCA */
303 
304  retry:
305   {
306     int i;
307     for (i = 0; i < tnfa->num_tags; i++)
308       {
309 	tags[i] = -1;
310 	if (match_tags)
311 	  match_tags[i] = -1;
312       }
313     for (i = 0; i < tnfa->num_states; i++)
314       states_seen[i] = 0;
315   }
316 
317   state = NULL;
318   pos = pos_start;
319   if (type == STR_USER)
320     str_source->rewind(pos + pos_add_next, str_source->context);
321   GET_NEXT_WCHAR();
322   pos_start = pos;
323   next_c_start = next_c;
324   str_byte_start = str_byte;
325 #ifdef TRE_WCHAR
326   str_wide_start = str_wide;
327 #endif /* TRE_WCHAR */
328 #ifdef TRE_MBSTATE
329   mbstate_start = mbstate;
330 #endif /* TRE_MBSTATE */
331 
332   /* Handle initial states. */
333   next_tags = NULL;
334   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
335     {
336       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
337       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
338 	{
339 	  DPRINT(("assert failed\n"));
340 	  continue;
341 	}
342       if (state == NULL)
343 	{
344 	  /* Start from this state. */
345 	  state = trans_i->state;
346 	  next_tags = trans_i->tags;
347 	}
348       else
349 	{
350 	  /* Backtrack to this state. */
351 	  DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
352 	  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
353 			trans_i->state_id, next_c, tags, mbstate);
354 	  {
355 	    int *tmp = trans_i->tags;
356 	    if (tmp)
357 	      while (*tmp >= 0)
358 		stack->item.tags[*tmp++] = pos;
359 	  }
360 	}
361     }
362 
363   if (next_tags)
364     for (; *next_tags >= 0; next_tags++)
365       tags[*next_tags] = pos;
366 
367 
368   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
369   DPRINT(("pos:chr/code | state and tags\n"));
370   DPRINT(("-------------+------------------------------------------------\n"));
371 
372   if (state == NULL)
373     goto backtrack;
374 
375   while (/*CONSTCOND*/(void)1,1)
376     {
377       tre_tnfa_transition_t *next_state;
378       int empty_br_match;
379 
380       DPRINT(("start loop\n"));
381       if (state == tnfa->final)
382 	{
383 	  DPRINT(("  match found, %d %d\n", match_eo, pos));
384 	  if (match_eo < pos
385 	      || (match_eo == pos
386 		  && match_tags
387 		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
388 				   tags, match_tags)))
389 	    {
390 	      int i;
391 	      /* This match wins the previous match. */
392 	      DPRINT(("	 win previous\n"));
393 	      match_eo = pos;
394 	      if (match_tags)
395 		for (i = 0; i < tnfa->num_tags; i++)
396 		  match_tags[i] = tags[i];
397 	    }
398 	  /* Our TNFAs never have transitions leaving from the final state,
399 	     so we jump right to backtracking. */
400 	  goto backtrack;
401 	}
402 
403 #ifdef TRE_DEBUG
404       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
405 	      state));
406       {
407 	int i;
408 	for (i = 0; i < tnfa->num_tags; i++)
409 	  DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
410 	DPRINT(("\n"));
411       }
412 #endif /* TRE_DEBUG */
413 
414       /* Go to the next character in the input string. */
415       empty_br_match = 0;
416       trans_i = state;
417       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
418 	{
419 	  /* This is a back reference state.  All transitions leaving from
420 	     this state have the same back reference "assertion".  Instead
421 	     of reading the next character, we match the back reference. */
422 	  int so, eo, bt = trans_i->u.backref;
423 	  int bt_len;
424 	  int result;
425 
426 	  DPRINT(("  should match back reference %d\n", bt));
427 	  /* Get the substring we need to match against.  Remember to
428 	     turn off REG_NOSUB temporarily. */
429 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB,
430 			  tnfa, tags, pos);
431 	  so = pmatch[bt].rm_so;
432 	  eo = pmatch[bt].rm_eo;
433 	  bt_len = eo - so;
434 
435 #ifdef TRE_DEBUG
436 	  {
437 	    int slen;
438 	    if (len < 0)
439 	      slen = bt_len;
440 	    else
441 	      slen = MIN(bt_len, len - pos);
442 
443 	    if (type == STR_BYTE)
444 	      {
445 		DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
446 			bt_len, so, eo, bt_len, (char*)string + so));
447 		DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
448 	      }
449 #ifdef TRE_WCHAR
450 	    else if (type == STR_WIDE)
451 	      {
452 		DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
453 			bt_len, so, eo, bt_len, (wchar_t*)string + so));
454 		DPRINT(("  current string is '%.*" STRF "'\n",
455 			slen, str_wide - 1));
456 	      }
457 #endif /* TRE_WCHAR */
458 	  }
459 #endif
460 
461 	  if (len < 0)
462 	    {
463 	      if (type == STR_USER)
464 		result = str_source->compare((unsigned)so, (unsigned)pos,
465 					     (unsigned)bt_len,
466 					     str_source->context);
467 #ifdef TRE_WCHAR
468 	      else if (type == STR_WIDE)
469 		result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
470 				 (size_t)bt_len);
471 #endif /* TRE_WCHAR */
472 	      else
473 		result = strncmp((const char*)string + so, str_byte - 1,
474 				 (size_t)bt_len);
475 	    }
476 	  else if (len - pos < bt_len)
477 	    result = 1;
478 #ifdef TRE_WCHAR
479 	  else if (type == STR_WIDE)
480 	    result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
481 			     (size_t)bt_len);
482 #endif /* TRE_WCHAR */
483 	  else
484 	    result = memcmp((const char*)string + so, str_byte - 1,
485 			    (size_t)bt_len);
486 
487 	  if (result == 0)
488 	    {
489 	      /* Back reference matched.  Check for infinite loop. */
490 	      if (bt_len == 0)
491 		empty_br_match = 1;
492 	      if (empty_br_match && states_seen[trans_i->state_id])
493 		{
494 		  DPRINT(("  avoid loop\n"));
495 		  goto backtrack;
496 		}
497 
498 	      states_seen[trans_i->state_id] = empty_br_match;
499 
500 	      /* Advance in input string and resync `prev_c', `next_c'
501 		 and pos. */
502 	      DPRINT(("	 back reference matched\n"));
503 	      str_byte += bt_len - 1;
504 #ifdef TRE_WCHAR
505 	      str_wide += bt_len - 1;
506 #endif /* TRE_WCHAR */
507 	      pos += bt_len - 1;
508 	      GET_NEXT_WCHAR();
509 	      DPRINT(("	 pos now %d\n", pos));
510 	    }
511 	  else
512 	    {
513 	      DPRINT(("	 back reference did not match\n"));
514 	      goto backtrack;
515 	    }
516 	}
517       else
518 	{
519 	  /* Check for end of string. */
520 	  if (len < 0)
521 	    {
522 	      if (type == STR_USER)
523 		{
524 		  if (str_user_end)
525 		    goto backtrack;
526 		}
527 	      else if (next_c == L'\0')
528 		goto backtrack;
529 	    }
530 	  else
531 	    {
532 	      if (pos >= len)
533 		goto backtrack;
534 	    }
535 
536 	  /* Read the next character. */
537 	  GET_NEXT_WCHAR();
538 	}
539 
540       next_state = NULL;
541       for (trans_i = state; trans_i->state; trans_i++)
542 	{
543 	  DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
544 		  trans_i->code_min, trans_i->code_max,
545 		  trans_i->code_min, trans_i->code_max,
546 		  trans_i->assertions, trans_i->state_id));
547 	  if (trans_i->code_min <= (tre_cint_t)prev_c
548 	      && trans_i->code_max >= (tre_cint_t)prev_c)
549 	    {
550 	      if (trans_i->assertions
551 		  && (CHECK_ASSERTIONS(trans_i->assertions)
552 		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
553 		{
554 		  DPRINT(("  assertion failed\n"));
555 		  continue;
556 		}
557 
558 	      if (next_state == NULL)
559 		{
560 		  /* First matching transition. */
561 		  DPRINT(("  Next state is %d\n", trans_i->state_id));
562 		  next_state = trans_i->state;
563 		  next_tags = trans_i->tags;
564 		}
565 	      else
566 		{
567 		  /* Second matching transition.  We may need to backtrack here
568 		     to take this transition instead of the first one, so we
569 		     push this transition in the backtracking stack so we can
570 		     jump back here if needed. */
571 		  DPRINT(("  saving state %d for backtracking\n",
572 			  trans_i->state_id));
573 		  BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
574 				trans_i->state_id, next_c, tags, mbstate);
575 		  {
576 		    int *tmp;
577 		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
578 		      stack->item.tags[*tmp] = pos;
579 		  }
580 #if 0 /* XXX - it's important not to look at all transitions here to keep
581 	 the stack small! */
582 		  break;
583 #endif
584 		}
585 	    }
586 	}
587 
588       if (next_state != NULL)
589 	{
590 	  /* Matching transitions were found.  Take the first one. */
591 	  state = next_state;
592 
593 	  /* Update the tag values. */
594 	  if (next_tags)
595 	    while (*next_tags >= 0)
596 	      tags[*next_tags++] = pos;
597 	}
598       else
599 	{
600 	backtrack:
601 	  /* A matching transition was not found.  Try to backtrack. */
602 	  if (stack->prev)
603 	    {
604 	      DPRINT(("	 backtracking\n"));
605 	      if (stack->item.state->assertions & ASSERT_BACKREF)
606 		{
607 		  DPRINT(("  states_seen[%d] = 0\n",
608 			  stack->item.state_id));
609 		  states_seen[stack->item.state_id] = 0;
610 		}
611 
612 	      BT_STACK_POP();
613 	    }
614 	  else if (match_eo < 0)
615 	    {
616 	      /* Try starting from a later position in the input string. */
617 	      /* Check for end of string. */
618 	      if (len < 0)
619 		{
620 		  if (next_c == L'\0')
621 		    {
622 		      DPRINT(("end of string.\n"));
623 		      break;
624 		    }
625 		}
626 	      else
627 		{
628 		  if (pos >= len)
629 		    {
630 		      DPRINT(("end of string.\n"));
631 		      break;
632 		    }
633 		}
634 	      DPRINT(("restarting from next start position\n"));
635 	      next_c = (tre_char_t) next_c_start;
636 #ifdef TRE_MBSTATE
637 	      mbstate = mbstate_start;
638 #endif /* TRE_MBSTATE */
639 	      str_byte = str_byte_start;
640 #ifdef TRE_WCHAR
641 	      str_wide = str_wide_start;
642 #endif /* TRE_WCHAR */
643 	      goto retry;
644 	    }
645 	  else
646 	    {
647 	      DPRINT(("finished\n"));
648 	      break;
649 	    }
650 	}
651     }
652 
653   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
654   *match_end_ofs = match_eo;
655 
656  error_exit:
657   tre_bt_mem_destroy(mem);
658 #ifndef TRE_USE_ALLOCA
659   if (tags)
660     xfree(tags);
661   if (pmatch)
662     xfree(pmatch);
663   if (states_seen)
664     xfree(states_seen);
665 #endif /* !TRE_USE_ALLOCA */
666 
667   return ret;
668 }
669