1 /*
2 tre-match-backtrack.c - TRE backtracking regex matching engine
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
29
30 */
31
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
36
37 #ifdef TRE_USE_ALLOCA
38 /* AIX requires this to be the first thing in the file. */
39 #ifndef __GNUC__
40 # if HAVE_ALLOCA_H
41 # include <alloca.h>
42 # else
43 # ifdef _AIX
44 #pragma alloca
45 # else
46 # ifndef alloca /* predefined by HP cc +Olibcalls */
47 char *alloca ();
48 # endif
49 # endif
50 # endif
51 #endif
52 #endif /* TRE_USE_ALLOCA */
53
54 //#include <assert.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #ifdef HAVE_WCHAR_H
58 #include <wchar.h>
59 #endif /* HAVE_WCHAR_H */
60 #ifdef HAVE_WCTYPE_H
61 #include <wctype.h>
62 #endif /* HAVE_WCTYPE_H */
63 #ifndef TRE_WCHAR
64 #include <ctype.h>
65 #endif /* !TRE_WCHAR */
66 #ifdef HAVE_MALLOC_H
67 #include <malloc.h>
68 #endif /* HAVE_MALLOC_H */
69
70 #include "tre-internal.h"
71 #include "tre-mem.h"
72 #include "tre-match-utils.h"
73 #include "tre.h"
74 #include "xmalloc.h"
75
76 #define assert(a) R_assert(a)
77
78 typedef struct {
79 int pos;
80 const char *str_byte;
81 #ifdef TRE_WCHAR
82 const wchar_t *str_wide;
83 #endif /* TRE_WCHAR */
84 tre_tnfa_transition_t *state;
85 int state_id;
86 int next_c;
87 int *tags;
88 #ifdef TRE_MBSTATE
89 mbstate_t mbstate;
90 #endif /* TRE_MBSTATE */
91 } tre_backtrack_item_t;
92
93 typedef struct tre_backtrack_struct {
94 tre_backtrack_item_t item;
95 struct tre_backtrack_struct *prev;
96 struct tre_backtrack_struct *next;
97 } *tre_backtrack_t;
98
99 #ifdef TRE_WCHAR
100 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
101 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
102 #else /* !TRE_WCHAR */
103 #define BT_STACK_WIDE_IN(_str_wide)
104 #define BT_STACK_WIDE_OUT
105 #endif /* !TRE_WCHAR */
106
107 #ifdef TRE_MBSTATE
108 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
109 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
110 #else /* !TRE_MBSTATE */
111 #define BT_STACK_MBSTATE_IN
112 #define BT_STACK_MBSTATE_OUT
113 #endif /* !TRE_MBSTATE */
114
115
116 #ifdef TRE_USE_ALLOCA
117 #define tre_bt_mem_new tre_mem_newa
118 #define tre_bt_mem_alloc tre_mem_alloca
119 #define tre_bt_mem_destroy(obj) do { } while (0,0)
120 #else /* !TRE_USE_ALLOCA */
121 #define tre_bt_mem_new tre_mem_new
122 #define tre_bt_mem_alloc tre_mem_alloc
123 #define tre_bt_mem_destroy tre_mem_destroy
124 #endif /* !TRE_USE_ALLOCA */
125
126
127 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
128 do \
129 { \
130 int i; \
131 if (!stack->next) \
132 { \
133 tre_backtrack_t s; \
134 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
135 if (!s) \
136 { \
137 tre_bt_mem_destroy(mem); \
138 if (tags) \
139 xfree(tags); \
140 if (pmatch) \
141 xfree(pmatch); \
142 if (states_seen) \
143 xfree(states_seen); \
144 return REG_ESPACE; \
145 } \
146 s->prev = stack; \
147 s->next = NULL; \
148 s->item.tags = tre_bt_mem_alloc(mem, \
149 sizeof(*tags) * tnfa->num_tags); \
150 if (!s->item.tags) \
151 { \
152 tre_bt_mem_destroy(mem); \
153 if (tags) \
154 xfree(tags); \
155 if (pmatch) \
156 xfree(pmatch); \
157 if (states_seen) \
158 xfree(states_seen); \
159 return REG_ESPACE; \
160 } \
161 stack->next = s; \
162 stack = s; \
163 } \
164 else \
165 stack = stack->next; \
166 stack->item.pos = (_pos); \
167 stack->item.str_byte = (_str_byte); \
168 BT_STACK_WIDE_IN(_str_wide); \
169 stack->item.state = (_state); \
170 stack->item.state_id = (_state_id); \
171 stack->item.next_c = (_next_c); \
172 for (i = 0; i < tnfa->num_tags; i++) \
173 stack->item.tags[i] = (_tags)[i]; \
174 BT_STACK_MBSTATE_IN; \
175 } \
176 while (/*CONSTCOND*/(void)0,0)
177
178 #define BT_STACK_POP() \
179 do \
180 { \
181 int i; \
182 assert(stack->prev); \
183 pos = stack->item.pos; \
184 if (type == STR_USER) \
185 str_source->rewind(pos + pos_add_next, str_source->context); \
186 str_byte = stack->item.str_byte; \
187 BT_STACK_WIDE_OUT; \
188 state = stack->item.state; \
189 next_c = (tre_char_t) stack->item.next_c; \
190 for (i = 0; i < tnfa->num_tags; i++) \
191 tags[i] = stack->item.tags[i]; \
192 BT_STACK_MBSTATE_OUT; \
193 stack = stack->prev; \
194 } \
195 while (/*CONSTCOND*/(void)0,0)
196
197 #undef MIN
198 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
199
200 reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)201 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
202 int len, tre_str_type_t type, int *match_tags,
203 int eflags, int *match_end_ofs)
204 {
205 /* State variables required by GET_NEXT_WCHAR. */
206 tre_char_t prev_c = 0, next_c = 0;
207 const char *str_byte = string;
208 int pos = 0;
209 unsigned int pos_add_next = 1;
210 #ifdef TRE_WCHAR
211 const wchar_t *str_wide = string;
212 #ifdef TRE_MBSTATE
213 mbstate_t mbstate;
214 #endif /* TRE_MBSTATE */
215 #endif /* TRE_WCHAR */
216 int reg_notbol = eflags & REG_NOTBOL;
217 int reg_noteol = eflags & REG_NOTEOL;
218 int reg_newline = tnfa->cflags & REG_NEWLINE;
219 int str_user_end = 0;
220
221 /* These are used to remember the necessary values of the above
222 variables to return to the position where the current search
223 started from. */
224 int next_c_start;
225 const char *str_byte_start;
226 int pos_start = -1;
227 #ifdef TRE_WCHAR
228 const wchar_t *str_wide_start;
229 #endif /* TRE_WCHAR */
230 #ifdef TRE_MBSTATE
231 mbstate_t mbstate_start;
232 #endif /* TRE_MBSTATE */
233
234 /* End offset of best match so far, or -1 if no match found yet. */
235 int match_eo = -1;
236 /* Tag arrays. */
237 int *next_tags, *tags = NULL;
238 /* Current TNFA state. */
239 tre_tnfa_transition_t *state;
240 int *states_seen = NULL;
241
242 /* Memory allocator to for allocating the backtracking stack. */
243 tre_mem_t mem = tre_bt_mem_new();
244
245 /* The backtracking stack. */
246 tre_backtrack_t stack;
247
248 tre_tnfa_transition_t *trans_i;
249 regmatch_t *pmatch = NULL;
250 int ret;
251
252 #ifdef TRE_MBSTATE
253 memset(&mbstate, '\0', sizeof(mbstate));
254 #endif /* TRE_MBSTATE */
255
256 if (!mem)
257 return REG_ESPACE;
258 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
259 if (!stack)
260 {
261 ret = REG_ESPACE;
262 goto error_exit;
263 }
264 stack->prev = NULL;
265 stack->next = NULL;
266
267 DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
268 DPRINT(("len = %d\n", len));
269
270 #ifdef TRE_USE_ALLOCA
271 tags = alloca(sizeof(*tags) * tnfa->num_tags);
272 pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
273 states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
274 #else /* !TRE_USE_ALLOCA */
275 if (tnfa->num_tags)
276 {
277 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
278 if (!tags)
279 {
280 ret = REG_ESPACE;
281 goto error_exit;
282 }
283 }
284 if (tnfa->num_submatches)
285 {
286 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
287 if (!pmatch)
288 {
289 ret = REG_ESPACE;
290 goto error_exit;
291 }
292 }
293 if (tnfa->num_states)
294 {
295 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
296 if (!states_seen)
297 {
298 ret = REG_ESPACE;
299 goto error_exit;
300 }
301 }
302 #endif /* !TRE_USE_ALLOCA */
303
304 retry:
305 {
306 int i;
307 for (i = 0; i < tnfa->num_tags; i++)
308 {
309 tags[i] = -1;
310 if (match_tags)
311 match_tags[i] = -1;
312 }
313 for (i = 0; i < tnfa->num_states; i++)
314 states_seen[i] = 0;
315 }
316
317 state = NULL;
318 pos = pos_start;
319 if (type == STR_USER)
320 str_source->rewind(pos + pos_add_next, str_source->context);
321 GET_NEXT_WCHAR();
322 pos_start = pos;
323 next_c_start = next_c;
324 str_byte_start = str_byte;
325 #ifdef TRE_WCHAR
326 str_wide_start = str_wide;
327 #endif /* TRE_WCHAR */
328 #ifdef TRE_MBSTATE
329 mbstate_start = mbstate;
330 #endif /* TRE_MBSTATE */
331
332 /* Handle initial states. */
333 next_tags = NULL;
334 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
335 {
336 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
337 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
338 {
339 DPRINT(("assert failed\n"));
340 continue;
341 }
342 if (state == NULL)
343 {
344 /* Start from this state. */
345 state = trans_i->state;
346 next_tags = trans_i->tags;
347 }
348 else
349 {
350 /* Backtrack to this state. */
351 DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
352 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
353 trans_i->state_id, next_c, tags, mbstate);
354 {
355 int *tmp = trans_i->tags;
356 if (tmp)
357 while (*tmp >= 0)
358 stack->item.tags[*tmp++] = pos;
359 }
360 }
361 }
362
363 if (next_tags)
364 for (; *next_tags >= 0; next_tags++)
365 tags[*next_tags] = pos;
366
367
368 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
369 DPRINT(("pos:chr/code | state and tags\n"));
370 DPRINT(("-------------+------------------------------------------------\n"));
371
372 if (state == NULL)
373 goto backtrack;
374
375 while (/*CONSTCOND*/(void)1,1)
376 {
377 tre_tnfa_transition_t *next_state;
378 int empty_br_match;
379
380 DPRINT(("start loop\n"));
381 if (state == tnfa->final)
382 {
383 DPRINT((" match found, %d %d\n", match_eo, pos));
384 if (match_eo < pos
385 || (match_eo == pos
386 && match_tags
387 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
388 tags, match_tags)))
389 {
390 int i;
391 /* This match wins the previous match. */
392 DPRINT((" win previous\n"));
393 match_eo = pos;
394 if (match_tags)
395 for (i = 0; i < tnfa->num_tags; i++)
396 match_tags[i] = tags[i];
397 }
398 /* Our TNFAs never have transitions leaving from the final state,
399 so we jump right to backtracking. */
400 goto backtrack;
401 }
402
403 #ifdef TRE_DEBUG
404 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
405 state));
406 {
407 int i;
408 for (i = 0; i < tnfa->num_tags; i++)
409 DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
410 DPRINT(("\n"));
411 }
412 #endif /* TRE_DEBUG */
413
414 /* Go to the next character in the input string. */
415 empty_br_match = 0;
416 trans_i = state;
417 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
418 {
419 /* This is a back reference state. All transitions leaving from
420 this state have the same back reference "assertion". Instead
421 of reading the next character, we match the back reference. */
422 int so, eo, bt = trans_i->u.backref;
423 int bt_len;
424 int result;
425
426 DPRINT((" should match back reference %d\n", bt));
427 /* Get the substring we need to match against. Remember to
428 turn off REG_NOSUB temporarily. */
429 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & /*LINTED*/!REG_NOSUB,
430 tnfa, tags, pos);
431 so = pmatch[bt].rm_so;
432 eo = pmatch[bt].rm_eo;
433 bt_len = eo - so;
434
435 #ifdef TRE_DEBUG
436 {
437 int slen;
438 if (len < 0)
439 slen = bt_len;
440 else
441 slen = MIN(bt_len, len - pos);
442
443 if (type == STR_BYTE)
444 {
445 DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n",
446 bt_len, so, eo, bt_len, (char*)string + so));
447 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
448 }
449 #ifdef TRE_WCHAR
450 else if (type == STR_WIDE)
451 {
452 DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
453 bt_len, so, eo, bt_len, (wchar_t*)string + so));
454 DPRINT((" current string is '%.*" STRF "'\n",
455 slen, str_wide - 1));
456 }
457 #endif /* TRE_WCHAR */
458 }
459 #endif
460
461 if (len < 0)
462 {
463 if (type == STR_USER)
464 result = str_source->compare((unsigned)so, (unsigned)pos,
465 (unsigned)bt_len,
466 str_source->context);
467 #ifdef TRE_WCHAR
468 else if (type == STR_WIDE)
469 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
470 (size_t)bt_len);
471 #endif /* TRE_WCHAR */
472 else
473 result = strncmp((const char*)string + so, str_byte - 1,
474 (size_t)bt_len);
475 }
476 else if (len - pos < bt_len)
477 result = 1;
478 #ifdef TRE_WCHAR
479 else if (type == STR_WIDE)
480 result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
481 (size_t)bt_len);
482 #endif /* TRE_WCHAR */
483 else
484 result = memcmp((const char*)string + so, str_byte - 1,
485 (size_t)bt_len);
486
487 if (result == 0)
488 {
489 /* Back reference matched. Check for infinite loop. */
490 if (bt_len == 0)
491 empty_br_match = 1;
492 if (empty_br_match && states_seen[trans_i->state_id])
493 {
494 DPRINT((" avoid loop\n"));
495 goto backtrack;
496 }
497
498 states_seen[trans_i->state_id] = empty_br_match;
499
500 /* Advance in input string and resync `prev_c', `next_c'
501 and pos. */
502 DPRINT((" back reference matched\n"));
503 str_byte += bt_len - 1;
504 #ifdef TRE_WCHAR
505 str_wide += bt_len - 1;
506 #endif /* TRE_WCHAR */
507 pos += bt_len - 1;
508 GET_NEXT_WCHAR();
509 DPRINT((" pos now %d\n", pos));
510 }
511 else
512 {
513 DPRINT((" back reference did not match\n"));
514 goto backtrack;
515 }
516 }
517 else
518 {
519 /* Check for end of string. */
520 if (len < 0)
521 {
522 if (type == STR_USER)
523 {
524 if (str_user_end)
525 goto backtrack;
526 }
527 else if (next_c == L'\0')
528 goto backtrack;
529 }
530 else
531 {
532 if (pos >= len)
533 goto backtrack;
534 }
535
536 /* Read the next character. */
537 GET_NEXT_WCHAR();
538 }
539
540 next_state = NULL;
541 for (trans_i = state; trans_i->state; trans_i++)
542 {
543 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
544 trans_i->code_min, trans_i->code_max,
545 trans_i->code_min, trans_i->code_max,
546 trans_i->assertions, trans_i->state_id));
547 if (trans_i->code_min <= (tre_cint_t)prev_c
548 && trans_i->code_max >= (tre_cint_t)prev_c)
549 {
550 if (trans_i->assertions
551 && (CHECK_ASSERTIONS(trans_i->assertions)
552 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
553 {
554 DPRINT((" assertion failed\n"));
555 continue;
556 }
557
558 if (next_state == NULL)
559 {
560 /* First matching transition. */
561 DPRINT((" Next state is %d\n", trans_i->state_id));
562 next_state = trans_i->state;
563 next_tags = trans_i->tags;
564 }
565 else
566 {
567 /* Second matching transition. We may need to backtrack here
568 to take this transition instead of the first one, so we
569 push this transition in the backtracking stack so we can
570 jump back here if needed. */
571 DPRINT((" saving state %d for backtracking\n",
572 trans_i->state_id));
573 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
574 trans_i->state_id, next_c, tags, mbstate);
575 {
576 int *tmp;
577 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
578 stack->item.tags[*tmp] = pos;
579 }
580 #if 0 /* XXX - it's important not to look at all transitions here to keep
581 the stack small! */
582 break;
583 #endif
584 }
585 }
586 }
587
588 if (next_state != NULL)
589 {
590 /* Matching transitions were found. Take the first one. */
591 state = next_state;
592
593 /* Update the tag values. */
594 if (next_tags)
595 while (*next_tags >= 0)
596 tags[*next_tags++] = pos;
597 }
598 else
599 {
600 backtrack:
601 /* A matching transition was not found. Try to backtrack. */
602 if (stack->prev)
603 {
604 DPRINT((" backtracking\n"));
605 if (stack->item.state->assertions & ASSERT_BACKREF)
606 {
607 DPRINT((" states_seen[%d] = 0\n",
608 stack->item.state_id));
609 states_seen[stack->item.state_id] = 0;
610 }
611
612 BT_STACK_POP();
613 }
614 else if (match_eo < 0)
615 {
616 /* Try starting from a later position in the input string. */
617 /* Check for end of string. */
618 if (len < 0)
619 {
620 if (next_c == L'\0')
621 {
622 DPRINT(("end of string.\n"));
623 break;
624 }
625 }
626 else
627 {
628 if (pos >= len)
629 {
630 DPRINT(("end of string.\n"));
631 break;
632 }
633 }
634 DPRINT(("restarting from next start position\n"));
635 next_c = (tre_char_t) next_c_start;
636 #ifdef TRE_MBSTATE
637 mbstate = mbstate_start;
638 #endif /* TRE_MBSTATE */
639 str_byte = str_byte_start;
640 #ifdef TRE_WCHAR
641 str_wide = str_wide_start;
642 #endif /* TRE_WCHAR */
643 goto retry;
644 }
645 else
646 {
647 DPRINT(("finished\n"));
648 break;
649 }
650 }
651 }
652
653 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
654 *match_end_ofs = match_eo;
655
656 error_exit:
657 tre_bt_mem_destroy(mem);
658 #ifndef TRE_USE_ALLOCA
659 if (tags)
660 xfree(tags);
661 if (pmatch)
662 xfree(pmatch);
663 if (states_seen)
664 xfree(states_seen);
665 #endif /* !TRE_USE_ALLOCA */
666
667 return ret;
668 }
669