1 /*
2 tre-match-backtrack.c - TRE backtracking regex matching engine
3
4 This software is released under a BSD-style license.
5 See the file LICENSE for details and copyright.
6
7 */
8
9 /*
10 This matcher is for regexps that use back referencing. Regexp matching
11 with back referencing is an NP-complete problem on the number of back
12 references. The easiest way to match them is to use a backtracking
13 routine which basically goes through all possible paths in the TNFA
14 and chooses the one which results in the best (leftmost and longest)
15 match. This can be spectacularly expensive and may run out of stack
16 space, but there really is no better known generic algorithm. Quoting
17 Henry Spencer from comp.compilers:
18 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19
20 POSIX.2 REs require longest match, which is really exciting to
21 implement since the obsolete ("basic") variant also includes
22 \<digit>. I haven't found a better way of tackling this than doing
23 a preliminary match using a DFA (or simulation) on a modified RE
24 that just replicates subREs for \<digit>, and then doing a
25 backtracking match to determine whether the subRE matches were
26 right. This can be rather slow, but I console myself with the
27 thought that people who use \<digit> deserve very slow execution.
28 (Pun unintentional but very appropriate.)
29
30 */
31
32
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
36
37 #include "tre-alloca.h"
38
39 #include <assert.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #ifdef HAVE_WCHAR_H
43 #include <wchar.h>
44 #endif /* HAVE_WCHAR_H */
45 #ifdef HAVE_WCTYPE_H
46 #include <wctype.h>
47 #endif /* HAVE_WCTYPE_H */
48 #ifndef TRE_WCHAR
49 #include <ctype.h>
50 #endif /* !TRE_WCHAR */
51 #ifdef HAVE_MALLOC_H
52 #include <malloc.h>
53 #endif /* HAVE_MALLOC_H */
54
55 #include "tre-internal.h"
56 #include "tre-mem.h"
57 #include "tre-match-utils.h"
58 #include "tre.h"
59 #include "xmalloc.h"
60
61 typedef struct {
62 int pos;
63 const char *str_byte;
64 #ifdef TRE_WCHAR
65 const wchar_t *str_wide;
66 #endif /* TRE_WCHAR */
67 tre_tnfa_transition_t *state;
68 int state_id;
69 int next_c;
70 int *tags;
71 #ifdef TRE_MBSTATE
72 mbstate_t mbstate;
73 #endif /* TRE_MBSTATE */
74 } tre_backtrack_item_t;
75
76 typedef struct tre_backtrack_struct {
77 tre_backtrack_item_t item;
78 struct tre_backtrack_struct *prev;
79 struct tre_backtrack_struct *next;
80 } *tre_backtrack_t;
81
82 #ifdef TRE_WCHAR
83 #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
84 #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
85 #else /* !TRE_WCHAR */
86 #define BT_STACK_WIDE_IN(_str_wide)
87 #define BT_STACK_WIDE_OUT
88 #endif /* !TRE_WCHAR */
89
90 #ifdef TRE_MBSTATE
91 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
92 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
93 #else /* !TRE_MBSTATE */
94 #define BT_STACK_MBSTATE_IN
95 #define BT_STACK_MBSTATE_OUT
96 #endif /* !TRE_MBSTATE */
97
98
99 #ifdef TRE_USE_ALLOCA
100 #define tre_bt_mem_new tre_mem_newa
101 #define tre_bt_mem_alloc tre_mem_alloca
102 #define tre_bt_mem_destroy(obj) do { } while (/*CONSTCOND*/(void)0,0)
103 #else /* !TRE_USE_ALLOCA */
104 #define tre_bt_mem_new tre_mem_new
105 #define tre_bt_mem_alloc tre_mem_alloc
106 #define tre_bt_mem_destroy tre_mem_destroy
107 #endif /* !TRE_USE_ALLOCA */
108
109
110 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
111 do \
112 { \
113 size_t i; \
114 if (!stack->next) \
115 { \
116 tre_backtrack_t s; \
117 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
118 if (!s) \
119 { \
120 tre_bt_mem_destroy(mem); \
121 if (tags) \
122 xfree(tags); \
123 if (pmatch) \
124 xfree(pmatch); \
125 if (states_seen) \
126 xfree(states_seen); \
127 return REG_ESPACE; \
128 } \
129 s->prev = stack; \
130 s->next = NULL; \
131 s->item.tags = tre_bt_mem_alloc(mem, \
132 sizeof(*tags) * tnfa->num_tags); \
133 if (!s->item.tags) \
134 { \
135 tre_bt_mem_destroy(mem); \
136 if (tags) \
137 xfree(tags); \
138 if (pmatch) \
139 xfree(pmatch); \
140 if (states_seen) \
141 xfree(states_seen); \
142 return REG_ESPACE; \
143 } \
144 stack->next = s; \
145 stack = s; \
146 } \
147 else \
148 stack = stack->next; \
149 stack->item.pos = (_pos); \
150 stack->item.str_byte = (_str_byte); \
151 BT_STACK_WIDE_IN(_str_wide); \
152 stack->item.state = (_state); \
153 stack->item.state_id = (_state_id); \
154 stack->item.next_c = (_next_c); \
155 for (i = 0; i < tnfa->num_tags; i++) \
156 stack->item.tags[i] = (_tags)[i]; \
157 BT_STACK_MBSTATE_IN; \
158 } \
159 while (/*CONSTCOND*/(void)0,0)
160
161 #define BT_STACK_POP() \
162 do \
163 { \
164 size_t i; \
165 assert(stack->prev); \
166 pos = stack->item.pos; \
167 if (type == STR_USER) \
168 str_source->rewind(pos + pos_add_next, str_source->context); \
169 str_byte = stack->item.str_byte; \
170 BT_STACK_WIDE_OUT; \
171 state = stack->item.state; \
172 next_c = (tre_char_t) stack->item.next_c; \
173 for (i = 0; i < tnfa->num_tags; i++) \
174 tags[i] = stack->item.tags[i]; \
175 BT_STACK_MBSTATE_OUT; \
176 stack = stack->prev; \
177 } \
178 while (/*CONSTCOND*/(void)0,0)
179
180 #undef MIN
181 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
182
183 reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)184 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
185 int len, tre_str_type_t type, int *match_tags,
186 int eflags, int *match_end_ofs)
187 {
188 /* State variables required by GET_NEXT_WCHAR. */
189 tre_char_t prev_c = 0, next_c = 0;
190 const char *str_byte = string;
191 int pos = 0;
192 unsigned int pos_add_next = 1;
193 #ifdef TRE_WCHAR
194 const wchar_t *str_wide = string;
195 #ifdef TRE_MBSTATE
196 mbstate_t mbstate;
197 #endif /* TRE_MBSTATE */
198 #endif /* TRE_WCHAR */
199 int reg_notbol = eflags & REG_NOTBOL;
200 int reg_noteol = eflags & REG_NOTEOL;
201 int reg_newline = tnfa->cflags & REG_NEWLINE;
202 size_t str_user_end = 0;
203
204 /* These are used to remember the necessary values of the above
205 variables to return to the position where the current search
206 started from. */
207 int next_c_start;
208 const char *str_byte_start;
209 int pos_start = -1;
210 #ifdef TRE_WCHAR
211 const wchar_t *str_wide_start;
212 #endif /* TRE_WCHAR */
213 #ifdef TRE_MBSTATE
214 mbstate_t mbstate_start;
215 #endif /* TRE_MBSTATE */
216
217 /* End offset of best match so far, or -1 if no match found yet. */
218 int match_eo = -1;
219 /* Tag arrays. */
220 int *next_tags, *tags = NULL;
221 /* Current TNFA state. */
222 tre_tnfa_transition_t *state;
223 int *states_seen = NULL;
224
225 /* Memory allocator to for allocating the backtracking stack. */
226 tre_mem_t mem = tre_bt_mem_new();
227
228 /* The backtracking stack. */
229 tre_backtrack_t stack;
230
231 tre_tnfa_transition_t *trans_i;
232 regmatch_t *pmatch = NULL;
233 reg_errcode_t ret;
234
235 #ifdef TRE_MBSTATE
236 memset(&mbstate, '\0', sizeof(mbstate));
237 #endif /* TRE_MBSTATE */
238
239 if (!mem)
240 return REG_ESPACE;
241 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
242 if (!stack)
243 {
244 ret = REG_ESPACE;
245 goto error_exit;
246 }
247 stack->prev = NULL;
248 stack->next = NULL;
249
250 DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
251 DPRINT(("len = %d\n", len));
252
253 #ifdef TRE_USE_ALLOCA
254 tags = alloca(sizeof(*tags) * tnfa->num_tags);
255 pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
256 states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
257 #else /* !TRE_USE_ALLOCA */
258 if (tnfa->num_tags)
259 {
260 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
261 if (!tags)
262 {
263 ret = REG_ESPACE;
264 goto error_exit;
265 }
266 }
267 if (tnfa->num_submatches)
268 {
269 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
270 if (!pmatch)
271 {
272 ret = REG_ESPACE;
273 goto error_exit;
274 }
275 }
276 if (tnfa->num_states)
277 {
278 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
279 if (!states_seen)
280 {
281 ret = REG_ESPACE;
282 goto error_exit;
283 }
284 }
285 #endif /* !TRE_USE_ALLOCA */
286
287 retry:
288 {
289 size_t i;
290 for (i = 0; i < tnfa->num_tags; i++)
291 {
292 tags[i] = -1;
293 if (match_tags)
294 match_tags[i] = -1;
295 }
296 for (i = 0; i < tnfa->num_states; i++)
297 states_seen[i] = 0;
298 }
299
300 state = NULL;
301 pos = pos_start;
302 if (type == STR_USER)
303 str_source->rewind(pos + pos_add_next, str_source->context);
304 GET_NEXT_WCHAR();
305 pos_start = pos;
306 next_c_start = next_c;
307 str_byte_start = str_byte;
308 #ifdef TRE_WCHAR
309 str_wide_start = str_wide;
310 #endif /* TRE_WCHAR */
311 #ifdef TRE_MBSTATE
312 mbstate_start = mbstate;
313 #endif /* TRE_MBSTATE */
314
315 /* Handle initial states. */
316 next_tags = NULL;
317 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
318 {
319 DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
320 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
321 {
322 DPRINT(("assert failed\n"));
323 continue;
324 }
325 if (state == NULL)
326 {
327 /* Start from this state. */
328 state = trans_i->state;
329 next_tags = trans_i->tags;
330 }
331 else
332 {
333 /* Backtrack to this state. */
334 DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
335 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
336 trans_i->state_id, next_c, tags, mbstate);
337 {
338 int *tmp = trans_i->tags;
339 if (tmp)
340 while (*tmp >= 0)
341 stack->item.tags[*tmp++] = pos;
342 }
343 }
344 }
345
346 if (next_tags)
347 for (; *next_tags >= 0; next_tags++)
348 tags[*next_tags] = pos;
349
350
351 DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
352 DPRINT(("pos:chr/code | state and tags\n"));
353 DPRINT(("-------------+------------------------------------------------\n"));
354
355 if (state == NULL)
356 goto backtrack;
357
358 while (/*CONSTCOND*/(void)1,1)
359 {
360 tre_tnfa_transition_t *next_state;
361 int empty_br_match;
362
363 DPRINT(("start loop\n"));
364 if (state == tnfa->final)
365 {
366 DPRINT((" match found, %d %d\n", match_eo, pos));
367 if (match_eo < pos
368 || (match_eo == pos
369 && match_tags
370 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
371 tags, match_tags)))
372 {
373 size_t i;
374 /* This match wins the previous match. */
375 DPRINT((" win previous\n"));
376 match_eo = pos;
377 if (match_tags)
378 for (i = 0; i < tnfa->num_tags; i++)
379 match_tags[i] = tags[i];
380 }
381 /* Our TNFAs never have transitions leaving from the final state,
382 so we jump right to backtracking. */
383 goto backtrack;
384 }
385
386 #ifdef TRE_DEBUG
387 DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
388 state));
389 {
390 int i;
391 for (i = 0; i < tnfa->num_tags; i++)
392 DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
393 DPRINT(("\n"));
394 }
395 #endif /* TRE_DEBUG */
396
397 /* Go to the next character in the input string. */
398 empty_br_match = 0;
399 trans_i = state;
400 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
401 {
402 /* This is a back reference state. All transitions leaving from
403 this state have the same back reference "assertion". Instead
404 of reading the next character, we match the back reference. */
405 regoff_t so, eo, bt = trans_i->u.backref;
406 regoff_t bt_len;
407 regoff_t result;
408
409 DPRINT((" should match back reference %d\n", bt));
410 /* Get the substring we need to match against. Remember to
411 turn off REG_NOSUB temporarily. */
412 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
413 tnfa, tags, pos);
414 /* LINTED */so = pmatch[bt].rm_so;
415 /* LINTED */eo = pmatch[bt].rm_eo;
416 bt_len = eo - so;
417
418 #ifdef TRE_DEBUG
419 {
420 int slen;
421 if (len < 0)
422 slen = bt_len;
423 else
424 slen = MIN(bt_len, len - pos);
425
426 if (type == STR_BYTE)
427 {
428 DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n",
429 bt_len, so, eo, bt_len, (char*)string + so));
430 DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
431 }
432 #ifdef TRE_WCHAR
433 else if (type == STR_WIDE)
434 {
435 DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
436 bt_len, so, eo, bt_len, (wchar_t*)string + so));
437 DPRINT((" current string is '%.*" STRF "'\n",
438 slen, str_wide - 1));
439 }
440 #endif /* TRE_WCHAR */
441 }
442 #endif
443
444 if (len < 0)
445 {
446 if (type == STR_USER)
447 result = str_source->compare((unsigned)so, (unsigned)pos,
448 (unsigned)bt_len,
449 str_source->context);
450 #ifdef TRE_WCHAR
451 else if (type == STR_WIDE)
452 result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
453 (size_t)bt_len);
454 #endif /* TRE_WCHAR */
455 else
456 /* LINTED */result = strncmp((const char*)string + so, str_byte - 1,
457 (size_t)bt_len);
458 }
459 else if (len - pos < bt_len)
460 result = 1;
461 #ifdef TRE_WCHAR
462 else if (type == STR_WIDE)
463 result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
464 (size_t)bt_len);
465 #endif /* TRE_WCHAR */
466 else
467 /* LINTED */result = memcmp((const char*)string + so, str_byte - 1,
468 (size_t)bt_len);
469
470 if (result == 0)
471 {
472 /* Back reference matched. Check for infinite loop. */
473 if (bt_len == 0)
474 empty_br_match = 1;
475 if (empty_br_match && states_seen[trans_i->state_id])
476 {
477 DPRINT((" avoid loop\n"));
478 goto backtrack;
479 }
480
481 states_seen[trans_i->state_id] = empty_br_match;
482
483 /* Advance in input string and resync `prev_c', `next_c'
484 and pos. */
485 DPRINT((" back reference matched\n"));
486 /* LINTED */str_byte += bt_len - 1;
487 #ifdef TRE_WCHAR
488 /* LINTED */str_wide += bt_len - 1;
489 #endif /* TRE_WCHAR */
490 /* LINTED */pos += bt_len - 1;
491 GET_NEXT_WCHAR();
492 DPRINT((" pos now %d\n", pos));
493 }
494 else
495 {
496 DPRINT((" back reference did not match\n"));
497 goto backtrack;
498 }
499 }
500 else
501 {
502 /* Check for end of string. */
503 if (len < 0)
504 {
505 if (type == STR_USER)
506 {
507 if (str_user_end)
508 goto backtrack;
509 }
510 else if (next_c == L'\0')
511 goto backtrack;
512 }
513 else
514 {
515 if (pos >= len)
516 goto backtrack;
517 }
518
519 /* Read the next character. */
520 GET_NEXT_WCHAR();
521 }
522
523 next_state = NULL;
524 for (trans_i = state; trans_i->state; trans_i++)
525 {
526 DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
527 trans_i->code_min, trans_i->code_max,
528 trans_i->code_min, trans_i->code_max,
529 trans_i->assertions, trans_i->state_id));
530 if (trans_i->code_min <= (tre_cint_t)prev_c
531 && trans_i->code_max >= (tre_cint_t)prev_c)
532 {
533 if (trans_i->assertions
534 && (CHECK_ASSERTIONS(trans_i->assertions)
535 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
536 {
537 DPRINT((" assertion failed\n"));
538 continue;
539 }
540
541 if (next_state == NULL)
542 {
543 /* First matching transition. */
544 DPRINT((" Next state is %d\n", trans_i->state_id));
545 next_state = trans_i->state;
546 next_tags = trans_i->tags;
547 }
548 else
549 {
550 /* Second matching transition. We may need to backtrack here
551 to take this transition instead of the first one, so we
552 push this transition in the backtracking stack so we can
553 jump back here if needed. */
554 DPRINT((" saving state %d for backtracking\n",
555 trans_i->state_id));
556 BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
557 trans_i->state_id, next_c, tags, mbstate);
558 {
559 int *tmp;
560 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
561 stack->item.tags[*tmp] = pos;
562 }
563 #if 0 /* XXX - it's important not to look at all transitions here to keep
564 the stack small! */
565 break;
566 #endif
567 }
568 }
569 }
570
571 if (next_state != NULL)
572 {
573 /* Matching transitions were found. Take the first one. */
574 state = next_state;
575
576 /* Update the tag values. */
577 if (next_tags)
578 while (*next_tags >= 0)
579 tags[*next_tags++] = pos;
580 }
581 else
582 {
583 backtrack:
584 /* A matching transition was not found. Try to backtrack. */
585 if (stack->prev)
586 {
587 DPRINT((" backtracking\n"));
588 #if ASSERT_BACKREF
589 if (stack->item.state->assertions)
590 {
591 DPRINT((" states_seen[%d] = 0\n",
592 stack->item.state_id));
593 states_seen[stack->item.state_id] = 0;
594 }
595 #endif
596
597 BT_STACK_POP();
598 }
599 else if (match_eo < 0)
600 {
601 /* Try starting from a later position in the input string. */
602 /* Check for end of string. */
603 if (len < 0)
604 {
605 if (next_c == L'\0')
606 {
607 DPRINT(("end of string.\n"));
608 break;
609 }
610 }
611 else
612 {
613 if (pos >= len)
614 {
615 DPRINT(("end of string.\n"));
616 break;
617 }
618 }
619 DPRINT(("restarting from next start position\n"));
620 next_c = (tre_char_t) next_c_start;
621 #ifdef TRE_MBSTATE
622 mbstate = mbstate_start;
623 #endif /* TRE_MBSTATE */
624 str_byte = str_byte_start;
625 #ifdef TRE_WCHAR
626 str_wide = str_wide_start;
627 #endif /* TRE_WCHAR */
628 goto retry;
629 }
630 else
631 {
632 DPRINT(("finished\n"));
633 break;
634 }
635 }
636 }
637
638 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
639 *match_end_ofs = match_eo;
640
641 error_exit:
642 tre_bt_mem_destroy(mem);
643 #ifndef TRE_USE_ALLOCA
644 if (tags)
645 xfree(tags);
646 if (pmatch)
647 xfree(pmatch);
648 if (states_seen)
649 xfree(states_seen);
650 #endif /* !TRE_USE_ALLOCA */
651
652 return ret;
653 }
654