15f2eab64SJohn Marino /*
25f2eab64SJohn Marino tre-match-backtrack.c - TRE backtracking regex matching engine
35f2eab64SJohn Marino
45f2eab64SJohn Marino This software is released under a BSD-style license.
55f2eab64SJohn Marino See the file LICENSE for details and copyright.
65f2eab64SJohn Marino
75f2eab64SJohn Marino */
85f2eab64SJohn Marino
95f2eab64SJohn Marino /*
105f2eab64SJohn Marino This matcher is for regexps that use back referencing. Regexp matching
115f2eab64SJohn Marino with back referencing is an NP-complete problem on the number of back
125f2eab64SJohn Marino references. The easiest way to match them is to use a backtracking
135f2eab64SJohn Marino routine which basically goes through all possible paths in the TNFA
145f2eab64SJohn Marino and chooses the one which results in the best (leftmost and longest)
155f2eab64SJohn Marino match. This can be spectacularly expensive and may run out of stack
165f2eab64SJohn Marino space, but there really is no better known generic algorithm. Quoting
175f2eab64SJohn Marino Henry Spencer from comp.compilers:
185f2eab64SJohn Marino <URL: http://compilers.iecc.com/comparch/article/93-03-102>
195f2eab64SJohn Marino
205f2eab64SJohn Marino POSIX.2 REs require longest match, which is really exciting to
215f2eab64SJohn Marino implement since the obsolete ("basic") variant also includes
225f2eab64SJohn Marino \<digit>. I haven't found a better way of tackling this than doing
235f2eab64SJohn Marino a preliminary match using a DFA (or simulation) on a modified RE
245f2eab64SJohn Marino that just replicates subREs for \<digit>, and then doing a
255f2eab64SJohn Marino backtracking match to determine whether the subRE matches were
265f2eab64SJohn Marino right. This can be rather slow, but I console myself with the
275f2eab64SJohn Marino thought that people who use \<digit> deserve very slow execution.
285f2eab64SJohn Marino (Pun unintentional but very appropriate.)
295f2eab64SJohn Marino
305f2eab64SJohn Marino */
315f2eab64SJohn Marino
325f2eab64SJohn Marino
335f2eab64SJohn Marino #ifdef HAVE_CONFIG_H
345f2eab64SJohn Marino #include <config.h>
355f2eab64SJohn Marino #endif /* HAVE_CONFIG_H */
365f2eab64SJohn Marino
37d5f8dde1SJohn Marino /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
38d5f8dde1SJohn Marino info while running */
39d5f8dde1SJohn Marino #undef TRE_USE_ALLOCA
40d5f8dde1SJohn Marino
415f2eab64SJohn Marino #ifdef TRE_USE_ALLOCA
425f2eab64SJohn Marino /* AIX requires this to be the first thing in the file. */
435f2eab64SJohn Marino #ifndef __GNUC__
445f2eab64SJohn Marino # if HAVE_ALLOCA_H
455f2eab64SJohn Marino # include <alloca.h>
465f2eab64SJohn Marino # else
475f2eab64SJohn Marino # ifdef _AIX
485f2eab64SJohn Marino #pragma alloca
495f2eab64SJohn Marino # else
505f2eab64SJohn Marino # ifndef alloca /* predefined by HP cc +Olibcalls */
515f2eab64SJohn Marino char *alloca ();
525f2eab64SJohn Marino # endif
535f2eab64SJohn Marino # endif
545f2eab64SJohn Marino # endif
555f2eab64SJohn Marino #endif
565f2eab64SJohn Marino #endif /* TRE_USE_ALLOCA */
575f2eab64SJohn Marino
585f2eab64SJohn Marino #include <assert.h>
595f2eab64SJohn Marino #include <stdlib.h>
605f2eab64SJohn Marino #include <string.h>
615f2eab64SJohn Marino #ifdef HAVE_WCHAR_H
625f2eab64SJohn Marino #include <wchar.h>
635f2eab64SJohn Marino #endif /* HAVE_WCHAR_H */
645f2eab64SJohn Marino #ifdef HAVE_WCTYPE_H
655f2eab64SJohn Marino #include <wctype.h>
665f2eab64SJohn Marino #endif /* HAVE_WCTYPE_H */
675f2eab64SJohn Marino #ifndef TRE_WCHAR
685f2eab64SJohn Marino #include <ctype.h>
695f2eab64SJohn Marino #endif /* !TRE_WCHAR */
705f2eab64SJohn Marino #ifdef HAVE_MALLOC_H
715f2eab64SJohn Marino #include <malloc.h>
725f2eab64SJohn Marino #endif /* HAVE_MALLOC_H */
735f2eab64SJohn Marino
745f2eab64SJohn Marino #include "tre-internal.h"
755f2eab64SJohn Marino #include "tre-mem.h"
765f2eab64SJohn Marino #include "tre-match-utils.h"
775f2eab64SJohn Marino #include "tre.h"
785f2eab64SJohn Marino #include "xmalloc.h"
795f2eab64SJohn Marino
805f2eab64SJohn Marino typedef struct {
815f2eab64SJohn Marino int pos;
82d5f8dde1SJohn Marino unsigned int pos_add_next;
835f2eab64SJohn Marino const char *str_byte;
845f2eab64SJohn Marino #ifdef TRE_WCHAR
855f2eab64SJohn Marino const wchar_t *str_wide;
865f2eab64SJohn Marino #endif /* TRE_WCHAR */
875f2eab64SJohn Marino tre_tnfa_transition_t *state;
885f2eab64SJohn Marino int state_id;
895f2eab64SJohn Marino int next_c;
90d5f8dde1SJohn Marino tre_tag_t *tags;
915f2eab64SJohn Marino #ifdef TRE_MBSTATE
925f2eab64SJohn Marino mbstate_t mbstate;
935f2eab64SJohn Marino #endif /* TRE_MBSTATE */
945f2eab64SJohn Marino } tre_backtrack_item_t;
955f2eab64SJohn Marino
965f2eab64SJohn Marino typedef struct tre_backtrack_struct {
975f2eab64SJohn Marino tre_backtrack_item_t item;
985f2eab64SJohn Marino struct tre_backtrack_struct *prev;
995f2eab64SJohn Marino struct tre_backtrack_struct *next;
1005f2eab64SJohn Marino } *tre_backtrack_t;
1015f2eab64SJohn Marino
1025f2eab64SJohn Marino #ifdef TRE_WCHAR
1035f2eab64SJohn Marino #define BT_STACK_WIDE_IN(_str_wide) stack->item.str_wide = (_str_wide)
1045f2eab64SJohn Marino #define BT_STACK_WIDE_OUT (str_wide) = stack->item.str_wide
1055f2eab64SJohn Marino #else /* !TRE_WCHAR */
1065f2eab64SJohn Marino #define BT_STACK_WIDE_IN(_str_wide)
1075f2eab64SJohn Marino #define BT_STACK_WIDE_OUT
1085f2eab64SJohn Marino #endif /* !TRE_WCHAR */
1095f2eab64SJohn Marino
1105f2eab64SJohn Marino #ifdef TRE_MBSTATE
1115f2eab64SJohn Marino #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
1125f2eab64SJohn Marino #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
1135f2eab64SJohn Marino #else /* !TRE_MBSTATE */
1145f2eab64SJohn Marino #define BT_STACK_MBSTATE_IN
1155f2eab64SJohn Marino #define BT_STACK_MBSTATE_OUT
1165f2eab64SJohn Marino #endif /* !TRE_MBSTATE */
1175f2eab64SJohn Marino
1185f2eab64SJohn Marino
1195f2eab64SJohn Marino #ifdef TRE_USE_ALLOCA
1205f2eab64SJohn Marino #define tre_bt_mem_new tre_mem_newa
1215f2eab64SJohn Marino #define tre_bt_mem_alloc tre_mem_alloca
1225f2eab64SJohn Marino #define tre_bt_mem_destroy(obj) do { } while (0)
1235f2eab64SJohn Marino #else /* !TRE_USE_ALLOCA */
1245f2eab64SJohn Marino #define tre_bt_mem_new tre_mem_new
1255f2eab64SJohn Marino #define tre_bt_mem_alloc tre_mem_alloc
1265f2eab64SJohn Marino #define tre_bt_mem_destroy tre_mem_destroy
1275f2eab64SJohn Marino #endif /* !TRE_USE_ALLOCA */
1285f2eab64SJohn Marino
1295f2eab64SJohn Marino
130d5f8dde1SJohn Marino #define BT_STACK_PUSH(_pos, _pos_add_next, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
1315f2eab64SJohn Marino do \
1325f2eab64SJohn Marino { \
1335f2eab64SJohn Marino if (!stack->next) \
1345f2eab64SJohn Marino { \
1355f2eab64SJohn Marino tre_backtrack_t s; \
1365f2eab64SJohn Marino s = tre_bt_mem_alloc(mem, sizeof(*s)); \
1375f2eab64SJohn Marino if (!s) \
1385f2eab64SJohn Marino { \
1395f2eab64SJohn Marino tre_bt_mem_destroy(mem); \
1405f2eab64SJohn Marino if (tags) \
1415f2eab64SJohn Marino xfree(tags); \
1425f2eab64SJohn Marino if (pmatch) \
1435f2eab64SJohn Marino xfree(pmatch); \
1445f2eab64SJohn Marino if (states_seen) \
1455f2eab64SJohn Marino xfree(states_seen); \
1465f2eab64SJohn Marino return REG_ESPACE; \
1475f2eab64SJohn Marino } \
1485f2eab64SJohn Marino s->prev = stack; \
1495f2eab64SJohn Marino s->next = NULL; \
1505f2eab64SJohn Marino s->item.tags = tre_bt_mem_alloc(mem, \
151d5f8dde1SJohn Marino num_tags * sizeof(*tags)); \
1525f2eab64SJohn Marino if (!s->item.tags) \
1535f2eab64SJohn Marino { \
1545f2eab64SJohn Marino tre_bt_mem_destroy(mem); \
1555f2eab64SJohn Marino if (tags) \
1565f2eab64SJohn Marino xfree(tags); \
1575f2eab64SJohn Marino if (pmatch) \
1585f2eab64SJohn Marino xfree(pmatch); \
1595f2eab64SJohn Marino if (states_seen) \
1605f2eab64SJohn Marino xfree(states_seen); \
1615f2eab64SJohn Marino return REG_ESPACE; \
1625f2eab64SJohn Marino } \
1635f2eab64SJohn Marino stack->next = s; \
1645f2eab64SJohn Marino stack = s; \
1655f2eab64SJohn Marino } \
1665f2eab64SJohn Marino else \
1675f2eab64SJohn Marino stack = stack->next; \
1685f2eab64SJohn Marino stack->item.pos = (_pos); \
169d5f8dde1SJohn Marino stack->item.pos_add_next = (_pos_add_next); \
1705f2eab64SJohn Marino stack->item.str_byte = (_str_byte); \
1715f2eab64SJohn Marino BT_STACK_WIDE_IN(_str_wide); \
1725f2eab64SJohn Marino stack->item.state = (_state); \
1735f2eab64SJohn Marino stack->item.state_id = (_state_id); \
1745f2eab64SJohn Marino stack->item.next_c = (_next_c); \
175d5f8dde1SJohn Marino memcpy(stack->item.tags, (_tags), num_tags * sizeof(*(_tags))); \
1765f2eab64SJohn Marino BT_STACK_MBSTATE_IN; \
1775f2eab64SJohn Marino } \
1785f2eab64SJohn Marino while (/*CONSTCOND*/0)
1795f2eab64SJohn Marino
1805f2eab64SJohn Marino #define BT_STACK_POP() \
1815f2eab64SJohn Marino do \
1825f2eab64SJohn Marino { \
1835f2eab64SJohn Marino assert(stack->prev); \
1845f2eab64SJohn Marino pos = stack->item.pos; \
185d5f8dde1SJohn Marino pos_add_next = stack->item.pos_add_next; \
1865f2eab64SJohn Marino str_byte = stack->item.str_byte; \
1875f2eab64SJohn Marino BT_STACK_WIDE_OUT; \
1885f2eab64SJohn Marino state = stack->item.state; \
1895f2eab64SJohn Marino next_c = stack->item.next_c; \
190d5f8dde1SJohn Marino memcpy(tags, stack->item.tags, num_tags * sizeof(*tags)); \
1915f2eab64SJohn Marino BT_STACK_MBSTATE_OUT; \
1925f2eab64SJohn Marino stack = stack->prev; \
1935f2eab64SJohn Marino } \
1945f2eab64SJohn Marino while (/*CONSTCOND*/0)
1955f2eab64SJohn Marino
1965f2eab64SJohn Marino #undef MIN
1975f2eab64SJohn Marino #define MIN(a, b) ((a) <= (b) ? (a) : (b))
1985f2eab64SJohn Marino
1995f2eab64SJohn Marino reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,tre_tag_t * match_tags,int eflags,int * match_end_ofs)2005f2eab64SJohn Marino tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
201d5f8dde1SJohn Marino int len, tre_str_type_t type, tre_tag_t *match_tags,
2025f2eab64SJohn Marino int eflags, int *match_end_ofs)
2035f2eab64SJohn Marino {
2045f2eab64SJohn Marino /* State variables required by GET_NEXT_WCHAR. */
2055f2eab64SJohn Marino tre_char_t prev_c = 0, next_c = 0;
2065f2eab64SJohn Marino const char *str_byte = string;
2075f2eab64SJohn Marino int pos = 0;
2085f2eab64SJohn Marino unsigned int pos_add_next = 1;
2095f2eab64SJohn Marino #ifdef TRE_WCHAR
2105f2eab64SJohn Marino const wchar_t *str_wide = string;
2115f2eab64SJohn Marino #ifdef TRE_MBSTATE
2125f2eab64SJohn Marino mbstate_t mbstate;
2135f2eab64SJohn Marino #endif /* TRE_MBSTATE */
2145f2eab64SJohn Marino #endif /* TRE_WCHAR */
2155f2eab64SJohn Marino int reg_notbol = eflags & REG_NOTBOL;
2165f2eab64SJohn Marino int reg_noteol = eflags & REG_NOTEOL;
2175f2eab64SJohn Marino int reg_newline = tnfa->cflags & REG_NEWLINE;
218d5f8dde1SJohn Marino int i;
2195f2eab64SJohn Marino
2205f2eab64SJohn Marino /* These are used to remember the necessary values of the above
2215f2eab64SJohn Marino variables to return to the position where the current search
2225f2eab64SJohn Marino started from. */
2235f2eab64SJohn Marino int next_c_start;
2245f2eab64SJohn Marino const char *str_byte_start;
2255f2eab64SJohn Marino int pos_start = -1;
2265f2eab64SJohn Marino #ifdef TRE_WCHAR
2275f2eab64SJohn Marino const wchar_t *str_wide_start;
2285f2eab64SJohn Marino #endif /* TRE_WCHAR */
2295f2eab64SJohn Marino #ifdef TRE_MBSTATE
2305f2eab64SJohn Marino mbstate_t mbstate_start;
2315f2eab64SJohn Marino #endif /* TRE_MBSTATE */
2325f2eab64SJohn Marino
2335f2eab64SJohn Marino /* End offset of best match so far, or -1 if no match found yet. */
2345f2eab64SJohn Marino int match_eo = -1;
2355f2eab64SJohn Marino /* Tag arrays. */
236d5f8dde1SJohn Marino int *next_tags;
237d5f8dde1SJohn Marino tre_tag_t *tags = NULL;
2385f2eab64SJohn Marino /* Current TNFA state. */
2395f2eab64SJohn Marino tre_tnfa_transition_t *state;
2405f2eab64SJohn Marino int *states_seen = NULL;
2415f2eab64SJohn Marino
2425f2eab64SJohn Marino /* Memory allocator to for allocating the backtracking stack. */
2435f2eab64SJohn Marino tre_mem_t mem = tre_bt_mem_new();
2445f2eab64SJohn Marino
2455f2eab64SJohn Marino /* The backtracking stack. */
2465f2eab64SJohn Marino tre_backtrack_t stack;
2475f2eab64SJohn Marino
2485f2eab64SJohn Marino tre_tnfa_transition_t *trans_i;
2495f2eab64SJohn Marino regmatch_t *pmatch = NULL;
250d5f8dde1SJohn Marino reg_errcode_t ret;
251d5f8dde1SJohn Marino
252d5f8dde1SJohn Marino int num_tags = tnfa->num_tags;
253d5f8dde1SJohn Marino int touch = 1;
254d5f8dde1SJohn Marino char *buf;
255d5f8dde1SJohn Marino int tbytes;
2565f2eab64SJohn Marino
2575f2eab64SJohn Marino #ifdef TRE_MBSTATE
2585f2eab64SJohn Marino memset(&mbstate, '\0', sizeof(mbstate));
2595f2eab64SJohn Marino #endif /* TRE_MBSTATE */
260*ec01c407SSascha Wildner #ifndef TRE_USE_ALLOCA
261*ec01c407SSascha Wildner buf = NULL;
262*ec01c407SSascha Wildner #endif
2635f2eab64SJohn Marino
2645f2eab64SJohn Marino if (!mem)
2655f2eab64SJohn Marino return REG_ESPACE;
2665f2eab64SJohn Marino stack = tre_bt_mem_alloc(mem, sizeof(*stack));
2675f2eab64SJohn Marino if (!stack)
2685f2eab64SJohn Marino {
2695f2eab64SJohn Marino ret = REG_ESPACE;
2705f2eab64SJohn Marino goto error_exit;
2715f2eab64SJohn Marino }
2725f2eab64SJohn Marino stack->prev = NULL;
2735f2eab64SJohn Marino stack->next = NULL;
2745f2eab64SJohn Marino
2755f2eab64SJohn Marino DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
2765f2eab64SJohn Marino DPRINT(("len = %d\n", len));
2775f2eab64SJohn Marino
278d5f8dde1SJohn Marino {
279d5f8dde1SJohn Marino int pbytes, sbytes, total_bytes;
280d5f8dde1SJohn Marino char *tmp_buf;
281d5f8dde1SJohn Marino /* Compute the length of the block we need. */
282d5f8dde1SJohn Marino tbytes = sizeof(*tags) * num_tags;
283d5f8dde1SJohn Marino pbytes = sizeof(*pmatch) * tnfa->num_submatches;
284d5f8dde1SJohn Marino sbytes = sizeof(*states_seen) * tnfa->num_states;
285d5f8dde1SJohn Marino total_bytes =
286d5f8dde1SJohn Marino (sizeof(long) - 1) * 2 /* for alignment paddings */
287d5f8dde1SJohn Marino + tbytes + pbytes + sbytes;
288d5f8dde1SJohn Marino
289d5f8dde1SJohn Marino DPRINT(("tre_tnfa_run_backtrack, allocate %d bytes\n", total_bytes));
290d5f8dde1SJohn Marino /* Allocate the memory. */
2915f2eab64SJohn Marino #ifdef TRE_USE_ALLOCA
292d5f8dde1SJohn Marino buf = alloca(total_bytes);
2935f2eab64SJohn Marino #else /* !TRE_USE_ALLOCA */
294d5f8dde1SJohn Marino buf = xmalloc((unsigned)total_bytes);
2955f2eab64SJohn Marino #endif /* !TRE_USE_ALLOCA */
296d5f8dde1SJohn Marino if (buf == NULL)
297d5f8dde1SJohn Marino return REG_ESPACE;
298d5f8dde1SJohn Marino
299d5f8dde1SJohn Marino /* Get the various pointers within tmp_buf (properly aligned). */
300d5f8dde1SJohn Marino tags = (void *)buf;
301d5f8dde1SJohn Marino tmp_buf = buf + tbytes;
302d5f8dde1SJohn Marino tmp_buf += ALIGN(tmp_buf, long);
303d5f8dde1SJohn Marino pmatch = (void *)tmp_buf;
304d5f8dde1SJohn Marino tmp_buf += pbytes;
305d5f8dde1SJohn Marino tmp_buf += ALIGN(tmp_buf, long);
306d5f8dde1SJohn Marino states_seen = (void *)tmp_buf;
307d5f8dde1SJohn Marino }
3085f2eab64SJohn Marino
3095f2eab64SJohn Marino retry:
3105f2eab64SJohn Marino {
311d5f8dde1SJohn Marino memset(tags, 0, num_tags * sizeof(*tags));
3125f2eab64SJohn Marino if (match_tags)
313d5f8dde1SJohn Marino memset(match_tags, 0, num_tags * sizeof(*match_tags));
3145f2eab64SJohn Marino for (i = 0; i < tnfa->num_states; i++)
3155f2eab64SJohn Marino states_seen[i] = 0;
3165f2eab64SJohn Marino }
3175f2eab64SJohn Marino
3185f2eab64SJohn Marino state = NULL;
3195f2eab64SJohn Marino pos = pos_start;
3205f2eab64SJohn Marino GET_NEXT_WCHAR();
3215f2eab64SJohn Marino pos_start = pos;
3225f2eab64SJohn Marino next_c_start = next_c;
3235f2eab64SJohn Marino str_byte_start = str_byte;
3245f2eab64SJohn Marino #ifdef TRE_WCHAR
3255f2eab64SJohn Marino str_wide_start = str_wide;
3265f2eab64SJohn Marino #endif /* TRE_WCHAR */
3275f2eab64SJohn Marino #ifdef TRE_MBSTATE
3285f2eab64SJohn Marino mbstate_start = mbstate;
3295f2eab64SJohn Marino #endif /* TRE_MBSTATE */
3305f2eab64SJohn Marino
3315f2eab64SJohn Marino /* Handle initial states. */
3325f2eab64SJohn Marino next_tags = NULL;
3335f2eab64SJohn Marino for (trans_i = tnfa->initial; trans_i->state; trans_i++)
3345f2eab64SJohn Marino {
3355f2eab64SJohn Marino DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
3365f2eab64SJohn Marino if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
3375f2eab64SJohn Marino {
3385f2eab64SJohn Marino DPRINT(("assert failed\n"));
3395f2eab64SJohn Marino continue;
3405f2eab64SJohn Marino }
3415f2eab64SJohn Marino if (state == NULL)
3425f2eab64SJohn Marino {
3435f2eab64SJohn Marino /* Start from this state. */
3445f2eab64SJohn Marino state = trans_i->state;
3455f2eab64SJohn Marino next_tags = trans_i->tags;
3465f2eab64SJohn Marino }
3475f2eab64SJohn Marino else
3485f2eab64SJohn Marino {
3495f2eab64SJohn Marino /* Backtrack to this state. */
3505f2eab64SJohn Marino DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
351d5f8dde1SJohn Marino BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide, trans_i->state,
3525f2eab64SJohn Marino trans_i->state_id, next_c, tags, mbstate);
3535f2eab64SJohn Marino {
3545f2eab64SJohn Marino int *tmp = trans_i->tags;
3555f2eab64SJohn Marino if (tmp)
356d5f8dde1SJohn Marino {
3575f2eab64SJohn Marino while (*tmp >= 0)
358d5f8dde1SJohn Marino tre_tag_set(stack->item.tags, *tmp++, pos, touch);
359d5f8dde1SJohn Marino touch++;
360d5f8dde1SJohn Marino }
3615f2eab64SJohn Marino }
3625f2eab64SJohn Marino }
3635f2eab64SJohn Marino }
3645f2eab64SJohn Marino
3655f2eab64SJohn Marino if (next_tags)
366d5f8dde1SJohn Marino {
3675f2eab64SJohn Marino for (; *next_tags >= 0; next_tags++)
368d5f8dde1SJohn Marino tre_tag_set(tags, *next_tags, pos, touch);
369d5f8dde1SJohn Marino touch++;
370d5f8dde1SJohn Marino }
3715f2eab64SJohn Marino
3725f2eab64SJohn Marino
3735f2eab64SJohn Marino DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
3745f2eab64SJohn Marino DPRINT(("pos:chr/code | state and tags\n"));
3755f2eab64SJohn Marino DPRINT(("-------------+------------------------------------------------\n"));
3765f2eab64SJohn Marino
3775f2eab64SJohn Marino if (state == NULL)
3785f2eab64SJohn Marino goto backtrack;
3795f2eab64SJohn Marino
3805f2eab64SJohn Marino while (/*CONSTCOND*/1)
3815f2eab64SJohn Marino {
3825f2eab64SJohn Marino tre_tnfa_transition_t *next_state;
3835f2eab64SJohn Marino int empty_br_match;
3845f2eab64SJohn Marino
3855f2eab64SJohn Marino DPRINT(("start loop\n"));
386d5f8dde1SJohn Marino
387d5f8dde1SJohn Marino if (match_eo >= 0 && tnfa->num_minimals)
388d5f8dde1SJohn Marino {
389d5f8dde1SJohn Marino int skip = 0;
390d5f8dde1SJohn Marino #ifdef TRE_DEBUG
391d5f8dde1SJohn Marino DPRINT(("Checking minimal conditions: match_eo=%d match_tags=",
392d5f8dde1SJohn Marino match_eo));
393d5f8dde1SJohn Marino tre_print_tags(match_tags, tnfa->num_tags);
394d5f8dde1SJohn Marino DPRINT(("\n"));
395d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
396d5f8dde1SJohn Marino for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
397d5f8dde1SJohn Marino {
398d5f8dde1SJohn Marino int end = tnfa->minimal_tags[i];
399d5f8dde1SJohn Marino int start = tnfa->minimal_tags[i + 1];
400d5f8dde1SJohn Marino DPRINT((" Minimal start %d, end %d\n", start, end));
401d5f8dde1SJohn Marino if (tre_minimal_tag_order(start, end, match_tags, tags) > 0)
402d5f8dde1SJohn Marino {
403d5f8dde1SJohn Marino skip = 1;
404d5f8dde1SJohn Marino break;
405d5f8dde1SJohn Marino }
406d5f8dde1SJohn Marino }
407d5f8dde1SJohn Marino if (!skip)
408d5f8dde1SJohn Marino {
409d5f8dde1SJohn Marino #ifdef TRE_DEBUG
410d5f8dde1SJohn Marino DPRINT((" Keeping tags="));
411d5f8dde1SJohn Marino tre_print_tags(tags, tnfa->num_tags);
412d5f8dde1SJohn Marino DPRINT(("\n"));
413d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
414d5f8dde1SJohn Marino }
415d5f8dde1SJohn Marino else
416d5f8dde1SJohn Marino {
417d5f8dde1SJohn Marino #ifdef TRE_DEBUG
418d5f8dde1SJohn Marino DPRINT((" Throwing out tags="));
419d5f8dde1SJohn Marino tre_print_tags(tags, tnfa->num_tags);
420d5f8dde1SJohn Marino DPRINT(("\n"));
421d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
422d5f8dde1SJohn Marino goto backtrack;
423d5f8dde1SJohn Marino }
424d5f8dde1SJohn Marino }
425d5f8dde1SJohn Marino
4265f2eab64SJohn Marino if (state == tnfa->final)
4275f2eab64SJohn Marino {
428d5f8dde1SJohn Marino DPRINT((" match found, match_eo=%d pos=%d\n", match_eo, pos));
429d5f8dde1SJohn Marino
430d5f8dde1SJohn Marino if (match_eo >= 0 && tnfa->num_minimals)
431d5f8dde1SJohn Marino {
432d5f8dde1SJohn Marino int compare = 0;
433d5f8dde1SJohn Marino #ifdef TRE_DEBUG
434d5f8dde1SJohn Marino DPRINT(("Checking minimal conditions: match_eo=%d "
435d5f8dde1SJohn Marino "match_tags=", match_eo));
436d5f8dde1SJohn Marino tre_print_tags(match_tags, tnfa->num_tags);
437d5f8dde1SJohn Marino DPRINT(("\n"));
438d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
439d5f8dde1SJohn Marino for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
440d5f8dde1SJohn Marino {
441d5f8dde1SJohn Marino int end = tnfa->minimal_tags[i];
442d5f8dde1SJohn Marino int start = tnfa->minimal_tags[i + 1];
443d5f8dde1SJohn Marino DPRINT((" Minimal start %d, end %d\n", start, end));
444d5f8dde1SJohn Marino if ((compare = tre_minimal_tag_order(start, end,
445d5f8dde1SJohn Marino match_tags, tags)) != 0)
446d5f8dde1SJohn Marino break;
447d5f8dde1SJohn Marino }
448d5f8dde1SJohn Marino if (compare > 0)
449d5f8dde1SJohn Marino {
450d5f8dde1SJohn Marino #ifdef TRE_DEBUG
451d5f8dde1SJohn Marino DPRINT((" Throwing out new match, tags="));
452d5f8dde1SJohn Marino tre_print_tags(tags, tnfa->num_tags);
453d5f8dde1SJohn Marino DPRINT(("\n"));
454d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
455d5f8dde1SJohn Marino goto backtrack;
456d5f8dde1SJohn Marino }
457d5f8dde1SJohn Marino else if (compare < 0)
458d5f8dde1SJohn Marino {
459d5f8dde1SJohn Marino #ifdef TRE_DEBUG
460d5f8dde1SJohn Marino DPRINT((" Throwing out old match, tags="));
461d5f8dde1SJohn Marino tre_print_tags(match_tags, tnfa->num_tags);
462d5f8dde1SJohn Marino DPRINT(("\n"));
463d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
464d5f8dde1SJohn Marino match_eo = -1;
465d5f8dde1SJohn Marino }
466d5f8dde1SJohn Marino }
467d5f8dde1SJohn Marino
4685f2eab64SJohn Marino if (match_eo < pos
4695f2eab64SJohn Marino || (match_eo == pos
4705f2eab64SJohn Marino && match_tags
4715f2eab64SJohn Marino && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
4725f2eab64SJohn Marino tags, match_tags)))
4735f2eab64SJohn Marino {
4745f2eab64SJohn Marino /* This match wins the previous match. */
475d5f8dde1SJohn Marino #ifdef TRE_DEBUG
476d5f8dde1SJohn Marino DPRINT((" win previous tags="));
477d5f8dde1SJohn Marino tre_print_tags(tags, tnfa->num_tags);
478d5f8dde1SJohn Marino DPRINT(("\n"));
479d5f8dde1SJohn Marino #endif /* TRE_DEBUG */
4805f2eab64SJohn Marino match_eo = pos;
4815f2eab64SJohn Marino if (match_tags)
482d5f8dde1SJohn Marino memcpy(match_tags, tags, num_tags * sizeof(*tags));
4835f2eab64SJohn Marino }
4845f2eab64SJohn Marino /* Our TNFAs never have transitions leaving from the final state,
4855f2eab64SJohn Marino so we jump right to backtracking. */
4865f2eab64SJohn Marino goto backtrack;
4875f2eab64SJohn Marino }
4885f2eab64SJohn Marino
4895f2eab64SJohn Marino #ifdef TRE_DEBUG
4905f2eab64SJohn Marino DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
4915f2eab64SJohn Marino state));
492d5f8dde1SJohn Marino tre_print_tags(tags, tnfa->num_tags);
4935f2eab64SJohn Marino DPRINT(("\n"));
4945f2eab64SJohn Marino #endif /* TRE_DEBUG */
4955f2eab64SJohn Marino
4965f2eab64SJohn Marino /* Go to the next character in the input string. */
4975f2eab64SJohn Marino empty_br_match = 0;
4985f2eab64SJohn Marino trans_i = state;
4995f2eab64SJohn Marino if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
5005f2eab64SJohn Marino {
5015f2eab64SJohn Marino /* This is a back reference state. All transitions leaving from
5025f2eab64SJohn Marino this state have the same back reference "assertion". Instead
5035f2eab64SJohn Marino of reading the next character, we match the back reference. */
5045f2eab64SJohn Marino int so, eo, bt = trans_i->u.backref;
5055f2eab64SJohn Marino int bt_len;
5065f2eab64SJohn Marino int result;
5075f2eab64SJohn Marino
5085f2eab64SJohn Marino DPRINT((" should match back reference %d\n", bt));
5095f2eab64SJohn Marino /* Get the substring we need to match against. Remember to
5105f2eab64SJohn Marino turn off REG_NOSUB temporarily. */
511d5f8dde1SJohn Marino ret = tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
5125f2eab64SJohn Marino tnfa, tags, pos);
513d5f8dde1SJohn Marino if (ret != REG_OK) goto error_exit;
5145f2eab64SJohn Marino so = pmatch[bt].rm_so;
5155f2eab64SJohn Marino eo = pmatch[bt].rm_eo;
5165f2eab64SJohn Marino bt_len = eo - so;
5175f2eab64SJohn Marino
5185f2eab64SJohn Marino #ifdef TRE_DEBUG
5195f2eab64SJohn Marino {
5205f2eab64SJohn Marino int slen;
5215f2eab64SJohn Marino if (len < 0)
5225f2eab64SJohn Marino slen = bt_len;
5235f2eab64SJohn Marino else
5245f2eab64SJohn Marino slen = MIN(bt_len, len - pos);
5255f2eab64SJohn Marino
5265f2eab64SJohn Marino if (type == STR_BYTE)
5275f2eab64SJohn Marino {
528d5f8dde1SJohn Marino DPRINT((" substring (len %d) is [%d, %d]: '%.*s'\n",
5295f2eab64SJohn Marino bt_len, so, eo, bt_len, (char*)string + so));
5305f2eab64SJohn Marino DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
5315f2eab64SJohn Marino }
5325f2eab64SJohn Marino #ifdef TRE_WCHAR
5335f2eab64SJohn Marino else if (type == STR_WIDE)
5345f2eab64SJohn Marino {
535d5f8dde1SJohn Marino DPRINT((" substring (len %d) is [%d, %d]: '%.*" STRF "'\n",
5365f2eab64SJohn Marino bt_len, so, eo, bt_len, (wchar_t*)string + so));
5375f2eab64SJohn Marino DPRINT((" current string is '%.*" STRF "'\n",
5385f2eab64SJohn Marino slen, str_wide - 1));
5395f2eab64SJohn Marino }
5405f2eab64SJohn Marino #endif /* TRE_WCHAR */
5415f2eab64SJohn Marino }
5425f2eab64SJohn Marino #endif
5435f2eab64SJohn Marino
544d5f8dde1SJohn Marino if (so < 0)
5455f2eab64SJohn Marino {
546d5f8dde1SJohn Marino result = 1; /* Back reference of nomatch doesn't match */
547d5f8dde1SJohn Marino }
548d5f8dde1SJohn Marino else if (len < 0)
549d5f8dde1SJohn Marino {
5505f2eab64SJohn Marino #ifdef TRE_WCHAR
551d5f8dde1SJohn Marino if (type == STR_WIDE)
5525f2eab64SJohn Marino result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
5535f2eab64SJohn Marino (size_t)bt_len);
5545f2eab64SJohn Marino else
555d5f8dde1SJohn Marino #endif /* TRE_WCHAR */
5565f2eab64SJohn Marino result = strncmp((const char*)string + so, str_byte - 1,
5575f2eab64SJohn Marino (size_t)bt_len);
5585f2eab64SJohn Marino }
5595f2eab64SJohn Marino else if (len - pos < bt_len)
5605f2eab64SJohn Marino result = 1;
5615f2eab64SJohn Marino #ifdef TRE_WCHAR
5625f2eab64SJohn Marino else if (type == STR_WIDE)
5635f2eab64SJohn Marino result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
5645f2eab64SJohn Marino (size_t)bt_len);
5655f2eab64SJohn Marino #endif /* TRE_WCHAR */
5665f2eab64SJohn Marino else
5675f2eab64SJohn Marino result = memcmp((const char*)string + so, str_byte - 1,
5685f2eab64SJohn Marino (size_t)bt_len);
5695f2eab64SJohn Marino
5705f2eab64SJohn Marino if (result == 0)
5715f2eab64SJohn Marino {
5725f2eab64SJohn Marino /* Back reference matched. Check for infinite loop. */
5735f2eab64SJohn Marino if (bt_len == 0)
5745f2eab64SJohn Marino empty_br_match = 1;
5755f2eab64SJohn Marino if (empty_br_match && states_seen[trans_i->state_id])
5765f2eab64SJohn Marino {
5775f2eab64SJohn Marino DPRINT((" avoid loop\n"));
5785f2eab64SJohn Marino goto backtrack;
5795f2eab64SJohn Marino }
5805f2eab64SJohn Marino
5815f2eab64SJohn Marino states_seen[trans_i->state_id] = empty_br_match;
5825f2eab64SJohn Marino
5835f2eab64SJohn Marino /* Advance in input string and resync `prev_c', `next_c'
5845f2eab64SJohn Marino and pos. */
5855f2eab64SJohn Marino DPRINT((" back reference matched\n"));
5865f2eab64SJohn Marino str_byte += bt_len - 1;
5875f2eab64SJohn Marino #ifdef TRE_WCHAR
5885f2eab64SJohn Marino str_wide += bt_len - 1;
5895f2eab64SJohn Marino #endif /* TRE_WCHAR */
5905f2eab64SJohn Marino pos += bt_len - 1;
5915f2eab64SJohn Marino GET_NEXT_WCHAR();
5925f2eab64SJohn Marino DPRINT((" pos now %d\n", pos));
5935f2eab64SJohn Marino }
5945f2eab64SJohn Marino else
5955f2eab64SJohn Marino {
5965f2eab64SJohn Marino DPRINT((" back reference did not match\n"));
5975f2eab64SJohn Marino goto backtrack;
5985f2eab64SJohn Marino }
5995f2eab64SJohn Marino }
6005f2eab64SJohn Marino else
6015f2eab64SJohn Marino {
6025f2eab64SJohn Marino /* Check for end of string. */
6035f2eab64SJohn Marino if (len < 0)
6045f2eab64SJohn Marino {
605d5f8dde1SJohn Marino if (next_c == L'\0')
6065f2eab64SJohn Marino goto backtrack;
6075f2eab64SJohn Marino }
6085f2eab64SJohn Marino else
6095f2eab64SJohn Marino {
6105f2eab64SJohn Marino if (pos >= len)
6115f2eab64SJohn Marino goto backtrack;
6125f2eab64SJohn Marino }
6135f2eab64SJohn Marino
6145f2eab64SJohn Marino /* Read the next character. */
6155f2eab64SJohn Marino GET_NEXT_WCHAR();
6165f2eab64SJohn Marino }
6175f2eab64SJohn Marino
6185f2eab64SJohn Marino next_state = NULL;
6195f2eab64SJohn Marino for (trans_i = state; trans_i->state; trans_i++)
6205f2eab64SJohn Marino {
6215f2eab64SJohn Marino DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
6225f2eab64SJohn Marino trans_i->code_min, trans_i->code_max,
6235f2eab64SJohn Marino trans_i->code_min, trans_i->code_max,
6245f2eab64SJohn Marino trans_i->assertions, trans_i->state_id));
6255f2eab64SJohn Marino if (trans_i->code_min <= (tre_cint_t)prev_c
6265f2eab64SJohn Marino && trans_i->code_max >= (tre_cint_t)prev_c)
6275f2eab64SJohn Marino {
6285f2eab64SJohn Marino if (trans_i->assertions
6295f2eab64SJohn Marino && (CHECK_ASSERTIONS(trans_i->assertions)
6305f2eab64SJohn Marino || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
6315f2eab64SJohn Marino {
6325f2eab64SJohn Marino DPRINT((" assertion failed\n"));
6335f2eab64SJohn Marino continue;
6345f2eab64SJohn Marino }
6355f2eab64SJohn Marino
6365f2eab64SJohn Marino if (next_state == NULL)
6375f2eab64SJohn Marino {
6385f2eab64SJohn Marino /* First matching transition. */
6395f2eab64SJohn Marino DPRINT((" Next state is %d\n", trans_i->state_id));
6405f2eab64SJohn Marino next_state = trans_i->state;
6415f2eab64SJohn Marino next_tags = trans_i->tags;
6425f2eab64SJohn Marino }
6435f2eab64SJohn Marino else
6445f2eab64SJohn Marino {
6455f2eab64SJohn Marino /* Second matching transition. We may need to backtrack here
6465f2eab64SJohn Marino to take this transition instead of the first one, so we
6475f2eab64SJohn Marino push this transition in the backtracking stack so we can
6485f2eab64SJohn Marino jump back here if needed. */
6495f2eab64SJohn Marino DPRINT((" saving state %d for backtracking\n",
6505f2eab64SJohn Marino trans_i->state_id));
651d5f8dde1SJohn Marino BT_STACK_PUSH(pos, pos_add_next, str_byte, str_wide,
652d5f8dde1SJohn Marino trans_i->state, trans_i->state_id, next_c,
653d5f8dde1SJohn Marino tags, mbstate);
6545f2eab64SJohn Marino {
6555f2eab64SJohn Marino int *tmp;
6565f2eab64SJohn Marino for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
657d5f8dde1SJohn Marino tre_tag_set(stack->item.tags, *tmp, pos, touch);
658d5f8dde1SJohn Marino touch++;
6595f2eab64SJohn Marino }
6605f2eab64SJohn Marino #if 0 /* XXX - it's important not to look at all transitions here to keep
6615f2eab64SJohn Marino the stack small! */
6625f2eab64SJohn Marino break;
6635f2eab64SJohn Marino #endif
6645f2eab64SJohn Marino }
6655f2eab64SJohn Marino }
6665f2eab64SJohn Marino }
6675f2eab64SJohn Marino
6685f2eab64SJohn Marino if (next_state != NULL)
6695f2eab64SJohn Marino {
6705f2eab64SJohn Marino /* Matching transitions were found. Take the first one. */
6715f2eab64SJohn Marino state = next_state;
6725f2eab64SJohn Marino
6735f2eab64SJohn Marino /* Update the tag values. */
6745f2eab64SJohn Marino if (next_tags)
675d5f8dde1SJohn Marino {
6765f2eab64SJohn Marino while (*next_tags >= 0)
677d5f8dde1SJohn Marino tre_tag_set(tags, *next_tags++, pos, touch);
678d5f8dde1SJohn Marino touch++;
679d5f8dde1SJohn Marino }
6805f2eab64SJohn Marino }
6815f2eab64SJohn Marino else
6825f2eab64SJohn Marino {
6835f2eab64SJohn Marino backtrack:
6845f2eab64SJohn Marino /* A matching transition was not found. Try to backtrack. */
6855f2eab64SJohn Marino if (stack->prev)
6865f2eab64SJohn Marino {
6875f2eab64SJohn Marino DPRINT((" backtracking\n"));
688d5f8dde1SJohn Marino if (stack->item.state->assertions & ASSERT_BACKREF)
6895f2eab64SJohn Marino {
6905f2eab64SJohn Marino DPRINT((" states_seen[%d] = 0\n",
6915f2eab64SJohn Marino stack->item.state_id));
6925f2eab64SJohn Marino states_seen[stack->item.state_id] = 0;
6935f2eab64SJohn Marino }
6945f2eab64SJohn Marino
6955f2eab64SJohn Marino BT_STACK_POP();
6965f2eab64SJohn Marino }
6975f2eab64SJohn Marino else if (match_eo < 0)
6985f2eab64SJohn Marino {
6995f2eab64SJohn Marino /* Try starting from a later position in the input string. */
7005f2eab64SJohn Marino /* Check for end of string. */
701d5f8dde1SJohn Marino if (pos == pos_start)
702d5f8dde1SJohn Marino {
7035f2eab64SJohn Marino if (len < 0)
7045f2eab64SJohn Marino {
7055f2eab64SJohn Marino if (next_c == L'\0')
7065f2eab64SJohn Marino {
7075f2eab64SJohn Marino DPRINT(("end of string.\n"));
7085f2eab64SJohn Marino break;
7095f2eab64SJohn Marino }
7105f2eab64SJohn Marino }
7115f2eab64SJohn Marino else
7125f2eab64SJohn Marino {
7135f2eab64SJohn Marino if (pos >= len)
7145f2eab64SJohn Marino {
7155f2eab64SJohn Marino DPRINT(("end of string.\n"));
7165f2eab64SJohn Marino break;
7175f2eab64SJohn Marino }
7185f2eab64SJohn Marino }
719d5f8dde1SJohn Marino }
7205f2eab64SJohn Marino DPRINT(("restarting from next start position\n"));
7215f2eab64SJohn Marino next_c = next_c_start;
7225f2eab64SJohn Marino #ifdef TRE_MBSTATE
7235f2eab64SJohn Marino mbstate = mbstate_start;
7245f2eab64SJohn Marino #endif /* TRE_MBSTATE */
7255f2eab64SJohn Marino str_byte = str_byte_start;
7265f2eab64SJohn Marino #ifdef TRE_WCHAR
7275f2eab64SJohn Marino str_wide = str_wide_start;
7285f2eab64SJohn Marino #endif /* TRE_WCHAR */
7295f2eab64SJohn Marino goto retry;
7305f2eab64SJohn Marino }
7315f2eab64SJohn Marino else
7325f2eab64SJohn Marino {
7335f2eab64SJohn Marino DPRINT(("finished\n"));
7345f2eab64SJohn Marino break;
7355f2eab64SJohn Marino }
7365f2eab64SJohn Marino }
7375f2eab64SJohn Marino }
7385f2eab64SJohn Marino
7395f2eab64SJohn Marino ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
7405f2eab64SJohn Marino *match_end_ofs = match_eo;
7415f2eab64SJohn Marino
7425f2eab64SJohn Marino error_exit:
7435f2eab64SJohn Marino tre_bt_mem_destroy(mem);
7445f2eab64SJohn Marino #ifndef TRE_USE_ALLOCA
745d5f8dde1SJohn Marino if (buf)
746d5f8dde1SJohn Marino xfree(buf);
7475f2eab64SJohn Marino #endif /* !TRE_USE_ALLOCA */
7485f2eab64SJohn Marino
7495f2eab64SJohn Marino return ret;
7505f2eab64SJohn Marino }
751