144b87433SJohn Marino /* Extended regular expression matching and search library.
2*6ea1f93eSDaniel Fojt Copyright (C) 2002-2018 Free Software Foundation, Inc.
344b87433SJohn Marino This file is part of the GNU C Library.
444b87433SJohn Marino Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
544b87433SJohn Marino
64536c563SJohn Marino The GNU C Library is free software; you can redistribute it and/or
74536c563SJohn Marino modify it under the terms of the GNU General Public
84536c563SJohn Marino License as published by the Free Software Foundation; either
94536c563SJohn Marino version 3 of the License, or (at your option) any later version.
1044b87433SJohn Marino
114536c563SJohn Marino The GNU C Library is distributed in the hope that it will be useful,
1244b87433SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
134536c563SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
144536c563SJohn Marino General Public License for more details.
1544b87433SJohn Marino
164536c563SJohn Marino You should have received a copy of the GNU General Public
174536c563SJohn Marino License along with the GNU C Library; if not, see
18*6ea1f93eSDaniel Fojt <https://www.gnu.org/licenses/>. */
1944b87433SJohn Marino
2044b87433SJohn Marino static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21*6ea1f93eSDaniel Fojt Idx n);
22*6ea1f93eSDaniel Fojt static void match_ctx_clean (re_match_context_t *mctx);
23*6ea1f93eSDaniel Fojt static void match_ctx_free (re_match_context_t *cache);
2444b87433SJohn Marino static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25*6ea1f93eSDaniel Fojt Idx str_idx, Idx from, Idx to);
26*6ea1f93eSDaniel Fojt static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
2744b87433SJohn Marino static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28*6ea1f93eSDaniel Fojt Idx str_idx);
2944b87433SJohn Marino static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30*6ea1f93eSDaniel Fojt Idx node, Idx str_idx);
3144b87433SJohn Marino static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
3244b87433SJohn Marino re_dfastate_t **limited_sts, Idx last_node,
33*6ea1f93eSDaniel Fojt Idx last_str_idx);
3444b87433SJohn Marino static reg_errcode_t re_search_internal (const regex_t *preg,
3544b87433SJohn Marino const char *string, Idx length,
3644b87433SJohn Marino Idx start, Idx last_start, Idx stop,
3744b87433SJohn Marino size_t nmatch, regmatch_t pmatch[],
38*6ea1f93eSDaniel Fojt int eflags);
3944b87433SJohn Marino static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
4044b87433SJohn Marino const char *string1, Idx length1,
4144b87433SJohn Marino const char *string2, Idx length2,
4244b87433SJohn Marino Idx start, regoff_t range,
4344b87433SJohn Marino struct re_registers *regs,
44*6ea1f93eSDaniel Fojt Idx stop, bool ret_len);
4544b87433SJohn Marino static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
4644b87433SJohn Marino const char *string, Idx length, Idx start,
4744b87433SJohn Marino regoff_t range, Idx stop,
4844b87433SJohn Marino struct re_registers *regs,
49*6ea1f93eSDaniel Fojt bool ret_len);
504536c563SJohn Marino static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51*6ea1f93eSDaniel Fojt Idx nregs, int regs_allocated);
52*6ea1f93eSDaniel Fojt static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
5344b87433SJohn Marino static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54*6ea1f93eSDaniel Fojt Idx *p_match_first);
5544b87433SJohn Marino static Idx check_halt_state_context (const re_match_context_t *mctx,
56*6ea1f93eSDaniel Fojt const re_dfastate_t *state, Idx idx);
5744b87433SJohn Marino static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
5844b87433SJohn Marino regmatch_t *prev_idx_match, Idx cur_node,
59*6ea1f93eSDaniel Fojt Idx cur_idx, Idx nmatch);
6044b87433SJohn Marino static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
6144b87433SJohn Marino Idx str_idx, Idx dest_node, Idx nregs,
6244b87433SJohn Marino regmatch_t *regs,
63*6ea1f93eSDaniel Fojt re_node_set *eps_via_nodes);
6444b87433SJohn Marino static reg_errcode_t set_regs (const regex_t *preg,
6544b87433SJohn Marino const re_match_context_t *mctx,
6644b87433SJohn Marino size_t nmatch, regmatch_t *pmatch,
67*6ea1f93eSDaniel Fojt bool fl_backtrack);
68*6ea1f93eSDaniel Fojt static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
6944b87433SJohn Marino
7044b87433SJohn Marino #ifdef RE_ENABLE_I18N
7144b87433SJohn Marino static int sift_states_iter_mb (const re_match_context_t *mctx,
7244b87433SJohn Marino re_sift_context_t *sctx,
73*6ea1f93eSDaniel Fojt Idx node_idx, Idx str_idx, Idx max_str_idx);
7444b87433SJohn Marino #endif /* RE_ENABLE_I18N */
7544b87433SJohn Marino static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76*6ea1f93eSDaniel Fojt re_sift_context_t *sctx);
7744b87433SJohn Marino static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
7844b87433SJohn Marino re_sift_context_t *sctx, Idx str_idx,
79*6ea1f93eSDaniel Fojt re_node_set *cur_dest);
8044b87433SJohn Marino static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
8144b87433SJohn Marino re_sift_context_t *sctx,
8244b87433SJohn Marino Idx str_idx,
83*6ea1f93eSDaniel Fojt re_node_set *dest_nodes);
8444b87433SJohn Marino static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
8544b87433SJohn Marino re_node_set *dest_nodes,
86*6ea1f93eSDaniel Fojt const re_node_set *candidates);
8744b87433SJohn Marino static bool check_dst_limits (const re_match_context_t *mctx,
8844b87433SJohn Marino const re_node_set *limits,
8944b87433SJohn Marino Idx dst_node, Idx dst_idx, Idx src_node,
90*6ea1f93eSDaniel Fojt Idx src_idx);
9144b87433SJohn Marino static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
9244b87433SJohn Marino int boundaries, Idx subexp_idx,
93*6ea1f93eSDaniel Fojt Idx from_node, Idx bkref_idx);
9444b87433SJohn Marino static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
9544b87433SJohn Marino Idx limit, Idx subexp_idx,
9644b87433SJohn Marino Idx node, Idx str_idx,
97*6ea1f93eSDaniel Fojt Idx bkref_idx);
9844b87433SJohn Marino static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
9944b87433SJohn Marino re_node_set *dest_nodes,
10044b87433SJohn Marino const re_node_set *candidates,
10144b87433SJohn Marino re_node_set *limits,
10244b87433SJohn Marino struct re_backref_cache_entry *bkref_ents,
103*6ea1f93eSDaniel Fojt Idx str_idx);
10444b87433SJohn Marino static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
10544b87433SJohn Marino re_sift_context_t *sctx,
106*6ea1f93eSDaniel Fojt Idx str_idx, const re_node_set *candidates);
10744b87433SJohn Marino static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
10844b87433SJohn Marino re_dfastate_t **dst,
109*6ea1f93eSDaniel Fojt re_dfastate_t **src, Idx num);
11044b87433SJohn Marino static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111*6ea1f93eSDaniel Fojt re_match_context_t *mctx);
11244b87433SJohn Marino static re_dfastate_t *transit_state (reg_errcode_t *err,
11344b87433SJohn Marino re_match_context_t *mctx,
114*6ea1f93eSDaniel Fojt re_dfastate_t *state);
11544b87433SJohn Marino static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
11644b87433SJohn Marino re_match_context_t *mctx,
117*6ea1f93eSDaniel Fojt re_dfastate_t *next_state);
11844b87433SJohn Marino static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
11944b87433SJohn Marino re_node_set *cur_nodes,
120*6ea1f93eSDaniel Fojt Idx str_idx);
12144b87433SJohn Marino #if 0
12244b87433SJohn Marino static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
12344b87433SJohn Marino re_match_context_t *mctx,
124*6ea1f93eSDaniel Fojt re_dfastate_t *pstate);
12544b87433SJohn Marino #endif
12644b87433SJohn Marino #ifdef RE_ENABLE_I18N
12744b87433SJohn Marino static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128*6ea1f93eSDaniel Fojt re_dfastate_t *pstate);
12944b87433SJohn Marino #endif /* RE_ENABLE_I18N */
13044b87433SJohn Marino static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131*6ea1f93eSDaniel Fojt const re_node_set *nodes);
13244b87433SJohn Marino static reg_errcode_t get_subexp (re_match_context_t *mctx,
133*6ea1f93eSDaniel Fojt Idx bkref_node, Idx bkref_str_idx);
13444b87433SJohn Marino static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
13544b87433SJohn Marino const re_sub_match_top_t *sub_top,
13644b87433SJohn Marino re_sub_match_last_t *sub_last,
137*6ea1f93eSDaniel Fojt Idx bkref_node, Idx bkref_str);
13844b87433SJohn Marino static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139*6ea1f93eSDaniel Fojt Idx subexp_idx, int type);
14044b87433SJohn Marino static reg_errcode_t check_arrival (re_match_context_t *mctx,
14144b87433SJohn Marino state_array_t *path, Idx top_node,
14244b87433SJohn Marino Idx top_str, Idx last_node, Idx last_str,
143*6ea1f93eSDaniel Fojt int type);
14444b87433SJohn Marino static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
14544b87433SJohn Marino Idx str_idx,
14644b87433SJohn Marino re_node_set *cur_nodes,
147*6ea1f93eSDaniel Fojt re_node_set *next_nodes);
14844b87433SJohn Marino static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
14944b87433SJohn Marino re_node_set *cur_nodes,
150*6ea1f93eSDaniel Fojt Idx ex_subexp, int type);
15144b87433SJohn Marino static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
15244b87433SJohn Marino re_node_set *dst_nodes,
15344b87433SJohn Marino Idx target, Idx ex_subexp,
154*6ea1f93eSDaniel Fojt int type);
15544b87433SJohn Marino static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
15644b87433SJohn Marino re_node_set *cur_nodes, Idx cur_str,
157*6ea1f93eSDaniel Fojt Idx subexp_num, int type);
158*6ea1f93eSDaniel Fojt static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
15944b87433SJohn Marino #ifdef RE_ENABLE_I18N
16044b87433SJohn Marino static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161*6ea1f93eSDaniel Fojt const re_string_t *input, Idx idx);
16244b87433SJohn Marino # ifdef _LIBC
16344b87433SJohn Marino static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164*6ea1f93eSDaniel Fojt size_t name_len);
16544b87433SJohn Marino # endif /* _LIBC */
16644b87433SJohn Marino #endif /* RE_ENABLE_I18N */
16744b87433SJohn Marino static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
16844b87433SJohn Marino const re_dfastate_t *state,
16944b87433SJohn Marino re_node_set *states_node,
170*6ea1f93eSDaniel Fojt bitset_t *states_ch);
17144b87433SJohn Marino static bool check_node_accept (const re_match_context_t *mctx,
172*6ea1f93eSDaniel Fojt const re_token_t *node, Idx idx);
173*6ea1f93eSDaniel Fojt static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
17444b87433SJohn Marino
17544b87433SJohn Marino /* Entry point for POSIX code. */
17644b87433SJohn Marino
17744b87433SJohn Marino /* regexec searches for a given pattern, specified by PREG, in the
17844b87433SJohn Marino string STRING.
17944b87433SJohn Marino
18044b87433SJohn Marino If NMATCH is zero or REG_NOSUB was set in the cflags argument to
1814536c563SJohn Marino 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
18244b87433SJohn Marino least NMATCH elements, and we set them to the offsets of the
18344b87433SJohn Marino corresponding matched substrings.
18444b87433SJohn Marino
1854536c563SJohn Marino EFLAGS specifies "execution flags" which affect matching: if
18644b87433SJohn Marino REG_NOTBOL is set, then ^ does not match at the beginning of the
18744b87433SJohn Marino string; if REG_NOTEOL is set, then $ does not match at the end.
18844b87433SJohn Marino
18944b87433SJohn Marino We return 0 if we find a match and REG_NOMATCH if not. */
19044b87433SJohn Marino
19144b87433SJohn Marino int
regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)192*6ea1f93eSDaniel Fojt regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193*6ea1f93eSDaniel Fojt size_t nmatch, regmatch_t pmatch[], int eflags)
19444b87433SJohn Marino {
19544b87433SJohn Marino reg_errcode_t err;
19644b87433SJohn Marino Idx start, length;
1974536c563SJohn Marino re_dfa_t *dfa = preg->buffer;
19844b87433SJohn Marino
19944b87433SJohn Marino if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
20044b87433SJohn Marino return REG_BADPAT;
20144b87433SJohn Marino
20244b87433SJohn Marino if (eflags & REG_STARTEND)
20344b87433SJohn Marino {
20444b87433SJohn Marino start = pmatch[0].rm_so;
20544b87433SJohn Marino length = pmatch[0].rm_eo;
20644b87433SJohn Marino }
20744b87433SJohn Marino else
20844b87433SJohn Marino {
20944b87433SJohn Marino start = 0;
21044b87433SJohn Marino length = strlen (string);
21144b87433SJohn Marino }
21244b87433SJohn Marino
213*6ea1f93eSDaniel Fojt lock_lock (dfa->lock);
21444b87433SJohn Marino if (preg->no_sub)
21544b87433SJohn Marino err = re_search_internal (preg, string, length, start, length,
21644b87433SJohn Marino length, 0, NULL, eflags);
21744b87433SJohn Marino else
21844b87433SJohn Marino err = re_search_internal (preg, string, length, start, length,
21944b87433SJohn Marino length, nmatch, pmatch, eflags);
220*6ea1f93eSDaniel Fojt lock_unlock (dfa->lock);
22144b87433SJohn Marino return err != REG_NOERROR;
22244b87433SJohn Marino }
22344b87433SJohn Marino
22444b87433SJohn Marino #ifdef _LIBC
225*6ea1f93eSDaniel Fojt libc_hidden_def (__regexec)
226*6ea1f93eSDaniel Fojt
22744b87433SJohn Marino # include <shlib-compat.h>
22844b87433SJohn Marino versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
22944b87433SJohn Marino
23044b87433SJohn Marino # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
23144b87433SJohn Marino __typeof__ (__regexec) __compat_regexec;
23244b87433SJohn Marino
23344b87433SJohn Marino int
23444b87433SJohn Marino attribute_compat_text_section
__compat_regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)23544b87433SJohn Marino __compat_regexec (const regex_t *_Restrict_ preg,
23644b87433SJohn Marino const char *_Restrict_ string, size_t nmatch,
23744b87433SJohn Marino regmatch_t pmatch[], int eflags)
23844b87433SJohn Marino {
23944b87433SJohn Marino return regexec (preg, string, nmatch, pmatch,
24044b87433SJohn Marino eflags & (REG_NOTBOL | REG_NOTEOL));
24144b87433SJohn Marino }
24244b87433SJohn Marino compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
24344b87433SJohn Marino # endif
24444b87433SJohn Marino #endif
24544b87433SJohn Marino
24644b87433SJohn Marino /* Entry points for GNU code. */
24744b87433SJohn Marino
24844b87433SJohn Marino /* re_match, re_search, re_match_2, re_search_2
24944b87433SJohn Marino
25044b87433SJohn Marino The former two functions operate on STRING with length LENGTH,
25144b87433SJohn Marino while the later two operate on concatenation of STRING1 and STRING2
25244b87433SJohn Marino with lengths LENGTH1 and LENGTH2, respectively.
25344b87433SJohn Marino
25444b87433SJohn Marino re_match() matches the compiled pattern in BUFP against the string,
25544b87433SJohn Marino starting at index START.
25644b87433SJohn Marino
25744b87433SJohn Marino re_search() first tries matching at index START, then it tries to match
25844b87433SJohn Marino starting from index START + 1, and so on. The last start position tried
25944b87433SJohn Marino is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
26044b87433SJohn Marino way as re_match().)
26144b87433SJohn Marino
26244b87433SJohn Marino The parameter STOP of re_{match,search}_2 specifies that no match exceeding
26344b87433SJohn Marino the first STOP characters of the concatenation of the strings should be
26444b87433SJohn Marino concerned.
26544b87433SJohn Marino
26644b87433SJohn Marino If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
26744b87433SJohn Marino and all groups is stored in REGS. (For the "_2" variants, the offsets are
26844b87433SJohn Marino computed relative to the concatenation, not relative to the individual
26944b87433SJohn Marino strings.)
27044b87433SJohn Marino
27144b87433SJohn Marino On success, re_match* functions return the length of the match, re_search*
27244b87433SJohn Marino return the position of the start of the match. Return value -1 means no
27344b87433SJohn Marino match was found and -2 indicates an internal error. */
27444b87433SJohn Marino
27544b87433SJohn Marino regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)276*6ea1f93eSDaniel Fojt re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277*6ea1f93eSDaniel Fojt Idx start, struct re_registers *regs)
27844b87433SJohn Marino {
27944b87433SJohn Marino return re_search_stub (bufp, string, length, start, 0, length, regs, true);
28044b87433SJohn Marino }
28144b87433SJohn Marino #ifdef _LIBC
weak_alias(__re_match,re_match)28244b87433SJohn Marino weak_alias (__re_match, re_match)
28344b87433SJohn Marino #endif
28444b87433SJohn Marino
28544b87433SJohn Marino regoff_t
286*6ea1f93eSDaniel Fojt re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287*6ea1f93eSDaniel Fojt Idx start, regoff_t range, struct re_registers *regs)
28844b87433SJohn Marino {
28944b87433SJohn Marino return re_search_stub (bufp, string, length, start, range, length, regs,
29044b87433SJohn Marino false);
29144b87433SJohn Marino }
29244b87433SJohn Marino #ifdef _LIBC
weak_alias(__re_search,re_search)29344b87433SJohn Marino weak_alias (__re_search, re_search)
29444b87433SJohn Marino #endif
29544b87433SJohn Marino
29644b87433SJohn Marino regoff_t
297*6ea1f93eSDaniel Fojt re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298*6ea1f93eSDaniel Fojt const char *string2, Idx length2, Idx start,
299*6ea1f93eSDaniel Fojt struct re_registers *regs, Idx stop)
30044b87433SJohn Marino {
30144b87433SJohn Marino return re_search_2_stub (bufp, string1, length1, string2, length2,
30244b87433SJohn Marino start, 0, regs, stop, true);
30344b87433SJohn Marino }
30444b87433SJohn Marino #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)30544b87433SJohn Marino weak_alias (__re_match_2, re_match_2)
30644b87433SJohn Marino #endif
30744b87433SJohn Marino
30844b87433SJohn Marino regoff_t
309*6ea1f93eSDaniel Fojt re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310*6ea1f93eSDaniel Fojt const char *string2, Idx length2, Idx start, regoff_t range,
311*6ea1f93eSDaniel Fojt struct re_registers *regs, Idx stop)
31244b87433SJohn Marino {
31344b87433SJohn Marino return re_search_2_stub (bufp, string1, length1, string2, length2,
31444b87433SJohn Marino start, range, regs, stop, false);
31544b87433SJohn Marino }
31644b87433SJohn Marino #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)31744b87433SJohn Marino weak_alias (__re_search_2, re_search_2)
31844b87433SJohn Marino #endif
31944b87433SJohn Marino
32044b87433SJohn Marino static regoff_t
321*6ea1f93eSDaniel Fojt re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322*6ea1f93eSDaniel Fojt Idx length1, const char *string2, Idx length2, Idx start,
323*6ea1f93eSDaniel Fojt regoff_t range, struct re_registers *regs,
32444b87433SJohn Marino Idx stop, bool ret_len)
32544b87433SJohn Marino {
32644b87433SJohn Marino const char *str;
32744b87433SJohn Marino regoff_t rval;
328*6ea1f93eSDaniel Fojt Idx len;
32944b87433SJohn Marino char *s = NULL;
33044b87433SJohn Marino
331*6ea1f93eSDaniel Fojt if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
332*6ea1f93eSDaniel Fojt || INT_ADD_WRAPV (length1, length2, &len))))
33344b87433SJohn Marino return -2;
33444b87433SJohn Marino
33544b87433SJohn Marino /* Concatenate the strings. */
33644b87433SJohn Marino if (length2 > 0)
33744b87433SJohn Marino if (length1 > 0)
33844b87433SJohn Marino {
33944b87433SJohn Marino s = re_malloc (char, len);
34044b87433SJohn Marino
341*6ea1f93eSDaniel Fojt if (__glibc_unlikely (s == NULL))
34244b87433SJohn Marino return -2;
34344b87433SJohn Marino #ifdef _LIBC
34444b87433SJohn Marino memcpy (__mempcpy (s, string1, length1), string2, length2);
34544b87433SJohn Marino #else
34644b87433SJohn Marino memcpy (s, string1, length1);
34744b87433SJohn Marino memcpy (s + length1, string2, length2);
34844b87433SJohn Marino #endif
34944b87433SJohn Marino str = s;
35044b87433SJohn Marino }
35144b87433SJohn Marino else
35244b87433SJohn Marino str = string2;
35344b87433SJohn Marino else
35444b87433SJohn Marino str = string1;
35544b87433SJohn Marino
35644b87433SJohn Marino rval = re_search_stub (bufp, str, len, start, range, stop, regs,
35744b87433SJohn Marino ret_len);
35844b87433SJohn Marino re_free (s);
35944b87433SJohn Marino return rval;
36044b87433SJohn Marino }
36144b87433SJohn Marino
36244b87433SJohn Marino /* The parameters have the same meaning as those of re_search.
36344b87433SJohn Marino Additional parameters:
36444b87433SJohn Marino If RET_LEN is true the length of the match is returned (re_match style);
36544b87433SJohn Marino otherwise the position of the match is returned. */
36644b87433SJohn Marino
36744b87433SJohn Marino static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)368*6ea1f93eSDaniel Fojt re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
36944b87433SJohn Marino Idx start, regoff_t range, Idx stop, struct re_registers *regs,
37044b87433SJohn Marino bool ret_len)
37144b87433SJohn Marino {
37244b87433SJohn Marino reg_errcode_t result;
37344b87433SJohn Marino regmatch_t *pmatch;
37444b87433SJohn Marino Idx nregs;
37544b87433SJohn Marino regoff_t rval;
37644b87433SJohn Marino int eflags = 0;
3774536c563SJohn Marino re_dfa_t *dfa = bufp->buffer;
37844b87433SJohn Marino Idx last_start = start + range;
37944b87433SJohn Marino
38044b87433SJohn Marino /* Check for out-of-range. */
381*6ea1f93eSDaniel Fojt if (__glibc_unlikely (start < 0 || start > length))
38244b87433SJohn Marino return -1;
383*6ea1f93eSDaniel Fojt if (__glibc_unlikely (length < last_start
384*6ea1f93eSDaniel Fojt || (0 <= range && last_start < start)))
38544b87433SJohn Marino last_start = length;
386*6ea1f93eSDaniel Fojt else if (__glibc_unlikely (last_start < 0
387*6ea1f93eSDaniel Fojt || (range < 0 && start <= last_start)))
38844b87433SJohn Marino last_start = 0;
38944b87433SJohn Marino
390*6ea1f93eSDaniel Fojt lock_lock (dfa->lock);
39144b87433SJohn Marino
39244b87433SJohn Marino eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
39344b87433SJohn Marino eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
39444b87433SJohn Marino
39544b87433SJohn Marino /* Compile fastmap if we haven't yet. */
39644b87433SJohn Marino if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
39744b87433SJohn Marino re_compile_fastmap (bufp);
39844b87433SJohn Marino
399*6ea1f93eSDaniel Fojt if (__glibc_unlikely (bufp->no_sub))
40044b87433SJohn Marino regs = NULL;
40144b87433SJohn Marino
40244b87433SJohn Marino /* We need at least 1 register. */
40344b87433SJohn Marino if (regs == NULL)
40444b87433SJohn Marino nregs = 1;
405*6ea1f93eSDaniel Fojt else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
406*6ea1f93eSDaniel Fojt && regs->num_regs <= bufp->re_nsub))
40744b87433SJohn Marino {
40844b87433SJohn Marino nregs = regs->num_regs;
409*6ea1f93eSDaniel Fojt if (__glibc_unlikely (nregs < 1))
41044b87433SJohn Marino {
41144b87433SJohn Marino /* Nothing can be copied to regs. */
41244b87433SJohn Marino regs = NULL;
41344b87433SJohn Marino nregs = 1;
41444b87433SJohn Marino }
41544b87433SJohn Marino }
41644b87433SJohn Marino else
41744b87433SJohn Marino nregs = bufp->re_nsub + 1;
41844b87433SJohn Marino pmatch = re_malloc (regmatch_t, nregs);
419*6ea1f93eSDaniel Fojt if (__glibc_unlikely (pmatch == NULL))
42044b87433SJohn Marino {
42144b87433SJohn Marino rval = -2;
42244b87433SJohn Marino goto out;
42344b87433SJohn Marino }
42444b87433SJohn Marino
42544b87433SJohn Marino result = re_search_internal (bufp, string, length, start, last_start, stop,
42644b87433SJohn Marino nregs, pmatch, eflags);
42744b87433SJohn Marino
42844b87433SJohn Marino rval = 0;
42944b87433SJohn Marino
4304536c563SJohn Marino /* I hope we needn't fill their regs with -1's when no match was found. */
43144b87433SJohn Marino if (result != REG_NOERROR)
4324536c563SJohn Marino rval = result == REG_NOMATCH ? -1 : -2;
43344b87433SJohn Marino else if (regs != NULL)
43444b87433SJohn Marino {
43544b87433SJohn Marino /* If caller wants register contents data back, copy them. */
43644b87433SJohn Marino bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
43744b87433SJohn Marino bufp->regs_allocated);
438*6ea1f93eSDaniel Fojt if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
43944b87433SJohn Marino rval = -2;
44044b87433SJohn Marino }
44144b87433SJohn Marino
442*6ea1f93eSDaniel Fojt if (__glibc_likely (rval == 0))
44344b87433SJohn Marino {
44444b87433SJohn Marino if (ret_len)
44544b87433SJohn Marino {
44644b87433SJohn Marino assert (pmatch[0].rm_so == start);
44744b87433SJohn Marino rval = pmatch[0].rm_eo - start;
44844b87433SJohn Marino }
44944b87433SJohn Marino else
45044b87433SJohn Marino rval = pmatch[0].rm_so;
45144b87433SJohn Marino }
45244b87433SJohn Marino re_free (pmatch);
45344b87433SJohn Marino out:
454*6ea1f93eSDaniel Fojt lock_unlock (dfa->lock);
45544b87433SJohn Marino return rval;
45644b87433SJohn Marino }
45744b87433SJohn Marino
4584536c563SJohn Marino static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)45944b87433SJohn Marino re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
46044b87433SJohn Marino int regs_allocated)
46144b87433SJohn Marino {
46244b87433SJohn Marino int rval = REGS_REALLOCATE;
46344b87433SJohn Marino Idx i;
46444b87433SJohn Marino Idx need_regs = nregs + 1;
4654536c563SJohn Marino /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
46644b87433SJohn Marino uses. */
46744b87433SJohn Marino
46844b87433SJohn Marino /* Have the register data arrays been allocated? */
46944b87433SJohn Marino if (regs_allocated == REGS_UNALLOCATED)
47044b87433SJohn Marino { /* No. So allocate them with malloc. */
47144b87433SJohn Marino regs->start = re_malloc (regoff_t, need_regs);
472*6ea1f93eSDaniel Fojt if (__glibc_unlikely (regs->start == NULL))
47344b87433SJohn Marino return REGS_UNALLOCATED;
47444b87433SJohn Marino regs->end = re_malloc (regoff_t, need_regs);
475*6ea1f93eSDaniel Fojt if (__glibc_unlikely (regs->end == NULL))
47644b87433SJohn Marino {
47744b87433SJohn Marino re_free (regs->start);
47844b87433SJohn Marino return REGS_UNALLOCATED;
47944b87433SJohn Marino }
48044b87433SJohn Marino regs->num_regs = need_regs;
48144b87433SJohn Marino }
48244b87433SJohn Marino else if (regs_allocated == REGS_REALLOCATE)
48344b87433SJohn Marino { /* Yes. If we need more elements than were already
48444b87433SJohn Marino allocated, reallocate them. If we need fewer, just
48544b87433SJohn Marino leave it alone. */
486*6ea1f93eSDaniel Fojt if (__glibc_unlikely (need_regs > regs->num_regs))
48744b87433SJohn Marino {
48844b87433SJohn Marino regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
48944b87433SJohn Marino regoff_t *new_end;
490*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_start == NULL))
49144b87433SJohn Marino return REGS_UNALLOCATED;
49244b87433SJohn Marino new_end = re_realloc (regs->end, regoff_t, need_regs);
493*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_end == NULL))
49444b87433SJohn Marino {
49544b87433SJohn Marino re_free (new_start);
49644b87433SJohn Marino return REGS_UNALLOCATED;
49744b87433SJohn Marino }
49844b87433SJohn Marino regs->start = new_start;
49944b87433SJohn Marino regs->end = new_end;
50044b87433SJohn Marino regs->num_regs = need_regs;
50144b87433SJohn Marino }
50244b87433SJohn Marino }
50344b87433SJohn Marino else
50444b87433SJohn Marino {
50544b87433SJohn Marino assert (regs_allocated == REGS_FIXED);
50644b87433SJohn Marino /* This function may not be called with REGS_FIXED and nregs too big. */
50744b87433SJohn Marino assert (regs->num_regs >= nregs);
50844b87433SJohn Marino rval = REGS_FIXED;
50944b87433SJohn Marino }
51044b87433SJohn Marino
51144b87433SJohn Marino /* Copy the regs. */
51244b87433SJohn Marino for (i = 0; i < nregs; ++i)
51344b87433SJohn Marino {
51444b87433SJohn Marino regs->start[i] = pmatch[i].rm_so;
51544b87433SJohn Marino regs->end[i] = pmatch[i].rm_eo;
51644b87433SJohn Marino }
51744b87433SJohn Marino for ( ; i < regs->num_regs; ++i)
51844b87433SJohn Marino regs->start[i] = regs->end[i] = -1;
51944b87433SJohn Marino
52044b87433SJohn Marino return rval;
52144b87433SJohn Marino }
52244b87433SJohn Marino
52344b87433SJohn Marino /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
52444b87433SJohn Marino ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
52544b87433SJohn Marino this memory for recording register information. STARTS and ENDS
52644b87433SJohn Marino must be allocated using the malloc library routine, and must each
52744b87433SJohn Marino be at least NUM_REGS * sizeof (regoff_t) bytes long.
52844b87433SJohn Marino
52944b87433SJohn Marino If NUM_REGS == 0, then subsequent matches should allocate their own
53044b87433SJohn Marino register data.
53144b87433SJohn Marino
53244b87433SJohn Marino Unless this function is called, the first search or match using
53344b87433SJohn Marino PATTERN_BUFFER will allocate its own register data, without
53444b87433SJohn Marino freeing the old data. */
53544b87433SJohn Marino
53644b87433SJohn Marino void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)537*6ea1f93eSDaniel Fojt re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
538*6ea1f93eSDaniel Fojt __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
53944b87433SJohn Marino {
54044b87433SJohn Marino if (num_regs)
54144b87433SJohn Marino {
54244b87433SJohn Marino bufp->regs_allocated = REGS_REALLOCATE;
54344b87433SJohn Marino regs->num_regs = num_regs;
54444b87433SJohn Marino regs->start = starts;
54544b87433SJohn Marino regs->end = ends;
54644b87433SJohn Marino }
54744b87433SJohn Marino else
54844b87433SJohn Marino {
54944b87433SJohn Marino bufp->regs_allocated = REGS_UNALLOCATED;
55044b87433SJohn Marino regs->num_regs = 0;
55144b87433SJohn Marino regs->start = regs->end = NULL;
55244b87433SJohn Marino }
55344b87433SJohn Marino }
55444b87433SJohn Marino #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)55544b87433SJohn Marino weak_alias (__re_set_registers, re_set_registers)
55644b87433SJohn Marino #endif
55744b87433SJohn Marino
55844b87433SJohn Marino /* Entry points compatible with 4.2 BSD regex library. We don't define
55944b87433SJohn Marino them unless specifically requested. */
56044b87433SJohn Marino
56144b87433SJohn Marino #if defined _REGEX_RE_COMP || defined _LIBC
56244b87433SJohn Marino int
56344b87433SJohn Marino # ifdef _LIBC
56444b87433SJohn Marino weak_function
56544b87433SJohn Marino # endif
566*6ea1f93eSDaniel Fojt re_exec (const char *s)
56744b87433SJohn Marino {
56844b87433SJohn Marino return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
56944b87433SJohn Marino }
57044b87433SJohn Marino #endif /* _REGEX_RE_COMP */
57144b87433SJohn Marino
57244b87433SJohn Marino /* Internal entry point. */
57344b87433SJohn Marino
57444b87433SJohn Marino /* Searches for a compiled pattern PREG in the string STRING, whose
57544b87433SJohn Marino length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
57644b87433SJohn Marino meaning as with regexec. LAST_START is START + RANGE, where
57744b87433SJohn Marino START and RANGE have the same meaning as with re_search.
57844b87433SJohn Marino Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
57944b87433SJohn Marino otherwise return the error code.
58044b87433SJohn Marino Note: We assume front end functions already check ranges.
58144b87433SJohn Marino (0 <= LAST_START && LAST_START <= LENGTH) */
58244b87433SJohn Marino
58344b87433SJohn Marino static reg_errcode_t
5844536c563SJohn Marino __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)585*6ea1f93eSDaniel Fojt re_search_internal (const regex_t *preg, const char *string, Idx length,
586*6ea1f93eSDaniel Fojt Idx start, Idx last_start, Idx stop, size_t nmatch,
587*6ea1f93eSDaniel Fojt regmatch_t pmatch[], int eflags)
58844b87433SJohn Marino {
58944b87433SJohn Marino reg_errcode_t err;
5904536c563SJohn Marino const re_dfa_t *dfa = preg->buffer;
59144b87433SJohn Marino Idx left_lim, right_lim;
59244b87433SJohn Marino int incr;
59344b87433SJohn Marino bool fl_longest_match;
59444b87433SJohn Marino int match_kind;
59544b87433SJohn Marino Idx match_first;
596*6ea1f93eSDaniel Fojt Idx match_last = -1;
59744b87433SJohn Marino Idx extra_nmatch;
59844b87433SJohn Marino bool sb;
59944b87433SJohn Marino int ch;
60044b87433SJohn Marino #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
60144b87433SJohn Marino re_match_context_t mctx = { .dfa = dfa };
60244b87433SJohn Marino #else
60344b87433SJohn Marino re_match_context_t mctx;
60444b87433SJohn Marino #endif
60544b87433SJohn Marino char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
60644b87433SJohn Marino && start != last_start && !preg->can_be_null)
60744b87433SJohn Marino ? preg->fastmap : NULL);
60844b87433SJohn Marino RE_TRANSLATE_TYPE t = preg->translate;
60944b87433SJohn Marino
61044b87433SJohn Marino #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
61144b87433SJohn Marino memset (&mctx, '\0', sizeof (re_match_context_t));
61244b87433SJohn Marino mctx.dfa = dfa;
61344b87433SJohn Marino #endif
61444b87433SJohn Marino
61544b87433SJohn Marino extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
61644b87433SJohn Marino nmatch -= extra_nmatch;
61744b87433SJohn Marino
61844b87433SJohn Marino /* Check if the DFA haven't been compiled. */
619*6ea1f93eSDaniel Fojt if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
620*6ea1f93eSDaniel Fojt || dfa->init_state_word == NULL
621*6ea1f93eSDaniel Fojt || dfa->init_state_nl == NULL
622*6ea1f93eSDaniel Fojt || dfa->init_state_begbuf == NULL))
62344b87433SJohn Marino return REG_NOMATCH;
62444b87433SJohn Marino
62544b87433SJohn Marino #ifdef DEBUG
62644b87433SJohn Marino /* We assume front-end functions already check them. */
62744b87433SJohn Marino assert (0 <= last_start && last_start <= length);
62844b87433SJohn Marino #endif
62944b87433SJohn Marino
63044b87433SJohn Marino /* If initial states with non-begbuf contexts have no elements,
63144b87433SJohn Marino the regex must be anchored. If preg->newline_anchor is set,
63244b87433SJohn Marino we'll never use init_state_nl, so do not check it. */
63344b87433SJohn Marino if (dfa->init_state->nodes.nelem == 0
63444b87433SJohn Marino && dfa->init_state_word->nodes.nelem == 0
63544b87433SJohn Marino && (dfa->init_state_nl->nodes.nelem == 0
63644b87433SJohn Marino || !preg->newline_anchor))
63744b87433SJohn Marino {
63844b87433SJohn Marino if (start != 0 && last_start != 0)
63944b87433SJohn Marino return REG_NOMATCH;
64044b87433SJohn Marino start = last_start = 0;
64144b87433SJohn Marino }
64244b87433SJohn Marino
64344b87433SJohn Marino /* We must check the longest matching, if nmatch > 0. */
64444b87433SJohn Marino fl_longest_match = (nmatch != 0 || dfa->nbackref);
64544b87433SJohn Marino
64644b87433SJohn Marino err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
64744b87433SJohn Marino preg->translate, (preg->syntax & RE_ICASE) != 0,
64844b87433SJohn Marino dfa);
649*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
65044b87433SJohn Marino goto free_return;
65144b87433SJohn Marino mctx.input.stop = stop;
65244b87433SJohn Marino mctx.input.raw_stop = stop;
65344b87433SJohn Marino mctx.input.newline_anchor = preg->newline_anchor;
65444b87433SJohn Marino
65544b87433SJohn Marino err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
656*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
65744b87433SJohn Marino goto free_return;
65844b87433SJohn Marino
65944b87433SJohn Marino /* We will log all the DFA states through which the dfa pass,
66044b87433SJohn Marino if nmatch > 1, or this dfa has "multibyte node", which is a
66144b87433SJohn Marino back-reference or a node which can accept multibyte character or
66244b87433SJohn Marino multi character collating element. */
66344b87433SJohn Marino if (nmatch > 1 || dfa->has_mb_node)
66444b87433SJohn Marino {
66544b87433SJohn Marino /* Avoid overflow. */
666*6ea1f93eSDaniel Fojt if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
667*6ea1f93eSDaniel Fojt <= mctx.input.bufs_len)))
66844b87433SJohn Marino {
66944b87433SJohn Marino err = REG_ESPACE;
67044b87433SJohn Marino goto free_return;
67144b87433SJohn Marino }
67244b87433SJohn Marino
67344b87433SJohn Marino mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
674*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx.state_log == NULL))
67544b87433SJohn Marino {
67644b87433SJohn Marino err = REG_ESPACE;
67744b87433SJohn Marino goto free_return;
67844b87433SJohn Marino }
67944b87433SJohn Marino }
68044b87433SJohn Marino else
68144b87433SJohn Marino mctx.state_log = NULL;
68244b87433SJohn Marino
68344b87433SJohn Marino match_first = start;
68444b87433SJohn Marino mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
68544b87433SJohn Marino : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
68644b87433SJohn Marino
6874536c563SJohn Marino /* Check incrementally whether the input string matches. */
68844b87433SJohn Marino incr = (last_start < start) ? -1 : 1;
68944b87433SJohn Marino left_lim = (last_start < start) ? last_start : start;
69044b87433SJohn Marino right_lim = (last_start < start) ? start : last_start;
69144b87433SJohn Marino sb = dfa->mb_cur_max == 1;
69244b87433SJohn Marino match_kind =
69344b87433SJohn Marino (fastmap
69444b87433SJohn Marino ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
69544b87433SJohn Marino | (start <= last_start ? 2 : 0)
69644b87433SJohn Marino | (t != NULL ? 1 : 0))
69744b87433SJohn Marino : 8);
69844b87433SJohn Marino
69944b87433SJohn Marino for (;; match_first += incr)
70044b87433SJohn Marino {
70144b87433SJohn Marino err = REG_NOMATCH;
70244b87433SJohn Marino if (match_first < left_lim || right_lim < match_first)
70344b87433SJohn Marino goto free_return;
70444b87433SJohn Marino
70544b87433SJohn Marino /* Advance as rapidly as possible through the string, until we
70644b87433SJohn Marino find a plausible place to start matching. This may be done
70744b87433SJohn Marino with varying efficiency, so there are various possibilities:
70844b87433SJohn Marino only the most common of them are specialized, in order to
70944b87433SJohn Marino save on code size. We use a switch statement for speed. */
71044b87433SJohn Marino switch (match_kind)
71144b87433SJohn Marino {
71244b87433SJohn Marino case 8:
71344b87433SJohn Marino /* No fastmap. */
71444b87433SJohn Marino break;
71544b87433SJohn Marino
71644b87433SJohn Marino case 7:
71744b87433SJohn Marino /* Fastmap with single-byte translation, match forward. */
718*6ea1f93eSDaniel Fojt while (__glibc_likely (match_first < right_lim)
71944b87433SJohn Marino && !fastmap[t[(unsigned char) string[match_first]]])
72044b87433SJohn Marino ++match_first;
72144b87433SJohn Marino goto forward_match_found_start_or_reached_end;
72244b87433SJohn Marino
72344b87433SJohn Marino case 6:
72444b87433SJohn Marino /* Fastmap without translation, match forward. */
725*6ea1f93eSDaniel Fojt while (__glibc_likely (match_first < right_lim)
72644b87433SJohn Marino && !fastmap[(unsigned char) string[match_first]])
72744b87433SJohn Marino ++match_first;
72844b87433SJohn Marino
72944b87433SJohn Marino forward_match_found_start_or_reached_end:
730*6ea1f93eSDaniel Fojt if (__glibc_unlikely (match_first == right_lim))
73144b87433SJohn Marino {
73244b87433SJohn Marino ch = match_first >= length
73344b87433SJohn Marino ? 0 : (unsigned char) string[match_first];
73444b87433SJohn Marino if (!fastmap[t ? t[ch] : ch])
73544b87433SJohn Marino goto free_return;
73644b87433SJohn Marino }
73744b87433SJohn Marino break;
73844b87433SJohn Marino
73944b87433SJohn Marino case 4:
74044b87433SJohn Marino case 5:
74144b87433SJohn Marino /* Fastmap without multi-byte translation, match backwards. */
74244b87433SJohn Marino while (match_first >= left_lim)
74344b87433SJohn Marino {
74444b87433SJohn Marino ch = match_first >= length
74544b87433SJohn Marino ? 0 : (unsigned char) string[match_first];
74644b87433SJohn Marino if (fastmap[t ? t[ch] : ch])
74744b87433SJohn Marino break;
74844b87433SJohn Marino --match_first;
74944b87433SJohn Marino }
75044b87433SJohn Marino if (match_first < left_lim)
75144b87433SJohn Marino goto free_return;
75244b87433SJohn Marino break;
75344b87433SJohn Marino
75444b87433SJohn Marino default:
75544b87433SJohn Marino /* In this case, we can't determine easily the current byte,
75644b87433SJohn Marino since it might be a component byte of a multibyte
75744b87433SJohn Marino character. Then we use the constructed buffer instead. */
75844b87433SJohn Marino for (;;)
75944b87433SJohn Marino {
76044b87433SJohn Marino /* If MATCH_FIRST is out of the valid range, reconstruct the
76144b87433SJohn Marino buffers. */
76244b87433SJohn Marino __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
763*6ea1f93eSDaniel Fojt if (__glibc_unlikely (offset
764*6ea1f93eSDaniel Fojt >= (__re_size_t) mctx.input.valid_raw_len))
76544b87433SJohn Marino {
76644b87433SJohn Marino err = re_string_reconstruct (&mctx.input, match_first,
76744b87433SJohn Marino eflags);
768*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
76944b87433SJohn Marino goto free_return;
77044b87433SJohn Marino
77144b87433SJohn Marino offset = match_first - mctx.input.raw_mbs_idx;
77244b87433SJohn Marino }
77344b87433SJohn Marino /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
77444b87433SJohn Marino Note that MATCH_FIRST must not be smaller than 0. */
77544b87433SJohn Marino ch = (match_first >= length
77644b87433SJohn Marino ? 0 : re_string_byte_at (&mctx.input, offset));
77744b87433SJohn Marino if (fastmap[ch])
77844b87433SJohn Marino break;
77944b87433SJohn Marino match_first += incr;
78044b87433SJohn Marino if (match_first < left_lim || match_first > right_lim)
78144b87433SJohn Marino {
78244b87433SJohn Marino err = REG_NOMATCH;
78344b87433SJohn Marino goto free_return;
78444b87433SJohn Marino }
78544b87433SJohn Marino }
78644b87433SJohn Marino break;
78744b87433SJohn Marino }
78844b87433SJohn Marino
78944b87433SJohn Marino /* Reconstruct the buffers so that the matcher can assume that
79044b87433SJohn Marino the matching starts from the beginning of the buffer. */
79144b87433SJohn Marino err = re_string_reconstruct (&mctx.input, match_first, eflags);
792*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
79344b87433SJohn Marino goto free_return;
79444b87433SJohn Marino
79544b87433SJohn Marino #ifdef RE_ENABLE_I18N
79644b87433SJohn Marino /* Don't consider this char as a possible match start if it part,
79744b87433SJohn Marino yet isn't the head, of a multibyte character. */
79844b87433SJohn Marino if (!sb && !re_string_first_byte (&mctx.input, 0))
79944b87433SJohn Marino continue;
80044b87433SJohn Marino #endif
80144b87433SJohn Marino
80244b87433SJohn Marino /* It seems to be appropriate one, then use the matcher. */
80344b87433SJohn Marino /* We assume that the matching starts from 0. */
80444b87433SJohn Marino mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
80544b87433SJohn Marino match_last = check_matching (&mctx, fl_longest_match,
80644b87433SJohn Marino start <= last_start ? &match_first : NULL);
807*6ea1f93eSDaniel Fojt if (match_last != -1)
80844b87433SJohn Marino {
809*6ea1f93eSDaniel Fojt if (__glibc_unlikely (match_last == -2))
81044b87433SJohn Marino {
81144b87433SJohn Marino err = REG_ESPACE;
81244b87433SJohn Marino goto free_return;
81344b87433SJohn Marino }
81444b87433SJohn Marino else
81544b87433SJohn Marino {
81644b87433SJohn Marino mctx.match_last = match_last;
81744b87433SJohn Marino if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
81844b87433SJohn Marino {
81944b87433SJohn Marino re_dfastate_t *pstate = mctx.state_log[match_last];
82044b87433SJohn Marino mctx.last_node = check_halt_state_context (&mctx, pstate,
82144b87433SJohn Marino match_last);
82244b87433SJohn Marino }
82344b87433SJohn Marino if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
82444b87433SJohn Marino || dfa->nbackref)
82544b87433SJohn Marino {
82644b87433SJohn Marino err = prune_impossible_nodes (&mctx);
82744b87433SJohn Marino if (err == REG_NOERROR)
82844b87433SJohn Marino break;
829*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOMATCH))
83044b87433SJohn Marino goto free_return;
831*6ea1f93eSDaniel Fojt match_last = -1;
83244b87433SJohn Marino }
83344b87433SJohn Marino else
83444b87433SJohn Marino break; /* We found a match. */
83544b87433SJohn Marino }
83644b87433SJohn Marino }
83744b87433SJohn Marino
83844b87433SJohn Marino match_ctx_clean (&mctx);
83944b87433SJohn Marino }
84044b87433SJohn Marino
84144b87433SJohn Marino #ifdef DEBUG
842*6ea1f93eSDaniel Fojt assert (match_last != -1);
84344b87433SJohn Marino assert (err == REG_NOERROR);
84444b87433SJohn Marino #endif
84544b87433SJohn Marino
84644b87433SJohn Marino /* Set pmatch[] if we need. */
84744b87433SJohn Marino if (nmatch > 0)
84844b87433SJohn Marino {
84944b87433SJohn Marino Idx reg_idx;
85044b87433SJohn Marino
85144b87433SJohn Marino /* Initialize registers. */
85244b87433SJohn Marino for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
85344b87433SJohn Marino pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
85444b87433SJohn Marino
85544b87433SJohn Marino /* Set the points where matching start/end. */
85644b87433SJohn Marino pmatch[0].rm_so = 0;
85744b87433SJohn Marino pmatch[0].rm_eo = mctx.match_last;
85844b87433SJohn Marino /* FIXME: This function should fail if mctx.match_last exceeds
85944b87433SJohn Marino the maximum possible regoff_t value. We need a new error
86044b87433SJohn Marino code REG_OVERFLOW. */
86144b87433SJohn Marino
86244b87433SJohn Marino if (!preg->no_sub && nmatch > 1)
86344b87433SJohn Marino {
86444b87433SJohn Marino err = set_regs (preg, &mctx, nmatch, pmatch,
86544b87433SJohn Marino dfa->has_plural_match && dfa->nbackref > 0);
866*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
86744b87433SJohn Marino goto free_return;
86844b87433SJohn Marino }
86944b87433SJohn Marino
8704536c563SJohn Marino /* At last, add the offset to each register, since we slid
87144b87433SJohn Marino the buffers so that we could assume that the matching starts
87244b87433SJohn Marino from 0. */
87344b87433SJohn Marino for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
87444b87433SJohn Marino if (pmatch[reg_idx].rm_so != -1)
87544b87433SJohn Marino {
87644b87433SJohn Marino #ifdef RE_ENABLE_I18N
877*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx.input.offsets_needed != 0))
87844b87433SJohn Marino {
87944b87433SJohn Marino pmatch[reg_idx].rm_so =
88044b87433SJohn Marino (pmatch[reg_idx].rm_so == mctx.input.valid_len
88144b87433SJohn Marino ? mctx.input.valid_raw_len
88244b87433SJohn Marino : mctx.input.offsets[pmatch[reg_idx].rm_so]);
88344b87433SJohn Marino pmatch[reg_idx].rm_eo =
88444b87433SJohn Marino (pmatch[reg_idx].rm_eo == mctx.input.valid_len
88544b87433SJohn Marino ? mctx.input.valid_raw_len
88644b87433SJohn Marino : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
88744b87433SJohn Marino }
88844b87433SJohn Marino #else
88944b87433SJohn Marino assert (mctx.input.offsets_needed == 0);
89044b87433SJohn Marino #endif
89144b87433SJohn Marino pmatch[reg_idx].rm_so += match_first;
89244b87433SJohn Marino pmatch[reg_idx].rm_eo += match_first;
89344b87433SJohn Marino }
89444b87433SJohn Marino for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
89544b87433SJohn Marino {
89644b87433SJohn Marino pmatch[nmatch + reg_idx].rm_so = -1;
89744b87433SJohn Marino pmatch[nmatch + reg_idx].rm_eo = -1;
89844b87433SJohn Marino }
89944b87433SJohn Marino
90044b87433SJohn Marino if (dfa->subexp_map)
90144b87433SJohn Marino for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
90244b87433SJohn Marino if (dfa->subexp_map[reg_idx] != reg_idx)
90344b87433SJohn Marino {
90444b87433SJohn Marino pmatch[reg_idx + 1].rm_so
90544b87433SJohn Marino = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
90644b87433SJohn Marino pmatch[reg_idx + 1].rm_eo
90744b87433SJohn Marino = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
90844b87433SJohn Marino }
90944b87433SJohn Marino }
91044b87433SJohn Marino
91144b87433SJohn Marino free_return:
91244b87433SJohn Marino re_free (mctx.state_log);
91344b87433SJohn Marino if (dfa->nbackref)
91444b87433SJohn Marino match_ctx_free (&mctx);
91544b87433SJohn Marino re_string_destruct (&mctx.input);
91644b87433SJohn Marino return err;
91744b87433SJohn Marino }
91844b87433SJohn Marino
91944b87433SJohn Marino static reg_errcode_t
9204536c563SJohn Marino __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)92144b87433SJohn Marino prune_impossible_nodes (re_match_context_t *mctx)
92244b87433SJohn Marino {
92344b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
92444b87433SJohn Marino Idx halt_node, match_last;
92544b87433SJohn Marino reg_errcode_t ret;
92644b87433SJohn Marino re_dfastate_t **sifted_states;
92744b87433SJohn Marino re_dfastate_t **lim_states = NULL;
92844b87433SJohn Marino re_sift_context_t sctx;
92944b87433SJohn Marino #ifdef DEBUG
93044b87433SJohn Marino assert (mctx->state_log != NULL);
93144b87433SJohn Marino #endif
93244b87433SJohn Marino match_last = mctx->match_last;
93344b87433SJohn Marino halt_node = mctx->last_node;
93444b87433SJohn Marino
93544b87433SJohn Marino /* Avoid overflow. */
936*6ea1f93eSDaniel Fojt if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
937*6ea1f93eSDaniel Fojt <= match_last))
93844b87433SJohn Marino return REG_ESPACE;
93944b87433SJohn Marino
94044b87433SJohn Marino sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
941*6ea1f93eSDaniel Fojt if (__glibc_unlikely (sifted_states == NULL))
94244b87433SJohn Marino {
94344b87433SJohn Marino ret = REG_ESPACE;
94444b87433SJohn Marino goto free_return;
94544b87433SJohn Marino }
94644b87433SJohn Marino if (dfa->nbackref)
94744b87433SJohn Marino {
94844b87433SJohn Marino lim_states = re_malloc (re_dfastate_t *, match_last + 1);
949*6ea1f93eSDaniel Fojt if (__glibc_unlikely (lim_states == NULL))
95044b87433SJohn Marino {
95144b87433SJohn Marino ret = REG_ESPACE;
95244b87433SJohn Marino goto free_return;
95344b87433SJohn Marino }
95444b87433SJohn Marino while (1)
95544b87433SJohn Marino {
95644b87433SJohn Marino memset (lim_states, '\0',
95744b87433SJohn Marino sizeof (re_dfastate_t *) * (match_last + 1));
95844b87433SJohn Marino sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
95944b87433SJohn Marino match_last);
96044b87433SJohn Marino ret = sift_states_backward (mctx, &sctx);
96144b87433SJohn Marino re_node_set_free (&sctx.limits);
962*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
96344b87433SJohn Marino goto free_return;
96444b87433SJohn Marino if (sifted_states[0] != NULL || lim_states[0] != NULL)
96544b87433SJohn Marino break;
96644b87433SJohn Marino do
96744b87433SJohn Marino {
96844b87433SJohn Marino --match_last;
969*6ea1f93eSDaniel Fojt if (match_last < 0)
97044b87433SJohn Marino {
97144b87433SJohn Marino ret = REG_NOMATCH;
97244b87433SJohn Marino goto free_return;
97344b87433SJohn Marino }
97444b87433SJohn Marino } while (mctx->state_log[match_last] == NULL
97544b87433SJohn Marino || !mctx->state_log[match_last]->halt);
97644b87433SJohn Marino halt_node = check_halt_state_context (mctx,
97744b87433SJohn Marino mctx->state_log[match_last],
97844b87433SJohn Marino match_last);
97944b87433SJohn Marino }
98044b87433SJohn Marino ret = merge_state_array (dfa, sifted_states, lim_states,
98144b87433SJohn Marino match_last + 1);
98244b87433SJohn Marino re_free (lim_states);
98344b87433SJohn Marino lim_states = NULL;
984*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
98544b87433SJohn Marino goto free_return;
98644b87433SJohn Marino }
98744b87433SJohn Marino else
98844b87433SJohn Marino {
98944b87433SJohn Marino sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
99044b87433SJohn Marino ret = sift_states_backward (mctx, &sctx);
99144b87433SJohn Marino re_node_set_free (&sctx.limits);
992*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
99344b87433SJohn Marino goto free_return;
99444b87433SJohn Marino if (sifted_states[0] == NULL)
99544b87433SJohn Marino {
99644b87433SJohn Marino ret = REG_NOMATCH;
99744b87433SJohn Marino goto free_return;
99844b87433SJohn Marino }
99944b87433SJohn Marino }
100044b87433SJohn Marino re_free (mctx->state_log);
100144b87433SJohn Marino mctx->state_log = sifted_states;
100244b87433SJohn Marino sifted_states = NULL;
100344b87433SJohn Marino mctx->last_node = halt_node;
100444b87433SJohn Marino mctx->match_last = match_last;
100544b87433SJohn Marino ret = REG_NOERROR;
100644b87433SJohn Marino free_return:
100744b87433SJohn Marino re_free (sifted_states);
100844b87433SJohn Marino re_free (lim_states);
100944b87433SJohn Marino return ret;
101044b87433SJohn Marino }
101144b87433SJohn Marino
101244b87433SJohn Marino /* Acquire an initial state and return it.
101344b87433SJohn Marino We must select appropriate initial state depending on the context,
101444b87433SJohn Marino since initial states may have constraints like "\<", "^", etc.. */
101544b87433SJohn Marino
101644b87433SJohn Marino static inline re_dfastate_t *
1017*6ea1f93eSDaniel Fojt __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)101844b87433SJohn Marino acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
101944b87433SJohn Marino Idx idx)
102044b87433SJohn Marino {
102144b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
102244b87433SJohn Marino if (dfa->init_state->has_constraint)
102344b87433SJohn Marino {
102444b87433SJohn Marino unsigned int context;
102544b87433SJohn Marino context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
102644b87433SJohn Marino if (IS_WORD_CONTEXT (context))
102744b87433SJohn Marino return dfa->init_state_word;
102844b87433SJohn Marino else if (IS_ORDINARY_CONTEXT (context))
102944b87433SJohn Marino return dfa->init_state;
103044b87433SJohn Marino else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
103144b87433SJohn Marino return dfa->init_state_begbuf;
103244b87433SJohn Marino else if (IS_NEWLINE_CONTEXT (context))
103344b87433SJohn Marino return dfa->init_state_nl;
103444b87433SJohn Marino else if (IS_BEGBUF_CONTEXT (context))
103544b87433SJohn Marino {
103644b87433SJohn Marino /* It is relatively rare case, then calculate on demand. */
103744b87433SJohn Marino return re_acquire_state_context (err, dfa,
103844b87433SJohn Marino dfa->init_state->entrance_nodes,
103944b87433SJohn Marino context);
104044b87433SJohn Marino }
104144b87433SJohn Marino else
104244b87433SJohn Marino /* Must not happen? */
104344b87433SJohn Marino return dfa->init_state;
104444b87433SJohn Marino }
104544b87433SJohn Marino else
104644b87433SJohn Marino return dfa->init_state;
104744b87433SJohn Marino }
104844b87433SJohn Marino
104944b87433SJohn Marino /* Check whether the regular expression match input string INPUT or not,
1050*6ea1f93eSDaniel Fojt and return the index where the matching end. Return -1 if
1051*6ea1f93eSDaniel Fojt there is no match, and return -2 in case of an error.
105244b87433SJohn Marino FL_LONGEST_MATCH means we want the POSIX longest matching.
105344b87433SJohn Marino If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
105444b87433SJohn Marino next place where we may want to try matching.
10554536c563SJohn Marino Note that the matcher assumes that the matching starts from the current
105644b87433SJohn Marino index of the buffer. */
105744b87433SJohn Marino
105844b87433SJohn Marino static Idx
1059*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)106044b87433SJohn Marino check_matching (re_match_context_t *mctx, bool fl_longest_match,
106144b87433SJohn Marino Idx *p_match_first)
106244b87433SJohn Marino {
106344b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
106444b87433SJohn Marino reg_errcode_t err;
106544b87433SJohn Marino Idx match = 0;
1066*6ea1f93eSDaniel Fojt Idx match_last = -1;
106744b87433SJohn Marino Idx cur_str_idx = re_string_cur_idx (&mctx->input);
106844b87433SJohn Marino re_dfastate_t *cur_state;
106944b87433SJohn Marino bool at_init_state = p_match_first != NULL;
107044b87433SJohn Marino Idx next_start_idx = cur_str_idx;
107144b87433SJohn Marino
107244b87433SJohn Marino err = REG_NOERROR;
107344b87433SJohn Marino cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
107444b87433SJohn Marino /* An initial state must not be NULL (invalid). */
1075*6ea1f93eSDaniel Fojt if (__glibc_unlikely (cur_state == NULL))
107644b87433SJohn Marino {
107744b87433SJohn Marino assert (err == REG_ESPACE);
1078*6ea1f93eSDaniel Fojt return -2;
107944b87433SJohn Marino }
108044b87433SJohn Marino
108144b87433SJohn Marino if (mctx->state_log != NULL)
108244b87433SJohn Marino {
108344b87433SJohn Marino mctx->state_log[cur_str_idx] = cur_state;
108444b87433SJohn Marino
108544b87433SJohn Marino /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
108644b87433SJohn Marino later. E.g. Processing back references. */
1087*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dfa->nbackref))
108844b87433SJohn Marino {
108944b87433SJohn Marino at_init_state = false;
109044b87433SJohn Marino err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1091*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
109244b87433SJohn Marino return err;
109344b87433SJohn Marino
109444b87433SJohn Marino if (cur_state->has_backref)
109544b87433SJohn Marino {
109644b87433SJohn Marino err = transit_state_bkref (mctx, &cur_state->nodes);
1097*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
109844b87433SJohn Marino return err;
109944b87433SJohn Marino }
110044b87433SJohn Marino }
110144b87433SJohn Marino }
110244b87433SJohn Marino
110344b87433SJohn Marino /* If the RE accepts NULL string. */
1104*6ea1f93eSDaniel Fojt if (__glibc_unlikely (cur_state->halt))
110544b87433SJohn Marino {
110644b87433SJohn Marino if (!cur_state->has_constraint
110744b87433SJohn Marino || check_halt_state_context (mctx, cur_state, cur_str_idx))
110844b87433SJohn Marino {
110944b87433SJohn Marino if (!fl_longest_match)
111044b87433SJohn Marino return cur_str_idx;
111144b87433SJohn Marino else
111244b87433SJohn Marino {
111344b87433SJohn Marino match_last = cur_str_idx;
111444b87433SJohn Marino match = 1;
111544b87433SJohn Marino }
111644b87433SJohn Marino }
111744b87433SJohn Marino }
111844b87433SJohn Marino
111944b87433SJohn Marino while (!re_string_eoi (&mctx->input))
112044b87433SJohn Marino {
112144b87433SJohn Marino re_dfastate_t *old_state = cur_state;
112244b87433SJohn Marino Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
112344b87433SJohn Marino
1124*6ea1f93eSDaniel Fojt if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
11254536c563SJohn Marino && mctx->input.bufs_len < mctx->input.len)
1126*6ea1f93eSDaniel Fojt || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
112744b87433SJohn Marino && mctx->input.valid_len < mctx->input.len))
112844b87433SJohn Marino {
11294536c563SJohn Marino err = extend_buffers (mctx, next_char_idx + 1);
1130*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
113144b87433SJohn Marino {
113244b87433SJohn Marino assert (err == REG_ESPACE);
1133*6ea1f93eSDaniel Fojt return -2;
113444b87433SJohn Marino }
113544b87433SJohn Marino }
113644b87433SJohn Marino
113744b87433SJohn Marino cur_state = transit_state (&err, mctx, cur_state);
113844b87433SJohn Marino if (mctx->state_log != NULL)
113944b87433SJohn Marino cur_state = merge_state_with_log (&err, mctx, cur_state);
114044b87433SJohn Marino
114144b87433SJohn Marino if (cur_state == NULL)
114244b87433SJohn Marino {
114344b87433SJohn Marino /* Reached the invalid state or an error. Try to recover a valid
114444b87433SJohn Marino state using the state log, if available and if we have not
114544b87433SJohn Marino already found a valid (even if not the longest) match. */
1146*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
1147*6ea1f93eSDaniel Fojt return -2;
114844b87433SJohn Marino
114944b87433SJohn Marino if (mctx->state_log == NULL
115044b87433SJohn Marino || (match && !fl_longest_match)
115144b87433SJohn Marino || (cur_state = find_recover_state (&err, mctx)) == NULL)
115244b87433SJohn Marino break;
115344b87433SJohn Marino }
115444b87433SJohn Marino
1155*6ea1f93eSDaniel Fojt if (__glibc_unlikely (at_init_state))
115644b87433SJohn Marino {
115744b87433SJohn Marino if (old_state == cur_state)
115844b87433SJohn Marino next_start_idx = next_char_idx;
115944b87433SJohn Marino else
116044b87433SJohn Marino at_init_state = false;
116144b87433SJohn Marino }
116244b87433SJohn Marino
116344b87433SJohn Marino if (cur_state->halt)
116444b87433SJohn Marino {
116544b87433SJohn Marino /* Reached a halt state.
116644b87433SJohn Marino Check the halt state can satisfy the current context. */
116744b87433SJohn Marino if (!cur_state->has_constraint
116844b87433SJohn Marino || check_halt_state_context (mctx, cur_state,
116944b87433SJohn Marino re_string_cur_idx (&mctx->input)))
117044b87433SJohn Marino {
117144b87433SJohn Marino /* We found an appropriate halt state. */
117244b87433SJohn Marino match_last = re_string_cur_idx (&mctx->input);
117344b87433SJohn Marino match = 1;
117444b87433SJohn Marino
117544b87433SJohn Marino /* We found a match, do not modify match_first below. */
117644b87433SJohn Marino p_match_first = NULL;
117744b87433SJohn Marino if (!fl_longest_match)
117844b87433SJohn Marino break;
117944b87433SJohn Marino }
118044b87433SJohn Marino }
118144b87433SJohn Marino }
118244b87433SJohn Marino
118344b87433SJohn Marino if (p_match_first)
118444b87433SJohn Marino *p_match_first += next_start_idx;
118544b87433SJohn Marino
118644b87433SJohn Marino return match_last;
118744b87433SJohn Marino }
118844b87433SJohn Marino
118944b87433SJohn Marino /* Check NODE match the current context. */
119044b87433SJohn Marino
119144b87433SJohn Marino static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)119244b87433SJohn Marino check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
119344b87433SJohn Marino {
119444b87433SJohn Marino re_token_type_t type = dfa->nodes[node].type;
119544b87433SJohn Marino unsigned int constraint = dfa->nodes[node].constraint;
119644b87433SJohn Marino if (type != END_OF_RE)
119744b87433SJohn Marino return false;
119844b87433SJohn Marino if (!constraint)
119944b87433SJohn Marino return true;
120044b87433SJohn Marino if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
120144b87433SJohn Marino return false;
120244b87433SJohn Marino return true;
120344b87433SJohn Marino }
120444b87433SJohn Marino
120544b87433SJohn Marino /* Check the halt state STATE match the current context.
120644b87433SJohn Marino Return 0 if not match, if the node, STATE has, is a halt node and
120744b87433SJohn Marino match the context, return the node. */
120844b87433SJohn Marino
120944b87433SJohn Marino static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)121044b87433SJohn Marino check_halt_state_context (const re_match_context_t *mctx,
121144b87433SJohn Marino const re_dfastate_t *state, Idx idx)
121244b87433SJohn Marino {
121344b87433SJohn Marino Idx i;
121444b87433SJohn Marino unsigned int context;
121544b87433SJohn Marino #ifdef DEBUG
121644b87433SJohn Marino assert (state->halt);
121744b87433SJohn Marino #endif
121844b87433SJohn Marino context = re_string_context_at (&mctx->input, idx, mctx->eflags);
121944b87433SJohn Marino for (i = 0; i < state->nodes.nelem; ++i)
122044b87433SJohn Marino if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
122144b87433SJohn Marino return state->nodes.elems[i];
122244b87433SJohn Marino return 0;
122344b87433SJohn Marino }
122444b87433SJohn Marino
122544b87433SJohn Marino /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
122644b87433SJohn Marino corresponding to the DFA).
122744b87433SJohn Marino Return the destination node, and update EPS_VIA_NODES;
1228*6ea1f93eSDaniel Fojt return -1 in case of errors. */
122944b87433SJohn Marino
123044b87433SJohn Marino static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)123144b87433SJohn Marino proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
123244b87433SJohn Marino Idx *pidx, Idx node, re_node_set *eps_via_nodes,
123344b87433SJohn Marino struct re_fail_stack_t *fs)
123444b87433SJohn Marino {
123544b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
123644b87433SJohn Marino Idx i;
123744b87433SJohn Marino bool ok;
123844b87433SJohn Marino if (IS_EPSILON_NODE (dfa->nodes[node].type))
123944b87433SJohn Marino {
124044b87433SJohn Marino re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
124144b87433SJohn Marino re_node_set *edests = &dfa->edests[node];
124244b87433SJohn Marino Idx dest_node;
124344b87433SJohn Marino ok = re_node_set_insert (eps_via_nodes, node);
1244*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
1245*6ea1f93eSDaniel Fojt return -2;
1246*6ea1f93eSDaniel Fojt /* Pick up a valid destination, or return -1 if none
124744b87433SJohn Marino is found. */
1248*6ea1f93eSDaniel Fojt for (dest_node = -1, i = 0; i < edests->nelem; ++i)
124944b87433SJohn Marino {
125044b87433SJohn Marino Idx candidate = edests->elems[i];
125144b87433SJohn Marino if (!re_node_set_contains (cur_nodes, candidate))
125244b87433SJohn Marino continue;
1253*6ea1f93eSDaniel Fojt if (dest_node == -1)
125444b87433SJohn Marino dest_node = candidate;
125544b87433SJohn Marino
125644b87433SJohn Marino else
125744b87433SJohn Marino {
125844b87433SJohn Marino /* In order to avoid infinite loop like "(a*)*", return the second
125944b87433SJohn Marino epsilon-transition if the first was already considered. */
126044b87433SJohn Marino if (re_node_set_contains (eps_via_nodes, dest_node))
126144b87433SJohn Marino return candidate;
126244b87433SJohn Marino
126344b87433SJohn Marino /* Otherwise, push the second epsilon-transition on the fail stack. */
126444b87433SJohn Marino else if (fs != NULL
126544b87433SJohn Marino && push_fail_stack (fs, *pidx, candidate, nregs, regs,
126644b87433SJohn Marino eps_via_nodes))
1267*6ea1f93eSDaniel Fojt return -2;
126844b87433SJohn Marino
126944b87433SJohn Marino /* We know we are going to exit. */
127044b87433SJohn Marino break;
127144b87433SJohn Marino }
127244b87433SJohn Marino }
127344b87433SJohn Marino return dest_node;
127444b87433SJohn Marino }
127544b87433SJohn Marino else
127644b87433SJohn Marino {
127744b87433SJohn Marino Idx naccepted = 0;
127844b87433SJohn Marino re_token_type_t type = dfa->nodes[node].type;
127944b87433SJohn Marino
128044b87433SJohn Marino #ifdef RE_ENABLE_I18N
128144b87433SJohn Marino if (dfa->nodes[node].accept_mb)
128244b87433SJohn Marino naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
128344b87433SJohn Marino else
128444b87433SJohn Marino #endif /* RE_ENABLE_I18N */
128544b87433SJohn Marino if (type == OP_BACK_REF)
128644b87433SJohn Marino {
128744b87433SJohn Marino Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
128844b87433SJohn Marino naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
128944b87433SJohn Marino if (fs != NULL)
129044b87433SJohn Marino {
129144b87433SJohn Marino if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1292*6ea1f93eSDaniel Fojt return -1;
129344b87433SJohn Marino else if (naccepted)
129444b87433SJohn Marino {
129544b87433SJohn Marino char *buf = (char *) re_string_get_buffer (&mctx->input);
129644b87433SJohn Marino if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
129744b87433SJohn Marino naccepted) != 0)
1298*6ea1f93eSDaniel Fojt return -1;
129944b87433SJohn Marino }
130044b87433SJohn Marino }
130144b87433SJohn Marino
130244b87433SJohn Marino if (naccepted == 0)
130344b87433SJohn Marino {
130444b87433SJohn Marino Idx dest_node;
130544b87433SJohn Marino ok = re_node_set_insert (eps_via_nodes, node);
1306*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
1307*6ea1f93eSDaniel Fojt return -2;
130844b87433SJohn Marino dest_node = dfa->edests[node].elems[0];
130944b87433SJohn Marino if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
131044b87433SJohn Marino dest_node))
131144b87433SJohn Marino return dest_node;
131244b87433SJohn Marino }
131344b87433SJohn Marino }
131444b87433SJohn Marino
131544b87433SJohn Marino if (naccepted != 0
131644b87433SJohn Marino || check_node_accept (mctx, dfa->nodes + node, *pidx))
131744b87433SJohn Marino {
131844b87433SJohn Marino Idx dest_node = dfa->nexts[node];
131944b87433SJohn Marino *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
132044b87433SJohn Marino if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
132144b87433SJohn Marino || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
132244b87433SJohn Marino dest_node)))
1323*6ea1f93eSDaniel Fojt return -1;
132444b87433SJohn Marino re_node_set_empty (eps_via_nodes);
132544b87433SJohn Marino return dest_node;
132644b87433SJohn Marino }
132744b87433SJohn Marino }
1328*6ea1f93eSDaniel Fojt return -1;
132944b87433SJohn Marino }
133044b87433SJohn Marino
133144b87433SJohn Marino static reg_errcode_t
1332*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)133344b87433SJohn Marino push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
133444b87433SJohn Marino Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
133544b87433SJohn Marino {
133644b87433SJohn Marino reg_errcode_t err;
133744b87433SJohn Marino Idx num = fs->num++;
133844b87433SJohn Marino if (fs->num == fs->alloc)
133944b87433SJohn Marino {
134044b87433SJohn Marino struct re_fail_stack_ent_t *new_array;
1341*6ea1f93eSDaniel Fojt new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1342*6ea1f93eSDaniel Fojt fs->alloc * 2);
134344b87433SJohn Marino if (new_array == NULL)
134444b87433SJohn Marino return REG_ESPACE;
134544b87433SJohn Marino fs->alloc *= 2;
134644b87433SJohn Marino fs->stack = new_array;
134744b87433SJohn Marino }
134844b87433SJohn Marino fs->stack[num].idx = str_idx;
134944b87433SJohn Marino fs->stack[num].node = dest_node;
135044b87433SJohn Marino fs->stack[num].regs = re_malloc (regmatch_t, nregs);
135144b87433SJohn Marino if (fs->stack[num].regs == NULL)
135244b87433SJohn Marino return REG_ESPACE;
135344b87433SJohn Marino memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
135444b87433SJohn Marino err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
135544b87433SJohn Marino return err;
135644b87433SJohn Marino }
135744b87433SJohn Marino
135844b87433SJohn Marino static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)135944b87433SJohn Marino pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
136044b87433SJohn Marino regmatch_t *regs, re_node_set *eps_via_nodes)
136144b87433SJohn Marino {
136244b87433SJohn Marino Idx num = --fs->num;
1363*6ea1f93eSDaniel Fojt assert (num >= 0);
136444b87433SJohn Marino *pidx = fs->stack[num].idx;
136544b87433SJohn Marino memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
136644b87433SJohn Marino re_node_set_free (eps_via_nodes);
136744b87433SJohn Marino re_free (fs->stack[num].regs);
136844b87433SJohn Marino *eps_via_nodes = fs->stack[num].eps_via_nodes;
136944b87433SJohn Marino return fs->stack[num].node;
137044b87433SJohn Marino }
137144b87433SJohn Marino
137244b87433SJohn Marino /* Set the positions where the subexpressions are starts/ends to registers
137344b87433SJohn Marino PMATCH.
137444b87433SJohn Marino Note: We assume that pmatch[0] is already set, and
137544b87433SJohn Marino pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
137644b87433SJohn Marino
137744b87433SJohn Marino static reg_errcode_t
1378*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)137944b87433SJohn Marino set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
138044b87433SJohn Marino regmatch_t *pmatch, bool fl_backtrack)
138144b87433SJohn Marino {
13824536c563SJohn Marino const re_dfa_t *dfa = preg->buffer;
138344b87433SJohn Marino Idx idx, cur_node;
138444b87433SJohn Marino re_node_set eps_via_nodes;
138544b87433SJohn Marino struct re_fail_stack_t *fs;
138644b87433SJohn Marino struct re_fail_stack_t fs_body = { 0, 2, NULL };
138744b87433SJohn Marino regmatch_t *prev_idx_match;
138844b87433SJohn Marino bool prev_idx_match_malloced = false;
138944b87433SJohn Marino
139044b87433SJohn Marino #ifdef DEBUG
139144b87433SJohn Marino assert (nmatch > 1);
139244b87433SJohn Marino assert (mctx->state_log != NULL);
139344b87433SJohn Marino #endif
139444b87433SJohn Marino if (fl_backtrack)
139544b87433SJohn Marino {
139644b87433SJohn Marino fs = &fs_body;
139744b87433SJohn Marino fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
139844b87433SJohn Marino if (fs->stack == NULL)
139944b87433SJohn Marino return REG_ESPACE;
140044b87433SJohn Marino }
140144b87433SJohn Marino else
140244b87433SJohn Marino fs = NULL;
140344b87433SJohn Marino
140444b87433SJohn Marino cur_node = dfa->init_node;
140544b87433SJohn Marino re_node_set_init_empty (&eps_via_nodes);
140644b87433SJohn Marino
140744b87433SJohn Marino if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
140844b87433SJohn Marino prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
140944b87433SJohn Marino else
141044b87433SJohn Marino {
141144b87433SJohn Marino prev_idx_match = re_malloc (regmatch_t, nmatch);
141244b87433SJohn Marino if (prev_idx_match == NULL)
141344b87433SJohn Marino {
141444b87433SJohn Marino free_fail_stack_return (fs);
141544b87433SJohn Marino return REG_ESPACE;
141644b87433SJohn Marino }
141744b87433SJohn Marino prev_idx_match_malloced = true;
141844b87433SJohn Marino }
141944b87433SJohn Marino memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
142044b87433SJohn Marino
142144b87433SJohn Marino for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
142244b87433SJohn Marino {
142344b87433SJohn Marino update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
142444b87433SJohn Marino
142544b87433SJohn Marino if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
142644b87433SJohn Marino {
142744b87433SJohn Marino Idx reg_idx;
142844b87433SJohn Marino if (fs)
142944b87433SJohn Marino {
143044b87433SJohn Marino for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
143144b87433SJohn Marino if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
143244b87433SJohn Marino break;
143344b87433SJohn Marino if (reg_idx == nmatch)
143444b87433SJohn Marino {
143544b87433SJohn Marino re_node_set_free (&eps_via_nodes);
143644b87433SJohn Marino if (prev_idx_match_malloced)
143744b87433SJohn Marino re_free (prev_idx_match);
143844b87433SJohn Marino return free_fail_stack_return (fs);
143944b87433SJohn Marino }
144044b87433SJohn Marino cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
144144b87433SJohn Marino &eps_via_nodes);
144244b87433SJohn Marino }
144344b87433SJohn Marino else
144444b87433SJohn Marino {
144544b87433SJohn Marino re_node_set_free (&eps_via_nodes);
144644b87433SJohn Marino if (prev_idx_match_malloced)
144744b87433SJohn Marino re_free (prev_idx_match);
144844b87433SJohn Marino return REG_NOERROR;
144944b87433SJohn Marino }
145044b87433SJohn Marino }
145144b87433SJohn Marino
145244b87433SJohn Marino /* Proceed to next node. */
145344b87433SJohn Marino cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
145444b87433SJohn Marino &eps_via_nodes, fs);
145544b87433SJohn Marino
1456*6ea1f93eSDaniel Fojt if (__glibc_unlikely (cur_node < 0))
145744b87433SJohn Marino {
1458*6ea1f93eSDaniel Fojt if (__glibc_unlikely (cur_node == -2))
145944b87433SJohn Marino {
146044b87433SJohn Marino re_node_set_free (&eps_via_nodes);
146144b87433SJohn Marino if (prev_idx_match_malloced)
146244b87433SJohn Marino re_free (prev_idx_match);
146344b87433SJohn Marino free_fail_stack_return (fs);
146444b87433SJohn Marino return REG_ESPACE;
146544b87433SJohn Marino }
146644b87433SJohn Marino if (fs)
146744b87433SJohn Marino cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
146844b87433SJohn Marino &eps_via_nodes);
146944b87433SJohn Marino else
147044b87433SJohn Marino {
147144b87433SJohn Marino re_node_set_free (&eps_via_nodes);
147244b87433SJohn Marino if (prev_idx_match_malloced)
147344b87433SJohn Marino re_free (prev_idx_match);
147444b87433SJohn Marino return REG_NOMATCH;
147544b87433SJohn Marino }
147644b87433SJohn Marino }
147744b87433SJohn Marino }
147844b87433SJohn Marino re_node_set_free (&eps_via_nodes);
147944b87433SJohn Marino if (prev_idx_match_malloced)
148044b87433SJohn Marino re_free (prev_idx_match);
148144b87433SJohn Marino return free_fail_stack_return (fs);
148244b87433SJohn Marino }
148344b87433SJohn Marino
148444b87433SJohn Marino static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)148544b87433SJohn Marino free_fail_stack_return (struct re_fail_stack_t *fs)
148644b87433SJohn Marino {
148744b87433SJohn Marino if (fs)
148844b87433SJohn Marino {
148944b87433SJohn Marino Idx fs_idx;
149044b87433SJohn Marino for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
149144b87433SJohn Marino {
149244b87433SJohn Marino re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
149344b87433SJohn Marino re_free (fs->stack[fs_idx].regs);
149444b87433SJohn Marino }
149544b87433SJohn Marino re_free (fs->stack);
149644b87433SJohn Marino }
149744b87433SJohn Marino return REG_NOERROR;
149844b87433SJohn Marino }
149944b87433SJohn Marino
150044b87433SJohn Marino static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)150144b87433SJohn Marino update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
150244b87433SJohn Marino regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
150344b87433SJohn Marino {
150444b87433SJohn Marino int type = dfa->nodes[cur_node].type;
150544b87433SJohn Marino if (type == OP_OPEN_SUBEXP)
150644b87433SJohn Marino {
150744b87433SJohn Marino Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
150844b87433SJohn Marino
150944b87433SJohn Marino /* We are at the first node of this sub expression. */
151044b87433SJohn Marino if (reg_num < nmatch)
151144b87433SJohn Marino {
151244b87433SJohn Marino pmatch[reg_num].rm_so = cur_idx;
151344b87433SJohn Marino pmatch[reg_num].rm_eo = -1;
151444b87433SJohn Marino }
151544b87433SJohn Marino }
151644b87433SJohn Marino else if (type == OP_CLOSE_SUBEXP)
151744b87433SJohn Marino {
151844b87433SJohn Marino Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
151944b87433SJohn Marino if (reg_num < nmatch)
152044b87433SJohn Marino {
152144b87433SJohn Marino /* We are at the last node of this sub expression. */
152244b87433SJohn Marino if (pmatch[reg_num].rm_so < cur_idx)
152344b87433SJohn Marino {
152444b87433SJohn Marino pmatch[reg_num].rm_eo = cur_idx;
152544b87433SJohn Marino /* This is a non-empty match or we are not inside an optional
152644b87433SJohn Marino subexpression. Accept this right away. */
152744b87433SJohn Marino memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
152844b87433SJohn Marino }
152944b87433SJohn Marino else
153044b87433SJohn Marino {
153144b87433SJohn Marino if (dfa->nodes[cur_node].opt_subexp
153244b87433SJohn Marino && prev_idx_match[reg_num].rm_so != -1)
153344b87433SJohn Marino /* We transited through an empty match for an optional
153444b87433SJohn Marino subexpression, like (a?)*, and this is not the subexp's
153544b87433SJohn Marino first match. Copy back the old content of the registers
153644b87433SJohn Marino so that matches of an inner subexpression are undone as
153744b87433SJohn Marino well, like in ((a?))*. */
153844b87433SJohn Marino memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
153944b87433SJohn Marino else
154044b87433SJohn Marino /* We completed a subexpression, but it may be part of
154144b87433SJohn Marino an optional one, so do not update PREV_IDX_MATCH. */
154244b87433SJohn Marino pmatch[reg_num].rm_eo = cur_idx;
154344b87433SJohn Marino }
154444b87433SJohn Marino }
154544b87433SJohn Marino }
154644b87433SJohn Marino }
154744b87433SJohn Marino
154844b87433SJohn Marino /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
154944b87433SJohn Marino and sift the nodes in each states according to the following rules.
155044b87433SJohn Marino Updated state_log will be wrote to STATE_LOG.
155144b87433SJohn Marino
15524536c563SJohn Marino Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
155344b87433SJohn Marino 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
15544536c563SJohn Marino If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
15554536c563SJohn Marino the LAST_NODE, we throw away the node 'a'.
15564536c563SJohn Marino 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
15574536c563SJohn Marino string 's' and transit to 'b':
155844b87433SJohn Marino i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
15594536c563SJohn Marino away the node 'a'.
156044b87433SJohn Marino ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
15614536c563SJohn Marino thrown away, we throw away the node 'a'.
156244b87433SJohn Marino 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
156344b87433SJohn Marino i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
15644536c563SJohn Marino node 'a'.
156544b87433SJohn Marino ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
15664536c563SJohn Marino we throw away the node 'a'. */
156744b87433SJohn Marino
156844b87433SJohn Marino #define STATE_NODE_CONTAINS(state,node) \
156944b87433SJohn Marino ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
157044b87433SJohn Marino
157144b87433SJohn Marino static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)157244b87433SJohn Marino sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
157344b87433SJohn Marino {
157444b87433SJohn Marino reg_errcode_t err;
157544b87433SJohn Marino int null_cnt = 0;
157644b87433SJohn Marino Idx str_idx = sctx->last_str_idx;
157744b87433SJohn Marino re_node_set cur_dest;
157844b87433SJohn Marino
157944b87433SJohn Marino #ifdef DEBUG
158044b87433SJohn Marino assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
158144b87433SJohn Marino #endif
158244b87433SJohn Marino
158344b87433SJohn Marino /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
158444b87433SJohn Marino transit to the last_node and the last_node itself. */
158544b87433SJohn Marino err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1586*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
158744b87433SJohn Marino return err;
158844b87433SJohn Marino err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1589*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
159044b87433SJohn Marino goto free_return;
159144b87433SJohn Marino
159244b87433SJohn Marino /* Then check each states in the state_log. */
159344b87433SJohn Marino while (str_idx > 0)
159444b87433SJohn Marino {
159544b87433SJohn Marino /* Update counters. */
159644b87433SJohn Marino null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
159744b87433SJohn Marino if (null_cnt > mctx->max_mb_elem_len)
159844b87433SJohn Marino {
159944b87433SJohn Marino memset (sctx->sifted_states, '\0',
160044b87433SJohn Marino sizeof (re_dfastate_t *) * str_idx);
160144b87433SJohn Marino re_node_set_free (&cur_dest);
160244b87433SJohn Marino return REG_NOERROR;
160344b87433SJohn Marino }
160444b87433SJohn Marino re_node_set_empty (&cur_dest);
160544b87433SJohn Marino --str_idx;
160644b87433SJohn Marino
160744b87433SJohn Marino if (mctx->state_log[str_idx])
160844b87433SJohn Marino {
160944b87433SJohn Marino err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1610*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
161144b87433SJohn Marino goto free_return;
161244b87433SJohn Marino }
161344b87433SJohn Marino
161444b87433SJohn Marino /* Add all the nodes which satisfy the following conditions:
161544b87433SJohn Marino - It can epsilon transit to a node in CUR_DEST.
161644b87433SJohn Marino - It is in CUR_SRC.
161744b87433SJohn Marino And update state_log. */
161844b87433SJohn Marino err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1619*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
162044b87433SJohn Marino goto free_return;
162144b87433SJohn Marino }
162244b87433SJohn Marino err = REG_NOERROR;
162344b87433SJohn Marino free_return:
162444b87433SJohn Marino re_node_set_free (&cur_dest);
162544b87433SJohn Marino return err;
162644b87433SJohn Marino }
162744b87433SJohn Marino
162844b87433SJohn Marino static reg_errcode_t
1629*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)163044b87433SJohn Marino build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
163144b87433SJohn Marino Idx str_idx, re_node_set *cur_dest)
163244b87433SJohn Marino {
163344b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
163444b87433SJohn Marino const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
163544b87433SJohn Marino Idx i;
163644b87433SJohn Marino
163744b87433SJohn Marino /* Then build the next sifted state.
16384536c563SJohn Marino We build the next sifted state on 'cur_dest', and update
16394536c563SJohn Marino 'sifted_states[str_idx]' with 'cur_dest'.
164044b87433SJohn Marino Note:
16414536c563SJohn Marino 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
16424536c563SJohn Marino 'cur_src' points the node_set of the old 'state_log[str_idx]'
164344b87433SJohn Marino (with the epsilon nodes pre-filtered out). */
164444b87433SJohn Marino for (i = 0; i < cur_src->nelem; i++)
164544b87433SJohn Marino {
164644b87433SJohn Marino Idx prev_node = cur_src->elems[i];
164744b87433SJohn Marino int naccepted = 0;
164844b87433SJohn Marino bool ok;
164944b87433SJohn Marino
165044b87433SJohn Marino #ifdef DEBUG
165144b87433SJohn Marino re_token_type_t type = dfa->nodes[prev_node].type;
165244b87433SJohn Marino assert (!IS_EPSILON_NODE (type));
165344b87433SJohn Marino #endif
165444b87433SJohn Marino #ifdef RE_ENABLE_I18N
16554536c563SJohn Marino /* If the node may accept "multi byte". */
165644b87433SJohn Marino if (dfa->nodes[prev_node].accept_mb)
165744b87433SJohn Marino naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
165844b87433SJohn Marino str_idx, sctx->last_str_idx);
165944b87433SJohn Marino #endif /* RE_ENABLE_I18N */
166044b87433SJohn Marino
166144b87433SJohn Marino /* We don't check backreferences here.
166244b87433SJohn Marino See update_cur_sifted_state(). */
166344b87433SJohn Marino if (!naccepted
166444b87433SJohn Marino && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
166544b87433SJohn Marino && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
166644b87433SJohn Marino dfa->nexts[prev_node]))
166744b87433SJohn Marino naccepted = 1;
166844b87433SJohn Marino
166944b87433SJohn Marino if (naccepted == 0)
167044b87433SJohn Marino continue;
167144b87433SJohn Marino
167244b87433SJohn Marino if (sctx->limits.nelem)
167344b87433SJohn Marino {
167444b87433SJohn Marino Idx to_idx = str_idx + naccepted;
167544b87433SJohn Marino if (check_dst_limits (mctx, &sctx->limits,
167644b87433SJohn Marino dfa->nexts[prev_node], to_idx,
167744b87433SJohn Marino prev_node, str_idx))
167844b87433SJohn Marino continue;
167944b87433SJohn Marino }
168044b87433SJohn Marino ok = re_node_set_insert (cur_dest, prev_node);
1681*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
168244b87433SJohn Marino return REG_ESPACE;
168344b87433SJohn Marino }
168444b87433SJohn Marino
168544b87433SJohn Marino return REG_NOERROR;
168644b87433SJohn Marino }
168744b87433SJohn Marino
168844b87433SJohn Marino /* Helper functions. */
168944b87433SJohn Marino
169044b87433SJohn Marino static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)169144b87433SJohn Marino clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
169244b87433SJohn Marino {
169344b87433SJohn Marino Idx top = mctx->state_log_top;
169444b87433SJohn Marino
16954536c563SJohn Marino if ((next_state_log_idx >= mctx->input.bufs_len
16964536c563SJohn Marino && mctx->input.bufs_len < mctx->input.len)
169744b87433SJohn Marino || (next_state_log_idx >= mctx->input.valid_len
169844b87433SJohn Marino && mctx->input.valid_len < mctx->input.len))
169944b87433SJohn Marino {
170044b87433SJohn Marino reg_errcode_t err;
17014536c563SJohn Marino err = extend_buffers (mctx, next_state_log_idx + 1);
1702*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
170344b87433SJohn Marino return err;
170444b87433SJohn Marino }
170544b87433SJohn Marino
170644b87433SJohn Marino if (top < next_state_log_idx)
170744b87433SJohn Marino {
170844b87433SJohn Marino memset (mctx->state_log + top + 1, '\0',
170944b87433SJohn Marino sizeof (re_dfastate_t *) * (next_state_log_idx - top));
171044b87433SJohn Marino mctx->state_log_top = next_state_log_idx;
171144b87433SJohn Marino }
171244b87433SJohn Marino return REG_NOERROR;
171344b87433SJohn Marino }
171444b87433SJohn Marino
171544b87433SJohn Marino static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)171644b87433SJohn Marino merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
171744b87433SJohn Marino re_dfastate_t **src, Idx num)
171844b87433SJohn Marino {
171944b87433SJohn Marino Idx st_idx;
172044b87433SJohn Marino reg_errcode_t err;
172144b87433SJohn Marino for (st_idx = 0; st_idx < num; ++st_idx)
172244b87433SJohn Marino {
172344b87433SJohn Marino if (dst[st_idx] == NULL)
172444b87433SJohn Marino dst[st_idx] = src[st_idx];
172544b87433SJohn Marino else if (src[st_idx] != NULL)
172644b87433SJohn Marino {
172744b87433SJohn Marino re_node_set merged_set;
172844b87433SJohn Marino err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
172944b87433SJohn Marino &src[st_idx]->nodes);
1730*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
173144b87433SJohn Marino return err;
173244b87433SJohn Marino dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
173344b87433SJohn Marino re_node_set_free (&merged_set);
1734*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
173544b87433SJohn Marino return err;
173644b87433SJohn Marino }
173744b87433SJohn Marino }
173844b87433SJohn Marino return REG_NOERROR;
173944b87433SJohn Marino }
174044b87433SJohn Marino
174144b87433SJohn Marino static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)174244b87433SJohn Marino update_cur_sifted_state (const re_match_context_t *mctx,
174344b87433SJohn Marino re_sift_context_t *sctx, Idx str_idx,
174444b87433SJohn Marino re_node_set *dest_nodes)
174544b87433SJohn Marino {
174644b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
174744b87433SJohn Marino reg_errcode_t err = REG_NOERROR;
174844b87433SJohn Marino const re_node_set *candidates;
174944b87433SJohn Marino candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
175044b87433SJohn Marino : &mctx->state_log[str_idx]->nodes);
175144b87433SJohn Marino
175244b87433SJohn Marino if (dest_nodes->nelem == 0)
175344b87433SJohn Marino sctx->sifted_states[str_idx] = NULL;
175444b87433SJohn Marino else
175544b87433SJohn Marino {
175644b87433SJohn Marino if (candidates)
175744b87433SJohn Marino {
175844b87433SJohn Marino /* At first, add the nodes which can epsilon transit to a node in
175944b87433SJohn Marino DEST_NODE. */
176044b87433SJohn Marino err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1761*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
176244b87433SJohn Marino return err;
176344b87433SJohn Marino
176444b87433SJohn Marino /* Then, check the limitations in the current sift_context. */
176544b87433SJohn Marino if (sctx->limits.nelem)
176644b87433SJohn Marino {
176744b87433SJohn Marino err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
176844b87433SJohn Marino mctx->bkref_ents, str_idx);
1769*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
177044b87433SJohn Marino return err;
177144b87433SJohn Marino }
177244b87433SJohn Marino }
177344b87433SJohn Marino
177444b87433SJohn Marino sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1775*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
177644b87433SJohn Marino return err;
177744b87433SJohn Marino }
177844b87433SJohn Marino
177944b87433SJohn Marino if (candidates && mctx->state_log[str_idx]->has_backref)
178044b87433SJohn Marino {
178144b87433SJohn Marino err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1782*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
178344b87433SJohn Marino return err;
178444b87433SJohn Marino }
178544b87433SJohn Marino return REG_NOERROR;
178644b87433SJohn Marino }
178744b87433SJohn Marino
178844b87433SJohn Marino static reg_errcode_t
1789*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)179044b87433SJohn Marino add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
179144b87433SJohn Marino const re_node_set *candidates)
179244b87433SJohn Marino {
179344b87433SJohn Marino reg_errcode_t err = REG_NOERROR;
179444b87433SJohn Marino Idx i;
179544b87433SJohn Marino
179644b87433SJohn Marino re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1797*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
179844b87433SJohn Marino return err;
179944b87433SJohn Marino
180044b87433SJohn Marino if (!state->inveclosure.alloc)
180144b87433SJohn Marino {
180244b87433SJohn Marino err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1803*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
180444b87433SJohn Marino return REG_ESPACE;
180544b87433SJohn Marino for (i = 0; i < dest_nodes->nelem; i++)
180644b87433SJohn Marino {
180744b87433SJohn Marino err = re_node_set_merge (&state->inveclosure,
180844b87433SJohn Marino dfa->inveclosures + dest_nodes->elems[i]);
1809*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
181044b87433SJohn Marino return REG_ESPACE;
181144b87433SJohn Marino }
181244b87433SJohn Marino }
181344b87433SJohn Marino return re_node_set_add_intersect (dest_nodes, candidates,
181444b87433SJohn Marino &state->inveclosure);
181544b87433SJohn Marino }
181644b87433SJohn Marino
181744b87433SJohn Marino static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)181844b87433SJohn Marino sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
181944b87433SJohn Marino const re_node_set *candidates)
182044b87433SJohn Marino {
182144b87433SJohn Marino Idx ecl_idx;
182244b87433SJohn Marino reg_errcode_t err;
182344b87433SJohn Marino re_node_set *inv_eclosure = dfa->inveclosures + node;
182444b87433SJohn Marino re_node_set except_nodes;
182544b87433SJohn Marino re_node_set_init_empty (&except_nodes);
182644b87433SJohn Marino for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
182744b87433SJohn Marino {
182844b87433SJohn Marino Idx cur_node = inv_eclosure->elems[ecl_idx];
182944b87433SJohn Marino if (cur_node == node)
183044b87433SJohn Marino continue;
183144b87433SJohn Marino if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
183244b87433SJohn Marino {
183344b87433SJohn Marino Idx edst1 = dfa->edests[cur_node].elems[0];
183444b87433SJohn Marino Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1835*6ea1f93eSDaniel Fojt ? dfa->edests[cur_node].elems[1] : -1);
183644b87433SJohn Marino if ((!re_node_set_contains (inv_eclosure, edst1)
183744b87433SJohn Marino && re_node_set_contains (dest_nodes, edst1))
1838*6ea1f93eSDaniel Fojt || (edst2 > 0
183944b87433SJohn Marino && !re_node_set_contains (inv_eclosure, edst2)
184044b87433SJohn Marino && re_node_set_contains (dest_nodes, edst2)))
184144b87433SJohn Marino {
184244b87433SJohn Marino err = re_node_set_add_intersect (&except_nodes, candidates,
184344b87433SJohn Marino dfa->inveclosures + cur_node);
1844*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
184544b87433SJohn Marino {
184644b87433SJohn Marino re_node_set_free (&except_nodes);
184744b87433SJohn Marino return err;
184844b87433SJohn Marino }
184944b87433SJohn Marino }
185044b87433SJohn Marino }
185144b87433SJohn Marino }
185244b87433SJohn Marino for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
185344b87433SJohn Marino {
185444b87433SJohn Marino Idx cur_node = inv_eclosure->elems[ecl_idx];
185544b87433SJohn Marino if (!re_node_set_contains (&except_nodes, cur_node))
185644b87433SJohn Marino {
185744b87433SJohn Marino Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
185844b87433SJohn Marino re_node_set_remove_at (dest_nodes, idx);
185944b87433SJohn Marino }
186044b87433SJohn Marino }
186144b87433SJohn Marino re_node_set_free (&except_nodes);
186244b87433SJohn Marino return REG_NOERROR;
186344b87433SJohn Marino }
186444b87433SJohn Marino
186544b87433SJohn Marino static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)186644b87433SJohn Marino check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
186744b87433SJohn Marino Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
186844b87433SJohn Marino {
186944b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
187044b87433SJohn Marino Idx lim_idx, src_pos, dst_pos;
187144b87433SJohn Marino
187244b87433SJohn Marino Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
187344b87433SJohn Marino Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
187444b87433SJohn Marino for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
187544b87433SJohn Marino {
187644b87433SJohn Marino Idx subexp_idx;
187744b87433SJohn Marino struct re_backref_cache_entry *ent;
187844b87433SJohn Marino ent = mctx->bkref_ents + limits->elems[lim_idx];
187944b87433SJohn Marino subexp_idx = dfa->nodes[ent->node].opr.idx;
188044b87433SJohn Marino
188144b87433SJohn Marino dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
188244b87433SJohn Marino subexp_idx, dst_node, dst_idx,
188344b87433SJohn Marino dst_bkref_idx);
188444b87433SJohn Marino src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
188544b87433SJohn Marino subexp_idx, src_node, src_idx,
188644b87433SJohn Marino src_bkref_idx);
188744b87433SJohn Marino
188844b87433SJohn Marino /* In case of:
188944b87433SJohn Marino <src> <dst> ( <subexp> )
189044b87433SJohn Marino ( <subexp> ) <src> <dst>
189144b87433SJohn Marino ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
189244b87433SJohn Marino if (src_pos == dst_pos)
189344b87433SJohn Marino continue; /* This is unrelated limitation. */
189444b87433SJohn Marino else
189544b87433SJohn Marino return true;
189644b87433SJohn Marino }
189744b87433SJohn Marino return false;
189844b87433SJohn Marino }
189944b87433SJohn Marino
190044b87433SJohn Marino static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)190144b87433SJohn Marino check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
190244b87433SJohn Marino Idx subexp_idx, Idx from_node, Idx bkref_idx)
190344b87433SJohn Marino {
190444b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
190544b87433SJohn Marino const re_node_set *eclosures = dfa->eclosures + from_node;
190644b87433SJohn Marino Idx node_idx;
190744b87433SJohn Marino
190844b87433SJohn Marino /* Else, we are on the boundary: examine the nodes on the epsilon
190944b87433SJohn Marino closure. */
191044b87433SJohn Marino for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
191144b87433SJohn Marino {
191244b87433SJohn Marino Idx node = eclosures->elems[node_idx];
191344b87433SJohn Marino switch (dfa->nodes[node].type)
191444b87433SJohn Marino {
191544b87433SJohn Marino case OP_BACK_REF:
1916*6ea1f93eSDaniel Fojt if (bkref_idx != -1)
191744b87433SJohn Marino {
191844b87433SJohn Marino struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
191944b87433SJohn Marino do
192044b87433SJohn Marino {
192144b87433SJohn Marino Idx dst;
192244b87433SJohn Marino int cpos;
192344b87433SJohn Marino
192444b87433SJohn Marino if (ent->node != node)
192544b87433SJohn Marino continue;
192644b87433SJohn Marino
192744b87433SJohn Marino if (subexp_idx < BITSET_WORD_BITS
192844b87433SJohn Marino && !(ent->eps_reachable_subexps_map
192944b87433SJohn Marino & ((bitset_word_t) 1 << subexp_idx)))
193044b87433SJohn Marino continue;
193144b87433SJohn Marino
193244b87433SJohn Marino /* Recurse trying to reach the OP_OPEN_SUBEXP and
193344b87433SJohn Marino OP_CLOSE_SUBEXP cases below. But, if the
193444b87433SJohn Marino destination node is the same node as the source
193544b87433SJohn Marino node, don't recurse because it would cause an
193644b87433SJohn Marino infinite loop: a regex that exhibits this behavior
193744b87433SJohn Marino is ()\1*\1* */
193844b87433SJohn Marino dst = dfa->edests[node].elems[0];
193944b87433SJohn Marino if (dst == from_node)
194044b87433SJohn Marino {
194144b87433SJohn Marino if (boundaries & 1)
194244b87433SJohn Marino return -1;
194344b87433SJohn Marino else /* if (boundaries & 2) */
194444b87433SJohn Marino return 0;
194544b87433SJohn Marino }
194644b87433SJohn Marino
194744b87433SJohn Marino cpos =
194844b87433SJohn Marino check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
194944b87433SJohn Marino dst, bkref_idx);
195044b87433SJohn Marino if (cpos == -1 /* && (boundaries & 1) */)
195144b87433SJohn Marino return -1;
195244b87433SJohn Marino if (cpos == 0 && (boundaries & 2))
195344b87433SJohn Marino return 0;
195444b87433SJohn Marino
195544b87433SJohn Marino if (subexp_idx < BITSET_WORD_BITS)
195644b87433SJohn Marino ent->eps_reachable_subexps_map
195744b87433SJohn Marino &= ~((bitset_word_t) 1 << subexp_idx);
195844b87433SJohn Marino }
195944b87433SJohn Marino while (ent++->more);
196044b87433SJohn Marino }
196144b87433SJohn Marino break;
196244b87433SJohn Marino
196344b87433SJohn Marino case OP_OPEN_SUBEXP:
196444b87433SJohn Marino if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
196544b87433SJohn Marino return -1;
196644b87433SJohn Marino break;
196744b87433SJohn Marino
196844b87433SJohn Marino case OP_CLOSE_SUBEXP:
196944b87433SJohn Marino if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
197044b87433SJohn Marino return 0;
197144b87433SJohn Marino break;
197244b87433SJohn Marino
197344b87433SJohn Marino default:
197444b87433SJohn Marino break;
197544b87433SJohn Marino }
197644b87433SJohn Marino }
197744b87433SJohn Marino
197844b87433SJohn Marino return (boundaries & 2) ? 1 : 0;
197944b87433SJohn Marino }
198044b87433SJohn Marino
198144b87433SJohn Marino static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)198244b87433SJohn Marino check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
198344b87433SJohn Marino Idx subexp_idx, Idx from_node, Idx str_idx,
198444b87433SJohn Marino Idx bkref_idx)
198544b87433SJohn Marino {
198644b87433SJohn Marino struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
198744b87433SJohn Marino int boundaries;
198844b87433SJohn Marino
198944b87433SJohn Marino /* If we are outside the range of the subexpression, return -1 or 1. */
199044b87433SJohn Marino if (str_idx < lim->subexp_from)
199144b87433SJohn Marino return -1;
199244b87433SJohn Marino
199344b87433SJohn Marino if (lim->subexp_to < str_idx)
199444b87433SJohn Marino return 1;
199544b87433SJohn Marino
199644b87433SJohn Marino /* If we are within the subexpression, return 0. */
199744b87433SJohn Marino boundaries = (str_idx == lim->subexp_from);
199844b87433SJohn Marino boundaries |= (str_idx == lim->subexp_to) << 1;
199944b87433SJohn Marino if (boundaries == 0)
200044b87433SJohn Marino return 0;
200144b87433SJohn Marino
200244b87433SJohn Marino /* Else, examine epsilon closure. */
200344b87433SJohn Marino return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
200444b87433SJohn Marino from_node, bkref_idx);
200544b87433SJohn Marino }
200644b87433SJohn Marino
200744b87433SJohn Marino /* Check the limitations of sub expressions LIMITS, and remove the nodes
200844b87433SJohn Marino which are against limitations from DEST_NODES. */
200944b87433SJohn Marino
201044b87433SJohn Marino static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)201144b87433SJohn Marino check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
201244b87433SJohn Marino const re_node_set *candidates, re_node_set *limits,
201344b87433SJohn Marino struct re_backref_cache_entry *bkref_ents, Idx str_idx)
201444b87433SJohn Marino {
201544b87433SJohn Marino reg_errcode_t err;
201644b87433SJohn Marino Idx node_idx, lim_idx;
201744b87433SJohn Marino
201844b87433SJohn Marino for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
201944b87433SJohn Marino {
202044b87433SJohn Marino Idx subexp_idx;
202144b87433SJohn Marino struct re_backref_cache_entry *ent;
202244b87433SJohn Marino ent = bkref_ents + limits->elems[lim_idx];
202344b87433SJohn Marino
202444b87433SJohn Marino if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
202544b87433SJohn Marino continue; /* This is unrelated limitation. */
202644b87433SJohn Marino
202744b87433SJohn Marino subexp_idx = dfa->nodes[ent->node].opr.idx;
202844b87433SJohn Marino if (ent->subexp_to == str_idx)
202944b87433SJohn Marino {
2030*6ea1f93eSDaniel Fojt Idx ops_node = -1;
2031*6ea1f93eSDaniel Fojt Idx cls_node = -1;
203244b87433SJohn Marino for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
203344b87433SJohn Marino {
203444b87433SJohn Marino Idx node = dest_nodes->elems[node_idx];
203544b87433SJohn Marino re_token_type_t type = dfa->nodes[node].type;
203644b87433SJohn Marino if (type == OP_OPEN_SUBEXP
203744b87433SJohn Marino && subexp_idx == dfa->nodes[node].opr.idx)
203844b87433SJohn Marino ops_node = node;
203944b87433SJohn Marino else if (type == OP_CLOSE_SUBEXP
204044b87433SJohn Marino && subexp_idx == dfa->nodes[node].opr.idx)
204144b87433SJohn Marino cls_node = node;
204244b87433SJohn Marino }
204344b87433SJohn Marino
204444b87433SJohn Marino /* Check the limitation of the open subexpression. */
204544b87433SJohn Marino /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2046*6ea1f93eSDaniel Fojt if (ops_node >= 0)
204744b87433SJohn Marino {
204844b87433SJohn Marino err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
204944b87433SJohn Marino candidates);
2050*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
205144b87433SJohn Marino return err;
205244b87433SJohn Marino }
205344b87433SJohn Marino
205444b87433SJohn Marino /* Check the limitation of the close subexpression. */
2055*6ea1f93eSDaniel Fojt if (cls_node >= 0)
205644b87433SJohn Marino for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
205744b87433SJohn Marino {
205844b87433SJohn Marino Idx node = dest_nodes->elems[node_idx];
205944b87433SJohn Marino if (!re_node_set_contains (dfa->inveclosures + node,
206044b87433SJohn Marino cls_node)
206144b87433SJohn Marino && !re_node_set_contains (dfa->eclosures + node,
206244b87433SJohn Marino cls_node))
206344b87433SJohn Marino {
206444b87433SJohn Marino /* It is against this limitation.
206544b87433SJohn Marino Remove it form the current sifted state. */
206644b87433SJohn Marino err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
206744b87433SJohn Marino candidates);
2068*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
206944b87433SJohn Marino return err;
207044b87433SJohn Marino --node_idx;
207144b87433SJohn Marino }
207244b87433SJohn Marino }
207344b87433SJohn Marino }
207444b87433SJohn Marino else /* (ent->subexp_to != str_idx) */
207544b87433SJohn Marino {
207644b87433SJohn Marino for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
207744b87433SJohn Marino {
207844b87433SJohn Marino Idx node = dest_nodes->elems[node_idx];
207944b87433SJohn Marino re_token_type_t type = dfa->nodes[node].type;
208044b87433SJohn Marino if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
208144b87433SJohn Marino {
208244b87433SJohn Marino if (subexp_idx != dfa->nodes[node].opr.idx)
208344b87433SJohn Marino continue;
208444b87433SJohn Marino /* It is against this limitation.
208544b87433SJohn Marino Remove it form the current sifted state. */
208644b87433SJohn Marino err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
208744b87433SJohn Marino candidates);
2088*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
208944b87433SJohn Marino return err;
209044b87433SJohn Marino }
209144b87433SJohn Marino }
209244b87433SJohn Marino }
209344b87433SJohn Marino }
209444b87433SJohn Marino return REG_NOERROR;
209544b87433SJohn Marino }
209644b87433SJohn Marino
209744b87433SJohn Marino static reg_errcode_t
2098*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)209944b87433SJohn Marino sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
210044b87433SJohn Marino Idx str_idx, const re_node_set *candidates)
210144b87433SJohn Marino {
210244b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
210344b87433SJohn Marino reg_errcode_t err;
210444b87433SJohn Marino Idx node_idx, node;
210544b87433SJohn Marino re_sift_context_t local_sctx;
210644b87433SJohn Marino Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
210744b87433SJohn Marino
2108*6ea1f93eSDaniel Fojt if (first_idx == -1)
210944b87433SJohn Marino return REG_NOERROR;
211044b87433SJohn Marino
211144b87433SJohn Marino local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
211244b87433SJohn Marino
211344b87433SJohn Marino for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
211444b87433SJohn Marino {
211544b87433SJohn Marino Idx enabled_idx;
211644b87433SJohn Marino re_token_type_t type;
211744b87433SJohn Marino struct re_backref_cache_entry *entry;
211844b87433SJohn Marino node = candidates->elems[node_idx];
211944b87433SJohn Marino type = dfa->nodes[node].type;
212044b87433SJohn Marino /* Avoid infinite loop for the REs like "()\1+". */
212144b87433SJohn Marino if (node == sctx->last_node && str_idx == sctx->last_str_idx)
212244b87433SJohn Marino continue;
212344b87433SJohn Marino if (type != OP_BACK_REF)
212444b87433SJohn Marino continue;
212544b87433SJohn Marino
212644b87433SJohn Marino entry = mctx->bkref_ents + first_idx;
212744b87433SJohn Marino enabled_idx = first_idx;
212844b87433SJohn Marino do
212944b87433SJohn Marino {
213044b87433SJohn Marino Idx subexp_len;
213144b87433SJohn Marino Idx to_idx;
213244b87433SJohn Marino Idx dst_node;
213344b87433SJohn Marino bool ok;
213444b87433SJohn Marino re_dfastate_t *cur_state;
213544b87433SJohn Marino
213644b87433SJohn Marino if (entry->node != node)
213744b87433SJohn Marino continue;
213844b87433SJohn Marino subexp_len = entry->subexp_to - entry->subexp_from;
213944b87433SJohn Marino to_idx = str_idx + subexp_len;
214044b87433SJohn Marino dst_node = (subexp_len ? dfa->nexts[node]
214144b87433SJohn Marino : dfa->edests[node].elems[0]);
214244b87433SJohn Marino
214344b87433SJohn Marino if (to_idx > sctx->last_str_idx
214444b87433SJohn Marino || sctx->sifted_states[to_idx] == NULL
214544b87433SJohn Marino || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
214644b87433SJohn Marino || check_dst_limits (mctx, &sctx->limits, node,
214744b87433SJohn Marino str_idx, dst_node, to_idx))
214844b87433SJohn Marino continue;
214944b87433SJohn Marino
215044b87433SJohn Marino if (local_sctx.sifted_states == NULL)
215144b87433SJohn Marino {
215244b87433SJohn Marino local_sctx = *sctx;
215344b87433SJohn Marino err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2154*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
215544b87433SJohn Marino goto free_return;
215644b87433SJohn Marino }
215744b87433SJohn Marino local_sctx.last_node = node;
215844b87433SJohn Marino local_sctx.last_str_idx = str_idx;
215944b87433SJohn Marino ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2160*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
216144b87433SJohn Marino {
216244b87433SJohn Marino err = REG_ESPACE;
216344b87433SJohn Marino goto free_return;
216444b87433SJohn Marino }
216544b87433SJohn Marino cur_state = local_sctx.sifted_states[str_idx];
216644b87433SJohn Marino err = sift_states_backward (mctx, &local_sctx);
2167*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
216844b87433SJohn Marino goto free_return;
216944b87433SJohn Marino if (sctx->limited_states != NULL)
217044b87433SJohn Marino {
217144b87433SJohn Marino err = merge_state_array (dfa, sctx->limited_states,
217244b87433SJohn Marino local_sctx.sifted_states,
217344b87433SJohn Marino str_idx + 1);
2174*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
217544b87433SJohn Marino goto free_return;
217644b87433SJohn Marino }
217744b87433SJohn Marino local_sctx.sifted_states[str_idx] = cur_state;
217844b87433SJohn Marino re_node_set_remove (&local_sctx.limits, enabled_idx);
217944b87433SJohn Marino
218044b87433SJohn Marino /* mctx->bkref_ents may have changed, reload the pointer. */
218144b87433SJohn Marino entry = mctx->bkref_ents + enabled_idx;
218244b87433SJohn Marino }
218344b87433SJohn Marino while (enabled_idx++, entry++->more);
218444b87433SJohn Marino }
218544b87433SJohn Marino err = REG_NOERROR;
218644b87433SJohn Marino free_return:
218744b87433SJohn Marino if (local_sctx.sifted_states != NULL)
218844b87433SJohn Marino {
218944b87433SJohn Marino re_node_set_free (&local_sctx.limits);
219044b87433SJohn Marino }
219144b87433SJohn Marino
219244b87433SJohn Marino return err;
219344b87433SJohn Marino }
219444b87433SJohn Marino
219544b87433SJohn Marino
219644b87433SJohn Marino #ifdef RE_ENABLE_I18N
219744b87433SJohn Marino static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)219844b87433SJohn Marino sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
219944b87433SJohn Marino Idx node_idx, Idx str_idx, Idx max_str_idx)
220044b87433SJohn Marino {
220144b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
220244b87433SJohn Marino int naccepted;
22034536c563SJohn Marino /* Check the node can accept "multi byte". */
220444b87433SJohn Marino naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
220544b87433SJohn Marino if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
220644b87433SJohn Marino !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
220744b87433SJohn Marino dfa->nexts[node_idx]))
22084536c563SJohn Marino /* The node can't accept the "multi byte", or the
220944b87433SJohn Marino destination was already thrown away, then the node
22104536c563SJohn Marino could't accept the current input "multi byte". */
221144b87433SJohn Marino naccepted = 0;
221244b87433SJohn Marino /* Otherwise, it is sure that the node could accept
22134536c563SJohn Marino 'naccepted' bytes input. */
221444b87433SJohn Marino return naccepted;
221544b87433SJohn Marino }
221644b87433SJohn Marino #endif /* RE_ENABLE_I18N */
221744b87433SJohn Marino
221844b87433SJohn Marino
221944b87433SJohn Marino /* Functions for state transition. */
222044b87433SJohn Marino
222144b87433SJohn Marino /* Return the next state to which the current state STATE will transit by
222244b87433SJohn Marino accepting the current input byte, and update STATE_LOG if necessary.
222344b87433SJohn Marino If STATE can accept a multibyte char/collating element/back reference
222444b87433SJohn Marino update the destination of STATE_LOG. */
222544b87433SJohn Marino
222644b87433SJohn Marino static re_dfastate_t *
2227*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)222844b87433SJohn Marino transit_state (reg_errcode_t *err, re_match_context_t *mctx,
222944b87433SJohn Marino re_dfastate_t *state)
223044b87433SJohn Marino {
223144b87433SJohn Marino re_dfastate_t **trtable;
223244b87433SJohn Marino unsigned char ch;
223344b87433SJohn Marino
223444b87433SJohn Marino #ifdef RE_ENABLE_I18N
223544b87433SJohn Marino /* If the current state can accept multibyte. */
2236*6ea1f93eSDaniel Fojt if (__glibc_unlikely (state->accept_mb))
223744b87433SJohn Marino {
223844b87433SJohn Marino *err = transit_state_mb (mctx, state);
2239*6ea1f93eSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
224044b87433SJohn Marino return NULL;
224144b87433SJohn Marino }
224244b87433SJohn Marino #endif /* RE_ENABLE_I18N */
224344b87433SJohn Marino
224444b87433SJohn Marino /* Then decide the next state with the single byte. */
224544b87433SJohn Marino #if 0
224644b87433SJohn Marino if (0)
224744b87433SJohn Marino /* don't use transition table */
224844b87433SJohn Marino return transit_state_sb (err, mctx, state);
224944b87433SJohn Marino #endif
225044b87433SJohn Marino
225144b87433SJohn Marino /* Use transition table */
225244b87433SJohn Marino ch = re_string_fetch_byte (&mctx->input);
225344b87433SJohn Marino for (;;)
225444b87433SJohn Marino {
225544b87433SJohn Marino trtable = state->trtable;
2256*6ea1f93eSDaniel Fojt if (__glibc_likely (trtable != NULL))
225744b87433SJohn Marino return trtable[ch];
225844b87433SJohn Marino
225944b87433SJohn Marino trtable = state->word_trtable;
2260*6ea1f93eSDaniel Fojt if (__glibc_likely (trtable != NULL))
226144b87433SJohn Marino {
226244b87433SJohn Marino unsigned int context;
226344b87433SJohn Marino context
226444b87433SJohn Marino = re_string_context_at (&mctx->input,
226544b87433SJohn Marino re_string_cur_idx (&mctx->input) - 1,
226644b87433SJohn Marino mctx->eflags);
226744b87433SJohn Marino if (IS_WORD_CONTEXT (context))
226844b87433SJohn Marino return trtable[ch + SBC_MAX];
226944b87433SJohn Marino else
227044b87433SJohn Marino return trtable[ch];
227144b87433SJohn Marino }
227244b87433SJohn Marino
227344b87433SJohn Marino if (!build_trtable (mctx->dfa, state))
227444b87433SJohn Marino {
227544b87433SJohn Marino *err = REG_ESPACE;
227644b87433SJohn Marino return NULL;
227744b87433SJohn Marino }
227844b87433SJohn Marino
227944b87433SJohn Marino /* Retry, we now have a transition table. */
228044b87433SJohn Marino }
228144b87433SJohn Marino }
228244b87433SJohn Marino
228344b87433SJohn Marino /* Update the state_log if we need */
228444b87433SJohn Marino static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)228544b87433SJohn Marino merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
228644b87433SJohn Marino re_dfastate_t *next_state)
228744b87433SJohn Marino {
228844b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
228944b87433SJohn Marino Idx cur_idx = re_string_cur_idx (&mctx->input);
229044b87433SJohn Marino
229144b87433SJohn Marino if (cur_idx > mctx->state_log_top)
229244b87433SJohn Marino {
229344b87433SJohn Marino mctx->state_log[cur_idx] = next_state;
229444b87433SJohn Marino mctx->state_log_top = cur_idx;
229544b87433SJohn Marino }
229644b87433SJohn Marino else if (mctx->state_log[cur_idx] == 0)
229744b87433SJohn Marino {
229844b87433SJohn Marino mctx->state_log[cur_idx] = next_state;
229944b87433SJohn Marino }
230044b87433SJohn Marino else
230144b87433SJohn Marino {
230244b87433SJohn Marino re_dfastate_t *pstate;
230344b87433SJohn Marino unsigned int context;
230444b87433SJohn Marino re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
230544b87433SJohn Marino /* If (state_log[cur_idx] != 0), it implies that cur_idx is
230644b87433SJohn Marino the destination of a multibyte char/collating element/
230744b87433SJohn Marino back reference. Then the next state is the union set of
230844b87433SJohn Marino these destinations and the results of the transition table. */
230944b87433SJohn Marino pstate = mctx->state_log[cur_idx];
231044b87433SJohn Marino log_nodes = pstate->entrance_nodes;
231144b87433SJohn Marino if (next_state != NULL)
231244b87433SJohn Marino {
231344b87433SJohn Marino table_nodes = next_state->entrance_nodes;
231444b87433SJohn Marino *err = re_node_set_init_union (&next_nodes, table_nodes,
231544b87433SJohn Marino log_nodes);
2316*6ea1f93eSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
231744b87433SJohn Marino return NULL;
231844b87433SJohn Marino }
231944b87433SJohn Marino else
232044b87433SJohn Marino next_nodes = *log_nodes;
232144b87433SJohn Marino /* Note: We already add the nodes of the initial state,
232244b87433SJohn Marino then we don't need to add them here. */
232344b87433SJohn Marino
232444b87433SJohn Marino context = re_string_context_at (&mctx->input,
232544b87433SJohn Marino re_string_cur_idx (&mctx->input) - 1,
232644b87433SJohn Marino mctx->eflags);
232744b87433SJohn Marino next_state = mctx->state_log[cur_idx]
232844b87433SJohn Marino = re_acquire_state_context (err, dfa, &next_nodes, context);
232944b87433SJohn Marino /* We don't need to check errors here, since the return value of
233044b87433SJohn Marino this function is next_state and ERR is already set. */
233144b87433SJohn Marino
233244b87433SJohn Marino if (table_nodes != NULL)
233344b87433SJohn Marino re_node_set_free (&next_nodes);
233444b87433SJohn Marino }
233544b87433SJohn Marino
2336*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
233744b87433SJohn Marino {
233844b87433SJohn Marino /* Check OP_OPEN_SUBEXP in the current state in case that we use them
233944b87433SJohn Marino later. We must check them here, since the back references in the
234044b87433SJohn Marino next state might use them. */
234144b87433SJohn Marino *err = check_subexp_matching_top (mctx, &next_state->nodes,
234244b87433SJohn Marino cur_idx);
2343*6ea1f93eSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
234444b87433SJohn Marino return NULL;
234544b87433SJohn Marino
234644b87433SJohn Marino /* If the next state has back references. */
234744b87433SJohn Marino if (next_state->has_backref)
234844b87433SJohn Marino {
234944b87433SJohn Marino *err = transit_state_bkref (mctx, &next_state->nodes);
2350*6ea1f93eSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
235144b87433SJohn Marino return NULL;
235244b87433SJohn Marino next_state = mctx->state_log[cur_idx];
235344b87433SJohn Marino }
235444b87433SJohn Marino }
235544b87433SJohn Marino
235644b87433SJohn Marino return next_state;
235744b87433SJohn Marino }
235844b87433SJohn Marino
235944b87433SJohn Marino /* Skip bytes in the input that correspond to part of a
236044b87433SJohn Marino multi-byte match, then look in the log for a state
236144b87433SJohn Marino from which to restart matching. */
236244b87433SJohn Marino static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)236344b87433SJohn Marino find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
236444b87433SJohn Marino {
236544b87433SJohn Marino re_dfastate_t *cur_state;
236644b87433SJohn Marino do
236744b87433SJohn Marino {
236844b87433SJohn Marino Idx max = mctx->state_log_top;
236944b87433SJohn Marino Idx cur_str_idx = re_string_cur_idx (&mctx->input);
237044b87433SJohn Marino
237144b87433SJohn Marino do
237244b87433SJohn Marino {
237344b87433SJohn Marino if (++cur_str_idx > max)
237444b87433SJohn Marino return NULL;
237544b87433SJohn Marino re_string_skip_bytes (&mctx->input, 1);
237644b87433SJohn Marino }
237744b87433SJohn Marino while (mctx->state_log[cur_str_idx] == NULL);
237844b87433SJohn Marino
237944b87433SJohn Marino cur_state = merge_state_with_log (err, mctx, NULL);
238044b87433SJohn Marino }
238144b87433SJohn Marino while (*err == REG_NOERROR && cur_state == NULL);
238244b87433SJohn Marino return cur_state;
238344b87433SJohn Marino }
238444b87433SJohn Marino
238544b87433SJohn Marino /* Helper functions for transit_state. */
238644b87433SJohn Marino
238744b87433SJohn Marino /* From the node set CUR_NODES, pick up the nodes whose types are
238844b87433SJohn Marino OP_OPEN_SUBEXP and which have corresponding back references in the regular
238944b87433SJohn Marino expression. And register them to use them later for evaluating the
23904536c563SJohn Marino corresponding back references. */
239144b87433SJohn Marino
239244b87433SJohn Marino static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)239344b87433SJohn Marino check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
239444b87433SJohn Marino Idx str_idx)
239544b87433SJohn Marino {
239644b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
239744b87433SJohn Marino Idx node_idx;
239844b87433SJohn Marino reg_errcode_t err;
239944b87433SJohn Marino
240044b87433SJohn Marino /* TODO: This isn't efficient.
240144b87433SJohn Marino Because there might be more than one nodes whose types are
240244b87433SJohn Marino OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
240344b87433SJohn Marino nodes.
240444b87433SJohn Marino E.g. RE: (a){2} */
240544b87433SJohn Marino for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
240644b87433SJohn Marino {
240744b87433SJohn Marino Idx node = cur_nodes->elems[node_idx];
240844b87433SJohn Marino if (dfa->nodes[node].type == OP_OPEN_SUBEXP
240944b87433SJohn Marino && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
241044b87433SJohn Marino && (dfa->used_bkref_map
241144b87433SJohn Marino & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
241244b87433SJohn Marino {
241344b87433SJohn Marino err = match_ctx_add_subtop (mctx, node, str_idx);
2414*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
241544b87433SJohn Marino return err;
241644b87433SJohn Marino }
241744b87433SJohn Marino }
241844b87433SJohn Marino return REG_NOERROR;
241944b87433SJohn Marino }
242044b87433SJohn Marino
242144b87433SJohn Marino #if 0
242244b87433SJohn Marino /* Return the next state to which the current state STATE will transit by
242344b87433SJohn Marino accepting the current input byte. */
242444b87433SJohn Marino
242544b87433SJohn Marino static re_dfastate_t *
242644b87433SJohn Marino transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
242744b87433SJohn Marino re_dfastate_t *state)
242844b87433SJohn Marino {
242944b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
243044b87433SJohn Marino re_node_set next_nodes;
243144b87433SJohn Marino re_dfastate_t *next_state;
243244b87433SJohn Marino Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
243344b87433SJohn Marino unsigned int context;
243444b87433SJohn Marino
243544b87433SJohn Marino *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2436*6ea1f93eSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
243744b87433SJohn Marino return NULL;
243844b87433SJohn Marino for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
243944b87433SJohn Marino {
244044b87433SJohn Marino Idx cur_node = state->nodes.elems[node_cnt];
244144b87433SJohn Marino if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
244244b87433SJohn Marino {
244344b87433SJohn Marino *err = re_node_set_merge (&next_nodes,
244444b87433SJohn Marino dfa->eclosures + dfa->nexts[cur_node]);
2445*6ea1f93eSDaniel Fojt if (__glibc_unlikely (*err != REG_NOERROR))
244644b87433SJohn Marino {
244744b87433SJohn Marino re_node_set_free (&next_nodes);
244844b87433SJohn Marino return NULL;
244944b87433SJohn Marino }
245044b87433SJohn Marino }
245144b87433SJohn Marino }
245244b87433SJohn Marino context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
245344b87433SJohn Marino next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
245444b87433SJohn Marino /* We don't need to check errors here, since the return value of
245544b87433SJohn Marino this function is next_state and ERR is already set. */
245644b87433SJohn Marino
245744b87433SJohn Marino re_node_set_free (&next_nodes);
245844b87433SJohn Marino re_string_skip_bytes (&mctx->input, 1);
245944b87433SJohn Marino return next_state;
246044b87433SJohn Marino }
246144b87433SJohn Marino #endif
246244b87433SJohn Marino
246344b87433SJohn Marino #ifdef RE_ENABLE_I18N
246444b87433SJohn Marino static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)246544b87433SJohn Marino transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
246644b87433SJohn Marino {
246744b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
246844b87433SJohn Marino reg_errcode_t err;
246944b87433SJohn Marino Idx i;
247044b87433SJohn Marino
247144b87433SJohn Marino for (i = 0; i < pstate->nodes.nelem; ++i)
247244b87433SJohn Marino {
247344b87433SJohn Marino re_node_set dest_nodes, *new_nodes;
247444b87433SJohn Marino Idx cur_node_idx = pstate->nodes.elems[i];
247544b87433SJohn Marino int naccepted;
247644b87433SJohn Marino Idx dest_idx;
247744b87433SJohn Marino unsigned int context;
247844b87433SJohn Marino re_dfastate_t *dest_state;
247944b87433SJohn Marino
248044b87433SJohn Marino if (!dfa->nodes[cur_node_idx].accept_mb)
248144b87433SJohn Marino continue;
248244b87433SJohn Marino
248344b87433SJohn Marino if (dfa->nodes[cur_node_idx].constraint)
248444b87433SJohn Marino {
248544b87433SJohn Marino context = re_string_context_at (&mctx->input,
248644b87433SJohn Marino re_string_cur_idx (&mctx->input),
248744b87433SJohn Marino mctx->eflags);
248844b87433SJohn Marino if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
248944b87433SJohn Marino context))
249044b87433SJohn Marino continue;
249144b87433SJohn Marino }
249244b87433SJohn Marino
249344b87433SJohn Marino /* How many bytes the node can accept? */
249444b87433SJohn Marino naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
249544b87433SJohn Marino re_string_cur_idx (&mctx->input));
249644b87433SJohn Marino if (naccepted == 0)
249744b87433SJohn Marino continue;
249844b87433SJohn Marino
24994536c563SJohn Marino /* The node can accepts 'naccepted' bytes. */
250044b87433SJohn Marino dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
250144b87433SJohn Marino mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
250244b87433SJohn Marino : mctx->max_mb_elem_len);
250344b87433SJohn Marino err = clean_state_log_if_needed (mctx, dest_idx);
2504*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
250544b87433SJohn Marino return err;
250644b87433SJohn Marino #ifdef DEBUG
2507*6ea1f93eSDaniel Fojt assert (dfa->nexts[cur_node_idx] != -1);
250844b87433SJohn Marino #endif
250944b87433SJohn Marino new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
251044b87433SJohn Marino
251144b87433SJohn Marino dest_state = mctx->state_log[dest_idx];
251244b87433SJohn Marino if (dest_state == NULL)
251344b87433SJohn Marino dest_nodes = *new_nodes;
251444b87433SJohn Marino else
251544b87433SJohn Marino {
251644b87433SJohn Marino err = re_node_set_init_union (&dest_nodes,
251744b87433SJohn Marino dest_state->entrance_nodes, new_nodes);
2518*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
251944b87433SJohn Marino return err;
252044b87433SJohn Marino }
252144b87433SJohn Marino context = re_string_context_at (&mctx->input, dest_idx - 1,
252244b87433SJohn Marino mctx->eflags);
252344b87433SJohn Marino mctx->state_log[dest_idx]
252444b87433SJohn Marino = re_acquire_state_context (&err, dfa, &dest_nodes, context);
252544b87433SJohn Marino if (dest_state != NULL)
252644b87433SJohn Marino re_node_set_free (&dest_nodes);
2527*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2528*6ea1f93eSDaniel Fojt && err != REG_NOERROR))
252944b87433SJohn Marino return err;
253044b87433SJohn Marino }
253144b87433SJohn Marino return REG_NOERROR;
253244b87433SJohn Marino }
253344b87433SJohn Marino #endif /* RE_ENABLE_I18N */
253444b87433SJohn Marino
253544b87433SJohn Marino static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)253644b87433SJohn Marino transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
253744b87433SJohn Marino {
253844b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
253944b87433SJohn Marino reg_errcode_t err;
254044b87433SJohn Marino Idx i;
254144b87433SJohn Marino Idx cur_str_idx = re_string_cur_idx (&mctx->input);
254244b87433SJohn Marino
254344b87433SJohn Marino for (i = 0; i < nodes->nelem; ++i)
254444b87433SJohn Marino {
254544b87433SJohn Marino Idx dest_str_idx, prev_nelem, bkc_idx;
254644b87433SJohn Marino Idx node_idx = nodes->elems[i];
254744b87433SJohn Marino unsigned int context;
254844b87433SJohn Marino const re_token_t *node = dfa->nodes + node_idx;
254944b87433SJohn Marino re_node_set *new_dest_nodes;
255044b87433SJohn Marino
25514536c563SJohn Marino /* Check whether 'node' is a backreference or not. */
255244b87433SJohn Marino if (node->type != OP_BACK_REF)
255344b87433SJohn Marino continue;
255444b87433SJohn Marino
255544b87433SJohn Marino if (node->constraint)
255644b87433SJohn Marino {
255744b87433SJohn Marino context = re_string_context_at (&mctx->input, cur_str_idx,
255844b87433SJohn Marino mctx->eflags);
255944b87433SJohn Marino if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
256044b87433SJohn Marino continue;
256144b87433SJohn Marino }
256244b87433SJohn Marino
25634536c563SJohn Marino /* 'node' is a backreference.
256444b87433SJohn Marino Check the substring which the substring matched. */
256544b87433SJohn Marino bkc_idx = mctx->nbkref_ents;
256644b87433SJohn Marino err = get_subexp (mctx, node_idx, cur_str_idx);
2567*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
256844b87433SJohn Marino goto free_return;
256944b87433SJohn Marino
25704536c563SJohn Marino /* And add the epsilon closures (which is 'new_dest_nodes') of
257144b87433SJohn Marino the backreference to appropriate state_log. */
257244b87433SJohn Marino #ifdef DEBUG
2573*6ea1f93eSDaniel Fojt assert (dfa->nexts[node_idx] != -1);
257444b87433SJohn Marino #endif
257544b87433SJohn Marino for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
257644b87433SJohn Marino {
257744b87433SJohn Marino Idx subexp_len;
257844b87433SJohn Marino re_dfastate_t *dest_state;
257944b87433SJohn Marino struct re_backref_cache_entry *bkref_ent;
258044b87433SJohn Marino bkref_ent = mctx->bkref_ents + bkc_idx;
258144b87433SJohn Marino if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
258244b87433SJohn Marino continue;
258344b87433SJohn Marino subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
258444b87433SJohn Marino new_dest_nodes = (subexp_len == 0
258544b87433SJohn Marino ? dfa->eclosures + dfa->edests[node_idx].elems[0]
258644b87433SJohn Marino : dfa->eclosures + dfa->nexts[node_idx]);
258744b87433SJohn Marino dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
258844b87433SJohn Marino - bkref_ent->subexp_from);
258944b87433SJohn Marino context = re_string_context_at (&mctx->input, dest_str_idx - 1,
259044b87433SJohn Marino mctx->eflags);
259144b87433SJohn Marino dest_state = mctx->state_log[dest_str_idx];
259244b87433SJohn Marino prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
259344b87433SJohn Marino : mctx->state_log[cur_str_idx]->nodes.nelem);
25944536c563SJohn Marino /* Add 'new_dest_node' to state_log. */
259544b87433SJohn Marino if (dest_state == NULL)
259644b87433SJohn Marino {
259744b87433SJohn Marino mctx->state_log[dest_str_idx]
259844b87433SJohn Marino = re_acquire_state_context (&err, dfa, new_dest_nodes,
259944b87433SJohn Marino context);
2600*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2601*6ea1f93eSDaniel Fojt && err != REG_NOERROR))
260244b87433SJohn Marino goto free_return;
260344b87433SJohn Marino }
260444b87433SJohn Marino else
260544b87433SJohn Marino {
260644b87433SJohn Marino re_node_set dest_nodes;
260744b87433SJohn Marino err = re_node_set_init_union (&dest_nodes,
260844b87433SJohn Marino dest_state->entrance_nodes,
260944b87433SJohn Marino new_dest_nodes);
2610*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
261144b87433SJohn Marino {
261244b87433SJohn Marino re_node_set_free (&dest_nodes);
261344b87433SJohn Marino goto free_return;
261444b87433SJohn Marino }
261544b87433SJohn Marino mctx->state_log[dest_str_idx]
261644b87433SJohn Marino = re_acquire_state_context (&err, dfa, &dest_nodes, context);
261744b87433SJohn Marino re_node_set_free (&dest_nodes);
2618*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2619*6ea1f93eSDaniel Fojt && err != REG_NOERROR))
262044b87433SJohn Marino goto free_return;
262144b87433SJohn Marino }
262244b87433SJohn Marino /* We need to check recursively if the backreference can epsilon
262344b87433SJohn Marino transit. */
262444b87433SJohn Marino if (subexp_len == 0
262544b87433SJohn Marino && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
262644b87433SJohn Marino {
262744b87433SJohn Marino err = check_subexp_matching_top (mctx, new_dest_nodes,
262844b87433SJohn Marino cur_str_idx);
2629*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
263044b87433SJohn Marino goto free_return;
263144b87433SJohn Marino err = transit_state_bkref (mctx, new_dest_nodes);
2632*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
263344b87433SJohn Marino goto free_return;
263444b87433SJohn Marino }
263544b87433SJohn Marino }
263644b87433SJohn Marino }
263744b87433SJohn Marino err = REG_NOERROR;
263844b87433SJohn Marino free_return:
263944b87433SJohn Marino return err;
264044b87433SJohn Marino }
264144b87433SJohn Marino
264244b87433SJohn Marino /* Enumerate all the candidates which the backreference BKREF_NODE can match
264344b87433SJohn Marino at BKREF_STR_IDX, and register them by match_ctx_add_entry().
264444b87433SJohn Marino Note that we might collect inappropriate candidates here.
264544b87433SJohn Marino However, the cost of checking them strictly here is too high, then we
264644b87433SJohn Marino delay these checking for prune_impossible_nodes(). */
264744b87433SJohn Marino
264844b87433SJohn Marino static reg_errcode_t
2649*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)265044b87433SJohn Marino get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
265144b87433SJohn Marino {
265244b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
265344b87433SJohn Marino Idx subexp_num, sub_top_idx;
265444b87433SJohn Marino const char *buf = (const char *) re_string_get_buffer (&mctx->input);
265544b87433SJohn Marino /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
265644b87433SJohn Marino Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2657*6ea1f93eSDaniel Fojt if (cache_idx != -1)
265844b87433SJohn Marino {
265944b87433SJohn Marino const struct re_backref_cache_entry *entry
266044b87433SJohn Marino = mctx->bkref_ents + cache_idx;
266144b87433SJohn Marino do
266244b87433SJohn Marino if (entry->node == bkref_node)
266344b87433SJohn Marino return REG_NOERROR; /* We already checked it. */
266444b87433SJohn Marino while (entry++->more);
266544b87433SJohn Marino }
266644b87433SJohn Marino
266744b87433SJohn Marino subexp_num = dfa->nodes[bkref_node].opr.idx;
266844b87433SJohn Marino
266944b87433SJohn Marino /* For each sub expression */
267044b87433SJohn Marino for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
267144b87433SJohn Marino {
267244b87433SJohn Marino reg_errcode_t err;
267344b87433SJohn Marino re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
267444b87433SJohn Marino re_sub_match_last_t *sub_last;
267544b87433SJohn Marino Idx sub_last_idx, sl_str, bkref_str_off;
267644b87433SJohn Marino
267744b87433SJohn Marino if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
267844b87433SJohn Marino continue; /* It isn't related. */
267944b87433SJohn Marino
268044b87433SJohn Marino sl_str = sub_top->str_idx;
268144b87433SJohn Marino bkref_str_off = bkref_str_idx;
268244b87433SJohn Marino /* At first, check the last node of sub expressions we already
268344b87433SJohn Marino evaluated. */
268444b87433SJohn Marino for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
268544b87433SJohn Marino {
268644b87433SJohn Marino regoff_t sl_str_diff;
268744b87433SJohn Marino sub_last = sub_top->lasts[sub_last_idx];
268844b87433SJohn Marino sl_str_diff = sub_last->str_idx - sl_str;
268944b87433SJohn Marino /* The matched string by the sub expression match with the substring
269044b87433SJohn Marino at the back reference? */
269144b87433SJohn Marino if (sl_str_diff > 0)
269244b87433SJohn Marino {
2693*6ea1f93eSDaniel Fojt if (__glibc_unlikely (bkref_str_off + sl_str_diff
2694*6ea1f93eSDaniel Fojt > mctx->input.valid_len))
269544b87433SJohn Marino {
269644b87433SJohn Marino /* Not enough chars for a successful match. */
269744b87433SJohn Marino if (bkref_str_off + sl_str_diff > mctx->input.len)
269844b87433SJohn Marino break;
269944b87433SJohn Marino
270044b87433SJohn Marino err = clean_state_log_if_needed (mctx,
270144b87433SJohn Marino bkref_str_off
270244b87433SJohn Marino + sl_str_diff);
2703*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
270444b87433SJohn Marino return err;
270544b87433SJohn Marino buf = (const char *) re_string_get_buffer (&mctx->input);
270644b87433SJohn Marino }
270744b87433SJohn Marino if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
270844b87433SJohn Marino /* We don't need to search this sub expression any more. */
270944b87433SJohn Marino break;
271044b87433SJohn Marino }
271144b87433SJohn Marino bkref_str_off += sl_str_diff;
271244b87433SJohn Marino sl_str += sl_str_diff;
271344b87433SJohn Marino err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
271444b87433SJohn Marino bkref_str_idx);
271544b87433SJohn Marino
271644b87433SJohn Marino /* Reload buf, since the preceding call might have reallocated
271744b87433SJohn Marino the buffer. */
271844b87433SJohn Marino buf = (const char *) re_string_get_buffer (&mctx->input);
271944b87433SJohn Marino
272044b87433SJohn Marino if (err == REG_NOMATCH)
272144b87433SJohn Marino continue;
2722*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
272344b87433SJohn Marino return err;
272444b87433SJohn Marino }
272544b87433SJohn Marino
272644b87433SJohn Marino if (sub_last_idx < sub_top->nlasts)
272744b87433SJohn Marino continue;
272844b87433SJohn Marino if (sub_last_idx > 0)
272944b87433SJohn Marino ++sl_str;
273044b87433SJohn Marino /* Then, search for the other last nodes of the sub expression. */
273144b87433SJohn Marino for (; sl_str <= bkref_str_idx; ++sl_str)
273244b87433SJohn Marino {
273344b87433SJohn Marino Idx cls_node;
273444b87433SJohn Marino regoff_t sl_str_off;
273544b87433SJohn Marino const re_node_set *nodes;
273644b87433SJohn Marino sl_str_off = sl_str - sub_top->str_idx;
273744b87433SJohn Marino /* The matched string by the sub expression match with the substring
273844b87433SJohn Marino at the back reference? */
273944b87433SJohn Marino if (sl_str_off > 0)
274044b87433SJohn Marino {
2741*6ea1f93eSDaniel Fojt if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
274244b87433SJohn Marino {
274344b87433SJohn Marino /* If we are at the end of the input, we cannot match. */
274444b87433SJohn Marino if (bkref_str_off >= mctx->input.len)
274544b87433SJohn Marino break;
274644b87433SJohn Marino
27474536c563SJohn Marino err = extend_buffers (mctx, bkref_str_off + 1);
2748*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
274944b87433SJohn Marino return err;
275044b87433SJohn Marino
275144b87433SJohn Marino buf = (const char *) re_string_get_buffer (&mctx->input);
275244b87433SJohn Marino }
275344b87433SJohn Marino if (buf [bkref_str_off++] != buf[sl_str - 1])
275444b87433SJohn Marino break; /* We don't need to search this sub expression
275544b87433SJohn Marino any more. */
275644b87433SJohn Marino }
275744b87433SJohn Marino if (mctx->state_log[sl_str] == NULL)
275844b87433SJohn Marino continue;
275944b87433SJohn Marino /* Does this state have a ')' of the sub expression? */
276044b87433SJohn Marino nodes = &mctx->state_log[sl_str]->nodes;
276144b87433SJohn Marino cls_node = find_subexp_node (dfa, nodes, subexp_num,
276244b87433SJohn Marino OP_CLOSE_SUBEXP);
2763*6ea1f93eSDaniel Fojt if (cls_node == -1)
276444b87433SJohn Marino continue; /* No. */
276544b87433SJohn Marino if (sub_top->path == NULL)
276644b87433SJohn Marino {
276744b87433SJohn Marino sub_top->path = calloc (sizeof (state_array_t),
276844b87433SJohn Marino sl_str - sub_top->str_idx + 1);
276944b87433SJohn Marino if (sub_top->path == NULL)
277044b87433SJohn Marino return REG_ESPACE;
277144b87433SJohn Marino }
277244b87433SJohn Marino /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
277344b87433SJohn Marino in the current context? */
277444b87433SJohn Marino err = check_arrival (mctx, sub_top->path, sub_top->node,
277544b87433SJohn Marino sub_top->str_idx, cls_node, sl_str,
277644b87433SJohn Marino OP_CLOSE_SUBEXP);
277744b87433SJohn Marino if (err == REG_NOMATCH)
277844b87433SJohn Marino continue;
2779*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
278044b87433SJohn Marino return err;
278144b87433SJohn Marino sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2782*6ea1f93eSDaniel Fojt if (__glibc_unlikely (sub_last == NULL))
278344b87433SJohn Marino return REG_ESPACE;
278444b87433SJohn Marino err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
278544b87433SJohn Marino bkref_str_idx);
2786*6ea1f93eSDaniel Fojt buf = (const char *) re_string_get_buffer (&mctx->input);
278744b87433SJohn Marino if (err == REG_NOMATCH)
278844b87433SJohn Marino continue;
2789*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
2790*6ea1f93eSDaniel Fojt return err;
279144b87433SJohn Marino }
279244b87433SJohn Marino }
279344b87433SJohn Marino return REG_NOERROR;
279444b87433SJohn Marino }
279544b87433SJohn Marino
279644b87433SJohn Marino /* Helper functions for get_subexp(). */
279744b87433SJohn Marino
279844b87433SJohn Marino /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
279944b87433SJohn Marino If it can arrive, register the sub expression expressed with SUB_TOP
280044b87433SJohn Marino and SUB_LAST. */
280144b87433SJohn Marino
280244b87433SJohn Marino static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)280344b87433SJohn Marino get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
280444b87433SJohn Marino re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
280544b87433SJohn Marino {
280644b87433SJohn Marino reg_errcode_t err;
280744b87433SJohn Marino Idx to_idx;
280844b87433SJohn Marino /* Can the subexpression arrive the back reference? */
280944b87433SJohn Marino err = check_arrival (mctx, &sub_last->path, sub_last->node,
281044b87433SJohn Marino sub_last->str_idx, bkref_node, bkref_str,
281144b87433SJohn Marino OP_OPEN_SUBEXP);
281244b87433SJohn Marino if (err != REG_NOERROR)
281344b87433SJohn Marino return err;
281444b87433SJohn Marino err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
281544b87433SJohn Marino sub_last->str_idx);
2816*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
281744b87433SJohn Marino return err;
281844b87433SJohn Marino to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
281944b87433SJohn Marino return clean_state_log_if_needed (mctx, to_idx);
282044b87433SJohn Marino }
282144b87433SJohn Marino
282244b87433SJohn Marino /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
282344b87433SJohn Marino Search '(' if FL_OPEN, or search ')' otherwise.
282444b87433SJohn Marino TODO: This function isn't efficient...
282544b87433SJohn Marino Because there might be more than one nodes whose types are
282644b87433SJohn Marino OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
282744b87433SJohn Marino nodes.
282844b87433SJohn Marino E.g. RE: (a){2} */
282944b87433SJohn Marino
283044b87433SJohn Marino static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)283144b87433SJohn Marino find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
283244b87433SJohn Marino Idx subexp_idx, int type)
283344b87433SJohn Marino {
283444b87433SJohn Marino Idx cls_idx;
283544b87433SJohn Marino for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
283644b87433SJohn Marino {
283744b87433SJohn Marino Idx cls_node = nodes->elems[cls_idx];
283844b87433SJohn Marino const re_token_t *node = dfa->nodes + cls_node;
283944b87433SJohn Marino if (node->type == type
284044b87433SJohn Marino && node->opr.idx == subexp_idx)
284144b87433SJohn Marino return cls_node;
284244b87433SJohn Marino }
2843*6ea1f93eSDaniel Fojt return -1;
284444b87433SJohn Marino }
284544b87433SJohn Marino
284644b87433SJohn Marino /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
284744b87433SJohn Marino LAST_NODE at LAST_STR. We record the path onto PATH since it will be
284844b87433SJohn Marino heavily reused.
284944b87433SJohn Marino Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
285044b87433SJohn Marino
285144b87433SJohn Marino static reg_errcode_t
2852*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)285344b87433SJohn Marino check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
285444b87433SJohn Marino Idx top_str, Idx last_node, Idx last_str, int type)
285544b87433SJohn Marino {
285644b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
285744b87433SJohn Marino reg_errcode_t err = REG_NOERROR;
285844b87433SJohn Marino Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
285944b87433SJohn Marino re_dfastate_t *cur_state = NULL;
286044b87433SJohn Marino re_node_set *cur_nodes, next_nodes;
286144b87433SJohn Marino re_dfastate_t **backup_state_log;
286244b87433SJohn Marino unsigned int context;
286344b87433SJohn Marino
286444b87433SJohn Marino subexp_num = dfa->nodes[top_node].opr.idx;
286544b87433SJohn Marino /* Extend the buffer if we need. */
2866*6ea1f93eSDaniel Fojt if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
286744b87433SJohn Marino {
286844b87433SJohn Marino re_dfastate_t **new_array;
286944b87433SJohn Marino Idx old_alloc = path->alloc;
28704536c563SJohn Marino Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
28714536c563SJohn Marino Idx new_alloc;
2872*6ea1f93eSDaniel Fojt if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
28734536c563SJohn Marino return REG_ESPACE;
28744536c563SJohn Marino new_alloc = old_alloc + incr_alloc;
2875*6ea1f93eSDaniel Fojt if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
287644b87433SJohn Marino return REG_ESPACE;
287744b87433SJohn Marino new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2878*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_array == NULL))
287944b87433SJohn Marino return REG_ESPACE;
288044b87433SJohn Marino path->array = new_array;
288144b87433SJohn Marino path->alloc = new_alloc;
288244b87433SJohn Marino memset (new_array + old_alloc, '\0',
288344b87433SJohn Marino sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
288444b87433SJohn Marino }
288544b87433SJohn Marino
288644b87433SJohn Marino str_idx = path->next_idx ? path->next_idx : top_str;
288744b87433SJohn Marino
288844b87433SJohn Marino /* Temporary modify MCTX. */
288944b87433SJohn Marino backup_state_log = mctx->state_log;
289044b87433SJohn Marino backup_cur_idx = mctx->input.cur_idx;
289144b87433SJohn Marino mctx->state_log = path->array;
289244b87433SJohn Marino mctx->input.cur_idx = str_idx;
289344b87433SJohn Marino
289444b87433SJohn Marino /* Setup initial node set. */
289544b87433SJohn Marino context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
289644b87433SJohn Marino if (str_idx == top_str)
289744b87433SJohn Marino {
289844b87433SJohn Marino err = re_node_set_init_1 (&next_nodes, top_node);
2899*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
290044b87433SJohn Marino return err;
290144b87433SJohn Marino err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2902*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
290344b87433SJohn Marino {
290444b87433SJohn Marino re_node_set_free (&next_nodes);
290544b87433SJohn Marino return err;
290644b87433SJohn Marino }
290744b87433SJohn Marino }
290844b87433SJohn Marino else
290944b87433SJohn Marino {
291044b87433SJohn Marino cur_state = mctx->state_log[str_idx];
291144b87433SJohn Marino if (cur_state && cur_state->has_backref)
291244b87433SJohn Marino {
291344b87433SJohn Marino err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2914*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
291544b87433SJohn Marino return err;
291644b87433SJohn Marino }
291744b87433SJohn Marino else
291844b87433SJohn Marino re_node_set_init_empty (&next_nodes);
291944b87433SJohn Marino }
292044b87433SJohn Marino if (str_idx == top_str || (cur_state && cur_state->has_backref))
292144b87433SJohn Marino {
292244b87433SJohn Marino if (next_nodes.nelem)
292344b87433SJohn Marino {
292444b87433SJohn Marino err = expand_bkref_cache (mctx, &next_nodes, str_idx,
292544b87433SJohn Marino subexp_num, type);
2926*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
292744b87433SJohn Marino {
292844b87433SJohn Marino re_node_set_free (&next_nodes);
292944b87433SJohn Marino return err;
293044b87433SJohn Marino }
293144b87433SJohn Marino }
293244b87433SJohn Marino cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2933*6ea1f93eSDaniel Fojt if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
293444b87433SJohn Marino {
293544b87433SJohn Marino re_node_set_free (&next_nodes);
293644b87433SJohn Marino return err;
293744b87433SJohn Marino }
293844b87433SJohn Marino mctx->state_log[str_idx] = cur_state;
293944b87433SJohn Marino }
294044b87433SJohn Marino
294144b87433SJohn Marino for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
294244b87433SJohn Marino {
294344b87433SJohn Marino re_node_set_empty (&next_nodes);
294444b87433SJohn Marino if (mctx->state_log[str_idx + 1])
294544b87433SJohn Marino {
294644b87433SJohn Marino err = re_node_set_merge (&next_nodes,
294744b87433SJohn Marino &mctx->state_log[str_idx + 1]->nodes);
2948*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
294944b87433SJohn Marino {
295044b87433SJohn Marino re_node_set_free (&next_nodes);
295144b87433SJohn Marino return err;
295244b87433SJohn Marino }
295344b87433SJohn Marino }
295444b87433SJohn Marino if (cur_state)
295544b87433SJohn Marino {
295644b87433SJohn Marino err = check_arrival_add_next_nodes (mctx, str_idx,
295744b87433SJohn Marino &cur_state->non_eps_nodes,
295844b87433SJohn Marino &next_nodes);
2959*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
296044b87433SJohn Marino {
296144b87433SJohn Marino re_node_set_free (&next_nodes);
296244b87433SJohn Marino return err;
296344b87433SJohn Marino }
296444b87433SJohn Marino }
296544b87433SJohn Marino ++str_idx;
296644b87433SJohn Marino if (next_nodes.nelem)
296744b87433SJohn Marino {
296844b87433SJohn Marino err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2969*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
297044b87433SJohn Marino {
297144b87433SJohn Marino re_node_set_free (&next_nodes);
297244b87433SJohn Marino return err;
297344b87433SJohn Marino }
297444b87433SJohn Marino err = expand_bkref_cache (mctx, &next_nodes, str_idx,
297544b87433SJohn Marino subexp_num, type);
2976*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
297744b87433SJohn Marino {
297844b87433SJohn Marino re_node_set_free (&next_nodes);
297944b87433SJohn Marino return err;
298044b87433SJohn Marino }
298144b87433SJohn Marino }
298244b87433SJohn Marino context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
298344b87433SJohn Marino cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2984*6ea1f93eSDaniel Fojt if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
298544b87433SJohn Marino {
298644b87433SJohn Marino re_node_set_free (&next_nodes);
298744b87433SJohn Marino return err;
298844b87433SJohn Marino }
298944b87433SJohn Marino mctx->state_log[str_idx] = cur_state;
299044b87433SJohn Marino null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
299144b87433SJohn Marino }
299244b87433SJohn Marino re_node_set_free (&next_nodes);
299344b87433SJohn Marino cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
299444b87433SJohn Marino : &mctx->state_log[last_str]->nodes);
299544b87433SJohn Marino path->next_idx = str_idx;
299644b87433SJohn Marino
299744b87433SJohn Marino /* Fix MCTX. */
299844b87433SJohn Marino mctx->state_log = backup_state_log;
299944b87433SJohn Marino mctx->input.cur_idx = backup_cur_idx;
300044b87433SJohn Marino
300144b87433SJohn Marino /* Then check the current node set has the node LAST_NODE. */
300244b87433SJohn Marino if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
300344b87433SJohn Marino return REG_NOERROR;
300444b87433SJohn Marino
300544b87433SJohn Marino return REG_NOMATCH;
300644b87433SJohn Marino }
300744b87433SJohn Marino
300844b87433SJohn Marino /* Helper functions for check_arrival. */
300944b87433SJohn Marino
301044b87433SJohn Marino /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
301144b87433SJohn Marino to NEXT_NODES.
301244b87433SJohn Marino TODO: This function is similar to the functions transit_state*(),
301344b87433SJohn Marino however this function has many additional works.
301444b87433SJohn Marino Can't we unify them? */
301544b87433SJohn Marino
301644b87433SJohn Marino static reg_errcode_t
3017*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)301844b87433SJohn Marino check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
301944b87433SJohn Marino re_node_set *cur_nodes, re_node_set *next_nodes)
302044b87433SJohn Marino {
302144b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
302244b87433SJohn Marino bool ok;
302344b87433SJohn Marino Idx cur_idx;
302444b87433SJohn Marino #ifdef RE_ENABLE_I18N
302544b87433SJohn Marino reg_errcode_t err = REG_NOERROR;
302644b87433SJohn Marino #endif
302744b87433SJohn Marino re_node_set union_set;
302844b87433SJohn Marino re_node_set_init_empty (&union_set);
302944b87433SJohn Marino for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
303044b87433SJohn Marino {
303144b87433SJohn Marino int naccepted = 0;
303244b87433SJohn Marino Idx cur_node = cur_nodes->elems[cur_idx];
303344b87433SJohn Marino #ifdef DEBUG
303444b87433SJohn Marino re_token_type_t type = dfa->nodes[cur_node].type;
303544b87433SJohn Marino assert (!IS_EPSILON_NODE (type));
303644b87433SJohn Marino #endif
303744b87433SJohn Marino #ifdef RE_ENABLE_I18N
30384536c563SJohn Marino /* If the node may accept "multi byte". */
303944b87433SJohn Marino if (dfa->nodes[cur_node].accept_mb)
304044b87433SJohn Marino {
304144b87433SJohn Marino naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
304244b87433SJohn Marino str_idx);
304344b87433SJohn Marino if (naccepted > 1)
304444b87433SJohn Marino {
304544b87433SJohn Marino re_dfastate_t *dest_state;
304644b87433SJohn Marino Idx next_node = dfa->nexts[cur_node];
304744b87433SJohn Marino Idx next_idx = str_idx + naccepted;
304844b87433SJohn Marino dest_state = mctx->state_log[next_idx];
304944b87433SJohn Marino re_node_set_empty (&union_set);
305044b87433SJohn Marino if (dest_state)
305144b87433SJohn Marino {
305244b87433SJohn Marino err = re_node_set_merge (&union_set, &dest_state->nodes);
3053*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
305444b87433SJohn Marino {
305544b87433SJohn Marino re_node_set_free (&union_set);
305644b87433SJohn Marino return err;
305744b87433SJohn Marino }
305844b87433SJohn Marino }
305944b87433SJohn Marino ok = re_node_set_insert (&union_set, next_node);
3060*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
306144b87433SJohn Marino {
306244b87433SJohn Marino re_node_set_free (&union_set);
306344b87433SJohn Marino return REG_ESPACE;
306444b87433SJohn Marino }
306544b87433SJohn Marino mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
306644b87433SJohn Marino &union_set);
3067*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3068*6ea1f93eSDaniel Fojt && err != REG_NOERROR))
306944b87433SJohn Marino {
307044b87433SJohn Marino re_node_set_free (&union_set);
307144b87433SJohn Marino return err;
307244b87433SJohn Marino }
307344b87433SJohn Marino }
307444b87433SJohn Marino }
307544b87433SJohn Marino #endif /* RE_ENABLE_I18N */
307644b87433SJohn Marino if (naccepted
307744b87433SJohn Marino || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
307844b87433SJohn Marino {
307944b87433SJohn Marino ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3080*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
308144b87433SJohn Marino {
308244b87433SJohn Marino re_node_set_free (&union_set);
308344b87433SJohn Marino return REG_ESPACE;
308444b87433SJohn Marino }
308544b87433SJohn Marino }
308644b87433SJohn Marino }
308744b87433SJohn Marino re_node_set_free (&union_set);
308844b87433SJohn Marino return REG_NOERROR;
308944b87433SJohn Marino }
309044b87433SJohn Marino
309144b87433SJohn Marino /* For all the nodes in CUR_NODES, add the epsilon closures of them to
309244b87433SJohn Marino CUR_NODES, however exclude the nodes which are:
309344b87433SJohn Marino - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
309444b87433SJohn Marino - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
309544b87433SJohn Marino */
309644b87433SJohn Marino
309744b87433SJohn Marino static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)309844b87433SJohn Marino check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
309944b87433SJohn Marino Idx ex_subexp, int type)
310044b87433SJohn Marino {
310144b87433SJohn Marino reg_errcode_t err;
310244b87433SJohn Marino Idx idx, outside_node;
310344b87433SJohn Marino re_node_set new_nodes;
310444b87433SJohn Marino #ifdef DEBUG
310544b87433SJohn Marino assert (cur_nodes->nelem);
310644b87433SJohn Marino #endif
310744b87433SJohn Marino err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3108*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
310944b87433SJohn Marino return err;
311044b87433SJohn Marino /* Create a new node set NEW_NODES with the nodes which are epsilon
311144b87433SJohn Marino closures of the node in CUR_NODES. */
311244b87433SJohn Marino
311344b87433SJohn Marino for (idx = 0; idx < cur_nodes->nelem; ++idx)
311444b87433SJohn Marino {
311544b87433SJohn Marino Idx cur_node = cur_nodes->elems[idx];
311644b87433SJohn Marino const re_node_set *eclosure = dfa->eclosures + cur_node;
311744b87433SJohn Marino outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3118*6ea1f93eSDaniel Fojt if (outside_node == -1)
311944b87433SJohn Marino {
312044b87433SJohn Marino /* There are no problematic nodes, just merge them. */
312144b87433SJohn Marino err = re_node_set_merge (&new_nodes, eclosure);
3122*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
312344b87433SJohn Marino {
312444b87433SJohn Marino re_node_set_free (&new_nodes);
312544b87433SJohn Marino return err;
312644b87433SJohn Marino }
312744b87433SJohn Marino }
312844b87433SJohn Marino else
312944b87433SJohn Marino {
313044b87433SJohn Marino /* There are problematic nodes, re-calculate incrementally. */
313144b87433SJohn Marino err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
313244b87433SJohn Marino ex_subexp, type);
3133*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
313444b87433SJohn Marino {
313544b87433SJohn Marino re_node_set_free (&new_nodes);
313644b87433SJohn Marino return err;
313744b87433SJohn Marino }
313844b87433SJohn Marino }
313944b87433SJohn Marino }
314044b87433SJohn Marino re_node_set_free (cur_nodes);
314144b87433SJohn Marino *cur_nodes = new_nodes;
314244b87433SJohn Marino return REG_NOERROR;
314344b87433SJohn Marino }
314444b87433SJohn Marino
314544b87433SJohn Marino /* Helper function for check_arrival_expand_ecl.
314644b87433SJohn Marino Check incrementally the epsilon closure of TARGET, and if it isn't
314744b87433SJohn Marino problematic append it to DST_NODES. */
314844b87433SJohn Marino
314944b87433SJohn Marino static reg_errcode_t
3150*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)315144b87433SJohn Marino check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
315244b87433SJohn Marino Idx target, Idx ex_subexp, int type)
315344b87433SJohn Marino {
315444b87433SJohn Marino Idx cur_node;
315544b87433SJohn Marino for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
315644b87433SJohn Marino {
315744b87433SJohn Marino bool ok;
315844b87433SJohn Marino
315944b87433SJohn Marino if (dfa->nodes[cur_node].type == type
316044b87433SJohn Marino && dfa->nodes[cur_node].opr.idx == ex_subexp)
316144b87433SJohn Marino {
316244b87433SJohn Marino if (type == OP_CLOSE_SUBEXP)
316344b87433SJohn Marino {
316444b87433SJohn Marino ok = re_node_set_insert (dst_nodes, cur_node);
3165*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
316644b87433SJohn Marino return REG_ESPACE;
316744b87433SJohn Marino }
316844b87433SJohn Marino break;
316944b87433SJohn Marino }
317044b87433SJohn Marino ok = re_node_set_insert (dst_nodes, cur_node);
3171*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
317244b87433SJohn Marino return REG_ESPACE;
317344b87433SJohn Marino if (dfa->edests[cur_node].nelem == 0)
317444b87433SJohn Marino break;
317544b87433SJohn Marino if (dfa->edests[cur_node].nelem == 2)
317644b87433SJohn Marino {
317744b87433SJohn Marino reg_errcode_t err;
317844b87433SJohn Marino err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
317944b87433SJohn Marino dfa->edests[cur_node].elems[1],
318044b87433SJohn Marino ex_subexp, type);
3181*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
318244b87433SJohn Marino return err;
318344b87433SJohn Marino }
318444b87433SJohn Marino cur_node = dfa->edests[cur_node].elems[0];
318544b87433SJohn Marino }
318644b87433SJohn Marino return REG_NOERROR;
318744b87433SJohn Marino }
318844b87433SJohn Marino
318944b87433SJohn Marino
319044b87433SJohn Marino /* For all the back references in the current state, calculate the
319144b87433SJohn Marino destination of the back references by the appropriate entry
319244b87433SJohn Marino in MCTX->BKREF_ENTS. */
319344b87433SJohn Marino
319444b87433SJohn Marino static reg_errcode_t
3195*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)319644b87433SJohn Marino expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
319744b87433SJohn Marino Idx cur_str, Idx subexp_num, int type)
319844b87433SJohn Marino {
319944b87433SJohn Marino const re_dfa_t *const dfa = mctx->dfa;
320044b87433SJohn Marino reg_errcode_t err;
320144b87433SJohn Marino Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
320244b87433SJohn Marino struct re_backref_cache_entry *ent;
320344b87433SJohn Marino
3204*6ea1f93eSDaniel Fojt if (cache_idx_start == -1)
320544b87433SJohn Marino return REG_NOERROR;
320644b87433SJohn Marino
320744b87433SJohn Marino restart:
320844b87433SJohn Marino ent = mctx->bkref_ents + cache_idx_start;
320944b87433SJohn Marino do
321044b87433SJohn Marino {
321144b87433SJohn Marino Idx to_idx, next_node;
321244b87433SJohn Marino
321344b87433SJohn Marino /* Is this entry ENT is appropriate? */
321444b87433SJohn Marino if (!re_node_set_contains (cur_nodes, ent->node))
321544b87433SJohn Marino continue; /* No. */
321644b87433SJohn Marino
321744b87433SJohn Marino to_idx = cur_str + ent->subexp_to - ent->subexp_from;
321844b87433SJohn Marino /* Calculate the destination of the back reference, and append it
321944b87433SJohn Marino to MCTX->STATE_LOG. */
322044b87433SJohn Marino if (to_idx == cur_str)
322144b87433SJohn Marino {
322244b87433SJohn Marino /* The backreference did epsilon transit, we must re-check all the
322344b87433SJohn Marino node in the current state. */
322444b87433SJohn Marino re_node_set new_dests;
322544b87433SJohn Marino reg_errcode_t err2, err3;
322644b87433SJohn Marino next_node = dfa->edests[ent->node].elems[0];
322744b87433SJohn Marino if (re_node_set_contains (cur_nodes, next_node))
322844b87433SJohn Marino continue;
322944b87433SJohn Marino err = re_node_set_init_1 (&new_dests, next_node);
323044b87433SJohn Marino err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
323144b87433SJohn Marino err3 = re_node_set_merge (cur_nodes, &new_dests);
323244b87433SJohn Marino re_node_set_free (&new_dests);
3233*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3234*6ea1f93eSDaniel Fojt || err3 != REG_NOERROR))
323544b87433SJohn Marino {
323644b87433SJohn Marino err = (err != REG_NOERROR ? err
323744b87433SJohn Marino : (err2 != REG_NOERROR ? err2 : err3));
323844b87433SJohn Marino return err;
323944b87433SJohn Marino }
324044b87433SJohn Marino /* TODO: It is still inefficient... */
324144b87433SJohn Marino goto restart;
324244b87433SJohn Marino }
324344b87433SJohn Marino else
324444b87433SJohn Marino {
324544b87433SJohn Marino re_node_set union_set;
324644b87433SJohn Marino next_node = dfa->nexts[ent->node];
324744b87433SJohn Marino if (mctx->state_log[to_idx])
324844b87433SJohn Marino {
324944b87433SJohn Marino bool ok;
325044b87433SJohn Marino if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
325144b87433SJohn Marino next_node))
325244b87433SJohn Marino continue;
325344b87433SJohn Marino err = re_node_set_init_copy (&union_set,
325444b87433SJohn Marino &mctx->state_log[to_idx]->nodes);
325544b87433SJohn Marino ok = re_node_set_insert (&union_set, next_node);
3256*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR || ! ok))
325744b87433SJohn Marino {
325844b87433SJohn Marino re_node_set_free (&union_set);
325944b87433SJohn Marino err = err != REG_NOERROR ? err : REG_ESPACE;
326044b87433SJohn Marino return err;
326144b87433SJohn Marino }
326244b87433SJohn Marino }
326344b87433SJohn Marino else
326444b87433SJohn Marino {
326544b87433SJohn Marino err = re_node_set_init_1 (&union_set, next_node);
3266*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
326744b87433SJohn Marino return err;
326844b87433SJohn Marino }
326944b87433SJohn Marino mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
327044b87433SJohn Marino re_node_set_free (&union_set);
3271*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3272*6ea1f93eSDaniel Fojt && err != REG_NOERROR))
327344b87433SJohn Marino return err;
327444b87433SJohn Marino }
327544b87433SJohn Marino }
327644b87433SJohn Marino while (ent++->more);
327744b87433SJohn Marino return REG_NOERROR;
327844b87433SJohn Marino }
327944b87433SJohn Marino
328044b87433SJohn Marino /* Build transition table for the state.
328144b87433SJohn Marino Return true if successful. */
328244b87433SJohn Marino
328344b87433SJohn Marino static bool
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)328444b87433SJohn Marino build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
328544b87433SJohn Marino {
328644b87433SJohn Marino reg_errcode_t err;
328744b87433SJohn Marino Idx i, j;
328844b87433SJohn Marino int ch;
328944b87433SJohn Marino bool need_word_trtable = false;
329044b87433SJohn Marino bitset_word_t elem, mask;
329144b87433SJohn Marino bool dests_node_malloced = false;
329244b87433SJohn Marino bool dest_states_malloced = false;
32934536c563SJohn Marino Idx ndests; /* Number of the destination states from 'state'. */
329444b87433SJohn Marino re_dfastate_t **trtable;
329544b87433SJohn Marino re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
329644b87433SJohn Marino re_node_set follows, *dests_node;
329744b87433SJohn Marino bitset_t *dests_ch;
329844b87433SJohn Marino bitset_t acceptable;
329944b87433SJohn Marino
330044b87433SJohn Marino struct dests_alloc
330144b87433SJohn Marino {
330244b87433SJohn Marino re_node_set dests_node[SBC_MAX];
330344b87433SJohn Marino bitset_t dests_ch[SBC_MAX];
330444b87433SJohn Marino } *dests_alloc;
330544b87433SJohn Marino
330644b87433SJohn Marino /* We build DFA states which corresponds to the destination nodes
33074536c563SJohn Marino from 'state'. 'dests_node[i]' represents the nodes which i-th
33084536c563SJohn Marino destination state contains, and 'dests_ch[i]' represents the
330944b87433SJohn Marino characters which i-th destination state accepts. */
331044b87433SJohn Marino if (__libc_use_alloca (sizeof (struct dests_alloc)))
331144b87433SJohn Marino dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
331244b87433SJohn Marino else
331344b87433SJohn Marino {
331444b87433SJohn Marino dests_alloc = re_malloc (struct dests_alloc, 1);
3315*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dests_alloc == NULL))
331644b87433SJohn Marino return false;
331744b87433SJohn Marino dests_node_malloced = true;
331844b87433SJohn Marino }
331944b87433SJohn Marino dests_node = dests_alloc->dests_node;
332044b87433SJohn Marino dests_ch = dests_alloc->dests_ch;
332144b87433SJohn Marino
33224536c563SJohn Marino /* Initialize transition table. */
332344b87433SJohn Marino state->word_trtable = state->trtable = NULL;
332444b87433SJohn Marino
33254536c563SJohn Marino /* At first, group all nodes belonging to 'state' into several
332644b87433SJohn Marino destinations. */
332744b87433SJohn Marino ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3328*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ndests <= 0))
332944b87433SJohn Marino {
333044b87433SJohn Marino if (dests_node_malloced)
3331*6ea1f93eSDaniel Fojt re_free (dests_alloc);
33324536c563SJohn Marino /* Return false in case of an error, true otherwise. */
333344b87433SJohn Marino if (ndests == 0)
333444b87433SJohn Marino {
333544b87433SJohn Marino state->trtable = (re_dfastate_t **)
333644b87433SJohn Marino calloc (sizeof (re_dfastate_t *), SBC_MAX);
3337*6ea1f93eSDaniel Fojt if (__glibc_unlikely (state->trtable == NULL))
3338008e37b6SJohn Marino return false;
333944b87433SJohn Marino return true;
334044b87433SJohn Marino }
334144b87433SJohn Marino return false;
334244b87433SJohn Marino }
334344b87433SJohn Marino
334444b87433SJohn Marino err = re_node_set_alloc (&follows, ndests + 1);
3345*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
334644b87433SJohn Marino goto out_free;
334744b87433SJohn Marino
334844b87433SJohn Marino /* Avoid arithmetic overflow in size calculation. */
3349*6ea1f93eSDaniel Fojt size_t ndests_max
3350*6ea1f93eSDaniel Fojt = ((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3351*6ea1f93eSDaniel Fojt / (3 * sizeof (re_dfastate_t *)));
3352*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ndests_max < ndests))
335344b87433SJohn Marino goto out_free;
335444b87433SJohn Marino
335544b87433SJohn Marino if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
335644b87433SJohn Marino + ndests * 3 * sizeof (re_dfastate_t *)))
335744b87433SJohn Marino dest_states = (re_dfastate_t **)
335844b87433SJohn Marino alloca (ndests * 3 * sizeof (re_dfastate_t *));
335944b87433SJohn Marino else
336044b87433SJohn Marino {
3361*6ea1f93eSDaniel Fojt dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3362*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dest_states == NULL))
336344b87433SJohn Marino {
336444b87433SJohn Marino out_free:
336544b87433SJohn Marino if (dest_states_malloced)
3366*6ea1f93eSDaniel Fojt re_free (dest_states);
336744b87433SJohn Marino re_node_set_free (&follows);
336844b87433SJohn Marino for (i = 0; i < ndests; ++i)
336944b87433SJohn Marino re_node_set_free (dests_node + i);
337044b87433SJohn Marino if (dests_node_malloced)
3371*6ea1f93eSDaniel Fojt re_free (dests_alloc);
337244b87433SJohn Marino return false;
337344b87433SJohn Marino }
337444b87433SJohn Marino dest_states_malloced = true;
337544b87433SJohn Marino }
337644b87433SJohn Marino dest_states_word = dest_states + ndests;
337744b87433SJohn Marino dest_states_nl = dest_states_word + ndests;
337844b87433SJohn Marino bitset_empty (acceptable);
337944b87433SJohn Marino
338044b87433SJohn Marino /* Then build the states for all destinations. */
338144b87433SJohn Marino for (i = 0; i < ndests; ++i)
338244b87433SJohn Marino {
338344b87433SJohn Marino Idx next_node;
338444b87433SJohn Marino re_node_set_empty (&follows);
338544b87433SJohn Marino /* Merge the follows of this destination states. */
338644b87433SJohn Marino for (j = 0; j < dests_node[i].nelem; ++j)
338744b87433SJohn Marino {
338844b87433SJohn Marino next_node = dfa->nexts[dests_node[i].elems[j]];
3389*6ea1f93eSDaniel Fojt if (next_node != -1)
339044b87433SJohn Marino {
339144b87433SJohn Marino err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3392*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
339344b87433SJohn Marino goto out_free;
339444b87433SJohn Marino }
339544b87433SJohn Marino }
339644b87433SJohn Marino dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3397*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
339844b87433SJohn Marino goto out_free;
339944b87433SJohn Marino /* If the new state has context constraint,
340044b87433SJohn Marino build appropriate states for these contexts. */
340144b87433SJohn Marino if (dest_states[i]->has_constraint)
340244b87433SJohn Marino {
340344b87433SJohn Marino dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
340444b87433SJohn Marino CONTEXT_WORD);
3405*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dest_states_word[i] == NULL
3406*6ea1f93eSDaniel Fojt && err != REG_NOERROR))
340744b87433SJohn Marino goto out_free;
340844b87433SJohn Marino
340944b87433SJohn Marino if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
341044b87433SJohn Marino need_word_trtable = true;
341144b87433SJohn Marino
341244b87433SJohn Marino dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
341344b87433SJohn Marino CONTEXT_NEWLINE);
3414*6ea1f93eSDaniel Fojt if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
341544b87433SJohn Marino goto out_free;
341644b87433SJohn Marino }
341744b87433SJohn Marino else
341844b87433SJohn Marino {
341944b87433SJohn Marino dest_states_word[i] = dest_states[i];
342044b87433SJohn Marino dest_states_nl[i] = dest_states[i];
342144b87433SJohn Marino }
342244b87433SJohn Marino bitset_merge (acceptable, dests_ch[i]);
342344b87433SJohn Marino }
342444b87433SJohn Marino
3425*6ea1f93eSDaniel Fojt if (!__glibc_unlikely (need_word_trtable))
342644b87433SJohn Marino {
342744b87433SJohn Marino /* We don't care about whether the following character is a word
342844b87433SJohn Marino character, or we are in a single-byte character set so we can
342944b87433SJohn Marino discern by looking at the character code: allocate a
343044b87433SJohn Marino 256-entry transition table. */
343144b87433SJohn Marino trtable = state->trtable =
343244b87433SJohn Marino (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3433*6ea1f93eSDaniel Fojt if (__glibc_unlikely (trtable == NULL))
343444b87433SJohn Marino goto out_free;
343544b87433SJohn Marino
343644b87433SJohn Marino /* For all characters ch...: */
343744b87433SJohn Marino for (i = 0; i < BITSET_WORDS; ++i)
343844b87433SJohn Marino for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
343944b87433SJohn Marino elem;
344044b87433SJohn Marino mask <<= 1, elem >>= 1, ++ch)
3441*6ea1f93eSDaniel Fojt if (__glibc_unlikely (elem & 1))
344244b87433SJohn Marino {
344344b87433SJohn Marino /* There must be exactly one destination which accepts
344444b87433SJohn Marino character ch. See group_nodes_into_DFAstates. */
344544b87433SJohn Marino for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
344644b87433SJohn Marino ;
344744b87433SJohn Marino
344844b87433SJohn Marino /* j-th destination accepts the word character ch. */
344944b87433SJohn Marino if (dfa->word_char[i] & mask)
345044b87433SJohn Marino trtable[ch] = dest_states_word[j];
345144b87433SJohn Marino else
345244b87433SJohn Marino trtable[ch] = dest_states[j];
345344b87433SJohn Marino }
345444b87433SJohn Marino }
345544b87433SJohn Marino else
345644b87433SJohn Marino {
345744b87433SJohn Marino /* We care about whether the following character is a word
345844b87433SJohn Marino character, and we are in a multi-byte character set: discern
345944b87433SJohn Marino by looking at the character code: build two 256-entry
346044b87433SJohn Marino transition tables, one starting at trtable[0] and one
346144b87433SJohn Marino starting at trtable[SBC_MAX]. */
346244b87433SJohn Marino trtable = state->word_trtable =
346344b87433SJohn Marino (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3464*6ea1f93eSDaniel Fojt if (__glibc_unlikely (trtable == NULL))
346544b87433SJohn Marino goto out_free;
346644b87433SJohn Marino
346744b87433SJohn Marino /* For all characters ch...: */
346844b87433SJohn Marino for (i = 0; i < BITSET_WORDS; ++i)
346944b87433SJohn Marino for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
347044b87433SJohn Marino elem;
347144b87433SJohn Marino mask <<= 1, elem >>= 1, ++ch)
3472*6ea1f93eSDaniel Fojt if (__glibc_unlikely (elem & 1))
347344b87433SJohn Marino {
347444b87433SJohn Marino /* There must be exactly one destination which accepts
347544b87433SJohn Marino character ch. See group_nodes_into_DFAstates. */
347644b87433SJohn Marino for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
347744b87433SJohn Marino ;
347844b87433SJohn Marino
347944b87433SJohn Marino /* j-th destination accepts the word character ch. */
348044b87433SJohn Marino trtable[ch] = dest_states[j];
348144b87433SJohn Marino trtable[ch + SBC_MAX] = dest_states_word[j];
348244b87433SJohn Marino }
348344b87433SJohn Marino }
348444b87433SJohn Marino
348544b87433SJohn Marino /* new line */
348644b87433SJohn Marino if (bitset_contain (acceptable, NEWLINE_CHAR))
348744b87433SJohn Marino {
348844b87433SJohn Marino /* The current state accepts newline character. */
348944b87433SJohn Marino for (j = 0; j < ndests; ++j)
349044b87433SJohn Marino if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
349144b87433SJohn Marino {
349244b87433SJohn Marino /* k-th destination accepts newline character. */
349344b87433SJohn Marino trtable[NEWLINE_CHAR] = dest_states_nl[j];
349444b87433SJohn Marino if (need_word_trtable)
349544b87433SJohn Marino trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
349644b87433SJohn Marino /* There must be only one destination which accepts
349744b87433SJohn Marino newline. See group_nodes_into_DFAstates. */
349844b87433SJohn Marino break;
349944b87433SJohn Marino }
350044b87433SJohn Marino }
350144b87433SJohn Marino
350244b87433SJohn Marino if (dest_states_malloced)
3503*6ea1f93eSDaniel Fojt re_free (dest_states);
350444b87433SJohn Marino
350544b87433SJohn Marino re_node_set_free (&follows);
350644b87433SJohn Marino for (i = 0; i < ndests; ++i)
350744b87433SJohn Marino re_node_set_free (dests_node + i);
350844b87433SJohn Marino
350944b87433SJohn Marino if (dests_node_malloced)
3510*6ea1f93eSDaniel Fojt re_free (dests_alloc);
351144b87433SJohn Marino
351244b87433SJohn Marino return true;
351344b87433SJohn Marino }
351444b87433SJohn Marino
351544b87433SJohn Marino /* Group all nodes belonging to STATE into several destinations.
351644b87433SJohn Marino Then for all destinations, set the nodes belonging to the destination
351744b87433SJohn Marino to DESTS_NODE[i] and set the characters accepted by the destination
351844b87433SJohn Marino to DEST_CH[i]. This function return the number of destinations. */
351944b87433SJohn Marino
352044b87433SJohn Marino static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)352144b87433SJohn Marino group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
352244b87433SJohn Marino re_node_set *dests_node, bitset_t *dests_ch)
352344b87433SJohn Marino {
352444b87433SJohn Marino reg_errcode_t err;
352544b87433SJohn Marino bool ok;
352644b87433SJohn Marino Idx i, j, k;
35274536c563SJohn Marino Idx ndests; /* Number of the destinations from 'state'. */
352844b87433SJohn Marino bitset_t accepts; /* Characters a node can accept. */
352944b87433SJohn Marino const re_node_set *cur_nodes = &state->nodes;
353044b87433SJohn Marino bitset_empty (accepts);
353144b87433SJohn Marino ndests = 0;
353244b87433SJohn Marino
35334536c563SJohn Marino /* For all the nodes belonging to 'state', */
353444b87433SJohn Marino for (i = 0; i < cur_nodes->nelem; ++i)
353544b87433SJohn Marino {
353644b87433SJohn Marino re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
353744b87433SJohn Marino re_token_type_t type = node->type;
353844b87433SJohn Marino unsigned int constraint = node->constraint;
353944b87433SJohn Marino
354044b87433SJohn Marino /* Enumerate all single byte character this node can accept. */
354144b87433SJohn Marino if (type == CHARACTER)
354244b87433SJohn Marino bitset_set (accepts, node->opr.c);
354344b87433SJohn Marino else if (type == SIMPLE_BRACKET)
354444b87433SJohn Marino {
354544b87433SJohn Marino bitset_merge (accepts, node->opr.sbcset);
354644b87433SJohn Marino }
354744b87433SJohn Marino else if (type == OP_PERIOD)
354844b87433SJohn Marino {
354944b87433SJohn Marino #ifdef RE_ENABLE_I18N
355044b87433SJohn Marino if (dfa->mb_cur_max > 1)
355144b87433SJohn Marino bitset_merge (accepts, dfa->sb_char);
355244b87433SJohn Marino else
355344b87433SJohn Marino #endif
355444b87433SJohn Marino bitset_set_all (accepts);
355544b87433SJohn Marino if (!(dfa->syntax & RE_DOT_NEWLINE))
355644b87433SJohn Marino bitset_clear (accepts, '\n');
355744b87433SJohn Marino if (dfa->syntax & RE_DOT_NOT_NULL)
355844b87433SJohn Marino bitset_clear (accepts, '\0');
355944b87433SJohn Marino }
356044b87433SJohn Marino #ifdef RE_ENABLE_I18N
356144b87433SJohn Marino else if (type == OP_UTF8_PERIOD)
356244b87433SJohn Marino {
356344b87433SJohn Marino if (ASCII_CHARS % BITSET_WORD_BITS == 0)
356444b87433SJohn Marino memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
356544b87433SJohn Marino else
356644b87433SJohn Marino bitset_merge (accepts, utf8_sb_map);
356744b87433SJohn Marino if (!(dfa->syntax & RE_DOT_NEWLINE))
356844b87433SJohn Marino bitset_clear (accepts, '\n');
356944b87433SJohn Marino if (dfa->syntax & RE_DOT_NOT_NULL)
357044b87433SJohn Marino bitset_clear (accepts, '\0');
357144b87433SJohn Marino }
357244b87433SJohn Marino #endif
357344b87433SJohn Marino else
357444b87433SJohn Marino continue;
357544b87433SJohn Marino
35764536c563SJohn Marino /* Check the 'accepts' and sift the characters which are not
357744b87433SJohn Marino match it the context. */
357844b87433SJohn Marino if (constraint)
357944b87433SJohn Marino {
358044b87433SJohn Marino if (constraint & NEXT_NEWLINE_CONSTRAINT)
358144b87433SJohn Marino {
358244b87433SJohn Marino bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
358344b87433SJohn Marino bitset_empty (accepts);
358444b87433SJohn Marino if (accepts_newline)
358544b87433SJohn Marino bitset_set (accepts, NEWLINE_CHAR);
358644b87433SJohn Marino else
358744b87433SJohn Marino continue;
358844b87433SJohn Marino }
358944b87433SJohn Marino if (constraint & NEXT_ENDBUF_CONSTRAINT)
359044b87433SJohn Marino {
359144b87433SJohn Marino bitset_empty (accepts);
359244b87433SJohn Marino continue;
359344b87433SJohn Marino }
359444b87433SJohn Marino
359544b87433SJohn Marino if (constraint & NEXT_WORD_CONSTRAINT)
359644b87433SJohn Marino {
359744b87433SJohn Marino bitset_word_t any_set = 0;
359844b87433SJohn Marino if (type == CHARACTER && !node->word_char)
359944b87433SJohn Marino {
360044b87433SJohn Marino bitset_empty (accepts);
360144b87433SJohn Marino continue;
360244b87433SJohn Marino }
360344b87433SJohn Marino #ifdef RE_ENABLE_I18N
360444b87433SJohn Marino if (dfa->mb_cur_max > 1)
360544b87433SJohn Marino for (j = 0; j < BITSET_WORDS; ++j)
360644b87433SJohn Marino any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
360744b87433SJohn Marino else
360844b87433SJohn Marino #endif
360944b87433SJohn Marino for (j = 0; j < BITSET_WORDS; ++j)
361044b87433SJohn Marino any_set |= (accepts[j] &= dfa->word_char[j]);
361144b87433SJohn Marino if (!any_set)
361244b87433SJohn Marino continue;
361344b87433SJohn Marino }
361444b87433SJohn Marino if (constraint & NEXT_NOTWORD_CONSTRAINT)
361544b87433SJohn Marino {
361644b87433SJohn Marino bitset_word_t any_set = 0;
361744b87433SJohn Marino if (type == CHARACTER && node->word_char)
361844b87433SJohn Marino {
361944b87433SJohn Marino bitset_empty (accepts);
362044b87433SJohn Marino continue;
362144b87433SJohn Marino }
362244b87433SJohn Marino #ifdef RE_ENABLE_I18N
362344b87433SJohn Marino if (dfa->mb_cur_max > 1)
362444b87433SJohn Marino for (j = 0; j < BITSET_WORDS; ++j)
362544b87433SJohn Marino any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
362644b87433SJohn Marino else
362744b87433SJohn Marino #endif
362844b87433SJohn Marino for (j = 0; j < BITSET_WORDS; ++j)
362944b87433SJohn Marino any_set |= (accepts[j] &= ~dfa->word_char[j]);
363044b87433SJohn Marino if (!any_set)
363144b87433SJohn Marino continue;
363244b87433SJohn Marino }
363344b87433SJohn Marino }
363444b87433SJohn Marino
36354536c563SJohn Marino /* Then divide 'accepts' into DFA states, or create a new
363644b87433SJohn Marino state. Above, we make sure that accepts is not empty. */
363744b87433SJohn Marino for (j = 0; j < ndests; ++j)
363844b87433SJohn Marino {
363944b87433SJohn Marino bitset_t intersec; /* Intersection sets, see below. */
364044b87433SJohn Marino bitset_t remains;
364144b87433SJohn Marino /* Flags, see below. */
364244b87433SJohn Marino bitset_word_t has_intersec, not_subset, not_consumed;
364344b87433SJohn Marino
364444b87433SJohn Marino /* Optimization, skip if this state doesn't accept the character. */
364544b87433SJohn Marino if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
364644b87433SJohn Marino continue;
364744b87433SJohn Marino
36484536c563SJohn Marino /* Enumerate the intersection set of this state and 'accepts'. */
364944b87433SJohn Marino has_intersec = 0;
365044b87433SJohn Marino for (k = 0; k < BITSET_WORDS; ++k)
365144b87433SJohn Marino has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
365244b87433SJohn Marino /* And skip if the intersection set is empty. */
365344b87433SJohn Marino if (!has_intersec)
365444b87433SJohn Marino continue;
365544b87433SJohn Marino
36564536c563SJohn Marino /* Then check if this state is a subset of 'accepts'. */
365744b87433SJohn Marino not_subset = not_consumed = 0;
365844b87433SJohn Marino for (k = 0; k < BITSET_WORDS; ++k)
365944b87433SJohn Marino {
366044b87433SJohn Marino not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
366144b87433SJohn Marino not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
366244b87433SJohn Marino }
366344b87433SJohn Marino
36644536c563SJohn Marino /* If this state isn't a subset of 'accepts', create a
36654536c563SJohn Marino new group state, which has the 'remains'. */
366644b87433SJohn Marino if (not_subset)
366744b87433SJohn Marino {
366844b87433SJohn Marino bitset_copy (dests_ch[ndests], remains);
366944b87433SJohn Marino bitset_copy (dests_ch[j], intersec);
367044b87433SJohn Marino err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3671*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
367244b87433SJohn Marino goto error_return;
367344b87433SJohn Marino ++ndests;
367444b87433SJohn Marino }
367544b87433SJohn Marino
367644b87433SJohn Marino /* Put the position in the current group. */
367744b87433SJohn Marino ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3678*6ea1f93eSDaniel Fojt if (__glibc_unlikely (! ok))
367944b87433SJohn Marino goto error_return;
368044b87433SJohn Marino
368144b87433SJohn Marino /* If all characters are consumed, go to next node. */
368244b87433SJohn Marino if (!not_consumed)
368344b87433SJohn Marino break;
368444b87433SJohn Marino }
368544b87433SJohn Marino /* Some characters remain, create a new group. */
368644b87433SJohn Marino if (j == ndests)
368744b87433SJohn Marino {
368844b87433SJohn Marino bitset_copy (dests_ch[ndests], accepts);
368944b87433SJohn Marino err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3690*6ea1f93eSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
369144b87433SJohn Marino goto error_return;
369244b87433SJohn Marino ++ndests;
369344b87433SJohn Marino bitset_empty (accepts);
369444b87433SJohn Marino }
369544b87433SJohn Marino }
369644b87433SJohn Marino return ndests;
369744b87433SJohn Marino error_return:
369844b87433SJohn Marino for (j = 0; j < ndests; ++j)
369944b87433SJohn Marino re_node_set_free (dests_node + j);
3700*6ea1f93eSDaniel Fojt return -1;
370144b87433SJohn Marino }
370244b87433SJohn Marino
370344b87433SJohn Marino #ifdef RE_ENABLE_I18N
37044536c563SJohn Marino /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
370544b87433SJohn Marino Return the number of the bytes the node accepts.
370644b87433SJohn Marino STR_IDX is the current index of the input string.
370744b87433SJohn Marino
370844b87433SJohn Marino This function handles the nodes which can accept one character, or
370944b87433SJohn Marino one collating element like '.', '[a-z]', opposite to the other nodes
371044b87433SJohn Marino can only accept one byte. */
371144b87433SJohn Marino
3712*6ea1f93eSDaniel Fojt # ifdef _LIBC
3713*6ea1f93eSDaniel Fojt # include <locale/weight.h>
3714*6ea1f93eSDaniel Fojt # endif
3715*6ea1f93eSDaniel Fojt
371644b87433SJohn Marino static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)371744b87433SJohn Marino check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
371844b87433SJohn Marino const re_string_t *input, Idx str_idx)
371944b87433SJohn Marino {
372044b87433SJohn Marino const re_token_t *node = dfa->nodes + node_idx;
372144b87433SJohn Marino int char_len, elem_len;
372244b87433SJohn Marino Idx i;
372344b87433SJohn Marino
3724*6ea1f93eSDaniel Fojt if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
372544b87433SJohn Marino {
372644b87433SJohn Marino unsigned char c = re_string_byte_at (input, str_idx), d;
3727*6ea1f93eSDaniel Fojt if (__glibc_likely (c < 0xc2))
372844b87433SJohn Marino return 0;
372944b87433SJohn Marino
373044b87433SJohn Marino if (str_idx + 2 > input->len)
373144b87433SJohn Marino return 0;
373244b87433SJohn Marino
373344b87433SJohn Marino d = re_string_byte_at (input, str_idx + 1);
373444b87433SJohn Marino if (c < 0xe0)
373544b87433SJohn Marino return (d < 0x80 || d > 0xbf) ? 0 : 2;
373644b87433SJohn Marino else if (c < 0xf0)
373744b87433SJohn Marino {
373844b87433SJohn Marino char_len = 3;
373944b87433SJohn Marino if (c == 0xe0 && d < 0xa0)
374044b87433SJohn Marino return 0;
374144b87433SJohn Marino }
374244b87433SJohn Marino else if (c < 0xf8)
374344b87433SJohn Marino {
374444b87433SJohn Marino char_len = 4;
374544b87433SJohn Marino if (c == 0xf0 && d < 0x90)
374644b87433SJohn Marino return 0;
374744b87433SJohn Marino }
374844b87433SJohn Marino else if (c < 0xfc)
374944b87433SJohn Marino {
375044b87433SJohn Marino char_len = 5;
375144b87433SJohn Marino if (c == 0xf8 && d < 0x88)
375244b87433SJohn Marino return 0;
375344b87433SJohn Marino }
375444b87433SJohn Marino else if (c < 0xfe)
375544b87433SJohn Marino {
375644b87433SJohn Marino char_len = 6;
375744b87433SJohn Marino if (c == 0xfc && d < 0x84)
375844b87433SJohn Marino return 0;
375944b87433SJohn Marino }
376044b87433SJohn Marino else
376144b87433SJohn Marino return 0;
376244b87433SJohn Marino
376344b87433SJohn Marino if (str_idx + char_len > input->len)
376444b87433SJohn Marino return 0;
376544b87433SJohn Marino
376644b87433SJohn Marino for (i = 1; i < char_len; ++i)
376744b87433SJohn Marino {
376844b87433SJohn Marino d = re_string_byte_at (input, str_idx + i);
376944b87433SJohn Marino if (d < 0x80 || d > 0xbf)
377044b87433SJohn Marino return 0;
377144b87433SJohn Marino }
377244b87433SJohn Marino return char_len;
377344b87433SJohn Marino }
377444b87433SJohn Marino
377544b87433SJohn Marino char_len = re_string_char_size_at (input, str_idx);
377644b87433SJohn Marino if (node->type == OP_PERIOD)
377744b87433SJohn Marino {
377844b87433SJohn Marino if (char_len <= 1)
377944b87433SJohn Marino return 0;
378044b87433SJohn Marino /* FIXME: I don't think this if is needed, as both '\n'
378144b87433SJohn Marino and '\0' are char_len == 1. */
378244b87433SJohn Marino /* '.' accepts any one character except the following two cases. */
378344b87433SJohn Marino if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
378444b87433SJohn Marino re_string_byte_at (input, str_idx) == '\n') ||
378544b87433SJohn Marino ((dfa->syntax & RE_DOT_NOT_NULL) &&
378644b87433SJohn Marino re_string_byte_at (input, str_idx) == '\0'))
378744b87433SJohn Marino return 0;
378844b87433SJohn Marino return char_len;
378944b87433SJohn Marino }
379044b87433SJohn Marino
379144b87433SJohn Marino elem_len = re_string_elem_size_at (input, str_idx);
379244b87433SJohn Marino if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
379344b87433SJohn Marino return 0;
379444b87433SJohn Marino
379544b87433SJohn Marino if (node->type == COMPLEX_BRACKET)
379644b87433SJohn Marino {
379744b87433SJohn Marino const re_charset_t *cset = node->opr.mbcset;
379844b87433SJohn Marino # ifdef _LIBC
379944b87433SJohn Marino const unsigned char *pin
380044b87433SJohn Marino = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
380144b87433SJohn Marino Idx j;
380244b87433SJohn Marino uint32_t nrules;
380344b87433SJohn Marino # endif /* _LIBC */
380444b87433SJohn Marino int match_len = 0;
380544b87433SJohn Marino wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
380644b87433SJohn Marino ? re_string_wchar_at (input, str_idx) : 0);
380744b87433SJohn Marino
380844b87433SJohn Marino /* match with multibyte character? */
380944b87433SJohn Marino for (i = 0; i < cset->nmbchars; ++i)
381044b87433SJohn Marino if (wc == cset->mbchars[i])
381144b87433SJohn Marino {
381244b87433SJohn Marino match_len = char_len;
381344b87433SJohn Marino goto check_node_accept_bytes_match;
381444b87433SJohn Marino }
381544b87433SJohn Marino /* match with character_class? */
381644b87433SJohn Marino for (i = 0; i < cset->nchar_classes; ++i)
381744b87433SJohn Marino {
381844b87433SJohn Marino wctype_t wt = cset->char_classes[i];
381944b87433SJohn Marino if (__iswctype (wc, wt))
382044b87433SJohn Marino {
382144b87433SJohn Marino match_len = char_len;
382244b87433SJohn Marino goto check_node_accept_bytes_match;
382344b87433SJohn Marino }
382444b87433SJohn Marino }
382544b87433SJohn Marino
382644b87433SJohn Marino # ifdef _LIBC
382744b87433SJohn Marino nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
382844b87433SJohn Marino if (nrules != 0)
382944b87433SJohn Marino {
383044b87433SJohn Marino unsigned int in_collseq = 0;
383144b87433SJohn Marino const int32_t *table, *indirect;
383244b87433SJohn Marino const unsigned char *weights, *extra;
383344b87433SJohn Marino const char *collseqwc;
383444b87433SJohn Marino
383544b87433SJohn Marino /* match with collating_symbol? */
383644b87433SJohn Marino if (cset->ncoll_syms)
383744b87433SJohn Marino extra = (const unsigned char *)
383844b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
383944b87433SJohn Marino for (i = 0; i < cset->ncoll_syms; ++i)
384044b87433SJohn Marino {
384144b87433SJohn Marino const unsigned char *coll_sym = extra + cset->coll_syms[i];
384244b87433SJohn Marino /* Compare the length of input collating element and
384344b87433SJohn Marino the length of current collating element. */
384444b87433SJohn Marino if (*coll_sym != elem_len)
384544b87433SJohn Marino continue;
384644b87433SJohn Marino /* Compare each bytes. */
384744b87433SJohn Marino for (j = 0; j < *coll_sym; j++)
384844b87433SJohn Marino if (pin[j] != coll_sym[1 + j])
384944b87433SJohn Marino break;
385044b87433SJohn Marino if (j == *coll_sym)
385144b87433SJohn Marino {
385244b87433SJohn Marino /* Match if every bytes is equal. */
385344b87433SJohn Marino match_len = j;
385444b87433SJohn Marino goto check_node_accept_bytes_match;
385544b87433SJohn Marino }
385644b87433SJohn Marino }
385744b87433SJohn Marino
385844b87433SJohn Marino if (cset->nranges)
385944b87433SJohn Marino {
386044b87433SJohn Marino if (elem_len <= char_len)
386144b87433SJohn Marino {
386244b87433SJohn Marino collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
386344b87433SJohn Marino in_collseq = __collseq_table_lookup (collseqwc, wc);
386444b87433SJohn Marino }
386544b87433SJohn Marino else
386644b87433SJohn Marino in_collseq = find_collation_sequence_value (pin, elem_len);
386744b87433SJohn Marino }
386844b87433SJohn Marino /* match with range expression? */
38694536c563SJohn Marino /* FIXME: Implement rational ranges here, too. */
387044b87433SJohn Marino for (i = 0; i < cset->nranges; ++i)
387144b87433SJohn Marino if (cset->range_starts[i] <= in_collseq
387244b87433SJohn Marino && in_collseq <= cset->range_ends[i])
387344b87433SJohn Marino {
387444b87433SJohn Marino match_len = elem_len;
387544b87433SJohn Marino goto check_node_accept_bytes_match;
387644b87433SJohn Marino }
387744b87433SJohn Marino
387844b87433SJohn Marino /* match with equivalence_class? */
387944b87433SJohn Marino if (cset->nequiv_classes)
388044b87433SJohn Marino {
388144b87433SJohn Marino const unsigned char *cp = pin;
388244b87433SJohn Marino table = (const int32_t *)
388344b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388444b87433SJohn Marino weights = (const unsigned char *)
388544b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
388644b87433SJohn Marino extra = (const unsigned char *)
388744b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
388844b87433SJohn Marino indirect = (const int32_t *)
388944b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3890*6ea1f93eSDaniel Fojt int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3891*6ea1f93eSDaniel Fojt int32_t rule = idx >> 24;
3892*6ea1f93eSDaniel Fojt idx &= 0xffffff;
389344b87433SJohn Marino if (idx > 0)
3894*6ea1f93eSDaniel Fojt {
3895*6ea1f93eSDaniel Fojt size_t weight_len = weights[idx];
389644b87433SJohn Marino for (i = 0; i < cset->nequiv_classes; ++i)
389744b87433SJohn Marino {
389844b87433SJohn Marino int32_t equiv_class_idx = cset->equiv_classes[i];
3899*6ea1f93eSDaniel Fojt int32_t equiv_class_rule = equiv_class_idx >> 24;
390044b87433SJohn Marino equiv_class_idx &= 0xffffff;
3901*6ea1f93eSDaniel Fojt if (weights[equiv_class_idx] == weight_len
3902*6ea1f93eSDaniel Fojt && equiv_class_rule == rule
3903*6ea1f93eSDaniel Fojt && memcmp (weights + idx + 1,
3904*6ea1f93eSDaniel Fojt weights + equiv_class_idx + 1,
3905*6ea1f93eSDaniel Fojt weight_len) == 0)
390644b87433SJohn Marino {
390744b87433SJohn Marino match_len = elem_len;
390844b87433SJohn Marino goto check_node_accept_bytes_match;
390944b87433SJohn Marino }
391044b87433SJohn Marino }
391144b87433SJohn Marino }
391244b87433SJohn Marino }
391344b87433SJohn Marino }
391444b87433SJohn Marino else
391544b87433SJohn Marino # endif /* _LIBC */
391644b87433SJohn Marino {
391744b87433SJohn Marino /* match with range expression? */
391844b87433SJohn Marino for (i = 0; i < cset->nranges; ++i)
391944b87433SJohn Marino {
39204536c563SJohn Marino if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
392144b87433SJohn Marino {
392244b87433SJohn Marino match_len = char_len;
392344b87433SJohn Marino goto check_node_accept_bytes_match;
392444b87433SJohn Marino }
392544b87433SJohn Marino }
392644b87433SJohn Marino }
392744b87433SJohn Marino check_node_accept_bytes_match:
392844b87433SJohn Marino if (!cset->non_match)
392944b87433SJohn Marino return match_len;
393044b87433SJohn Marino else
393144b87433SJohn Marino {
393244b87433SJohn Marino if (match_len > 0)
393344b87433SJohn Marino return 0;
393444b87433SJohn Marino else
393544b87433SJohn Marino return (elem_len > char_len) ? elem_len : char_len;
393644b87433SJohn Marino }
393744b87433SJohn Marino }
393844b87433SJohn Marino return 0;
393944b87433SJohn Marino }
394044b87433SJohn Marino
394144b87433SJohn Marino # ifdef _LIBC
394244b87433SJohn Marino static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)394344b87433SJohn Marino find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
394444b87433SJohn Marino {
394544b87433SJohn Marino uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
394644b87433SJohn Marino if (nrules == 0)
394744b87433SJohn Marino {
394844b87433SJohn Marino if (mbs_len == 1)
394944b87433SJohn Marino {
395044b87433SJohn Marino /* No valid character. Match it as a single byte character. */
395144b87433SJohn Marino const unsigned char *collseq = (const unsigned char *)
395244b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
395344b87433SJohn Marino return collseq[mbs[0]];
395444b87433SJohn Marino }
395544b87433SJohn Marino return UINT_MAX;
395644b87433SJohn Marino }
395744b87433SJohn Marino else
395844b87433SJohn Marino {
395944b87433SJohn Marino int32_t idx;
396044b87433SJohn Marino const unsigned char *extra = (const unsigned char *)
396144b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
396244b87433SJohn Marino int32_t extrasize = (const unsigned char *)
396344b87433SJohn Marino _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
396444b87433SJohn Marino
396544b87433SJohn Marino for (idx = 0; idx < extrasize;)
396644b87433SJohn Marino {
396744b87433SJohn Marino int mbs_cnt;
396844b87433SJohn Marino bool found = false;
396944b87433SJohn Marino int32_t elem_mbs_len;
397044b87433SJohn Marino /* Skip the name of collating element name. */
397144b87433SJohn Marino idx = idx + extra[idx] + 1;
397244b87433SJohn Marino elem_mbs_len = extra[idx++];
397344b87433SJohn Marino if (mbs_len == elem_mbs_len)
397444b87433SJohn Marino {
397544b87433SJohn Marino for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
397644b87433SJohn Marino if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
397744b87433SJohn Marino break;
397844b87433SJohn Marino if (mbs_cnt == elem_mbs_len)
397944b87433SJohn Marino /* Found the entry. */
398044b87433SJohn Marino found = true;
398144b87433SJohn Marino }
398244b87433SJohn Marino /* Skip the byte sequence of the collating element. */
398344b87433SJohn Marino idx += elem_mbs_len;
398444b87433SJohn Marino /* Adjust for the alignment. */
398544b87433SJohn Marino idx = (idx + 3) & ~3;
398644b87433SJohn Marino /* Skip the collation sequence value. */
398744b87433SJohn Marino idx += sizeof (uint32_t);
398844b87433SJohn Marino /* Skip the wide char sequence of the collating element. */
39894536c563SJohn Marino idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
399044b87433SJohn Marino /* If we found the entry, return the sequence value. */
399144b87433SJohn Marino if (found)
399244b87433SJohn Marino return *(uint32_t *) (extra + idx);
399344b87433SJohn Marino /* Skip the collation sequence value. */
399444b87433SJohn Marino idx += sizeof (uint32_t);
399544b87433SJohn Marino }
399644b87433SJohn Marino return UINT_MAX;
399744b87433SJohn Marino }
399844b87433SJohn Marino }
399944b87433SJohn Marino # endif /* _LIBC */
400044b87433SJohn Marino #endif /* RE_ENABLE_I18N */
400144b87433SJohn Marino
400244b87433SJohn Marino /* Check whether the node accepts the byte which is IDX-th
400344b87433SJohn Marino byte of the INPUT. */
400444b87433SJohn Marino
400544b87433SJohn Marino static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)400644b87433SJohn Marino check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
400744b87433SJohn Marino Idx idx)
400844b87433SJohn Marino {
400944b87433SJohn Marino unsigned char ch;
401044b87433SJohn Marino ch = re_string_byte_at (&mctx->input, idx);
401144b87433SJohn Marino switch (node->type)
401244b87433SJohn Marino {
401344b87433SJohn Marino case CHARACTER:
401444b87433SJohn Marino if (node->opr.c != ch)
401544b87433SJohn Marino return false;
401644b87433SJohn Marino break;
401744b87433SJohn Marino
401844b87433SJohn Marino case SIMPLE_BRACKET:
401944b87433SJohn Marino if (!bitset_contain (node->opr.sbcset, ch))
402044b87433SJohn Marino return false;
402144b87433SJohn Marino break;
402244b87433SJohn Marino
402344b87433SJohn Marino #ifdef RE_ENABLE_I18N
402444b87433SJohn Marino case OP_UTF8_PERIOD:
402544b87433SJohn Marino if (ch >= ASCII_CHARS)
402644b87433SJohn Marino return false;
4027*6ea1f93eSDaniel Fojt FALLTHROUGH;
402844b87433SJohn Marino #endif
402944b87433SJohn Marino case OP_PERIOD:
403044b87433SJohn Marino if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
403144b87433SJohn Marino || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
403244b87433SJohn Marino return false;
403344b87433SJohn Marino break;
403444b87433SJohn Marino
403544b87433SJohn Marino default:
403644b87433SJohn Marino return false;
403744b87433SJohn Marino }
403844b87433SJohn Marino
403944b87433SJohn Marino if (node->constraint)
404044b87433SJohn Marino {
404144b87433SJohn Marino /* The node has constraints. Check whether the current context
404244b87433SJohn Marino satisfies the constraints. */
404344b87433SJohn Marino unsigned int context = re_string_context_at (&mctx->input, idx,
404444b87433SJohn Marino mctx->eflags);
404544b87433SJohn Marino if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
404644b87433SJohn Marino return false;
404744b87433SJohn Marino }
404844b87433SJohn Marino
404944b87433SJohn Marino return true;
405044b87433SJohn Marino }
405144b87433SJohn Marino
405244b87433SJohn Marino /* Extend the buffers, if the buffers have run out. */
405344b87433SJohn Marino
405444b87433SJohn Marino static reg_errcode_t
4055*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)40564536c563SJohn Marino extend_buffers (re_match_context_t *mctx, int min_len)
405744b87433SJohn Marino {
405844b87433SJohn Marino reg_errcode_t ret;
405944b87433SJohn Marino re_string_t *pstr = &mctx->input;
406044b87433SJohn Marino
406144b87433SJohn Marino /* Avoid overflow. */
4062*6ea1f93eSDaniel Fojt if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4063*6ea1f93eSDaniel Fojt <= pstr->bufs_len))
406444b87433SJohn Marino return REG_ESPACE;
406544b87433SJohn Marino
40664536c563SJohn Marino /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
40674536c563SJohn Marino ret = re_string_realloc_buffers (pstr,
40684536c563SJohn Marino MAX (min_len,
40694536c563SJohn Marino MIN (pstr->len, pstr->bufs_len * 2)));
4070*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
407144b87433SJohn Marino return ret;
407244b87433SJohn Marino
407344b87433SJohn Marino if (mctx->state_log != NULL)
407444b87433SJohn Marino {
407544b87433SJohn Marino /* And double the length of state_log. */
407644b87433SJohn Marino /* XXX We have no indication of the size of this buffer. If this
407744b87433SJohn Marino allocation fail we have no indication that the state_log array
407844b87433SJohn Marino does not have the right size. */
407944b87433SJohn Marino re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
408044b87433SJohn Marino pstr->bufs_len + 1);
4081*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_array == NULL))
408244b87433SJohn Marino return REG_ESPACE;
408344b87433SJohn Marino mctx->state_log = new_array;
408444b87433SJohn Marino }
408544b87433SJohn Marino
408644b87433SJohn Marino /* Then reconstruct the buffers. */
408744b87433SJohn Marino if (pstr->icase)
408844b87433SJohn Marino {
408944b87433SJohn Marino #ifdef RE_ENABLE_I18N
409044b87433SJohn Marino if (pstr->mb_cur_max > 1)
409144b87433SJohn Marino {
409244b87433SJohn Marino ret = build_wcs_upper_buffer (pstr);
4093*6ea1f93eSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
409444b87433SJohn Marino return ret;
409544b87433SJohn Marino }
409644b87433SJohn Marino else
409744b87433SJohn Marino #endif /* RE_ENABLE_I18N */
409844b87433SJohn Marino build_upper_buffer (pstr);
409944b87433SJohn Marino }
410044b87433SJohn Marino else
410144b87433SJohn Marino {
410244b87433SJohn Marino #ifdef RE_ENABLE_I18N
410344b87433SJohn Marino if (pstr->mb_cur_max > 1)
410444b87433SJohn Marino build_wcs_buffer (pstr);
410544b87433SJohn Marino else
410644b87433SJohn Marino #endif /* RE_ENABLE_I18N */
410744b87433SJohn Marino {
410844b87433SJohn Marino if (pstr->trans != NULL)
410944b87433SJohn Marino re_string_translate_buffer (pstr);
411044b87433SJohn Marino }
411144b87433SJohn Marino }
411244b87433SJohn Marino return REG_NOERROR;
411344b87433SJohn Marino }
411444b87433SJohn Marino
411544b87433SJohn Marino
411644b87433SJohn Marino /* Functions for matching context. */
411744b87433SJohn Marino
411844b87433SJohn Marino /* Initialize MCTX. */
411944b87433SJohn Marino
412044b87433SJohn Marino static reg_errcode_t
4121*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)412244b87433SJohn Marino match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
412344b87433SJohn Marino {
412444b87433SJohn Marino mctx->eflags = eflags;
4125*6ea1f93eSDaniel Fojt mctx->match_last = -1;
412644b87433SJohn Marino if (n > 0)
412744b87433SJohn Marino {
412844b87433SJohn Marino /* Avoid overflow. */
412944b87433SJohn Marino size_t max_object_size =
413044b87433SJohn Marino MAX (sizeof (struct re_backref_cache_entry),
413144b87433SJohn Marino sizeof (re_sub_match_top_t *));
4132*6ea1f93eSDaniel Fojt if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
413344b87433SJohn Marino return REG_ESPACE;
413444b87433SJohn Marino
413544b87433SJohn Marino mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
413644b87433SJohn Marino mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4137*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
413844b87433SJohn Marino return REG_ESPACE;
413944b87433SJohn Marino }
414044b87433SJohn Marino /* Already zero-ed by the caller.
414144b87433SJohn Marino else
414244b87433SJohn Marino mctx->bkref_ents = NULL;
414344b87433SJohn Marino mctx->nbkref_ents = 0;
414444b87433SJohn Marino mctx->nsub_tops = 0; */
414544b87433SJohn Marino mctx->abkref_ents = n;
414644b87433SJohn Marino mctx->max_mb_elem_len = 1;
414744b87433SJohn Marino mctx->asub_tops = n;
414844b87433SJohn Marino return REG_NOERROR;
414944b87433SJohn Marino }
415044b87433SJohn Marino
415144b87433SJohn Marino /* Clean the entries which depend on the current input in MCTX.
415244b87433SJohn Marino This function must be invoked when the matcher changes the start index
415344b87433SJohn Marino of the input, or changes the input string. */
415444b87433SJohn Marino
415544b87433SJohn Marino static void
match_ctx_clean(re_match_context_t * mctx)415644b87433SJohn Marino match_ctx_clean (re_match_context_t *mctx)
415744b87433SJohn Marino {
415844b87433SJohn Marino Idx st_idx;
415944b87433SJohn Marino for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
416044b87433SJohn Marino {
416144b87433SJohn Marino Idx sl_idx;
416244b87433SJohn Marino re_sub_match_top_t *top = mctx->sub_tops[st_idx];
416344b87433SJohn Marino for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
416444b87433SJohn Marino {
416544b87433SJohn Marino re_sub_match_last_t *last = top->lasts[sl_idx];
416644b87433SJohn Marino re_free (last->path.array);
416744b87433SJohn Marino re_free (last);
416844b87433SJohn Marino }
416944b87433SJohn Marino re_free (top->lasts);
417044b87433SJohn Marino if (top->path)
417144b87433SJohn Marino {
417244b87433SJohn Marino re_free (top->path->array);
417344b87433SJohn Marino re_free (top->path);
417444b87433SJohn Marino }
4175*6ea1f93eSDaniel Fojt re_free (top);
417644b87433SJohn Marino }
417744b87433SJohn Marino
417844b87433SJohn Marino mctx->nsub_tops = 0;
417944b87433SJohn Marino mctx->nbkref_ents = 0;
418044b87433SJohn Marino }
418144b87433SJohn Marino
418244b87433SJohn Marino /* Free all the memory associated with MCTX. */
418344b87433SJohn Marino
418444b87433SJohn Marino static void
match_ctx_free(re_match_context_t * mctx)418544b87433SJohn Marino match_ctx_free (re_match_context_t *mctx)
418644b87433SJohn Marino {
418744b87433SJohn Marino /* First, free all the memory associated with MCTX->SUB_TOPS. */
418844b87433SJohn Marino match_ctx_clean (mctx);
418944b87433SJohn Marino re_free (mctx->sub_tops);
419044b87433SJohn Marino re_free (mctx->bkref_ents);
419144b87433SJohn Marino }
419244b87433SJohn Marino
419344b87433SJohn Marino /* Add a new backreference entry to MCTX.
419444b87433SJohn Marino Note that we assume that caller never call this function with duplicate
419544b87433SJohn Marino entry, and call with STR_IDX which isn't smaller than any existing entry.
419644b87433SJohn Marino */
419744b87433SJohn Marino
419844b87433SJohn Marino static reg_errcode_t
4199*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)420044b87433SJohn Marino match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
420144b87433SJohn Marino Idx to)
420244b87433SJohn Marino {
420344b87433SJohn Marino if (mctx->nbkref_ents >= mctx->abkref_ents)
420444b87433SJohn Marino {
420544b87433SJohn Marino struct re_backref_cache_entry* new_entry;
420644b87433SJohn Marino new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
420744b87433SJohn Marino mctx->abkref_ents * 2);
4208*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_entry == NULL))
420944b87433SJohn Marino {
421044b87433SJohn Marino re_free (mctx->bkref_ents);
421144b87433SJohn Marino return REG_ESPACE;
421244b87433SJohn Marino }
421344b87433SJohn Marino mctx->bkref_ents = new_entry;
421444b87433SJohn Marino memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
421544b87433SJohn Marino sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
421644b87433SJohn Marino mctx->abkref_ents *= 2;
421744b87433SJohn Marino }
421844b87433SJohn Marino if (mctx->nbkref_ents > 0
421944b87433SJohn Marino && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
422044b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
422144b87433SJohn Marino
422244b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents].node = node;
422344b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
422444b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
422544b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
422644b87433SJohn Marino
422744b87433SJohn Marino /* This is a cache that saves negative results of check_dst_limits_calc_pos.
422844b87433SJohn Marino If bit N is clear, means that this entry won't epsilon-transition to
422944b87433SJohn Marino an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
423044b87433SJohn Marino it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
423144b87433SJohn Marino such node.
423244b87433SJohn Marino
423344b87433SJohn Marino A backreference does not epsilon-transition unless it is empty, so set
423444b87433SJohn Marino to all zeros if FROM != TO. */
423544b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
423644b87433SJohn Marino = (from == to ? -1 : 0);
423744b87433SJohn Marino
423844b87433SJohn Marino mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
423944b87433SJohn Marino if (mctx->max_mb_elem_len < to - from)
424044b87433SJohn Marino mctx->max_mb_elem_len = to - from;
424144b87433SJohn Marino return REG_NOERROR;
424244b87433SJohn Marino }
424344b87433SJohn Marino
4244*6ea1f93eSDaniel Fojt /* Return the first entry with the same str_idx, or -1 if none is
424544b87433SJohn Marino found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
424644b87433SJohn Marino
424744b87433SJohn Marino static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)424844b87433SJohn Marino search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
424944b87433SJohn Marino {
425044b87433SJohn Marino Idx left, right, mid, last;
425144b87433SJohn Marino last = right = mctx->nbkref_ents;
425244b87433SJohn Marino for (left = 0; left < right;)
425344b87433SJohn Marino {
425444b87433SJohn Marino mid = (left + right) / 2;
425544b87433SJohn Marino if (mctx->bkref_ents[mid].str_idx < str_idx)
425644b87433SJohn Marino left = mid + 1;
425744b87433SJohn Marino else
425844b87433SJohn Marino right = mid;
425944b87433SJohn Marino }
426044b87433SJohn Marino if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
426144b87433SJohn Marino return left;
426244b87433SJohn Marino else
4263*6ea1f93eSDaniel Fojt return -1;
426444b87433SJohn Marino }
426544b87433SJohn Marino
426644b87433SJohn Marino /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
426744b87433SJohn Marino at STR_IDX. */
426844b87433SJohn Marino
426944b87433SJohn Marino static reg_errcode_t
4270*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)427144b87433SJohn Marino match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
427244b87433SJohn Marino {
427344b87433SJohn Marino #ifdef DEBUG
427444b87433SJohn Marino assert (mctx->sub_tops != NULL);
427544b87433SJohn Marino assert (mctx->asub_tops > 0);
427644b87433SJohn Marino #endif
4277*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
427844b87433SJohn Marino {
427944b87433SJohn Marino Idx new_asub_tops = mctx->asub_tops * 2;
428044b87433SJohn Marino re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
428144b87433SJohn Marino re_sub_match_top_t *,
428244b87433SJohn Marino new_asub_tops);
4283*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_array == NULL))
428444b87433SJohn Marino return REG_ESPACE;
428544b87433SJohn Marino mctx->sub_tops = new_array;
428644b87433SJohn Marino mctx->asub_tops = new_asub_tops;
428744b87433SJohn Marino }
428844b87433SJohn Marino mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4289*6ea1f93eSDaniel Fojt if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
429044b87433SJohn Marino return REG_ESPACE;
429144b87433SJohn Marino mctx->sub_tops[mctx->nsub_tops]->node = node;
429244b87433SJohn Marino mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
429344b87433SJohn Marino return REG_NOERROR;
429444b87433SJohn Marino }
429544b87433SJohn Marino
429644b87433SJohn Marino /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
429744b87433SJohn Marino at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
429844b87433SJohn Marino
429944b87433SJohn Marino static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)430044b87433SJohn Marino match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
430144b87433SJohn Marino {
430244b87433SJohn Marino re_sub_match_last_t *new_entry;
4303*6ea1f93eSDaniel Fojt if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
430444b87433SJohn Marino {
430544b87433SJohn Marino Idx new_alasts = 2 * subtop->alasts + 1;
430644b87433SJohn Marino re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
430744b87433SJohn Marino re_sub_match_last_t *,
430844b87433SJohn Marino new_alasts);
4309*6ea1f93eSDaniel Fojt if (__glibc_unlikely (new_array == NULL))
431044b87433SJohn Marino return NULL;
431144b87433SJohn Marino subtop->lasts = new_array;
431244b87433SJohn Marino subtop->alasts = new_alasts;
431344b87433SJohn Marino }
431444b87433SJohn Marino new_entry = calloc (1, sizeof (re_sub_match_last_t));
4315*6ea1f93eSDaniel Fojt if (__glibc_likely (new_entry != NULL))
431644b87433SJohn Marino {
431744b87433SJohn Marino subtop->lasts[subtop->nlasts] = new_entry;
431844b87433SJohn Marino new_entry->node = node;
431944b87433SJohn Marino new_entry->str_idx = str_idx;
432044b87433SJohn Marino ++subtop->nlasts;
432144b87433SJohn Marino }
432244b87433SJohn Marino return new_entry;
432344b87433SJohn Marino }
432444b87433SJohn Marino
432544b87433SJohn Marino static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)432644b87433SJohn Marino sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
432744b87433SJohn Marino re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
432844b87433SJohn Marino {
432944b87433SJohn Marino sctx->sifted_states = sifted_sts;
433044b87433SJohn Marino sctx->limited_states = limited_sts;
433144b87433SJohn Marino sctx->last_node = last_node;
433244b87433SJohn Marino sctx->last_str_idx = last_str_idx;
433344b87433SJohn Marino re_node_set_init_empty (&sctx->limits);
433444b87433SJohn Marino }
4335