1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs, regmatch_t *prevregs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70 static int sift_states_iter_mb (const re_match_context_t *mctx,
71 re_sift_context_t *sctx,
72 Idx node_idx, Idx str_idx, Idx max_str_idx);
73 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74 re_sift_context_t *sctx);
75 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76 re_sift_context_t *sctx, Idx str_idx,
77 re_node_set *cur_dest);
78 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 Idx str_idx,
81 re_node_set *dest_nodes);
82 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83 re_node_set *dest_nodes,
84 const re_node_set *candidates);
85 static bool check_dst_limits (const re_match_context_t *mctx,
86 const re_node_set *limits,
87 Idx dst_node, Idx dst_idx, Idx src_node,
88 Idx src_idx);
89 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90 int boundaries, Idx subexp_idx,
91 Idx from_node, Idx bkref_idx);
92 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93 Idx limit, Idx subexp_idx,
94 Idx node, Idx str_idx,
95 Idx bkref_idx);
96 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates,
99 re_node_set *limits,
100 struct re_backref_cache_entry *bkref_ents,
101 Idx str_idx);
102 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103 re_sift_context_t *sctx,
104 Idx str_idx, const re_node_set *candidates);
105 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
106 re_dfastate_t **dst,
107 re_dfastate_t **src, Idx num);
108 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
109 re_match_context_t *mctx);
110 static re_dfastate_t *transit_state (reg_errcode_t *err,
111 re_match_context_t *mctx,
112 re_dfastate_t *state);
113 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
114 re_match_context_t *mctx,
115 re_dfastate_t *next_state);
116 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117 re_node_set *cur_nodes,
118 Idx str_idx);
119 #if 0
120 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
121 re_match_context_t *mctx,
122 re_dfastate_t *pstate);
123 #endif
124 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125 re_dfastate_t *pstate);
126 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127 const re_node_set *nodes);
128 static reg_errcode_t get_subexp (re_match_context_t *mctx,
129 Idx bkref_node, Idx bkref_str_idx);
130 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131 const re_sub_match_top_t *sub_top,
132 re_sub_match_last_t *sub_last,
133 Idx bkref_node, Idx bkref_str);
134 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
135 Idx subexp_idx, int type);
136 static reg_errcode_t check_arrival (re_match_context_t *mctx,
137 state_array_t *path, Idx top_node,
138 Idx top_str, Idx last_node, Idx last_str,
139 int type);
140 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
141 Idx str_idx,
142 re_node_set *cur_nodes,
143 re_node_set *next_nodes);
144 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145 re_node_set *cur_nodes,
146 Idx ex_subexp, int type);
147 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148 re_node_set *dst_nodes,
149 Idx target, Idx ex_subexp,
150 int type);
151 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152 re_node_set *cur_nodes, Idx cur_str,
153 Idx subexp_num, int type);
154 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
155 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156 const re_string_t *input, Idx idx);
157 #ifdef _LIBC
158 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
159 size_t name_len);
160 #endif
161 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162 const re_dfastate_t *state,
163 re_node_set *states_node,
164 bitset_t *states_ch);
165 static bool check_node_accept (const re_match_context_t *mctx,
166 const re_token_t *node, Idx idx);
167 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
168
169 /* Entry point for POSIX code. */
170
171 /* regexec searches for a given pattern, specified by PREG, in the
172 string STRING.
173
174 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
175 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
176 least NMATCH elements, and we set them to the offsets of the
177 corresponding matched substrings.
178
179 EFLAGS specifies "execution flags" which affect matching: if
180 REG_NOTBOL is set, then ^ does not match at the beginning of the
181 string; if REG_NOTEOL is set, then $ does not match at the end.
182
183 Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
184 EFLAGS is invalid. */
185
186 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[_REGEX_NELTS (nmatch)],int eflags)187 regexec (const regex_t *__restrict preg, const char *__restrict string,
188 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
189 {
190 reg_errcode_t err;
191 Idx start, length;
192 re_dfa_t *dfa = preg->buffer;
193
194 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
195 return REG_BADPAT;
196
197 if (eflags & REG_STARTEND)
198 {
199 start = pmatch[0].rm_so;
200 length = pmatch[0].rm_eo;
201 }
202 else
203 {
204 start = 0;
205 length = strlen (string);
206 }
207
208 lock_lock (dfa->lock);
209 if (preg->no_sub)
210 err = re_search_internal (preg, string, length, start, length,
211 length, 0, NULL, eflags);
212 else
213 err = re_search_internal (preg, string, length, start, length,
214 length, nmatch, pmatch, eflags);
215 lock_unlock (dfa->lock);
216 return err != REG_NOERROR;
217 }
218
219 #ifdef _LIBC
220 libc_hidden_def (__regexec)
221
222 # include <shlib-compat.h>
223 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
224
225 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
226 __typeof__ (__regexec) __compat_regexec;
227
228 int
229 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[_REGEX_NELTS (nmatch)],int eflags)230 __compat_regexec (const regex_t *__restrict preg,
231 const char *__restrict string, size_t nmatch,
232 regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
233 {
234 return regexec (preg, string, nmatch, pmatch,
235 eflags & (REG_NOTBOL | REG_NOTEOL));
236 }
237 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
238 # endif
239 #endif
240
241 /* Entry points for GNU code. */
242
243 /* re_match, re_search, re_match_2, re_search_2
244
245 The former two functions operate on STRING with length LENGTH,
246 while the later two operate on concatenation of STRING1 and STRING2
247 with lengths LENGTH1 and LENGTH2, respectively.
248
249 re_match() matches the compiled pattern in BUFP against the string,
250 starting at index START.
251
252 re_search() first tries matching at index START, then it tries to match
253 starting from index START + 1, and so on. The last start position tried
254 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
255 way as re_match().)
256
257 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258 the first STOP characters of the concatenation of the strings should be
259 concerned.
260
261 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262 and all groups is stored in REGS. (For the "_2" variants, the offsets are
263 computed relative to the concatenation, not relative to the individual
264 strings.)
265
266 On success, re_match* functions return the length of the match, re_search*
267 return the position of the start of the match. They return -1 on
268 match failure, -2 on error. */
269
270 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)271 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
272 Idx start, struct re_registers *regs)
273 {
274 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
275 }
276 #ifdef _LIBC
weak_alias(__re_match,re_match)277 weak_alias (__re_match, re_match)
278 #endif
279
280 regoff_t
281 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
282 Idx start, regoff_t range, struct re_registers *regs)
283 {
284 return re_search_stub (bufp, string, length, start, range, length, regs,
285 false);
286 }
287 #ifdef _LIBC
weak_alias(__re_search,re_search)288 weak_alias (__re_search, re_search)
289 #endif
290
291 regoff_t
292 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
293 const char *string2, Idx length2, Idx start,
294 struct re_registers *regs, Idx stop)
295 {
296 return re_search_2_stub (bufp, string1, length1, string2, length2,
297 start, 0, regs, stop, true);
298 }
299 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)300 weak_alias (__re_match_2, re_match_2)
301 #endif
302
303 regoff_t
304 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
305 const char *string2, Idx length2, Idx start, regoff_t range,
306 struct re_registers *regs, Idx stop)
307 {
308 return re_search_2_stub (bufp, string1, length1, string2, length2,
309 start, range, regs, stop, false);
310 }
311 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)312 weak_alias (__re_search_2, re_search_2)
313 #endif
314
315 static regoff_t
316 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
317 Idx length1, const char *string2, Idx length2, Idx start,
318 regoff_t range, struct re_registers *regs,
319 Idx stop, bool ret_len)
320 {
321 const char *str;
322 regoff_t rval;
323 Idx len;
324 char *s = NULL;
325
326 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
327 || INT_ADD_WRAPV (length1, length2, &len))))
328 return -2;
329
330 /* Concatenate the strings. */
331 if (length2 > 0)
332 if (length1 > 0)
333 {
334 s = re_malloc (char, len);
335
336 if (__glibc_unlikely (s == NULL))
337 return -2;
338 #ifdef _LIBC
339 memcpy (__mempcpy (s, string1, length1), string2, length2);
340 #else
341 memcpy (s, string1, length1);
342 memcpy (s + length1, string2, length2);
343 #endif
344 str = s;
345 }
346 else
347 str = string2;
348 else
349 str = string1;
350
351 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
352 ret_len);
353 re_free (s);
354 return rval;
355 }
356
357 /* The parameters have the same meaning as those of re_search.
358 Additional parameters:
359 If RET_LEN is true the length of the match is returned (re_match style);
360 otherwise the position of the match is returned. */
361
362 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)363 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
364 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
365 bool ret_len)
366 {
367 reg_errcode_t result;
368 regmatch_t *pmatch;
369 Idx nregs;
370 regoff_t rval;
371 int eflags = 0;
372 re_dfa_t *dfa = bufp->buffer;
373 Idx last_start = start + range;
374
375 /* Check for out-of-range. */
376 if (__glibc_unlikely (start < 0 || start > length))
377 return -1;
378 if (__glibc_unlikely (length < last_start
379 || (0 <= range && last_start < start)))
380 last_start = length;
381 else if (__glibc_unlikely (last_start < 0
382 || (range < 0 && start <= last_start)))
383 last_start = 0;
384
385 lock_lock (dfa->lock);
386
387 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
388 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
389
390 /* Compile fastmap if we haven't yet. */
391 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
392 re_compile_fastmap (bufp);
393
394 if (__glibc_unlikely (bufp->no_sub))
395 regs = NULL;
396
397 /* We need at least 1 register. */
398 if (regs == NULL)
399 nregs = 1;
400 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
401 && regs->num_regs <= bufp->re_nsub))
402 {
403 nregs = regs->num_regs;
404 if (__glibc_unlikely (nregs < 1))
405 {
406 /* Nothing can be copied to regs. */
407 regs = NULL;
408 nregs = 1;
409 }
410 }
411 else
412 nregs = bufp->re_nsub + 1;
413 pmatch = re_malloc (regmatch_t, nregs);
414 if (__glibc_unlikely (pmatch == NULL))
415 {
416 rval = -2;
417 goto out;
418 }
419
420 result = re_search_internal (bufp, string, length, start, last_start, stop,
421 nregs, pmatch, eflags);
422
423 rval = 0;
424
425 /* I hope we needn't fill their regs with -1's when no match was found. */
426 if (result != REG_NOERROR)
427 rval = result == REG_NOMATCH ? -1 : -2;
428 else if (regs != NULL)
429 {
430 /* If caller wants register contents data back, copy them. */
431 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
432 bufp->regs_allocated);
433 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
434 rval = -2;
435 }
436
437 if (__glibc_likely (rval == 0))
438 {
439 if (ret_len)
440 {
441 DEBUG_ASSERT (pmatch[0].rm_so == start);
442 rval = pmatch[0].rm_eo - start;
443 }
444 else
445 rval = pmatch[0].rm_so;
446 }
447 re_free (pmatch);
448 out:
449 lock_unlock (dfa->lock);
450 return rval;
451 }
452
453 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)454 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
455 int regs_allocated)
456 {
457 int rval = REGS_REALLOCATE;
458 Idx i;
459 Idx need_regs = nregs + 1;
460 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
461 uses. */
462
463 /* Have the register data arrays been allocated? */
464 if (regs_allocated == REGS_UNALLOCATED)
465 { /* No. So allocate them with malloc. */
466 regs->start = re_malloc (regoff_t, need_regs);
467 if (__glibc_unlikely (regs->start == NULL))
468 return REGS_UNALLOCATED;
469 regs->end = re_malloc (regoff_t, need_regs);
470 if (__glibc_unlikely (regs->end == NULL))
471 {
472 re_free (regs->start);
473 return REGS_UNALLOCATED;
474 }
475 regs->num_regs = need_regs;
476 }
477 else if (regs_allocated == REGS_REALLOCATE)
478 { /* Yes. If we need more elements than were already
479 allocated, reallocate them. If we need fewer, just
480 leave it alone. */
481 if (__glibc_unlikely (need_regs > regs->num_regs))
482 {
483 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
484 regoff_t *new_end;
485 if (__glibc_unlikely (new_start == NULL))
486 return REGS_UNALLOCATED;
487 new_end = re_realloc (regs->end, regoff_t, need_regs);
488 if (__glibc_unlikely (new_end == NULL))
489 {
490 re_free (new_start);
491 return REGS_UNALLOCATED;
492 }
493 regs->start = new_start;
494 regs->end = new_end;
495 regs->num_regs = need_regs;
496 }
497 }
498 else
499 {
500 DEBUG_ASSERT (regs_allocated == REGS_FIXED);
501 /* This function may not be called with REGS_FIXED and nregs too big. */
502 DEBUG_ASSERT (nregs <= regs->num_regs);
503 rval = REGS_FIXED;
504 }
505
506 /* Copy the regs. */
507 for (i = 0; i < nregs; ++i)
508 {
509 regs->start[i] = pmatch[i].rm_so;
510 regs->end[i] = pmatch[i].rm_eo;
511 }
512 for ( ; i < regs->num_regs; ++i)
513 regs->start[i] = regs->end[i] = -1;
514
515 return rval;
516 }
517
518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
520 this memory for recording register information. STARTS and ENDS
521 must be allocated using the malloc library routine, and must each
522 be at least NUM_REGS * sizeof (regoff_t) bytes long.
523
524 If NUM_REGS == 0, then subsequent matches should allocate their own
525 register data.
526
527 Unless this function is called, the first search or match using
528 PATTERN_BUFFER will allocate its own register data, without
529 freeing the old data. */
530
531 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
533 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
534 {
535 if (num_regs)
536 {
537 bufp->regs_allocated = REGS_REALLOCATE;
538 regs->num_regs = num_regs;
539 regs->start = starts;
540 regs->end = ends;
541 }
542 else
543 {
544 bufp->regs_allocated = REGS_UNALLOCATED;
545 regs->num_regs = 0;
546 regs->start = regs->end = NULL;
547 }
548 }
549 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)550 weak_alias (__re_set_registers, re_set_registers)
551 #endif
552
553 /* Entry points compatible with 4.2 BSD regex library. We don't define
554 them unless specifically requested. */
555
556 #if defined _REGEX_RE_COMP || defined _LIBC
557 int
558 # ifdef _LIBC
559 weak_function
560 # endif
561 re_exec (const char *s)
562 {
563 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
564 }
565 #endif /* _REGEX_RE_COMP */
566
567 /* Internal entry point. */
568
569 /* Searches for a compiled pattern PREG in the string STRING, whose
570 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
571 meaning as with regexec. LAST_START is START + RANGE, where
572 START and RANGE have the same meaning as with re_search.
573 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574 otherwise return the error code.
575 Note: We assume front end functions already check ranges.
576 (0 <= LAST_START && LAST_START <= LENGTH) */
577
578 static reg_errcode_t
579 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)580 re_search_internal (const regex_t *preg, const char *string, Idx length,
581 Idx start, Idx last_start, Idx stop, size_t nmatch,
582 regmatch_t pmatch[], int eflags)
583 {
584 reg_errcode_t err;
585 const re_dfa_t *dfa = preg->buffer;
586 Idx left_lim, right_lim;
587 int incr;
588 bool fl_longest_match;
589 int match_kind;
590 Idx match_first;
591 Idx match_last = -1;
592 Idx extra_nmatch;
593 bool sb;
594 int ch;
595 re_match_context_t mctx = { .dfa = dfa };
596 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
597 && start != last_start && !preg->can_be_null)
598 ? preg->fastmap : NULL);
599 RE_TRANSLATE_TYPE t = preg->translate;
600
601 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
602 nmatch -= extra_nmatch;
603
604 /* Check if the DFA haven't been compiled. */
605 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
606 || dfa->init_state_word == NULL
607 || dfa->init_state_nl == NULL
608 || dfa->init_state_begbuf == NULL))
609 return REG_NOMATCH;
610
611 /* We assume front-end functions already check them. */
612 DEBUG_ASSERT (0 <= last_start && last_start <= length);
613
614 /* If initial states with non-begbuf contexts have no elements,
615 the regex must be anchored. If preg->newline_anchor is set,
616 we'll never use init_state_nl, so do not check it. */
617 if (dfa->init_state->nodes.nelem == 0
618 && dfa->init_state_word->nodes.nelem == 0
619 && (dfa->init_state_nl->nodes.nelem == 0
620 || !preg->newline_anchor))
621 {
622 if (start != 0 && last_start != 0)
623 return REG_NOMATCH;
624 start = last_start = 0;
625 }
626
627 /* We must check the longest matching, if nmatch > 0. */
628 fl_longest_match = (nmatch != 0 || dfa->nbackref);
629
630 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
631 preg->translate, (preg->syntax & RE_ICASE) != 0,
632 dfa);
633 if (__glibc_unlikely (err != REG_NOERROR))
634 goto free_return;
635 mctx.input.stop = stop;
636 mctx.input.raw_stop = stop;
637 mctx.input.newline_anchor = preg->newline_anchor;
638
639 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
640 if (__glibc_unlikely (err != REG_NOERROR))
641 goto free_return;
642
643 /* We will log all the DFA states through which the dfa pass,
644 if nmatch > 1, or this dfa has "multibyte node", which is a
645 back-reference or a node which can accept multibyte character or
646 multi character collating element. */
647 if (nmatch > 1 || dfa->has_mb_node)
648 {
649 /* Avoid overflow. */
650 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
651 <= mctx.input.bufs_len)))
652 {
653 err = REG_ESPACE;
654 goto free_return;
655 }
656
657 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
658 if (__glibc_unlikely (mctx.state_log == NULL))
659 {
660 err = REG_ESPACE;
661 goto free_return;
662 }
663 }
664
665 match_first = start;
666 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
667 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
668
669 /* Check incrementally whether the input string matches. */
670 incr = (last_start < start) ? -1 : 1;
671 left_lim = (last_start < start) ? last_start : start;
672 right_lim = (last_start < start) ? start : last_start;
673 sb = dfa->mb_cur_max == 1;
674 match_kind =
675 (fastmap
676 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
677 | (start <= last_start ? 2 : 0)
678 | (t != NULL ? 1 : 0))
679 : 8);
680
681 for (;; match_first += incr)
682 {
683 err = REG_NOMATCH;
684 if (match_first < left_lim || right_lim < match_first)
685 goto free_return;
686
687 /* Advance as rapidly as possible through the string, until we
688 find a plausible place to start matching. This may be done
689 with varying efficiency, so there are various possibilities:
690 only the most common of them are specialized, in order to
691 save on code size. We use a switch statement for speed. */
692 switch (match_kind)
693 {
694 case 8:
695 /* No fastmap. */
696 break;
697
698 case 7:
699 /* Fastmap with single-byte translation, match forward. */
700 while (__glibc_likely (match_first < right_lim)
701 && !fastmap[t[(unsigned char) string[match_first]]])
702 ++match_first;
703 goto forward_match_found_start_or_reached_end;
704
705 case 6:
706 /* Fastmap without translation, match forward. */
707 while (__glibc_likely (match_first < right_lim)
708 && !fastmap[(unsigned char) string[match_first]])
709 ++match_first;
710
711 forward_match_found_start_or_reached_end:
712 if (__glibc_unlikely (match_first == right_lim))
713 {
714 ch = match_first >= length
715 ? 0 : (unsigned char) string[match_first];
716 if (!fastmap[t ? t[ch] : ch])
717 goto free_return;
718 }
719 break;
720
721 case 4:
722 case 5:
723 /* Fastmap without multi-byte translation, match backwards. */
724 while (match_first >= left_lim)
725 {
726 ch = match_first >= length
727 ? 0 : (unsigned char) string[match_first];
728 if (fastmap[t ? t[ch] : ch])
729 break;
730 --match_first;
731 }
732 if (match_first < left_lim)
733 goto free_return;
734 break;
735
736 default:
737 /* In this case, we can't determine easily the current byte,
738 since it might be a component byte of a multibyte
739 character. Then we use the constructed buffer instead. */
740 for (;;)
741 {
742 /* If MATCH_FIRST is out of the valid range, reconstruct the
743 buffers. */
744 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
745 if (__glibc_unlikely (offset
746 >= (__re_size_t) mctx.input.valid_raw_len))
747 {
748 err = re_string_reconstruct (&mctx.input, match_first,
749 eflags);
750 if (__glibc_unlikely (err != REG_NOERROR))
751 goto free_return;
752
753 offset = match_first - mctx.input.raw_mbs_idx;
754 }
755 /* Use buffer byte if OFFSET is in buffer, otherwise '\0'. */
756 ch = (offset < mctx.input.valid_len
757 ? re_string_byte_at (&mctx.input, offset) : 0);
758 if (fastmap[ch])
759 break;
760 match_first += incr;
761 if (match_first < left_lim || match_first > right_lim)
762 {
763 err = REG_NOMATCH;
764 goto free_return;
765 }
766 }
767 break;
768 }
769
770 /* Reconstruct the buffers so that the matcher can assume that
771 the matching starts from the beginning of the buffer. */
772 err = re_string_reconstruct (&mctx.input, match_first, eflags);
773 if (__glibc_unlikely (err != REG_NOERROR))
774 goto free_return;
775
776 /* Don't consider this char as a possible match start if it part,
777 yet isn't the head, of a multibyte character. */
778 if (!sb && !re_string_first_byte (&mctx.input, 0))
779 continue;
780
781 /* It seems to be appropriate one, then use the matcher. */
782 /* We assume that the matching starts from 0. */
783 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
784 match_last = check_matching (&mctx, fl_longest_match,
785 start <= last_start ? &match_first : NULL);
786 if (match_last != -1)
787 {
788 if (__glibc_unlikely (match_last == -2))
789 {
790 err = REG_ESPACE;
791 goto free_return;
792 }
793 else
794 {
795 mctx.match_last = match_last;
796 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
797 {
798 re_dfastate_t *pstate = mctx.state_log[match_last];
799 mctx.last_node = check_halt_state_context (&mctx, pstate,
800 match_last);
801 }
802 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
803 || dfa->nbackref)
804 {
805 err = prune_impossible_nodes (&mctx);
806 if (err == REG_NOERROR)
807 break;
808 if (__glibc_unlikely (err != REG_NOMATCH))
809 goto free_return;
810 match_last = -1;
811 }
812 else
813 break; /* We found a match. */
814 }
815 }
816
817 match_ctx_clean (&mctx);
818 }
819
820 DEBUG_ASSERT (match_last != -1);
821 DEBUG_ASSERT (err == REG_NOERROR);
822
823 /* Set pmatch[] if we need. */
824 if (nmatch > 0)
825 {
826 Idx reg_idx;
827
828 /* Initialize registers. */
829 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
830 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
831
832 /* Set the points where matching start/end. */
833 pmatch[0].rm_so = 0;
834 pmatch[0].rm_eo = mctx.match_last;
835 /* FIXME: This function should fail if mctx.match_last exceeds
836 the maximum possible regoff_t value. We need a new error
837 code REG_OVERFLOW. */
838
839 if (!preg->no_sub && nmatch > 1)
840 {
841 err = set_regs (preg, &mctx, nmatch, pmatch,
842 dfa->has_plural_match && dfa->nbackref > 0);
843 if (__glibc_unlikely (err != REG_NOERROR))
844 goto free_return;
845 }
846
847 /* At last, add the offset to each register, since we slid
848 the buffers so that we could assume that the matching starts
849 from 0. */
850 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
851 if (pmatch[reg_idx].rm_so != -1)
852 {
853 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
854 {
855 pmatch[reg_idx].rm_so =
856 (pmatch[reg_idx].rm_so == mctx.input.valid_len
857 ? mctx.input.valid_raw_len
858 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
859 pmatch[reg_idx].rm_eo =
860 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
861 ? mctx.input.valid_raw_len
862 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
863 }
864 pmatch[reg_idx].rm_so += match_first;
865 pmatch[reg_idx].rm_eo += match_first;
866 }
867 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
868 {
869 pmatch[nmatch + reg_idx].rm_so = -1;
870 pmatch[nmatch + reg_idx].rm_eo = -1;
871 }
872
873 if (dfa->subexp_map)
874 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
875 if (dfa->subexp_map[reg_idx] != reg_idx)
876 {
877 pmatch[reg_idx + 1].rm_so
878 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
879 pmatch[reg_idx + 1].rm_eo
880 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
881 }
882 }
883
884 free_return:
885 re_free (mctx.state_log);
886 if (dfa->nbackref)
887 match_ctx_free (&mctx);
888 re_string_destruct (&mctx.input);
889 return err;
890 }
891
892 static reg_errcode_t
893 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)894 prune_impossible_nodes (re_match_context_t *mctx)
895 {
896 const re_dfa_t *const dfa = mctx->dfa;
897 Idx halt_node, match_last;
898 reg_errcode_t ret;
899 re_dfastate_t **sifted_states;
900 re_dfastate_t **lim_states = NULL;
901 re_sift_context_t sctx;
902 DEBUG_ASSERT (mctx->state_log != NULL);
903 match_last = mctx->match_last;
904 halt_node = mctx->last_node;
905
906 /* Avoid overflow. */
907 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
908 <= match_last))
909 return REG_ESPACE;
910
911 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
912 if (__glibc_unlikely (sifted_states == NULL))
913 {
914 ret = REG_ESPACE;
915 goto free_return;
916 }
917 if (dfa->nbackref)
918 {
919 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
920 if (__glibc_unlikely (lim_states == NULL))
921 {
922 ret = REG_ESPACE;
923 goto free_return;
924 }
925 while (1)
926 {
927 memset (lim_states, '\0',
928 sizeof (re_dfastate_t *) * (match_last + 1));
929 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
930 match_last);
931 ret = sift_states_backward (mctx, &sctx);
932 re_node_set_free (&sctx.limits);
933 if (__glibc_unlikely (ret != REG_NOERROR))
934 goto free_return;
935 if (sifted_states[0] != NULL || lim_states[0] != NULL)
936 break;
937 do
938 {
939 --match_last;
940 if (match_last < 0)
941 {
942 ret = REG_NOMATCH;
943 goto free_return;
944 }
945 } while (mctx->state_log[match_last] == NULL
946 || !mctx->state_log[match_last]->halt);
947 halt_node = check_halt_state_context (mctx,
948 mctx->state_log[match_last],
949 match_last);
950 }
951 ret = merge_state_array (dfa, sifted_states, lim_states,
952 match_last + 1);
953 re_free (lim_states);
954 lim_states = NULL;
955 if (__glibc_unlikely (ret != REG_NOERROR))
956 goto free_return;
957 }
958 else
959 {
960 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
961 ret = sift_states_backward (mctx, &sctx);
962 re_node_set_free (&sctx.limits);
963 if (__glibc_unlikely (ret != REG_NOERROR))
964 goto free_return;
965 if (sifted_states[0] == NULL)
966 {
967 ret = REG_NOMATCH;
968 goto free_return;
969 }
970 }
971 re_free (mctx->state_log);
972 mctx->state_log = sifted_states;
973 sifted_states = NULL;
974 mctx->last_node = halt_node;
975 mctx->match_last = match_last;
976 ret = REG_NOERROR;
977 free_return:
978 re_free (sifted_states);
979 re_free (lim_states);
980 return ret;
981 }
982
983 /* Acquire an initial state and return it.
984 We must select appropriate initial state depending on the context,
985 since initial states may have constraints like "\<", "^", etc.. */
986
987 static __always_inline re_dfastate_t *
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)988 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
989 Idx idx)
990 {
991 const re_dfa_t *const dfa = mctx->dfa;
992 if (dfa->init_state->has_constraint)
993 {
994 unsigned int context;
995 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
996 if (IS_WORD_CONTEXT (context))
997 return dfa->init_state_word;
998 else if (IS_ORDINARY_CONTEXT (context))
999 return dfa->init_state;
1000 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1001 return dfa->init_state_begbuf;
1002 else if (IS_NEWLINE_CONTEXT (context))
1003 return dfa->init_state_nl;
1004 else if (IS_BEGBUF_CONTEXT (context))
1005 {
1006 /* It is relatively rare case, then calculate on demand. */
1007 return re_acquire_state_context (err, dfa,
1008 dfa->init_state->entrance_nodes,
1009 context);
1010 }
1011 else
1012 /* Must not happen? */
1013 return dfa->init_state;
1014 }
1015 else
1016 return dfa->init_state;
1017 }
1018
1019 /* Check whether the regular expression match input string INPUT or not,
1020 and return the index where the matching end. Return -1 if
1021 there is no match, and return -2 in case of an error.
1022 FL_LONGEST_MATCH means we want the POSIX longest matching.
1023 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1024 next place where we may want to try matching.
1025 Note that the matcher assumes that the matching starts from the current
1026 index of the buffer. */
1027
1028 static Idx
1029 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1030 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1031 Idx *p_match_first)
1032 {
1033 const re_dfa_t *const dfa = mctx->dfa;
1034 reg_errcode_t err;
1035 Idx match = 0;
1036 Idx match_last = -1;
1037 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1038 re_dfastate_t *cur_state;
1039 bool at_init_state = p_match_first != NULL;
1040 Idx next_start_idx = cur_str_idx;
1041
1042 err = REG_NOERROR;
1043 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1044 /* An initial state must not be NULL (invalid). */
1045 if (__glibc_unlikely (cur_state == NULL))
1046 {
1047 DEBUG_ASSERT (err == REG_ESPACE);
1048 return -2;
1049 }
1050
1051 if (mctx->state_log != NULL)
1052 {
1053 mctx->state_log[cur_str_idx] = cur_state;
1054
1055 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1056 later. E.g. Processing back references. */
1057 if (__glibc_unlikely (dfa->nbackref))
1058 {
1059 at_init_state = false;
1060 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1061 if (__glibc_unlikely (err != REG_NOERROR))
1062 return err;
1063
1064 if (cur_state->has_backref)
1065 {
1066 err = transit_state_bkref (mctx, &cur_state->nodes);
1067 if (__glibc_unlikely (err != REG_NOERROR))
1068 return err;
1069 }
1070 }
1071 }
1072
1073 /* If the RE accepts NULL string. */
1074 if (__glibc_unlikely (cur_state->halt))
1075 {
1076 if (!cur_state->has_constraint
1077 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1078 {
1079 if (!fl_longest_match)
1080 return cur_str_idx;
1081 else
1082 {
1083 match_last = cur_str_idx;
1084 match = 1;
1085 }
1086 }
1087 }
1088
1089 while (!re_string_eoi (&mctx->input))
1090 {
1091 re_dfastate_t *old_state = cur_state;
1092 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1093
1094 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1095 && mctx->input.bufs_len < mctx->input.len)
1096 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1097 && mctx->input.valid_len < mctx->input.len))
1098 {
1099 err = extend_buffers (mctx, next_char_idx + 1);
1100 if (__glibc_unlikely (err != REG_NOERROR))
1101 {
1102 DEBUG_ASSERT (err == REG_ESPACE);
1103 return -2;
1104 }
1105 }
1106
1107 cur_state = transit_state (&err, mctx, cur_state);
1108 if (mctx->state_log != NULL)
1109 cur_state = merge_state_with_log (&err, mctx, cur_state);
1110
1111 if (cur_state == NULL)
1112 {
1113 /* Reached the invalid state or an error. Try to recover a valid
1114 state using the state log, if available and if we have not
1115 already found a valid (even if not the longest) match. */
1116 if (__glibc_unlikely (err != REG_NOERROR))
1117 return -2;
1118
1119 if (mctx->state_log == NULL
1120 || (match && !fl_longest_match)
1121 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1122 break;
1123 }
1124
1125 if (__glibc_unlikely (at_init_state))
1126 {
1127 if (old_state == cur_state)
1128 next_start_idx = next_char_idx;
1129 else
1130 at_init_state = false;
1131 }
1132
1133 if (cur_state->halt)
1134 {
1135 /* Reached a halt state.
1136 Check the halt state can satisfy the current context. */
1137 if (!cur_state->has_constraint
1138 || check_halt_state_context (mctx, cur_state,
1139 re_string_cur_idx (&mctx->input)))
1140 {
1141 /* We found an appropriate halt state. */
1142 match_last = re_string_cur_idx (&mctx->input);
1143 match = 1;
1144
1145 /* We found a match, do not modify match_first below. */
1146 p_match_first = NULL;
1147 if (!fl_longest_match)
1148 break;
1149 }
1150 }
1151 }
1152
1153 if (p_match_first)
1154 *p_match_first += next_start_idx;
1155
1156 return match_last;
1157 }
1158
1159 /* Check NODE match the current context. */
1160
1161 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1162 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1163 {
1164 re_token_type_t type = dfa->nodes[node].type;
1165 unsigned int constraint = dfa->nodes[node].constraint;
1166 if (type != END_OF_RE)
1167 return false;
1168 if (!constraint)
1169 return true;
1170 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1171 return false;
1172 return true;
1173 }
1174
1175 /* Check the halt state STATE match the current context.
1176 Return 0 if not match, if the node, STATE has, is a halt node and
1177 match the context, return the node. */
1178
1179 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1180 check_halt_state_context (const re_match_context_t *mctx,
1181 const re_dfastate_t *state, Idx idx)
1182 {
1183 Idx i;
1184 unsigned int context;
1185 DEBUG_ASSERT (state->halt);
1186 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1187 for (i = 0; i < state->nodes.nelem; ++i)
1188 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1189 return state->nodes.elems[i];
1190 return 0;
1191 }
1192
1193 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1194 corresponding to the DFA).
1195 Return the destination node, and update EPS_VIA_NODES;
1196 return -1 on match failure, -2 on error. */
1197
1198 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1199 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1200 regmatch_t *prevregs,
1201 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1202 struct re_fail_stack_t *fs)
1203 {
1204 const re_dfa_t *const dfa = mctx->dfa;
1205 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1206 {
1207 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1208 re_node_set *edests = &dfa->edests[node];
1209
1210 if (! re_node_set_contains (eps_via_nodes, node))
1211 {
1212 bool ok = re_node_set_insert (eps_via_nodes, node);
1213 if (__glibc_unlikely (! ok))
1214 return -2;
1215 }
1216
1217 /* Pick a valid destination, or return -1 if none is found. */
1218 Idx dest_node = -1;
1219 for (Idx i = 0; i < edests->nelem; i++)
1220 {
1221 Idx candidate = edests->elems[i];
1222 if (!re_node_set_contains (cur_nodes, candidate))
1223 continue;
1224 if (dest_node == -1)
1225 dest_node = candidate;
1226
1227 else
1228 {
1229 /* In order to avoid infinite loop like "(a*)*", return the second
1230 epsilon-transition if the first was already considered. */
1231 if (re_node_set_contains (eps_via_nodes, dest_node))
1232 return candidate;
1233
1234 /* Otherwise, push the second epsilon-transition on the fail stack. */
1235 else if (fs != NULL
1236 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1237 prevregs, eps_via_nodes))
1238 return -2;
1239
1240 /* We know we are going to exit. */
1241 break;
1242 }
1243 }
1244 return dest_node;
1245 }
1246 else
1247 {
1248 Idx naccepted = 0;
1249 re_token_type_t type = dfa->nodes[node].type;
1250
1251 if (dfa->nodes[node].accept_mb)
1252 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1253 else if (type == OP_BACK_REF)
1254 {
1255 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1256 if (subexp_idx < nregs)
1257 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258 if (fs != NULL)
1259 {
1260 if (subexp_idx >= nregs
1261 || regs[subexp_idx].rm_so == -1
1262 || regs[subexp_idx].rm_eo == -1)
1263 return -1;
1264 else if (naccepted)
1265 {
1266 char *buf = (char *) re_string_get_buffer (&mctx->input);
1267 if (mctx->input.valid_len - *pidx < naccepted
1268 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1269 naccepted)
1270 != 0))
1271 return -1;
1272 }
1273 }
1274
1275 if (naccepted == 0)
1276 {
1277 Idx dest_node;
1278 bool ok = re_node_set_insert (eps_via_nodes, node);
1279 if (__glibc_unlikely (! ok))
1280 return -2;
1281 dest_node = dfa->edests[node].elems[0];
1282 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1283 dest_node))
1284 return dest_node;
1285 }
1286 }
1287
1288 if (naccepted != 0
1289 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1290 {
1291 Idx dest_node = dfa->nexts[node];
1292 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1293 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1294 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1295 dest_node)))
1296 return -1;
1297 re_node_set_empty (eps_via_nodes);
1298 return dest_node;
1299 }
1300 }
1301 return -1;
1302 }
1303
1304 static reg_errcode_t
1305 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1306 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1307 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1308 re_node_set *eps_via_nodes)
1309 {
1310 reg_errcode_t err;
1311 Idx num = fs->num++;
1312 if (fs->num == fs->alloc)
1313 {
1314 struct re_fail_stack_ent_t *new_array;
1315 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1316 fs->alloc * 2);
1317 if (new_array == NULL)
1318 return REG_ESPACE;
1319 fs->alloc *= 2;
1320 fs->stack = new_array;
1321 }
1322 fs->stack[num].idx = str_idx;
1323 fs->stack[num].node = dest_node;
1324 fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1325 if (fs->stack[num].regs == NULL)
1326 return REG_ESPACE;
1327 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1328 memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1329 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1330 return err;
1331 }
1332
1333 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1334 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1335 regmatch_t *regs, regmatch_t *prevregs,
1336 re_node_set *eps_via_nodes)
1337 {
1338 if (fs == NULL || fs->num == 0)
1339 return -1;
1340 Idx num = --fs->num;
1341 *pidx = fs->stack[num].idx;
1342 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1343 memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1344 re_node_set_free (eps_via_nodes);
1345 re_free (fs->stack[num].regs);
1346 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1347 DEBUG_ASSERT (0 <= fs->stack[num].node);
1348 return fs->stack[num].node;
1349 }
1350
1351
1352 #define DYNARRAY_STRUCT regmatch_list
1353 #define DYNARRAY_ELEMENT regmatch_t
1354 #define DYNARRAY_PREFIX regmatch_list_
1355 #include <malloc/dynarray-skeleton.c>
1356
1357 /* Set the positions where the subexpressions are starts/ends to registers
1358 PMATCH.
1359 Note: We assume that pmatch[0] is already set, and
1360 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1361
1362 static reg_errcode_t
1363 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1364 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1365 regmatch_t *pmatch, bool fl_backtrack)
1366 {
1367 const re_dfa_t *dfa = preg->buffer;
1368 Idx idx, cur_node;
1369 re_node_set eps_via_nodes;
1370 struct re_fail_stack_t *fs;
1371 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1372 struct regmatch_list prev_match;
1373 regmatch_list_init (&prev_match);
1374
1375 DEBUG_ASSERT (nmatch > 1);
1376 DEBUG_ASSERT (mctx->state_log != NULL);
1377 if (fl_backtrack)
1378 {
1379 fs = &fs_body;
1380 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1381 if (fs->stack == NULL)
1382 return REG_ESPACE;
1383 }
1384 else
1385 fs = NULL;
1386
1387 cur_node = dfa->init_node;
1388 re_node_set_init_empty (&eps_via_nodes);
1389
1390 if (!regmatch_list_resize (&prev_match, nmatch))
1391 {
1392 regmatch_list_free (&prev_match);
1393 free_fail_stack_return (fs);
1394 return REG_ESPACE;
1395 }
1396 regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1397 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1398
1399 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1400 {
1401 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1402
1403 if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1404 || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1405 {
1406 Idx reg_idx;
1407 cur_node = -1;
1408 if (fs)
1409 {
1410 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1411 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1412 {
1413 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1414 prev_idx_match, &eps_via_nodes);
1415 break;
1416 }
1417 }
1418 if (cur_node < 0)
1419 {
1420 re_node_set_free (&eps_via_nodes);
1421 regmatch_list_free (&prev_match);
1422 return free_fail_stack_return (fs);
1423 }
1424 }
1425
1426 /* Proceed to next node. */
1427 cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1428 &idx, cur_node,
1429 &eps_via_nodes, fs);
1430
1431 if (__glibc_unlikely (cur_node < 0))
1432 {
1433 if (__glibc_unlikely (cur_node == -2))
1434 {
1435 re_node_set_free (&eps_via_nodes);
1436 regmatch_list_free (&prev_match);
1437 free_fail_stack_return (fs);
1438 return REG_ESPACE;
1439 }
1440 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1441 prev_idx_match, &eps_via_nodes);
1442 if (cur_node < 0)
1443 {
1444 re_node_set_free (&eps_via_nodes);
1445 regmatch_list_free (&prev_match);
1446 free_fail_stack_return (fs);
1447 return REG_NOMATCH;
1448 }
1449 }
1450 }
1451 re_node_set_free (&eps_via_nodes);
1452 regmatch_list_free (&prev_match);
1453 return free_fail_stack_return (fs);
1454 }
1455
1456 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1457 free_fail_stack_return (struct re_fail_stack_t *fs)
1458 {
1459 if (fs)
1460 {
1461 Idx fs_idx;
1462 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1463 {
1464 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1465 re_free (fs->stack[fs_idx].regs);
1466 }
1467 re_free (fs->stack);
1468 }
1469 return REG_NOERROR;
1470 }
1471
1472 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1473 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1474 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1475 {
1476 int type = dfa->nodes[cur_node].type;
1477 if (type == OP_OPEN_SUBEXP)
1478 {
1479 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1480
1481 /* We are at the first node of this sub expression. */
1482 if (reg_num < nmatch)
1483 {
1484 pmatch[reg_num].rm_so = cur_idx;
1485 pmatch[reg_num].rm_eo = -1;
1486 }
1487 }
1488 else if (type == OP_CLOSE_SUBEXP)
1489 {
1490 /* We are at the last node of this sub expression. */
1491 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1492 if (reg_num < nmatch)
1493 {
1494 if (pmatch[reg_num].rm_so < cur_idx)
1495 {
1496 pmatch[reg_num].rm_eo = cur_idx;
1497 /* This is a non-empty match or we are not inside an optional
1498 subexpression. Accept this right away. */
1499 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1500 }
1501 else
1502 {
1503 if (dfa->nodes[cur_node].opt_subexp
1504 && prev_idx_match[reg_num].rm_so != -1)
1505 /* We transited through an empty match for an optional
1506 subexpression, like (a?)*, and this is not the subexp's
1507 first match. Copy back the old content of the registers
1508 so that matches of an inner subexpression are undone as
1509 well, like in ((a?))*. */
1510 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1511 else
1512 /* We completed a subexpression, but it may be part of
1513 an optional one, so do not update PREV_IDX_MATCH. */
1514 pmatch[reg_num].rm_eo = cur_idx;
1515 }
1516 }
1517 }
1518 }
1519
1520 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1521 and sift the nodes in each states according to the following rules.
1522 Updated state_log will be wrote to STATE_LOG.
1523
1524 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1525 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1526 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1527 the LAST_NODE, we throw away the node 'a'.
1528 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1529 string 's' and transit to 'b':
1530 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1531 away the node 'a'.
1532 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1533 thrown away, we throw away the node 'a'.
1534 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1535 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1536 node 'a'.
1537 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1538 we throw away the node 'a'. */
1539
1540 #define STATE_NODE_CONTAINS(state,node) \
1541 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1542
1543 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1544 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1545 {
1546 reg_errcode_t err;
1547 int null_cnt = 0;
1548 Idx str_idx = sctx->last_str_idx;
1549 re_node_set cur_dest;
1550
1551 DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1552
1553 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1554 transit to the last_node and the last_node itself. */
1555 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1556 if (__glibc_unlikely (err != REG_NOERROR))
1557 return err;
1558 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1559 if (__glibc_unlikely (err != REG_NOERROR))
1560 goto free_return;
1561
1562 /* Then check each states in the state_log. */
1563 while (str_idx > 0)
1564 {
1565 /* Update counters. */
1566 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1567 if (null_cnt > mctx->max_mb_elem_len)
1568 {
1569 memset (sctx->sifted_states, '\0',
1570 sizeof (re_dfastate_t *) * str_idx);
1571 re_node_set_free (&cur_dest);
1572 return REG_NOERROR;
1573 }
1574 re_node_set_empty (&cur_dest);
1575 --str_idx;
1576
1577 if (mctx->state_log[str_idx])
1578 {
1579 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1580 if (__glibc_unlikely (err != REG_NOERROR))
1581 goto free_return;
1582 }
1583
1584 /* Add all the nodes which satisfy the following conditions:
1585 - It can epsilon transit to a node in CUR_DEST.
1586 - It is in CUR_SRC.
1587 And update state_log. */
1588 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1589 if (__glibc_unlikely (err != REG_NOERROR))
1590 goto free_return;
1591 }
1592 err = REG_NOERROR;
1593 free_return:
1594 re_node_set_free (&cur_dest);
1595 return err;
1596 }
1597
1598 static reg_errcode_t
1599 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1600 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1601 Idx str_idx, re_node_set *cur_dest)
1602 {
1603 const re_dfa_t *const dfa = mctx->dfa;
1604 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1605 Idx i;
1606
1607 /* Then build the next sifted state.
1608 We build the next sifted state on 'cur_dest', and update
1609 'sifted_states[str_idx]' with 'cur_dest'.
1610 Note:
1611 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1612 'cur_src' points the node_set of the old 'state_log[str_idx]'
1613 (with the epsilon nodes pre-filtered out). */
1614 for (i = 0; i < cur_src->nelem; i++)
1615 {
1616 Idx prev_node = cur_src->elems[i];
1617 int naccepted = 0;
1618 bool ok;
1619 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1620
1621 /* If the node may accept "multi byte". */
1622 if (dfa->nodes[prev_node].accept_mb)
1623 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1624 str_idx, sctx->last_str_idx);
1625
1626 /* We don't check backreferences here.
1627 See update_cur_sifted_state(). */
1628 if (!naccepted
1629 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1630 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1631 dfa->nexts[prev_node]))
1632 naccepted = 1;
1633
1634 if (naccepted == 0)
1635 continue;
1636
1637 if (sctx->limits.nelem)
1638 {
1639 Idx to_idx = str_idx + naccepted;
1640 if (check_dst_limits (mctx, &sctx->limits,
1641 dfa->nexts[prev_node], to_idx,
1642 prev_node, str_idx))
1643 continue;
1644 }
1645 ok = re_node_set_insert (cur_dest, prev_node);
1646 if (__glibc_unlikely (! ok))
1647 return REG_ESPACE;
1648 }
1649
1650 return REG_NOERROR;
1651 }
1652
1653 /* Helper functions. */
1654
1655 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1656 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1657 {
1658 Idx top = mctx->state_log_top;
1659
1660 if ((next_state_log_idx >= mctx->input.bufs_len
1661 && mctx->input.bufs_len < mctx->input.len)
1662 || (next_state_log_idx >= mctx->input.valid_len
1663 && mctx->input.valid_len < mctx->input.len))
1664 {
1665 reg_errcode_t err;
1666 err = extend_buffers (mctx, next_state_log_idx + 1);
1667 if (__glibc_unlikely (err != REG_NOERROR))
1668 return err;
1669 }
1670
1671 if (top < next_state_log_idx)
1672 {
1673 DEBUG_ASSERT (mctx->state_log != NULL);
1674 memset (mctx->state_log + top + 1, '\0',
1675 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1676 mctx->state_log_top = next_state_log_idx;
1677 }
1678 return REG_NOERROR;
1679 }
1680
1681 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1682 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1683 re_dfastate_t **src, Idx num)
1684 {
1685 Idx st_idx;
1686 reg_errcode_t err;
1687 for (st_idx = 0; st_idx < num; ++st_idx)
1688 {
1689 if (dst[st_idx] == NULL)
1690 dst[st_idx] = src[st_idx];
1691 else if (src[st_idx] != NULL)
1692 {
1693 re_node_set merged_set;
1694 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1695 &src[st_idx]->nodes);
1696 if (__glibc_unlikely (err != REG_NOERROR))
1697 return err;
1698 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1699 re_node_set_free (&merged_set);
1700 if (__glibc_unlikely (err != REG_NOERROR))
1701 return err;
1702 }
1703 }
1704 return REG_NOERROR;
1705 }
1706
1707 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1708 update_cur_sifted_state (const re_match_context_t *mctx,
1709 re_sift_context_t *sctx, Idx str_idx,
1710 re_node_set *dest_nodes)
1711 {
1712 const re_dfa_t *const dfa = mctx->dfa;
1713 reg_errcode_t err = REG_NOERROR;
1714 const re_node_set *candidates;
1715 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1716 : &mctx->state_log[str_idx]->nodes);
1717
1718 if (dest_nodes->nelem == 0)
1719 sctx->sifted_states[str_idx] = NULL;
1720 else
1721 {
1722 if (candidates)
1723 {
1724 /* At first, add the nodes which can epsilon transit to a node in
1725 DEST_NODE. */
1726 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1727 if (__glibc_unlikely (err != REG_NOERROR))
1728 return err;
1729
1730 /* Then, check the limitations in the current sift_context. */
1731 if (sctx->limits.nelem)
1732 {
1733 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1734 mctx->bkref_ents, str_idx);
1735 if (__glibc_unlikely (err != REG_NOERROR))
1736 return err;
1737 }
1738 }
1739
1740 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1741 if (__glibc_unlikely (err != REG_NOERROR))
1742 return err;
1743 }
1744
1745 if (candidates && mctx->state_log[str_idx]->has_backref)
1746 {
1747 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1748 if (__glibc_unlikely (err != REG_NOERROR))
1749 return err;
1750 }
1751 return REG_NOERROR;
1752 }
1753
1754 static reg_errcode_t
1755 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1756 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1757 const re_node_set *candidates)
1758 {
1759 reg_errcode_t err = REG_NOERROR;
1760 Idx i;
1761
1762 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1763 if (__glibc_unlikely (err != REG_NOERROR))
1764 return err;
1765
1766 if (!state->inveclosure.alloc)
1767 {
1768 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1769 if (__glibc_unlikely (err != REG_NOERROR))
1770 return REG_ESPACE;
1771 for (i = 0; i < dest_nodes->nelem; i++)
1772 {
1773 err = re_node_set_merge (&state->inveclosure,
1774 dfa->inveclosures + dest_nodes->elems[i]);
1775 if (__glibc_unlikely (err != REG_NOERROR))
1776 return REG_ESPACE;
1777 }
1778 }
1779 return re_node_set_add_intersect (dest_nodes, candidates,
1780 &state->inveclosure);
1781 }
1782
1783 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1784 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1785 const re_node_set *candidates)
1786 {
1787 Idx ecl_idx;
1788 reg_errcode_t err;
1789 re_node_set *inv_eclosure = dfa->inveclosures + node;
1790 re_node_set except_nodes;
1791 re_node_set_init_empty (&except_nodes);
1792 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1793 {
1794 Idx cur_node = inv_eclosure->elems[ecl_idx];
1795 if (cur_node == node)
1796 continue;
1797 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1798 {
1799 Idx edst1 = dfa->edests[cur_node].elems[0];
1800 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1801 ? dfa->edests[cur_node].elems[1] : -1);
1802 if ((!re_node_set_contains (inv_eclosure, edst1)
1803 && re_node_set_contains (dest_nodes, edst1))
1804 || (edst2 > 0
1805 && !re_node_set_contains (inv_eclosure, edst2)
1806 && re_node_set_contains (dest_nodes, edst2)))
1807 {
1808 err = re_node_set_add_intersect (&except_nodes, candidates,
1809 dfa->inveclosures + cur_node);
1810 if (__glibc_unlikely (err != REG_NOERROR))
1811 {
1812 re_node_set_free (&except_nodes);
1813 return err;
1814 }
1815 }
1816 }
1817 }
1818 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1819 {
1820 Idx cur_node = inv_eclosure->elems[ecl_idx];
1821 if (!re_node_set_contains (&except_nodes, cur_node))
1822 {
1823 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1824 re_node_set_remove_at (dest_nodes, idx);
1825 }
1826 }
1827 re_node_set_free (&except_nodes);
1828 return REG_NOERROR;
1829 }
1830
1831 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1832 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1833 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1834 {
1835 const re_dfa_t *const dfa = mctx->dfa;
1836 Idx lim_idx, src_pos, dst_pos;
1837
1838 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1839 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1840 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1841 {
1842 Idx subexp_idx;
1843 struct re_backref_cache_entry *ent;
1844 ent = mctx->bkref_ents + limits->elems[lim_idx];
1845 subexp_idx = dfa->nodes[ent->node].opr.idx;
1846
1847 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1848 subexp_idx, dst_node, dst_idx,
1849 dst_bkref_idx);
1850 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1851 subexp_idx, src_node, src_idx,
1852 src_bkref_idx);
1853
1854 /* In case of:
1855 <src> <dst> ( <subexp> )
1856 ( <subexp> ) <src> <dst>
1857 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1858 if (src_pos == dst_pos)
1859 continue; /* This is unrelated limitation. */
1860 else
1861 return true;
1862 }
1863 return false;
1864 }
1865
1866 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1867 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1868 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1869 {
1870 const re_dfa_t *const dfa = mctx->dfa;
1871 const re_node_set *eclosures = dfa->eclosures + from_node;
1872 Idx node_idx;
1873
1874 /* Else, we are on the boundary: examine the nodes on the epsilon
1875 closure. */
1876 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1877 {
1878 Idx node = eclosures->elems[node_idx];
1879 switch (dfa->nodes[node].type)
1880 {
1881 case OP_BACK_REF:
1882 if (bkref_idx != -1)
1883 {
1884 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1885 do
1886 {
1887 Idx dst;
1888 int cpos;
1889
1890 if (ent->node != node)
1891 continue;
1892
1893 if (subexp_idx < BITSET_WORD_BITS
1894 && !(ent->eps_reachable_subexps_map
1895 & ((bitset_word_t) 1 << subexp_idx)))
1896 continue;
1897
1898 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1899 OP_CLOSE_SUBEXP cases below. But, if the
1900 destination node is the same node as the source
1901 node, don't recurse because it would cause an
1902 infinite loop: a regex that exhibits this behavior
1903 is ()\1*\1* */
1904 dst = dfa->edests[node].elems[0];
1905 if (dst == from_node)
1906 {
1907 if (boundaries & 1)
1908 return -1;
1909 else /* if (boundaries & 2) */
1910 return 0;
1911 }
1912
1913 cpos =
1914 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1915 dst, bkref_idx);
1916 if (cpos == -1 /* && (boundaries & 1) */)
1917 return -1;
1918 if (cpos == 0 && (boundaries & 2))
1919 return 0;
1920
1921 if (subexp_idx < BITSET_WORD_BITS)
1922 ent->eps_reachable_subexps_map
1923 &= ~((bitset_word_t) 1 << subexp_idx);
1924 }
1925 while (ent++->more);
1926 }
1927 break;
1928
1929 case OP_OPEN_SUBEXP:
1930 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1931 return -1;
1932 break;
1933
1934 case OP_CLOSE_SUBEXP:
1935 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1936 return 0;
1937 break;
1938
1939 default:
1940 break;
1941 }
1942 }
1943
1944 return (boundaries & 2) ? 1 : 0;
1945 }
1946
1947 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1948 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1949 Idx subexp_idx, Idx from_node, Idx str_idx,
1950 Idx bkref_idx)
1951 {
1952 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1953 int boundaries;
1954
1955 /* If we are outside the range of the subexpression, return -1 or 1. */
1956 if (str_idx < lim->subexp_from)
1957 return -1;
1958
1959 if (lim->subexp_to < str_idx)
1960 return 1;
1961
1962 /* If we are within the subexpression, return 0. */
1963 boundaries = (str_idx == lim->subexp_from);
1964 boundaries |= (str_idx == lim->subexp_to) << 1;
1965 if (boundaries == 0)
1966 return 0;
1967
1968 /* Else, examine epsilon closure. */
1969 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1970 from_node, bkref_idx);
1971 }
1972
1973 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1974 which are against limitations from DEST_NODES. */
1975
1976 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)1977 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1978 const re_node_set *candidates, re_node_set *limits,
1979 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1980 {
1981 reg_errcode_t err;
1982 Idx node_idx, lim_idx;
1983
1984 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1985 {
1986 Idx subexp_idx;
1987 struct re_backref_cache_entry *ent;
1988 ent = bkref_ents + limits->elems[lim_idx];
1989
1990 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1991 continue; /* This is unrelated limitation. */
1992
1993 subexp_idx = dfa->nodes[ent->node].opr.idx;
1994 if (ent->subexp_to == str_idx)
1995 {
1996 Idx ops_node = -1;
1997 Idx cls_node = -1;
1998 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1999 {
2000 Idx node = dest_nodes->elems[node_idx];
2001 re_token_type_t type = dfa->nodes[node].type;
2002 if (type == OP_OPEN_SUBEXP
2003 && subexp_idx == dfa->nodes[node].opr.idx)
2004 ops_node = node;
2005 else if (type == OP_CLOSE_SUBEXP
2006 && subexp_idx == dfa->nodes[node].opr.idx)
2007 cls_node = node;
2008 }
2009
2010 /* Check the limitation of the open subexpression. */
2011 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2012 if (ops_node >= 0)
2013 {
2014 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2015 candidates);
2016 if (__glibc_unlikely (err != REG_NOERROR))
2017 return err;
2018 }
2019
2020 /* Check the limitation of the close subexpression. */
2021 if (cls_node >= 0)
2022 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2023 {
2024 Idx node = dest_nodes->elems[node_idx];
2025 if (!re_node_set_contains (dfa->inveclosures + node,
2026 cls_node)
2027 && !re_node_set_contains (dfa->eclosures + node,
2028 cls_node))
2029 {
2030 /* It is against this limitation.
2031 Remove it form the current sifted state. */
2032 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2033 candidates);
2034 if (__glibc_unlikely (err != REG_NOERROR))
2035 return err;
2036 --node_idx;
2037 }
2038 }
2039 }
2040 else /* (ent->subexp_to != str_idx) */
2041 {
2042 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2043 {
2044 Idx node = dest_nodes->elems[node_idx];
2045 re_token_type_t type = dfa->nodes[node].type;
2046 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2047 {
2048 if (subexp_idx != dfa->nodes[node].opr.idx)
2049 continue;
2050 /* It is against this limitation.
2051 Remove it form the current sifted state. */
2052 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2053 candidates);
2054 if (__glibc_unlikely (err != REG_NOERROR))
2055 return err;
2056 }
2057 }
2058 }
2059 }
2060 return REG_NOERROR;
2061 }
2062
2063 static reg_errcode_t
2064 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2065 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2066 Idx str_idx, const re_node_set *candidates)
2067 {
2068 const re_dfa_t *const dfa = mctx->dfa;
2069 reg_errcode_t err;
2070 Idx node_idx, node;
2071 re_sift_context_t local_sctx;
2072 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2073
2074 if (first_idx == -1)
2075 return REG_NOERROR;
2076
2077 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2078
2079 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2080 {
2081 Idx enabled_idx;
2082 re_token_type_t type;
2083 struct re_backref_cache_entry *entry;
2084 node = candidates->elems[node_idx];
2085 type = dfa->nodes[node].type;
2086 /* Avoid infinite loop for the REs like "()\1+". */
2087 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2088 continue;
2089 if (type != OP_BACK_REF)
2090 continue;
2091
2092 entry = mctx->bkref_ents + first_idx;
2093 enabled_idx = first_idx;
2094 do
2095 {
2096 Idx subexp_len;
2097 Idx to_idx;
2098 Idx dst_node;
2099 bool ok;
2100 re_dfastate_t *cur_state;
2101
2102 if (entry->node != node)
2103 continue;
2104 subexp_len = entry->subexp_to - entry->subexp_from;
2105 to_idx = str_idx + subexp_len;
2106 dst_node = (subexp_len ? dfa->nexts[node]
2107 : dfa->edests[node].elems[0]);
2108
2109 if (to_idx > sctx->last_str_idx
2110 || sctx->sifted_states[to_idx] == NULL
2111 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2112 || check_dst_limits (mctx, &sctx->limits, node,
2113 str_idx, dst_node, to_idx))
2114 continue;
2115
2116 if (local_sctx.sifted_states == NULL)
2117 {
2118 local_sctx = *sctx;
2119 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2120 if (__glibc_unlikely (err != REG_NOERROR))
2121 goto free_return;
2122 }
2123 local_sctx.last_node = node;
2124 local_sctx.last_str_idx = str_idx;
2125 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2126 if (__glibc_unlikely (! ok))
2127 {
2128 err = REG_ESPACE;
2129 goto free_return;
2130 }
2131 cur_state = local_sctx.sifted_states[str_idx];
2132 err = sift_states_backward (mctx, &local_sctx);
2133 if (__glibc_unlikely (err != REG_NOERROR))
2134 goto free_return;
2135 if (sctx->limited_states != NULL)
2136 {
2137 err = merge_state_array (dfa, sctx->limited_states,
2138 local_sctx.sifted_states,
2139 str_idx + 1);
2140 if (__glibc_unlikely (err != REG_NOERROR))
2141 goto free_return;
2142 }
2143 local_sctx.sifted_states[str_idx] = cur_state;
2144 re_node_set_remove (&local_sctx.limits, enabled_idx);
2145
2146 /* mctx->bkref_ents may have changed, reload the pointer. */
2147 entry = mctx->bkref_ents + enabled_idx;
2148 }
2149 while (enabled_idx++, entry++->more);
2150 }
2151 err = REG_NOERROR;
2152 free_return:
2153 if (local_sctx.sifted_states != NULL)
2154 {
2155 re_node_set_free (&local_sctx.limits);
2156 }
2157
2158 return err;
2159 }
2160
2161
2162 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2163 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2164 Idx node_idx, Idx str_idx, Idx max_str_idx)
2165 {
2166 const re_dfa_t *const dfa = mctx->dfa;
2167 int naccepted;
2168 /* Check the node can accept "multi byte". */
2169 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2170 if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2171 && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2172 dfa->nexts[node_idx]))
2173 /* The node can't accept the "multi byte", or the
2174 destination was already thrown away, then the node
2175 couldn't accept the current input "multi byte". */
2176 naccepted = 0;
2177 /* Otherwise, it is sure that the node could accept
2178 'naccepted' bytes input. */
2179 return naccepted;
2180 }
2181
2182 /* Functions for state transition. */
2183
2184 /* Return the next state to which the current state STATE will transit by
2185 accepting the current input byte, and update STATE_LOG if necessary.
2186 Return NULL on failure.
2187 If STATE can accept a multibyte char/collating element/back reference
2188 update the destination of STATE_LOG. */
2189
2190 static re_dfastate_t *
2191 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2192 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2193 re_dfastate_t *state)
2194 {
2195 re_dfastate_t **trtable;
2196 unsigned char ch;
2197
2198 /* If the current state can accept multibyte. */
2199 if (__glibc_unlikely (state->accept_mb))
2200 {
2201 *err = transit_state_mb (mctx, state);
2202 if (__glibc_unlikely (*err != REG_NOERROR))
2203 return NULL;
2204 }
2205
2206 /* Then decide the next state with the single byte. */
2207 #if 0
2208 if (0)
2209 /* don't use transition table */
2210 return transit_state_sb (err, mctx, state);
2211 #endif
2212
2213 /* Use transition table */
2214 ch = re_string_fetch_byte (&mctx->input);
2215 for (;;)
2216 {
2217 trtable = state->trtable;
2218 if (__glibc_likely (trtable != NULL))
2219 return trtable[ch];
2220
2221 trtable = state->word_trtable;
2222 if (__glibc_likely (trtable != NULL))
2223 {
2224 unsigned int context;
2225 context
2226 = re_string_context_at (&mctx->input,
2227 re_string_cur_idx (&mctx->input) - 1,
2228 mctx->eflags);
2229 if (IS_WORD_CONTEXT (context))
2230 return trtable[ch + SBC_MAX];
2231 else
2232 return trtable[ch];
2233 }
2234
2235 if (!build_trtable (mctx->dfa, state))
2236 {
2237 *err = REG_ESPACE;
2238 return NULL;
2239 }
2240
2241 /* Retry, we now have a transition table. */
2242 }
2243 }
2244
2245 /* Update the state_log if we need */
2246 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2247 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2248 re_dfastate_t *next_state)
2249 {
2250 const re_dfa_t *const dfa = mctx->dfa;
2251 Idx cur_idx = re_string_cur_idx (&mctx->input);
2252
2253 if (cur_idx > mctx->state_log_top)
2254 {
2255 mctx->state_log[cur_idx] = next_state;
2256 mctx->state_log_top = cur_idx;
2257 }
2258 else if (mctx->state_log[cur_idx] == 0)
2259 {
2260 mctx->state_log[cur_idx] = next_state;
2261 }
2262 else
2263 {
2264 re_dfastate_t *pstate;
2265 unsigned int context;
2266 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2267 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2268 the destination of a multibyte char/collating element/
2269 back reference. Then the next state is the union set of
2270 these destinations and the results of the transition table. */
2271 pstate = mctx->state_log[cur_idx];
2272 log_nodes = pstate->entrance_nodes;
2273 if (next_state != NULL)
2274 {
2275 table_nodes = next_state->entrance_nodes;
2276 *err = re_node_set_init_union (&next_nodes, table_nodes,
2277 log_nodes);
2278 if (__glibc_unlikely (*err != REG_NOERROR))
2279 return NULL;
2280 }
2281 else
2282 next_nodes = *log_nodes;
2283 /* Note: We already add the nodes of the initial state,
2284 then we don't need to add them here. */
2285
2286 context = re_string_context_at (&mctx->input,
2287 re_string_cur_idx (&mctx->input) - 1,
2288 mctx->eflags);
2289 next_state = mctx->state_log[cur_idx]
2290 = re_acquire_state_context (err, dfa, &next_nodes, context);
2291 /* We don't need to check errors here, since the return value of
2292 this function is next_state and ERR is already set. */
2293
2294 if (table_nodes != NULL)
2295 re_node_set_free (&next_nodes);
2296 }
2297
2298 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2299 {
2300 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2301 later. We must check them here, since the back references in the
2302 next state might use them. */
2303 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2304 cur_idx);
2305 if (__glibc_unlikely (*err != REG_NOERROR))
2306 return NULL;
2307
2308 /* If the next state has back references. */
2309 if (next_state->has_backref)
2310 {
2311 *err = transit_state_bkref (mctx, &next_state->nodes);
2312 if (__glibc_unlikely (*err != REG_NOERROR))
2313 return NULL;
2314 next_state = mctx->state_log[cur_idx];
2315 }
2316 }
2317
2318 return next_state;
2319 }
2320
2321 /* Skip bytes in the input that correspond to part of a
2322 multi-byte match, then look in the log for a state
2323 from which to restart matching. */
2324 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2325 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2326 {
2327 re_dfastate_t *cur_state;
2328 do
2329 {
2330 Idx max = mctx->state_log_top;
2331 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2332
2333 do
2334 {
2335 if (++cur_str_idx > max)
2336 return NULL;
2337 re_string_skip_bytes (&mctx->input, 1);
2338 }
2339 while (mctx->state_log[cur_str_idx] == NULL);
2340
2341 cur_state = merge_state_with_log (err, mctx, NULL);
2342 }
2343 while (*err == REG_NOERROR && cur_state == NULL);
2344 return cur_state;
2345 }
2346
2347 /* Helper functions for transit_state. */
2348
2349 /* From the node set CUR_NODES, pick up the nodes whose types are
2350 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2351 expression. And register them to use them later for evaluating the
2352 corresponding back references. */
2353
2354 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2355 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2356 Idx str_idx)
2357 {
2358 const re_dfa_t *const dfa = mctx->dfa;
2359 Idx node_idx;
2360 reg_errcode_t err;
2361
2362 /* TODO: This isn't efficient.
2363 Because there might be more than one nodes whose types are
2364 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2365 nodes.
2366 E.g. RE: (a){2} */
2367 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2368 {
2369 Idx node = cur_nodes->elems[node_idx];
2370 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2371 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2372 && (dfa->used_bkref_map
2373 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2374 {
2375 err = match_ctx_add_subtop (mctx, node, str_idx);
2376 if (__glibc_unlikely (err != REG_NOERROR))
2377 return err;
2378 }
2379 }
2380 return REG_NOERROR;
2381 }
2382
2383 #if 0
2384 /* Return the next state to which the current state STATE will transit by
2385 accepting the current input byte. Return NULL on failure. */
2386
2387 static re_dfastate_t *
2388 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2389 re_dfastate_t *state)
2390 {
2391 const re_dfa_t *const dfa = mctx->dfa;
2392 re_node_set next_nodes;
2393 re_dfastate_t *next_state;
2394 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2395 unsigned int context;
2396
2397 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2398 if (__glibc_unlikely (*err != REG_NOERROR))
2399 return NULL;
2400 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2401 {
2402 Idx cur_node = state->nodes.elems[node_cnt];
2403 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2404 {
2405 *err = re_node_set_merge (&next_nodes,
2406 dfa->eclosures + dfa->nexts[cur_node]);
2407 if (__glibc_unlikely (*err != REG_NOERROR))
2408 {
2409 re_node_set_free (&next_nodes);
2410 return NULL;
2411 }
2412 }
2413 }
2414 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2415 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2416 /* We don't need to check errors here, since the return value of
2417 this function is next_state and ERR is already set. */
2418
2419 re_node_set_free (&next_nodes);
2420 re_string_skip_bytes (&mctx->input, 1);
2421 return next_state;
2422 }
2423 #endif
2424
2425 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2426 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2427 {
2428 const re_dfa_t *const dfa = mctx->dfa;
2429 reg_errcode_t err;
2430 Idx i;
2431
2432 for (i = 0; i < pstate->nodes.nelem; ++i)
2433 {
2434 re_node_set dest_nodes, *new_nodes;
2435 Idx cur_node_idx = pstate->nodes.elems[i];
2436 int naccepted;
2437 Idx dest_idx;
2438 unsigned int context;
2439 re_dfastate_t *dest_state;
2440
2441 if (!dfa->nodes[cur_node_idx].accept_mb)
2442 continue;
2443
2444 if (dfa->nodes[cur_node_idx].constraint)
2445 {
2446 context = re_string_context_at (&mctx->input,
2447 re_string_cur_idx (&mctx->input),
2448 mctx->eflags);
2449 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2450 context))
2451 continue;
2452 }
2453
2454 /* How many bytes the node can accept? */
2455 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2456 re_string_cur_idx (&mctx->input));
2457 if (naccepted == 0)
2458 continue;
2459
2460 /* The node can accepts 'naccepted' bytes. */
2461 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2462 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2463 : mctx->max_mb_elem_len);
2464 err = clean_state_log_if_needed (mctx, dest_idx);
2465 if (__glibc_unlikely (err != REG_NOERROR))
2466 return err;
2467 DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2468 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2469
2470 dest_state = mctx->state_log[dest_idx];
2471 if (dest_state == NULL)
2472 dest_nodes = *new_nodes;
2473 else
2474 {
2475 err = re_node_set_init_union (&dest_nodes,
2476 dest_state->entrance_nodes, new_nodes);
2477 if (__glibc_unlikely (err != REG_NOERROR))
2478 return err;
2479 }
2480 context = re_string_context_at (&mctx->input, dest_idx - 1,
2481 mctx->eflags);
2482 mctx->state_log[dest_idx]
2483 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2484 if (dest_state != NULL)
2485 re_node_set_free (&dest_nodes);
2486 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2487 && err != REG_NOERROR))
2488 return err;
2489 }
2490 return REG_NOERROR;
2491 }
2492
2493 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2494 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2495 {
2496 const re_dfa_t *const dfa = mctx->dfa;
2497 reg_errcode_t err;
2498 Idx i;
2499 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2500
2501 for (i = 0; i < nodes->nelem; ++i)
2502 {
2503 Idx dest_str_idx, prev_nelem, bkc_idx;
2504 Idx node_idx = nodes->elems[i];
2505 unsigned int context;
2506 const re_token_t *node = dfa->nodes + node_idx;
2507 re_node_set *new_dest_nodes;
2508
2509 /* Check whether 'node' is a backreference or not. */
2510 if (node->type != OP_BACK_REF)
2511 continue;
2512
2513 if (node->constraint)
2514 {
2515 context = re_string_context_at (&mctx->input, cur_str_idx,
2516 mctx->eflags);
2517 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2518 continue;
2519 }
2520
2521 /* 'node' is a backreference.
2522 Check the substring which the substring matched. */
2523 bkc_idx = mctx->nbkref_ents;
2524 err = get_subexp (mctx, node_idx, cur_str_idx);
2525 if (__glibc_unlikely (err != REG_NOERROR))
2526 goto free_return;
2527
2528 /* And add the epsilon closures (which is 'new_dest_nodes') of
2529 the backreference to appropriate state_log. */
2530 DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2531 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2532 {
2533 Idx subexp_len;
2534 re_dfastate_t *dest_state;
2535 struct re_backref_cache_entry *bkref_ent;
2536 bkref_ent = mctx->bkref_ents + bkc_idx;
2537 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2538 continue;
2539 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2540 new_dest_nodes = (subexp_len == 0
2541 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2542 : dfa->eclosures + dfa->nexts[node_idx]);
2543 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2544 - bkref_ent->subexp_from);
2545 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2546 mctx->eflags);
2547 dest_state = mctx->state_log[dest_str_idx];
2548 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2549 : mctx->state_log[cur_str_idx]->nodes.nelem);
2550 /* Add 'new_dest_node' to state_log. */
2551 if (dest_state == NULL)
2552 {
2553 mctx->state_log[dest_str_idx]
2554 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2555 context);
2556 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2557 && err != REG_NOERROR))
2558 goto free_return;
2559 }
2560 else
2561 {
2562 re_node_set dest_nodes;
2563 err = re_node_set_init_union (&dest_nodes,
2564 dest_state->entrance_nodes,
2565 new_dest_nodes);
2566 if (__glibc_unlikely (err != REG_NOERROR))
2567 {
2568 re_node_set_free (&dest_nodes);
2569 goto free_return;
2570 }
2571 mctx->state_log[dest_str_idx]
2572 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2573 re_node_set_free (&dest_nodes);
2574 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2575 && err != REG_NOERROR))
2576 goto free_return;
2577 }
2578 /* We need to check recursively if the backreference can epsilon
2579 transit. */
2580 if (subexp_len == 0
2581 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2582 {
2583 err = check_subexp_matching_top (mctx, new_dest_nodes,
2584 cur_str_idx);
2585 if (__glibc_unlikely (err != REG_NOERROR))
2586 goto free_return;
2587 err = transit_state_bkref (mctx, new_dest_nodes);
2588 if (__glibc_unlikely (err != REG_NOERROR))
2589 goto free_return;
2590 }
2591 }
2592 }
2593 err = REG_NOERROR;
2594 free_return:
2595 return err;
2596 }
2597
2598 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2599 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2600 Note that we might collect inappropriate candidates here.
2601 However, the cost of checking them strictly here is too high, then we
2602 delay these checking for prune_impossible_nodes(). */
2603
2604 static reg_errcode_t
2605 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2606 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2607 {
2608 const re_dfa_t *const dfa = mctx->dfa;
2609 Idx subexp_num, sub_top_idx;
2610 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2611 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2612 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2613 if (cache_idx != -1)
2614 {
2615 const struct re_backref_cache_entry *entry
2616 = mctx->bkref_ents + cache_idx;
2617 do
2618 if (entry->node == bkref_node)
2619 return REG_NOERROR; /* We already checked it. */
2620 while (entry++->more);
2621 }
2622
2623 subexp_num = dfa->nodes[bkref_node].opr.idx;
2624
2625 /* For each sub expression */
2626 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2627 {
2628 reg_errcode_t err;
2629 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2630 re_sub_match_last_t *sub_last;
2631 Idx sub_last_idx, sl_str, bkref_str_off;
2632
2633 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2634 continue; /* It isn't related. */
2635
2636 sl_str = sub_top->str_idx;
2637 bkref_str_off = bkref_str_idx;
2638 /* At first, check the last node of sub expressions we already
2639 evaluated. */
2640 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2641 {
2642 regoff_t sl_str_diff;
2643 sub_last = sub_top->lasts[sub_last_idx];
2644 sl_str_diff = sub_last->str_idx - sl_str;
2645 /* The matched string by the sub expression match with the substring
2646 at the back reference? */
2647 if (sl_str_diff > 0)
2648 {
2649 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2650 > mctx->input.valid_len))
2651 {
2652 /* Not enough chars for a successful match. */
2653 if (bkref_str_off + sl_str_diff > mctx->input.len)
2654 break;
2655
2656 err = clean_state_log_if_needed (mctx,
2657 bkref_str_off
2658 + sl_str_diff);
2659 if (__glibc_unlikely (err != REG_NOERROR))
2660 return err;
2661 buf = (const char *) re_string_get_buffer (&mctx->input);
2662 }
2663 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2664 /* We don't need to search this sub expression any more. */
2665 break;
2666 }
2667 bkref_str_off += sl_str_diff;
2668 sl_str += sl_str_diff;
2669 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2670 bkref_str_idx);
2671
2672 /* Reload buf, since the preceding call might have reallocated
2673 the buffer. */
2674 buf = (const char *) re_string_get_buffer (&mctx->input);
2675
2676 if (err == REG_NOMATCH)
2677 continue;
2678 if (__glibc_unlikely (err != REG_NOERROR))
2679 return err;
2680 }
2681
2682 if (sub_last_idx < sub_top->nlasts)
2683 continue;
2684 if (sub_last_idx > 0)
2685 ++sl_str;
2686 /* Then, search for the other last nodes of the sub expression. */
2687 for (; sl_str <= bkref_str_idx; ++sl_str)
2688 {
2689 Idx cls_node;
2690 regoff_t sl_str_off;
2691 const re_node_set *nodes;
2692 sl_str_off = sl_str - sub_top->str_idx;
2693 /* The matched string by the sub expression match with the substring
2694 at the back reference? */
2695 if (sl_str_off > 0)
2696 {
2697 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2698 {
2699 /* If we are at the end of the input, we cannot match. */
2700 if (bkref_str_off >= mctx->input.len)
2701 break;
2702
2703 err = extend_buffers (mctx, bkref_str_off + 1);
2704 if (__glibc_unlikely (err != REG_NOERROR))
2705 return err;
2706
2707 buf = (const char *) re_string_get_buffer (&mctx->input);
2708 }
2709 if (buf [bkref_str_off++] != buf[sl_str - 1])
2710 break; /* We don't need to search this sub expression
2711 any more. */
2712 }
2713 if (mctx->state_log[sl_str] == NULL)
2714 continue;
2715 /* Does this state have a ')' of the sub expression? */
2716 nodes = &mctx->state_log[sl_str]->nodes;
2717 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2718 OP_CLOSE_SUBEXP);
2719 if (cls_node == -1)
2720 continue; /* No. */
2721 if (sub_top->path == NULL)
2722 {
2723 sub_top->path = calloc (sizeof (state_array_t),
2724 sl_str - sub_top->str_idx + 1);
2725 if (sub_top->path == NULL)
2726 return REG_ESPACE;
2727 }
2728 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2729 in the current context? */
2730 err = check_arrival (mctx, sub_top->path, sub_top->node,
2731 sub_top->str_idx, cls_node, sl_str,
2732 OP_CLOSE_SUBEXP);
2733 if (err == REG_NOMATCH)
2734 continue;
2735 if (__glibc_unlikely (err != REG_NOERROR))
2736 return err;
2737 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2738 if (__glibc_unlikely (sub_last == NULL))
2739 return REG_ESPACE;
2740 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2741 bkref_str_idx);
2742 buf = (const char *) re_string_get_buffer (&mctx->input);
2743 if (err == REG_NOMATCH)
2744 continue;
2745 if (__glibc_unlikely (err != REG_NOERROR))
2746 return err;
2747 }
2748 }
2749 return REG_NOERROR;
2750 }
2751
2752 /* Helper functions for get_subexp(). */
2753
2754 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2755 If it can arrive, register the sub expression expressed with SUB_TOP
2756 and SUB_LAST. */
2757
2758 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2759 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2760 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2761 {
2762 reg_errcode_t err;
2763 Idx to_idx;
2764 /* Can the subexpression arrive the back reference? */
2765 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2766 sub_last->str_idx, bkref_node, bkref_str,
2767 OP_OPEN_SUBEXP);
2768 if (err != REG_NOERROR)
2769 return err;
2770 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2771 sub_last->str_idx);
2772 if (__glibc_unlikely (err != REG_NOERROR))
2773 return err;
2774 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2775 return clean_state_log_if_needed (mctx, to_idx);
2776 }
2777
2778 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2779 Search '(' if FL_OPEN, or search ')' otherwise.
2780 TODO: This function isn't efficient...
2781 Because there might be more than one nodes whose types are
2782 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2783 nodes.
2784 E.g. RE: (a){2} */
2785
2786 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2787 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2788 Idx subexp_idx, int type)
2789 {
2790 Idx cls_idx;
2791 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2792 {
2793 Idx cls_node = nodes->elems[cls_idx];
2794 const re_token_t *node = dfa->nodes + cls_node;
2795 if (node->type == type
2796 && node->opr.idx == subexp_idx)
2797 return cls_node;
2798 }
2799 return -1;
2800 }
2801
2802 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2803 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2804 heavily reused.
2805 Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2806 REG_ESPACE if memory is exhausted. */
2807
2808 static reg_errcode_t
2809 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2810 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2811 Idx top_str, Idx last_node, Idx last_str, int type)
2812 {
2813 const re_dfa_t *const dfa = mctx->dfa;
2814 reg_errcode_t err = REG_NOERROR;
2815 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2816 re_dfastate_t *cur_state = NULL;
2817 re_node_set *cur_nodes, next_nodes;
2818 re_dfastate_t **backup_state_log;
2819 unsigned int context;
2820
2821 subexp_num = dfa->nodes[top_node].opr.idx;
2822 /* Extend the buffer if we need. */
2823 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2824 {
2825 re_dfastate_t **new_array;
2826 Idx old_alloc = path->alloc;
2827 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2828 Idx new_alloc;
2829 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2830 return REG_ESPACE;
2831 new_alloc = old_alloc + incr_alloc;
2832 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2833 return REG_ESPACE;
2834 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2835 if (__glibc_unlikely (new_array == NULL))
2836 return REG_ESPACE;
2837 path->array = new_array;
2838 path->alloc = new_alloc;
2839 memset (new_array + old_alloc, '\0',
2840 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2841 }
2842
2843 str_idx = path->next_idx ? path->next_idx : top_str;
2844
2845 /* Temporary modify MCTX. */
2846 backup_state_log = mctx->state_log;
2847 backup_cur_idx = mctx->input.cur_idx;
2848 mctx->state_log = path->array;
2849 mctx->input.cur_idx = str_idx;
2850
2851 /* Setup initial node set. */
2852 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2853 if (str_idx == top_str)
2854 {
2855 err = re_node_set_init_1 (&next_nodes, top_node);
2856 if (__glibc_unlikely (err != REG_NOERROR))
2857 return err;
2858 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2859 if (__glibc_unlikely (err != REG_NOERROR))
2860 {
2861 re_node_set_free (&next_nodes);
2862 return err;
2863 }
2864 }
2865 else
2866 {
2867 cur_state = mctx->state_log[str_idx];
2868 if (cur_state && cur_state->has_backref)
2869 {
2870 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2871 if (__glibc_unlikely (err != REG_NOERROR))
2872 return err;
2873 }
2874 else
2875 re_node_set_init_empty (&next_nodes);
2876 }
2877 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2878 {
2879 if (next_nodes.nelem)
2880 {
2881 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2882 subexp_num, type);
2883 if (__glibc_unlikely (err != REG_NOERROR))
2884 {
2885 re_node_set_free (&next_nodes);
2886 return err;
2887 }
2888 }
2889 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2890 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2891 {
2892 re_node_set_free (&next_nodes);
2893 return err;
2894 }
2895 mctx->state_log[str_idx] = cur_state;
2896 }
2897
2898 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2899 {
2900 re_node_set_empty (&next_nodes);
2901 if (mctx->state_log[str_idx + 1])
2902 {
2903 err = re_node_set_merge (&next_nodes,
2904 &mctx->state_log[str_idx + 1]->nodes);
2905 if (__glibc_unlikely (err != REG_NOERROR))
2906 {
2907 re_node_set_free (&next_nodes);
2908 return err;
2909 }
2910 }
2911 if (cur_state)
2912 {
2913 err = check_arrival_add_next_nodes (mctx, str_idx,
2914 &cur_state->non_eps_nodes,
2915 &next_nodes);
2916 if (__glibc_unlikely (err != REG_NOERROR))
2917 {
2918 re_node_set_free (&next_nodes);
2919 return err;
2920 }
2921 }
2922 ++str_idx;
2923 if (next_nodes.nelem)
2924 {
2925 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2926 if (__glibc_unlikely (err != REG_NOERROR))
2927 {
2928 re_node_set_free (&next_nodes);
2929 return err;
2930 }
2931 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2932 subexp_num, type);
2933 if (__glibc_unlikely (err != REG_NOERROR))
2934 {
2935 re_node_set_free (&next_nodes);
2936 return err;
2937 }
2938 }
2939 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2940 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2941 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2942 {
2943 re_node_set_free (&next_nodes);
2944 return err;
2945 }
2946 mctx->state_log[str_idx] = cur_state;
2947 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2948 }
2949 re_node_set_free (&next_nodes);
2950 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2951 : &mctx->state_log[last_str]->nodes);
2952 path->next_idx = str_idx;
2953
2954 /* Fix MCTX. */
2955 mctx->state_log = backup_state_log;
2956 mctx->input.cur_idx = backup_cur_idx;
2957
2958 /* Then check the current node set has the node LAST_NODE. */
2959 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2960 return REG_NOERROR;
2961
2962 return REG_NOMATCH;
2963 }
2964
2965 /* Helper functions for check_arrival. */
2966
2967 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2968 to NEXT_NODES.
2969 TODO: This function is similar to the functions transit_state*(),
2970 however this function has many additional works.
2971 Can't we unify them? */
2972
2973 static reg_errcode_t
2974 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)2975 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2976 re_node_set *cur_nodes, re_node_set *next_nodes)
2977 {
2978 const re_dfa_t *const dfa = mctx->dfa;
2979 bool ok;
2980 Idx cur_idx;
2981 reg_errcode_t err = REG_NOERROR;
2982 re_node_set union_set;
2983 re_node_set_init_empty (&union_set);
2984 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2985 {
2986 int naccepted = 0;
2987 Idx cur_node = cur_nodes->elems[cur_idx];
2988 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2989
2990 /* If the node may accept "multi byte". */
2991 if (dfa->nodes[cur_node].accept_mb)
2992 {
2993 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2994 str_idx);
2995 if (naccepted > 1)
2996 {
2997 re_dfastate_t *dest_state;
2998 Idx next_node = dfa->nexts[cur_node];
2999 Idx next_idx = str_idx + naccepted;
3000 dest_state = mctx->state_log[next_idx];
3001 re_node_set_empty (&union_set);
3002 if (dest_state)
3003 {
3004 err = re_node_set_merge (&union_set, &dest_state->nodes);
3005 if (__glibc_unlikely (err != REG_NOERROR))
3006 {
3007 re_node_set_free (&union_set);
3008 return err;
3009 }
3010 }
3011 ok = re_node_set_insert (&union_set, next_node);
3012 if (__glibc_unlikely (! ok))
3013 {
3014 re_node_set_free (&union_set);
3015 return REG_ESPACE;
3016 }
3017 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3018 &union_set);
3019 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3020 && err != REG_NOERROR))
3021 {
3022 re_node_set_free (&union_set);
3023 return err;
3024 }
3025 }
3026 }
3027
3028 if (naccepted
3029 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3030 {
3031 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3032 if (__glibc_unlikely (! ok))
3033 {
3034 re_node_set_free (&union_set);
3035 return REG_ESPACE;
3036 }
3037 }
3038 }
3039 re_node_set_free (&union_set);
3040 return REG_NOERROR;
3041 }
3042
3043 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3044 CUR_NODES, however exclude the nodes which are:
3045 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3046 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3047 */
3048
3049 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3050 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3051 Idx ex_subexp, int type)
3052 {
3053 reg_errcode_t err;
3054 Idx idx, outside_node;
3055 re_node_set new_nodes;
3056 DEBUG_ASSERT (cur_nodes->nelem);
3057 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3058 if (__glibc_unlikely (err != REG_NOERROR))
3059 return err;
3060 /* Create a new node set NEW_NODES with the nodes which are epsilon
3061 closures of the node in CUR_NODES. */
3062
3063 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3064 {
3065 Idx cur_node = cur_nodes->elems[idx];
3066 const re_node_set *eclosure = dfa->eclosures + cur_node;
3067 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3068 if (outside_node == -1)
3069 {
3070 /* There are no problematic nodes, just merge them. */
3071 err = re_node_set_merge (&new_nodes, eclosure);
3072 if (__glibc_unlikely (err != REG_NOERROR))
3073 {
3074 re_node_set_free (&new_nodes);
3075 return err;
3076 }
3077 }
3078 else
3079 {
3080 /* There are problematic nodes, re-calculate incrementally. */
3081 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3082 ex_subexp, type);
3083 if (__glibc_unlikely (err != REG_NOERROR))
3084 {
3085 re_node_set_free (&new_nodes);
3086 return err;
3087 }
3088 }
3089 }
3090 re_node_set_free (cur_nodes);
3091 *cur_nodes = new_nodes;
3092 return REG_NOERROR;
3093 }
3094
3095 /* Helper function for check_arrival_expand_ecl.
3096 Check incrementally the epsilon closure of TARGET, and if it isn't
3097 problematic append it to DST_NODES. */
3098
3099 static reg_errcode_t
3100 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3101 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3102 Idx target, Idx ex_subexp, int type)
3103 {
3104 Idx cur_node;
3105 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3106 {
3107 bool ok;
3108
3109 if (dfa->nodes[cur_node].type == type
3110 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3111 {
3112 if (type == OP_CLOSE_SUBEXP)
3113 {
3114 ok = re_node_set_insert (dst_nodes, cur_node);
3115 if (__glibc_unlikely (! ok))
3116 return REG_ESPACE;
3117 }
3118 break;
3119 }
3120 ok = re_node_set_insert (dst_nodes, cur_node);
3121 if (__glibc_unlikely (! ok))
3122 return REG_ESPACE;
3123 if (dfa->edests[cur_node].nelem == 0)
3124 break;
3125 if (dfa->edests[cur_node].nelem == 2)
3126 {
3127 reg_errcode_t err;
3128 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3129 dfa->edests[cur_node].elems[1],
3130 ex_subexp, type);
3131 if (__glibc_unlikely (err != REG_NOERROR))
3132 return err;
3133 }
3134 cur_node = dfa->edests[cur_node].elems[0];
3135 }
3136 return REG_NOERROR;
3137 }
3138
3139
3140 /* For all the back references in the current state, calculate the
3141 destination of the back references by the appropriate entry
3142 in MCTX->BKREF_ENTS. */
3143
3144 static reg_errcode_t
3145 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3146 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3147 Idx cur_str, Idx subexp_num, int type)
3148 {
3149 const re_dfa_t *const dfa = mctx->dfa;
3150 reg_errcode_t err;
3151 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3152 struct re_backref_cache_entry *ent;
3153
3154 if (cache_idx_start == -1)
3155 return REG_NOERROR;
3156
3157 restart:
3158 ent = mctx->bkref_ents + cache_idx_start;
3159 do
3160 {
3161 Idx to_idx, next_node;
3162
3163 /* Is this entry ENT is appropriate? */
3164 if (!re_node_set_contains (cur_nodes, ent->node))
3165 continue; /* No. */
3166
3167 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3168 /* Calculate the destination of the back reference, and append it
3169 to MCTX->STATE_LOG. */
3170 if (to_idx == cur_str)
3171 {
3172 /* The backreference did epsilon transit, we must re-check all the
3173 node in the current state. */
3174 re_node_set new_dests;
3175 reg_errcode_t err2, err3;
3176 next_node = dfa->edests[ent->node].elems[0];
3177 if (re_node_set_contains (cur_nodes, next_node))
3178 continue;
3179 err = re_node_set_init_1 (&new_dests, next_node);
3180 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3181 err3 = re_node_set_merge (cur_nodes, &new_dests);
3182 re_node_set_free (&new_dests);
3183 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3184 || err3 != REG_NOERROR))
3185 {
3186 err = (err != REG_NOERROR ? err
3187 : (err2 != REG_NOERROR ? err2 : err3));
3188 return err;
3189 }
3190 /* TODO: It is still inefficient... */
3191 goto restart;
3192 }
3193 else
3194 {
3195 re_node_set union_set;
3196 next_node = dfa->nexts[ent->node];
3197 if (mctx->state_log[to_idx])
3198 {
3199 bool ok;
3200 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3201 next_node))
3202 continue;
3203 err = re_node_set_init_copy (&union_set,
3204 &mctx->state_log[to_idx]->nodes);
3205 ok = re_node_set_insert (&union_set, next_node);
3206 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3207 {
3208 re_node_set_free (&union_set);
3209 err = err != REG_NOERROR ? err : REG_ESPACE;
3210 return err;
3211 }
3212 }
3213 else
3214 {
3215 err = re_node_set_init_1 (&union_set, next_node);
3216 if (__glibc_unlikely (err != REG_NOERROR))
3217 return err;
3218 }
3219 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3220 re_node_set_free (&union_set);
3221 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3222 && err != REG_NOERROR))
3223 return err;
3224 }
3225 }
3226 while (ent++->more);
3227 return REG_NOERROR;
3228 }
3229
3230 /* Build transition table for the state.
3231 Return true if successful. */
3232
3233 static bool __attribute_noinline__
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3234 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3235 {
3236 reg_errcode_t err;
3237 Idx i, j;
3238 int ch;
3239 bool need_word_trtable = false;
3240 bitset_word_t elem, mask;
3241 Idx ndests; /* Number of the destination states from 'state'. */
3242 re_dfastate_t **trtable;
3243 re_dfastate_t *dest_states[SBC_MAX];
3244 re_dfastate_t *dest_states_word[SBC_MAX];
3245 re_dfastate_t *dest_states_nl[SBC_MAX];
3246 re_node_set follows;
3247 bitset_t acceptable;
3248
3249 /* We build DFA states which corresponds to the destination nodes
3250 from 'state'. 'dests_node[i]' represents the nodes which i-th
3251 destination state contains, and 'dests_ch[i]' represents the
3252 characters which i-th destination state accepts. */
3253 re_node_set dests_node[SBC_MAX];
3254 bitset_t dests_ch[SBC_MAX];
3255
3256 /* Initialize transition table. */
3257 state->word_trtable = state->trtable = NULL;
3258
3259 /* At first, group all nodes belonging to 'state' into several
3260 destinations. */
3261 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3262 if (__glibc_unlikely (ndests <= 0))
3263 {
3264 /* Return false in case of an error, true otherwise. */
3265 if (ndests == 0)
3266 {
3267 state->trtable = (re_dfastate_t **)
3268 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3269 if (__glibc_unlikely (state->trtable == NULL))
3270 return false;
3271 return true;
3272 }
3273 return false;
3274 }
3275
3276 err = re_node_set_alloc (&follows, ndests + 1);
3277 if (__glibc_unlikely (err != REG_NOERROR))
3278 {
3279 out_free:
3280 re_node_set_free (&follows);
3281 for (i = 0; i < ndests; ++i)
3282 re_node_set_free (dests_node + i);
3283 return false;
3284 }
3285
3286 bitset_empty (acceptable);
3287
3288 /* Then build the states for all destinations. */
3289 for (i = 0; i < ndests; ++i)
3290 {
3291 Idx next_node;
3292 re_node_set_empty (&follows);
3293 /* Merge the follows of this destination states. */
3294 for (j = 0; j < dests_node[i].nelem; ++j)
3295 {
3296 next_node = dfa->nexts[dests_node[i].elems[j]];
3297 if (next_node != -1)
3298 {
3299 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3300 if (__glibc_unlikely (err != REG_NOERROR))
3301 goto out_free;
3302 }
3303 }
3304 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3305 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3306 goto out_free;
3307 /* If the new state has context constraint,
3308 build appropriate states for these contexts. */
3309 if (dest_states[i]->has_constraint)
3310 {
3311 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3312 CONTEXT_WORD);
3313 if (__glibc_unlikely (dest_states_word[i] == NULL
3314 && err != REG_NOERROR))
3315 goto out_free;
3316
3317 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3318 need_word_trtable = true;
3319
3320 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3321 CONTEXT_NEWLINE);
3322 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3323 goto out_free;
3324 }
3325 else
3326 {
3327 dest_states_word[i] = dest_states[i];
3328 dest_states_nl[i] = dest_states[i];
3329 }
3330 bitset_merge (acceptable, dests_ch[i]);
3331 }
3332
3333 if (!__glibc_unlikely (need_word_trtable))
3334 {
3335 /* We don't care about whether the following character is a word
3336 character, or we are in a single-byte character set so we can
3337 discern by looking at the character code: allocate a
3338 256-entry transition table. */
3339 trtable = state->trtable =
3340 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3341 if (__glibc_unlikely (trtable == NULL))
3342 goto out_free;
3343
3344 /* For all characters ch...: */
3345 for (i = 0; i < BITSET_WORDS; ++i)
3346 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3347 elem;
3348 mask <<= 1, elem >>= 1, ++ch)
3349 if (__glibc_unlikely (elem & 1))
3350 {
3351 /* There must be exactly one destination which accepts
3352 character ch. See group_nodes_into_DFAstates. */
3353 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3354 ;
3355
3356 /* j-th destination accepts the word character ch. */
3357 if (dfa->word_char[i] & mask)
3358 trtable[ch] = dest_states_word[j];
3359 else
3360 trtable[ch] = dest_states[j];
3361 }
3362 }
3363 else
3364 {
3365 /* We care about whether the following character is a word
3366 character, and we are in a multi-byte character set: discern
3367 by looking at the character code: build two 256-entry
3368 transition tables, one starting at trtable[0] and one
3369 starting at trtable[SBC_MAX]. */
3370 trtable = state->word_trtable =
3371 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3372 if (__glibc_unlikely (trtable == NULL))
3373 goto out_free;
3374
3375 /* For all characters ch...: */
3376 for (i = 0; i < BITSET_WORDS; ++i)
3377 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3378 elem;
3379 mask <<= 1, elem >>= 1, ++ch)
3380 if (__glibc_unlikely (elem & 1))
3381 {
3382 /* There must be exactly one destination which accepts
3383 character ch. See group_nodes_into_DFAstates. */
3384 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3385 ;
3386
3387 /* j-th destination accepts the word character ch. */
3388 trtable[ch] = dest_states[j];
3389 trtable[ch + SBC_MAX] = dest_states_word[j];
3390 }
3391 }
3392
3393 /* new line */
3394 if (bitset_contain (acceptable, NEWLINE_CHAR))
3395 {
3396 /* The current state accepts newline character. */
3397 for (j = 0; j < ndests; ++j)
3398 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3399 {
3400 /* k-th destination accepts newline character. */
3401 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3402 if (need_word_trtable)
3403 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3404 /* There must be only one destination which accepts
3405 newline. See group_nodes_into_DFAstates. */
3406 break;
3407 }
3408 }
3409
3410 re_node_set_free (&follows);
3411 for (i = 0; i < ndests; ++i)
3412 re_node_set_free (dests_node + i);
3413 return true;
3414 }
3415
3416 /* Group all nodes belonging to STATE into several destinations.
3417 Then for all destinations, set the nodes belonging to the destination
3418 to DESTS_NODE[i] and set the characters accepted by the destination
3419 to DEST_CH[i]. Return the number of destinations if successful,
3420 -1 on internal error. */
3421
3422 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3423 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3424 re_node_set *dests_node, bitset_t *dests_ch)
3425 {
3426 reg_errcode_t err;
3427 bool ok;
3428 Idx i, j, k;
3429 Idx ndests; /* Number of the destinations from 'state'. */
3430 bitset_t accepts; /* Characters a node can accept. */
3431 const re_node_set *cur_nodes = &state->nodes;
3432 bitset_empty (accepts);
3433 ndests = 0;
3434
3435 /* For all the nodes belonging to 'state', */
3436 for (i = 0; i < cur_nodes->nelem; ++i)
3437 {
3438 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3439 re_token_type_t type = node->type;
3440 unsigned int constraint = node->constraint;
3441
3442 /* Enumerate all single byte character this node can accept. */
3443 if (type == CHARACTER)
3444 bitset_set (accepts, node->opr.c);
3445 else if (type == SIMPLE_BRACKET)
3446 {
3447 bitset_merge (accepts, node->opr.sbcset);
3448 }
3449 else if (type == OP_PERIOD)
3450 {
3451 if (dfa->mb_cur_max > 1)
3452 bitset_merge (accepts, dfa->sb_char);
3453 else
3454 bitset_set_all (accepts);
3455 if (!(dfa->syntax & RE_DOT_NEWLINE))
3456 bitset_clear (accepts, '\n');
3457 if (dfa->syntax & RE_DOT_NOT_NULL)
3458 bitset_clear (accepts, '\0');
3459 }
3460 else if (type == OP_UTF8_PERIOD)
3461 {
3462 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3463 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3464 else
3465 bitset_merge (accepts, utf8_sb_map);
3466 if (!(dfa->syntax & RE_DOT_NEWLINE))
3467 bitset_clear (accepts, '\n');
3468 if (dfa->syntax & RE_DOT_NOT_NULL)
3469 bitset_clear (accepts, '\0');
3470 }
3471 else
3472 continue;
3473
3474 /* Check the 'accepts' and sift the characters which are not
3475 match it the context. */
3476 if (constraint)
3477 {
3478 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3479 {
3480 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3481 bitset_empty (accepts);
3482 if (accepts_newline)
3483 bitset_set (accepts, NEWLINE_CHAR);
3484 else
3485 continue;
3486 }
3487 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3488 {
3489 bitset_empty (accepts);
3490 continue;
3491 }
3492
3493 if (constraint & NEXT_WORD_CONSTRAINT)
3494 {
3495 bitset_word_t any_set = 0;
3496 if (type == CHARACTER && !node->word_char)
3497 {
3498 bitset_empty (accepts);
3499 continue;
3500 }
3501 if (dfa->mb_cur_max > 1)
3502 for (j = 0; j < BITSET_WORDS; ++j)
3503 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3504 else
3505 for (j = 0; j < BITSET_WORDS; ++j)
3506 any_set |= (accepts[j] &= dfa->word_char[j]);
3507 if (!any_set)
3508 continue;
3509 }
3510 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3511 {
3512 bitset_word_t any_set = 0;
3513 if (type == CHARACTER && node->word_char)
3514 {
3515 bitset_empty (accepts);
3516 continue;
3517 }
3518 if (dfa->mb_cur_max > 1)
3519 for (j = 0; j < BITSET_WORDS; ++j)
3520 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3521 else
3522 for (j = 0; j < BITSET_WORDS; ++j)
3523 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3524 if (!any_set)
3525 continue;
3526 }
3527 }
3528
3529 /* Then divide 'accepts' into DFA states, or create a new
3530 state. Above, we make sure that accepts is not empty. */
3531 for (j = 0; j < ndests; ++j)
3532 {
3533 bitset_t intersec; /* Intersection sets, see below. */
3534 bitset_t remains;
3535 /* Flags, see below. */
3536 bitset_word_t has_intersec, not_subset, not_consumed;
3537
3538 /* Optimization, skip if this state doesn't accept the character. */
3539 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3540 continue;
3541
3542 /* Enumerate the intersection set of this state and 'accepts'. */
3543 has_intersec = 0;
3544 for (k = 0; k < BITSET_WORDS; ++k)
3545 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3546 /* And skip if the intersection set is empty. */
3547 if (!has_intersec)
3548 continue;
3549
3550 /* Then check if this state is a subset of 'accepts'. */
3551 not_subset = not_consumed = 0;
3552 for (k = 0; k < BITSET_WORDS; ++k)
3553 {
3554 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3555 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3556 }
3557
3558 /* If this state isn't a subset of 'accepts', create a
3559 new group state, which has the 'remains'. */
3560 if (not_subset)
3561 {
3562 bitset_copy (dests_ch[ndests], remains);
3563 bitset_copy (dests_ch[j], intersec);
3564 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3565 if (__glibc_unlikely (err != REG_NOERROR))
3566 goto error_return;
3567 ++ndests;
3568 }
3569
3570 /* Put the position in the current group. */
3571 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3572 if (__glibc_unlikely (! ok))
3573 goto error_return;
3574
3575 /* If all characters are consumed, go to next node. */
3576 if (!not_consumed)
3577 break;
3578 }
3579 /* Some characters remain, create a new group. */
3580 if (j == ndests)
3581 {
3582 bitset_copy (dests_ch[ndests], accepts);
3583 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3584 if (__glibc_unlikely (err != REG_NOERROR))
3585 goto error_return;
3586 ++ndests;
3587 bitset_empty (accepts);
3588 }
3589 }
3590 assume (ndests <= SBC_MAX);
3591 return ndests;
3592 error_return:
3593 for (j = 0; j < ndests; ++j)
3594 re_node_set_free (dests_node + j);
3595 return -1;
3596 }
3597
3598 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3599 Return the number of the bytes the node accepts.
3600 STR_IDX is the current index of the input string.
3601
3602 This function handles the nodes which can accept one character, or
3603 one collating element like '.', '[a-z]', opposite to the other nodes
3604 can only accept one byte. */
3605
3606 #ifdef _LIBC
3607 # include <locale/weight.h>
3608 #endif
3609
3610 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3611 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3612 const re_string_t *input, Idx str_idx)
3613 {
3614 const re_token_t *node = dfa->nodes + node_idx;
3615 int char_len, elem_len;
3616 Idx i;
3617
3618 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3619 {
3620 unsigned char c = re_string_byte_at (input, str_idx), d;
3621 if (__glibc_likely (c < 0xc2))
3622 return 0;
3623
3624 if (str_idx + 2 > input->len)
3625 return 0;
3626
3627 d = re_string_byte_at (input, str_idx + 1);
3628 if (c < 0xe0)
3629 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3630 else if (c < 0xf0)
3631 {
3632 char_len = 3;
3633 if (c == 0xe0 && d < 0xa0)
3634 return 0;
3635 }
3636 else if (c < 0xf8)
3637 {
3638 char_len = 4;
3639 if (c == 0xf0 && d < 0x90)
3640 return 0;
3641 }
3642 else if (c < 0xfc)
3643 {
3644 char_len = 5;
3645 if (c == 0xf8 && d < 0x88)
3646 return 0;
3647 }
3648 else if (c < 0xfe)
3649 {
3650 char_len = 6;
3651 if (c == 0xfc && d < 0x84)
3652 return 0;
3653 }
3654 else
3655 return 0;
3656
3657 if (str_idx + char_len > input->len)
3658 return 0;
3659
3660 for (i = 1; i < char_len; ++i)
3661 {
3662 d = re_string_byte_at (input, str_idx + i);
3663 if (d < 0x80 || d > 0xbf)
3664 return 0;
3665 }
3666 return char_len;
3667 }
3668
3669 char_len = re_string_char_size_at (input, str_idx);
3670 if (node->type == OP_PERIOD)
3671 {
3672 if (char_len <= 1)
3673 return 0;
3674 /* FIXME: I don't think this if is needed, as both '\n'
3675 and '\0' are char_len == 1. */
3676 /* '.' accepts any one character except the following two cases. */
3677 if ((!(dfa->syntax & RE_DOT_NEWLINE)
3678 && re_string_byte_at (input, str_idx) == '\n')
3679 || ((dfa->syntax & RE_DOT_NOT_NULL)
3680 && re_string_byte_at (input, str_idx) == '\0'))
3681 return 0;
3682 return char_len;
3683 }
3684
3685 elem_len = re_string_elem_size_at (input, str_idx);
3686 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3687 return 0;
3688
3689 if (node->type == COMPLEX_BRACKET)
3690 {
3691 const re_charset_t *cset = node->opr.mbcset;
3692 #ifdef _LIBC
3693 const unsigned char *pin
3694 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3695 Idx j;
3696 uint32_t nrules;
3697 #endif
3698 int match_len = 0;
3699 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3700 ? re_string_wchar_at (input, str_idx) : 0);
3701
3702 /* match with multibyte character? */
3703 for (i = 0; i < cset->nmbchars; ++i)
3704 if (wc == cset->mbchars[i])
3705 {
3706 match_len = char_len;
3707 goto check_node_accept_bytes_match;
3708 }
3709 /* match with character_class? */
3710 for (i = 0; i < cset->nchar_classes; ++i)
3711 {
3712 wctype_t wt = cset->char_classes[i];
3713 if (__iswctype (wc, wt))
3714 {
3715 match_len = char_len;
3716 goto check_node_accept_bytes_match;
3717 }
3718 }
3719
3720 #ifdef _LIBC
3721 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3722 if (nrules != 0)
3723 {
3724 unsigned int in_collseq = 0;
3725 const int32_t *table, *indirect;
3726 const unsigned char *weights, *extra;
3727 const char *collseqwc;
3728
3729 /* match with collating_symbol? */
3730 if (cset->ncoll_syms)
3731 extra = (const unsigned char *)
3732 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3733 for (i = 0; i < cset->ncoll_syms; ++i)
3734 {
3735 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3736 /* Compare the length of input collating element and
3737 the length of current collating element. */
3738 if (*coll_sym != elem_len)
3739 continue;
3740 /* Compare each bytes. */
3741 for (j = 0; j < *coll_sym; j++)
3742 if (pin[j] != coll_sym[1 + j])
3743 break;
3744 if (j == *coll_sym)
3745 {
3746 /* Match if every bytes is equal. */
3747 match_len = j;
3748 goto check_node_accept_bytes_match;
3749 }
3750 }
3751
3752 if (cset->nranges)
3753 {
3754 if (elem_len <= char_len)
3755 {
3756 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3757 in_collseq = __collseq_table_lookup (collseqwc, wc);
3758 }
3759 else
3760 in_collseq = find_collation_sequence_value (pin, elem_len);
3761 }
3762 /* match with range expression? */
3763 /* FIXME: Implement rational ranges here, too. */
3764 for (i = 0; i < cset->nranges; ++i)
3765 if (cset->range_starts[i] <= in_collseq
3766 && in_collseq <= cset->range_ends[i])
3767 {
3768 match_len = elem_len;
3769 goto check_node_accept_bytes_match;
3770 }
3771
3772 /* match with equivalence_class? */
3773 if (cset->nequiv_classes)
3774 {
3775 const unsigned char *cp = pin;
3776 table = (const int32_t *)
3777 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3778 weights = (const unsigned char *)
3779 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3780 extra = (const unsigned char *)
3781 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3782 indirect = (const int32_t *)
3783 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3784 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3785 int32_t rule = idx >> 24;
3786 idx &= 0xffffff;
3787 if (idx > 0)
3788 {
3789 size_t weight_len = weights[idx];
3790 for (i = 0; i < cset->nequiv_classes; ++i)
3791 {
3792 int32_t equiv_class_idx = cset->equiv_classes[i];
3793 int32_t equiv_class_rule = equiv_class_idx >> 24;
3794 equiv_class_idx &= 0xffffff;
3795 if (weights[equiv_class_idx] == weight_len
3796 && equiv_class_rule == rule
3797 && memcmp (weights + idx + 1,
3798 weights + equiv_class_idx + 1,
3799 weight_len) == 0)
3800 {
3801 match_len = elem_len;
3802 goto check_node_accept_bytes_match;
3803 }
3804 }
3805 }
3806 }
3807 }
3808 else
3809 #endif /* _LIBC */
3810 {
3811 /* match with range expression? */
3812 for (i = 0; i < cset->nranges; ++i)
3813 {
3814 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3815 {
3816 match_len = char_len;
3817 goto check_node_accept_bytes_match;
3818 }
3819 }
3820 }
3821 check_node_accept_bytes_match:
3822 if (!cset->non_match)
3823 return match_len;
3824 else
3825 {
3826 if (match_len > 0)
3827 return 0;
3828 else
3829 return (elem_len > char_len) ? elem_len : char_len;
3830 }
3831 }
3832 return 0;
3833 }
3834
3835 #ifdef _LIBC
3836 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3837 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3838 {
3839 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3840 if (nrules == 0)
3841 {
3842 if (mbs_len == 1)
3843 {
3844 /* No valid character. Match it as a single byte character. */
3845 const unsigned char *collseq = (const unsigned char *)
3846 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3847 return collseq[mbs[0]];
3848 }
3849 return UINT_MAX;
3850 }
3851 else
3852 {
3853 int32_t idx;
3854 const unsigned char *extra = (const unsigned char *)
3855 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3856 int32_t extrasize = (const unsigned char *)
3857 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3858
3859 for (idx = 0; idx < extrasize;)
3860 {
3861 int mbs_cnt;
3862 bool found = false;
3863 int32_t elem_mbs_len;
3864 /* Skip the name of collating element name. */
3865 idx = idx + extra[idx] + 1;
3866 elem_mbs_len = extra[idx++];
3867 if (mbs_len == elem_mbs_len)
3868 {
3869 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3870 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3871 break;
3872 if (mbs_cnt == elem_mbs_len)
3873 /* Found the entry. */
3874 found = true;
3875 }
3876 /* Skip the byte sequence of the collating element. */
3877 idx += elem_mbs_len;
3878 /* Adjust for the alignment. */
3879 idx = (idx + 3) & ~3;
3880 /* Skip the collation sequence value. */
3881 idx += sizeof (uint32_t);
3882 /* Skip the wide char sequence of the collating element. */
3883 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3884 /* If we found the entry, return the sequence value. */
3885 if (found)
3886 return *(uint32_t *) (extra + idx);
3887 /* Skip the collation sequence value. */
3888 idx += sizeof (uint32_t);
3889 }
3890 return UINT_MAX;
3891 }
3892 }
3893 #endif /* _LIBC */
3894
3895 /* Check whether the node accepts the byte which is IDX-th
3896 byte of the INPUT. */
3897
3898 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3899 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3900 Idx idx)
3901 {
3902 unsigned char ch;
3903 ch = re_string_byte_at (&mctx->input, idx);
3904 switch (node->type)
3905 {
3906 case CHARACTER:
3907 if (node->opr.c != ch)
3908 return false;
3909 break;
3910
3911 case SIMPLE_BRACKET:
3912 if (!bitset_contain (node->opr.sbcset, ch))
3913 return false;
3914 break;
3915
3916 case OP_UTF8_PERIOD:
3917 if (ch >= ASCII_CHARS)
3918 return false;
3919 FALLTHROUGH;
3920 case OP_PERIOD:
3921 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3922 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3923 return false;
3924 break;
3925
3926 default:
3927 return false;
3928 }
3929
3930 if (node->constraint)
3931 {
3932 /* The node has constraints. Check whether the current context
3933 satisfies the constraints. */
3934 unsigned int context = re_string_context_at (&mctx->input, idx,
3935 mctx->eflags);
3936 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3937 return false;
3938 }
3939
3940 return true;
3941 }
3942
3943 /* Extend the buffers, if the buffers have run out. */
3944
3945 static reg_errcode_t
3946 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)3947 extend_buffers (re_match_context_t *mctx, int min_len)
3948 {
3949 reg_errcode_t ret;
3950 re_string_t *pstr = &mctx->input;
3951
3952 /* Avoid overflow. */
3953 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3954 <= pstr->bufs_len))
3955 return REG_ESPACE;
3956
3957 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3958 ret = re_string_realloc_buffers (pstr,
3959 MAX (min_len,
3960 MIN (pstr->len, pstr->bufs_len * 2)));
3961 if (__glibc_unlikely (ret != REG_NOERROR))
3962 return ret;
3963
3964 if (mctx->state_log != NULL)
3965 {
3966 /* And double the length of state_log. */
3967 /* XXX We have no indication of the size of this buffer. If this
3968 allocation fail we have no indication that the state_log array
3969 does not have the right size. */
3970 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3971 pstr->bufs_len + 1);
3972 if (__glibc_unlikely (new_array == NULL))
3973 return REG_ESPACE;
3974 mctx->state_log = new_array;
3975 }
3976
3977 /* Then reconstruct the buffers. */
3978 if (pstr->icase)
3979 {
3980 if (pstr->mb_cur_max > 1)
3981 {
3982 ret = build_wcs_upper_buffer (pstr);
3983 if (__glibc_unlikely (ret != REG_NOERROR))
3984 return ret;
3985 }
3986 else
3987 build_upper_buffer (pstr);
3988 }
3989 else
3990 {
3991 if (pstr->mb_cur_max > 1)
3992 build_wcs_buffer (pstr);
3993 else
3994 {
3995 if (pstr->trans != NULL)
3996 re_string_translate_buffer (pstr);
3997 }
3998 }
3999 return REG_NOERROR;
4000 }
4001
4002
4003 /* Functions for matching context. */
4004
4005 /* Initialize MCTX. */
4006
4007 static reg_errcode_t
4008 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4009 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4010 {
4011 mctx->eflags = eflags;
4012 mctx->match_last = -1;
4013 if (n > 0)
4014 {
4015 /* Avoid overflow. */
4016 size_t max_object_size =
4017 MAX (sizeof (struct re_backref_cache_entry),
4018 sizeof (re_sub_match_top_t *));
4019 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4020 return REG_ESPACE;
4021
4022 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4023 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4024 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4025 return REG_ESPACE;
4026 }
4027 /* Already zero-ed by the caller.
4028 else
4029 mctx->bkref_ents = NULL;
4030 mctx->nbkref_ents = 0;
4031 mctx->nsub_tops = 0; */
4032 mctx->abkref_ents = n;
4033 mctx->max_mb_elem_len = 1;
4034 mctx->asub_tops = n;
4035 return REG_NOERROR;
4036 }
4037
4038 /* Clean the entries which depend on the current input in MCTX.
4039 This function must be invoked when the matcher changes the start index
4040 of the input, or changes the input string. */
4041
4042 static void
match_ctx_clean(re_match_context_t * mctx)4043 match_ctx_clean (re_match_context_t *mctx)
4044 {
4045 Idx st_idx;
4046 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4047 {
4048 Idx sl_idx;
4049 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4050 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4051 {
4052 re_sub_match_last_t *last = top->lasts[sl_idx];
4053 re_free (last->path.array);
4054 re_free (last);
4055 }
4056 re_free (top->lasts);
4057 if (top->path)
4058 {
4059 re_free (top->path->array);
4060 re_free (top->path);
4061 }
4062 re_free (top);
4063 }
4064
4065 mctx->nsub_tops = 0;
4066 mctx->nbkref_ents = 0;
4067 }
4068
4069 /* Free all the memory associated with MCTX. */
4070
4071 static void
match_ctx_free(re_match_context_t * mctx)4072 match_ctx_free (re_match_context_t *mctx)
4073 {
4074 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4075 match_ctx_clean (mctx);
4076 re_free (mctx->sub_tops);
4077 re_free (mctx->bkref_ents);
4078 }
4079
4080 /* Add a new backreference entry to MCTX.
4081 Note that we assume that caller never call this function with duplicate
4082 entry, and call with STR_IDX which isn't smaller than any existing entry.
4083 */
4084
4085 static reg_errcode_t
4086 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4087 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4088 Idx to)
4089 {
4090 if (mctx->nbkref_ents >= mctx->abkref_ents)
4091 {
4092 struct re_backref_cache_entry* new_entry;
4093 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4094 mctx->abkref_ents * 2);
4095 if (__glibc_unlikely (new_entry == NULL))
4096 {
4097 re_free (mctx->bkref_ents);
4098 return REG_ESPACE;
4099 }
4100 mctx->bkref_ents = new_entry;
4101 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4102 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4103 mctx->abkref_ents *= 2;
4104 }
4105 if (mctx->nbkref_ents > 0
4106 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4107 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4108
4109 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4110 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4111 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4112 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4113
4114 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4115 If bit N is clear, means that this entry won't epsilon-transition to
4116 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4117 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4118 such node.
4119
4120 A backreference does not epsilon-transition unless it is empty, so set
4121 to all zeros if FROM != TO. */
4122 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4123 = (from == to ? -1 : 0);
4124
4125 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4126 if (mctx->max_mb_elem_len < to - from)
4127 mctx->max_mb_elem_len = to - from;
4128 return REG_NOERROR;
4129 }
4130
4131 /* Return the first entry with the same str_idx, or -1 if none is
4132 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4133
4134 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4135 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4136 {
4137 Idx left, right, mid, last;
4138 last = right = mctx->nbkref_ents;
4139 for (left = 0; left < right;)
4140 {
4141 mid = (left + right) / 2;
4142 if (mctx->bkref_ents[mid].str_idx < str_idx)
4143 left = mid + 1;
4144 else
4145 right = mid;
4146 }
4147 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4148 return left;
4149 else
4150 return -1;
4151 }
4152
4153 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4154 at STR_IDX. */
4155
4156 static reg_errcode_t
4157 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4158 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4159 {
4160 DEBUG_ASSERT (mctx->sub_tops != NULL);
4161 DEBUG_ASSERT (mctx->asub_tops > 0);
4162 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4163 {
4164 Idx new_asub_tops = mctx->asub_tops * 2;
4165 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4166 re_sub_match_top_t *,
4167 new_asub_tops);
4168 if (__glibc_unlikely (new_array == NULL))
4169 return REG_ESPACE;
4170 mctx->sub_tops = new_array;
4171 mctx->asub_tops = new_asub_tops;
4172 }
4173 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4174 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4175 return REG_ESPACE;
4176 mctx->sub_tops[mctx->nsub_tops]->node = node;
4177 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4178 return REG_NOERROR;
4179 }
4180
4181 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4182 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4183 Return the new entry if successful, NULL if memory is exhausted. */
4184
4185 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4186 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4187 {
4188 re_sub_match_last_t *new_entry;
4189 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4190 {
4191 Idx new_alasts = 2 * subtop->alasts + 1;
4192 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4193 re_sub_match_last_t *,
4194 new_alasts);
4195 if (__glibc_unlikely (new_array == NULL))
4196 return NULL;
4197 subtop->lasts = new_array;
4198 subtop->alasts = new_alasts;
4199 }
4200 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4201 if (__glibc_likely (new_entry != NULL))
4202 {
4203 subtop->lasts[subtop->nlasts] = new_entry;
4204 new_entry->node = node;
4205 new_entry->str_idx = str_idx;
4206 ++subtop->nlasts;
4207 }
4208 return new_entry;
4209 }
4210
4211 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4212 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4213 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4214 {
4215 sctx->sifted_states = sifted_sts;
4216 sctx->limited_states = limited_sts;
4217 sctx->last_node = last_node;
4218 sctx->last_str_idx = last_str_idx;
4219 re_node_set_init_empty (&sctx->limits);
4220 }
4221