1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
33 Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
38 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
49 bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs, regmatch_t *prevregs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
67 bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
82 Idx str_idx,
83 re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
90 Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
97 Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
101 re_node_set *limits,
102 struct re_backref_cache_entry *bkref_ents,
103 Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 re_dfastate_t **dst,
109 re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
120 Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
143 int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 Idx str_idx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
154 int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174
175 /* Entry point for POSIX code. */
176
177 /* regexec searches for a given pattern, specified by PREG, in the
178 string STRING.
179
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
184
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
188
189 Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
190 EFLAGS is invalid. */
191
192 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)193 regexec (const regex_t *__restrict preg, const char *__restrict string,
194 size_t nmatch, regmatch_t pmatch[], int eflags)
195 {
196 reg_errcode_t err;
197 Idx start, length;
198 re_dfa_t *dfa = preg->buffer;
199
200 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
201 return REG_BADPAT;
202
203 if (eflags & REG_STARTEND)
204 {
205 start = pmatch[0].rm_so;
206 length = pmatch[0].rm_eo;
207 }
208 else
209 {
210 start = 0;
211 length = strlen (string);
212 }
213
214 lock_lock (dfa->lock);
215 if (preg->no_sub)
216 err = re_search_internal (preg, string, length, start, length,
217 length, 0, NULL, eflags);
218 else
219 err = re_search_internal (preg, string, length, start, length,
220 length, nmatch, pmatch, eflags);
221 lock_unlock (dfa->lock);
222 return err != REG_NOERROR;
223 }
224
225 #ifdef _LIBC
226 libc_hidden_def (__regexec)
227
228 # include <shlib-compat.h>
229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
230
231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
232 __typeof__ (__regexec) __compat_regexec;
233
234 int
235 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)236 __compat_regexec (const regex_t *__restrict preg,
237 const char *__restrict string, size_t nmatch,
238 regmatch_t pmatch[], int eflags)
239 {
240 return regexec (preg, string, nmatch, pmatch,
241 eflags & (REG_NOTBOL | REG_NOTEOL));
242 }
243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
244 # endif
245 #endif
246
247 /* Entry points for GNU code. */
248
249 /* re_match, re_search, re_match_2, re_search_2
250
251 The former two functions operate on STRING with length LENGTH,
252 while the later two operate on concatenation of STRING1 and STRING2
253 with lengths LENGTH1 and LENGTH2, respectively.
254
255 re_match() matches the compiled pattern in BUFP against the string,
256 starting at index START.
257
258 re_search() first tries matching at index START, then it tries to match
259 starting from index START + 1, and so on. The last start position tried
260 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
261 way as re_match().)
262
263 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
264 the first STOP characters of the concatenation of the strings should be
265 concerned.
266
267 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
268 and all groups is stored in REGS. (For the "_2" variants, the offsets are
269 computed relative to the concatenation, not relative to the individual
270 strings.)
271
272 On success, re_match* functions return the length of the match, re_search*
273 return the position of the start of the match. They return -1 on
274 match failure, -2 on error. */
275
276 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
278 Idx start, struct re_registers *regs)
279 {
280 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
281 }
282 #ifdef _LIBC
weak_alias(__re_match,re_match)283 weak_alias (__re_match, re_match)
284 #endif
285
286 regoff_t
287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
288 Idx start, regoff_t range, struct re_registers *regs)
289 {
290 return re_search_stub (bufp, string, length, start, range, length, regs,
291 false);
292 }
293 #ifdef _LIBC
weak_alias(__re_search,re_search)294 weak_alias (__re_search, re_search)
295 #endif
296
297 regoff_t
298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
299 const char *string2, Idx length2, Idx start,
300 struct re_registers *regs, Idx stop)
301 {
302 return re_search_2_stub (bufp, string1, length1, string2, length2,
303 start, 0, regs, stop, true);
304 }
305 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)306 weak_alias (__re_match_2, re_match_2)
307 #endif
308
309 regoff_t
310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
311 const char *string2, Idx length2, Idx start, regoff_t range,
312 struct re_registers *regs, Idx stop)
313 {
314 return re_search_2_stub (bufp, string1, length1, string2, length2,
315 start, range, regs, stop, false);
316 }
317 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)318 weak_alias (__re_search_2, re_search_2)
319 #endif
320
321 static regoff_t
322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
323 Idx length1, const char *string2, Idx length2, Idx start,
324 regoff_t range, struct re_registers *regs,
325 Idx stop, bool ret_len)
326 {
327 const char *str;
328 regoff_t rval;
329 Idx len;
330 char *s = NULL;
331
332 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
333 || INT_ADD_WRAPV (length1, length2, &len))))
334 return -2;
335
336 /* Concatenate the strings. */
337 if (length2 > 0)
338 if (length1 > 0)
339 {
340 s = re_malloc (char, len);
341
342 if (__glibc_unlikely (s == NULL))
343 return -2;
344 #ifdef _LIBC
345 memcpy (__mempcpy (s, string1, length1), string2, length2);
346 #else
347 memcpy (s, string1, length1);
348 memcpy (s + length1, string2, length2);
349 #endif
350 str = s;
351 }
352 else
353 str = string2;
354 else
355 str = string1;
356
357 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 ret_len);
359 re_free (s);
360 return rval;
361 }
362
363 /* The parameters have the same meaning as those of re_search.
364 Additional parameters:
365 If RET_LEN is true the length of the match is returned (re_match style);
366 otherwise the position of the match is returned. */
367
368 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 bool ret_len)
372 {
373 reg_errcode_t result;
374 regmatch_t *pmatch;
375 Idx nregs;
376 regoff_t rval;
377 int eflags = 0;
378 re_dfa_t *dfa = bufp->buffer;
379 Idx last_start = start + range;
380
381 /* Check for out-of-range. */
382 if (__glibc_unlikely (start < 0 || start > length))
383 return -1;
384 if (__glibc_unlikely (length < last_start
385 || (0 <= range && last_start < start)))
386 last_start = length;
387 else if (__glibc_unlikely (last_start < 0
388 || (range < 0 && start <= last_start)))
389 last_start = 0;
390
391 lock_lock (dfa->lock);
392
393 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
394 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
395
396 /* Compile fastmap if we haven't yet. */
397 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
398 re_compile_fastmap (bufp);
399
400 if (__glibc_unlikely (bufp->no_sub))
401 regs = NULL;
402
403 /* We need at least 1 register. */
404 if (regs == NULL)
405 nregs = 1;
406 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
407 && regs->num_regs <= bufp->re_nsub))
408 {
409 nregs = regs->num_regs;
410 if (__glibc_unlikely (nregs < 1))
411 {
412 /* Nothing can be copied to regs. */
413 regs = NULL;
414 nregs = 1;
415 }
416 }
417 else
418 nregs = bufp->re_nsub + 1;
419 pmatch = re_malloc (regmatch_t, nregs);
420 if (__glibc_unlikely (pmatch == NULL))
421 {
422 rval = -2;
423 goto out;
424 }
425
426 result = re_search_internal (bufp, string, length, start, last_start, stop,
427 nregs, pmatch, eflags);
428
429 rval = 0;
430
431 /* I hope we needn't fill their regs with -1's when no match was found. */
432 if (result != REG_NOERROR)
433 rval = result == REG_NOMATCH ? -1 : -2;
434 else if (regs != NULL)
435 {
436 /* If caller wants register contents data back, copy them. */
437 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
438 bufp->regs_allocated);
439 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
440 rval = -2;
441 }
442
443 if (__glibc_likely (rval == 0))
444 {
445 if (ret_len)
446 {
447 DEBUG_ASSERT (pmatch[0].rm_so == start);
448 rval = pmatch[0].rm_eo - start;
449 }
450 else
451 rval = pmatch[0].rm_so;
452 }
453 re_free (pmatch);
454 out:
455 lock_unlock (dfa->lock);
456 return rval;
457 }
458
459 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
461 int regs_allocated)
462 {
463 int rval = REGS_REALLOCATE;
464 Idx i;
465 Idx need_regs = nregs + 1;
466 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
467 uses. */
468
469 /* Have the register data arrays been allocated? */
470 if (regs_allocated == REGS_UNALLOCATED)
471 { /* No. So allocate them with malloc. */
472 regs->start = re_malloc (regoff_t, need_regs);
473 if (__glibc_unlikely (regs->start == NULL))
474 return REGS_UNALLOCATED;
475 regs->end = re_malloc (regoff_t, need_regs);
476 if (__glibc_unlikely (regs->end == NULL))
477 {
478 re_free (regs->start);
479 return REGS_UNALLOCATED;
480 }
481 regs->num_regs = need_regs;
482 }
483 else if (regs_allocated == REGS_REALLOCATE)
484 { /* Yes. If we need more elements than were already
485 allocated, reallocate them. If we need fewer, just
486 leave it alone. */
487 if (__glibc_unlikely (need_regs > regs->num_regs))
488 {
489 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
490 regoff_t *new_end;
491 if (__glibc_unlikely (new_start == NULL))
492 return REGS_UNALLOCATED;
493 new_end = re_realloc (regs->end, regoff_t, need_regs);
494 if (__glibc_unlikely (new_end == NULL))
495 {
496 re_free (new_start);
497 return REGS_UNALLOCATED;
498 }
499 regs->start = new_start;
500 regs->end = new_end;
501 regs->num_regs = need_regs;
502 }
503 }
504 else
505 {
506 DEBUG_ASSERT (regs_allocated == REGS_FIXED);
507 /* This function may not be called with REGS_FIXED and nregs too big. */
508 DEBUG_ASSERT (nregs <= regs->num_regs);
509 rval = REGS_FIXED;
510 }
511
512 /* Copy the regs. */
513 for (i = 0; i < nregs; ++i)
514 {
515 regs->start[i] = pmatch[i].rm_so;
516 regs->end[i] = pmatch[i].rm_eo;
517 }
518 for ( ; i < regs->num_regs; ++i)
519 regs->start[i] = regs->end[i] = -1;
520
521 return rval;
522 }
523
524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
525 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
526 this memory for recording register information. STARTS and ENDS
527 must be allocated using the malloc library routine, and must each
528 be at least NUM_REGS * sizeof (regoff_t) bytes long.
529
530 If NUM_REGS == 0, then subsequent matches should allocate their own
531 register data.
532
533 Unless this function is called, the first search or match using
534 PATTERN_BUFFER will allocate its own register data, without
535 freeing the old data. */
536
537 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
539 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
540 {
541 if (num_regs)
542 {
543 bufp->regs_allocated = REGS_REALLOCATE;
544 regs->num_regs = num_regs;
545 regs->start = starts;
546 regs->end = ends;
547 }
548 else
549 {
550 bufp->regs_allocated = REGS_UNALLOCATED;
551 regs->num_regs = 0;
552 regs->start = regs->end = NULL;
553 }
554 }
555 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)556 weak_alias (__re_set_registers, re_set_registers)
557 #endif
558
559 /* Entry points compatible with 4.2 BSD regex library. We don't define
560 them unless specifically requested. */
561
562 #if defined _REGEX_RE_COMP || defined _LIBC
563 int
564 # ifdef _LIBC
565 weak_function
566 # endif
567 re_exec (const char *s)
568 {
569 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
570 }
571 #endif /* _REGEX_RE_COMP */
572
573 /* Internal entry point. */
574
575 /* Searches for a compiled pattern PREG in the string STRING, whose
576 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
577 meaning as with regexec. LAST_START is START + RANGE, where
578 START and RANGE have the same meaning as with re_search.
579 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
580 otherwise return the error code.
581 Note: We assume front end functions already check ranges.
582 (0 <= LAST_START && LAST_START <= LENGTH) */
583
584 static reg_errcode_t
585 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)586 re_search_internal (const regex_t *preg, const char *string, Idx length,
587 Idx start, Idx last_start, Idx stop, size_t nmatch,
588 regmatch_t pmatch[], int eflags)
589 {
590 reg_errcode_t err;
591 const re_dfa_t *dfa = preg->buffer;
592 Idx left_lim, right_lim;
593 int incr;
594 bool fl_longest_match;
595 int match_kind;
596 Idx match_first;
597 Idx match_last = -1;
598 Idx extra_nmatch;
599 bool sb;
600 int ch;
601 re_match_context_t mctx = { .dfa = dfa };
602 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
603 && start != last_start && !preg->can_be_null)
604 ? preg->fastmap : NULL);
605 RE_TRANSLATE_TYPE t = preg->translate;
606
607 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
608 nmatch -= extra_nmatch;
609
610 /* Check if the DFA haven't been compiled. */
611 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
612 || dfa->init_state_word == NULL
613 || dfa->init_state_nl == NULL
614 || dfa->init_state_begbuf == NULL))
615 return REG_NOMATCH;
616
617 /* We assume front-end functions already check them. */
618 DEBUG_ASSERT (0 <= last_start && last_start <= length);
619
620 /* If initial states with non-begbuf contexts have no elements,
621 the regex must be anchored. If preg->newline_anchor is set,
622 we'll never use init_state_nl, so do not check it. */
623 if (dfa->init_state->nodes.nelem == 0
624 && dfa->init_state_word->nodes.nelem == 0
625 && (dfa->init_state_nl->nodes.nelem == 0
626 || !preg->newline_anchor))
627 {
628 if (start != 0 && last_start != 0)
629 return REG_NOMATCH;
630 start = last_start = 0;
631 }
632
633 /* We must check the longest matching, if nmatch > 0. */
634 fl_longest_match = (nmatch != 0 || dfa->nbackref);
635
636 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
637 preg->translate, (preg->syntax & RE_ICASE) != 0,
638 dfa);
639 if (__glibc_unlikely (err != REG_NOERROR))
640 goto free_return;
641 mctx.input.stop = stop;
642 mctx.input.raw_stop = stop;
643 mctx.input.newline_anchor = preg->newline_anchor;
644
645 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
646 if (__glibc_unlikely (err != REG_NOERROR))
647 goto free_return;
648
649 /* We will log all the DFA states through which the dfa pass,
650 if nmatch > 1, or this dfa has "multibyte node", which is a
651 back-reference or a node which can accept multibyte character or
652 multi character collating element. */
653 if (nmatch > 1 || dfa->has_mb_node)
654 {
655 /* Avoid overflow. */
656 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
657 <= mctx.input.bufs_len)))
658 {
659 err = REG_ESPACE;
660 goto free_return;
661 }
662
663 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
664 if (__glibc_unlikely (mctx.state_log == NULL))
665 {
666 err = REG_ESPACE;
667 goto free_return;
668 }
669 }
670
671 match_first = start;
672 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
673 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
674
675 /* Check incrementally whether the input string matches. */
676 incr = (last_start < start) ? -1 : 1;
677 left_lim = (last_start < start) ? last_start : start;
678 right_lim = (last_start < start) ? start : last_start;
679 sb = dfa->mb_cur_max == 1;
680 match_kind =
681 (fastmap
682 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
683 | (start <= last_start ? 2 : 0)
684 | (t != NULL ? 1 : 0))
685 : 8);
686
687 for (;; match_first += incr)
688 {
689 err = REG_NOMATCH;
690 if (match_first < left_lim || right_lim < match_first)
691 goto free_return;
692
693 /* Advance as rapidly as possible through the string, until we
694 find a plausible place to start matching. This may be done
695 with varying efficiency, so there are various possibilities:
696 only the most common of them are specialized, in order to
697 save on code size. We use a switch statement for speed. */
698 switch (match_kind)
699 {
700 case 8:
701 /* No fastmap. */
702 break;
703
704 case 7:
705 /* Fastmap with single-byte translation, match forward. */
706 while (__glibc_likely (match_first < right_lim)
707 && !fastmap[t[(unsigned char) string[match_first]]])
708 ++match_first;
709 goto forward_match_found_start_or_reached_end;
710
711 case 6:
712 /* Fastmap without translation, match forward. */
713 while (__glibc_likely (match_first < right_lim)
714 && !fastmap[(unsigned char) string[match_first]])
715 ++match_first;
716
717 forward_match_found_start_or_reached_end:
718 if (__glibc_unlikely (match_first == right_lim))
719 {
720 ch = match_first >= length
721 ? 0 : (unsigned char) string[match_first];
722 if (!fastmap[t ? t[ch] : ch])
723 goto free_return;
724 }
725 break;
726
727 case 4:
728 case 5:
729 /* Fastmap without multi-byte translation, match backwards. */
730 while (match_first >= left_lim)
731 {
732 ch = match_first >= length
733 ? 0 : (unsigned char) string[match_first];
734 if (fastmap[t ? t[ch] : ch])
735 break;
736 --match_first;
737 }
738 if (match_first < left_lim)
739 goto free_return;
740 break;
741
742 default:
743 /* In this case, we can't determine easily the current byte,
744 since it might be a component byte of a multibyte
745 character. Then we use the constructed buffer instead. */
746 for (;;)
747 {
748 /* If MATCH_FIRST is out of the valid range, reconstruct the
749 buffers. */
750 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
751 if (__glibc_unlikely (offset
752 >= (__re_size_t) mctx.input.valid_raw_len))
753 {
754 err = re_string_reconstruct (&mctx.input, match_first,
755 eflags);
756 if (__glibc_unlikely (err != REG_NOERROR))
757 goto free_return;
758
759 offset = match_first - mctx.input.raw_mbs_idx;
760 }
761 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
762 Note that MATCH_FIRST must not be smaller than 0. */
763 ch = (match_first >= length
764 ? 0 : re_string_byte_at (&mctx.input, offset));
765 if (fastmap[ch])
766 break;
767 match_first += incr;
768 if (match_first < left_lim || match_first > right_lim)
769 {
770 err = REG_NOMATCH;
771 goto free_return;
772 }
773 }
774 break;
775 }
776
777 /* Reconstruct the buffers so that the matcher can assume that
778 the matching starts from the beginning of the buffer. */
779 err = re_string_reconstruct (&mctx.input, match_first, eflags);
780 if (__glibc_unlikely (err != REG_NOERROR))
781 goto free_return;
782
783 #ifdef RE_ENABLE_I18N
784 /* Don't consider this char as a possible match start if it part,
785 yet isn't the head, of a multibyte character. */
786 if (!sb && !re_string_first_byte (&mctx.input, 0))
787 continue;
788 #endif
789
790 /* It seems to be appropriate one, then use the matcher. */
791 /* We assume that the matching starts from 0. */
792 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
793 match_last = check_matching (&mctx, fl_longest_match,
794 start <= last_start ? &match_first : NULL);
795 if (match_last != -1)
796 {
797 if (__glibc_unlikely (match_last == -2))
798 {
799 err = REG_ESPACE;
800 goto free_return;
801 }
802 else
803 {
804 mctx.match_last = match_last;
805 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
806 {
807 re_dfastate_t *pstate = mctx.state_log[match_last];
808 mctx.last_node = check_halt_state_context (&mctx, pstate,
809 match_last);
810 }
811 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
812 || dfa->nbackref)
813 {
814 err = prune_impossible_nodes (&mctx);
815 if (err == REG_NOERROR)
816 break;
817 if (__glibc_unlikely (err != REG_NOMATCH))
818 goto free_return;
819 match_last = -1;
820 }
821 else
822 break; /* We found a match. */
823 }
824 }
825
826 match_ctx_clean (&mctx);
827 }
828
829 DEBUG_ASSERT (match_last != -1);
830 DEBUG_ASSERT (err == REG_NOERROR);
831
832 /* Set pmatch[] if we need. */
833 if (nmatch > 0)
834 {
835 Idx reg_idx;
836
837 /* Initialize registers. */
838 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
839 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
840
841 /* Set the points where matching start/end. */
842 pmatch[0].rm_so = 0;
843 pmatch[0].rm_eo = mctx.match_last;
844 /* FIXME: This function should fail if mctx.match_last exceeds
845 the maximum possible regoff_t value. We need a new error
846 code REG_OVERFLOW. */
847
848 if (!preg->no_sub && nmatch > 1)
849 {
850 err = set_regs (preg, &mctx, nmatch, pmatch,
851 dfa->has_plural_match && dfa->nbackref > 0);
852 if (__glibc_unlikely (err != REG_NOERROR))
853 goto free_return;
854 }
855
856 /* At last, add the offset to each register, since we slid
857 the buffers so that we could assume that the matching starts
858 from 0. */
859 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
860 if (pmatch[reg_idx].rm_so != -1)
861 {
862 #ifdef RE_ENABLE_I18N
863 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
864 {
865 pmatch[reg_idx].rm_so =
866 (pmatch[reg_idx].rm_so == mctx.input.valid_len
867 ? mctx.input.valid_raw_len
868 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
869 pmatch[reg_idx].rm_eo =
870 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
871 ? mctx.input.valid_raw_len
872 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
873 }
874 #else
875 DEBUG_ASSERT (mctx.input.offsets_needed == 0);
876 #endif
877 pmatch[reg_idx].rm_so += match_first;
878 pmatch[reg_idx].rm_eo += match_first;
879 }
880 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
881 {
882 pmatch[nmatch + reg_idx].rm_so = -1;
883 pmatch[nmatch + reg_idx].rm_eo = -1;
884 }
885
886 if (dfa->subexp_map)
887 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
888 if (dfa->subexp_map[reg_idx] != reg_idx)
889 {
890 pmatch[reg_idx + 1].rm_so
891 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
892 pmatch[reg_idx + 1].rm_eo
893 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
894 }
895 }
896
897 free_return:
898 re_free (mctx.state_log);
899 if (dfa->nbackref)
900 match_ctx_free (&mctx);
901 re_string_destruct (&mctx.input);
902 return err;
903 }
904
905 static reg_errcode_t
906 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)907 prune_impossible_nodes (re_match_context_t *mctx)
908 {
909 const re_dfa_t *const dfa = mctx->dfa;
910 Idx halt_node, match_last;
911 reg_errcode_t ret;
912 re_dfastate_t **sifted_states;
913 re_dfastate_t **lim_states = NULL;
914 re_sift_context_t sctx;
915 DEBUG_ASSERT (mctx->state_log != NULL);
916 match_last = mctx->match_last;
917 halt_node = mctx->last_node;
918
919 /* Avoid overflow. */
920 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
921 <= match_last))
922 return REG_ESPACE;
923
924 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
925 if (__glibc_unlikely (sifted_states == NULL))
926 {
927 ret = REG_ESPACE;
928 goto free_return;
929 }
930 if (dfa->nbackref)
931 {
932 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
933 if (__glibc_unlikely (lim_states == NULL))
934 {
935 ret = REG_ESPACE;
936 goto free_return;
937 }
938 while (1)
939 {
940 memset (lim_states, '\0',
941 sizeof (re_dfastate_t *) * (match_last + 1));
942 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
943 match_last);
944 ret = sift_states_backward (mctx, &sctx);
945 re_node_set_free (&sctx.limits);
946 if (__glibc_unlikely (ret != REG_NOERROR))
947 goto free_return;
948 if (sifted_states[0] != NULL || lim_states[0] != NULL)
949 break;
950 do
951 {
952 --match_last;
953 if (match_last < 0)
954 {
955 ret = REG_NOMATCH;
956 goto free_return;
957 }
958 } while (mctx->state_log[match_last] == NULL
959 || !mctx->state_log[match_last]->halt);
960 halt_node = check_halt_state_context (mctx,
961 mctx->state_log[match_last],
962 match_last);
963 }
964 ret = merge_state_array (dfa, sifted_states, lim_states,
965 match_last + 1);
966 re_free (lim_states);
967 lim_states = NULL;
968 if (__glibc_unlikely (ret != REG_NOERROR))
969 goto free_return;
970 }
971 else
972 {
973 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
974 ret = sift_states_backward (mctx, &sctx);
975 re_node_set_free (&sctx.limits);
976 if (__glibc_unlikely (ret != REG_NOERROR))
977 goto free_return;
978 if (sifted_states[0] == NULL)
979 {
980 ret = REG_NOMATCH;
981 goto free_return;
982 }
983 }
984 re_free (mctx->state_log);
985 mctx->state_log = sifted_states;
986 sifted_states = NULL;
987 mctx->last_node = halt_node;
988 mctx->match_last = match_last;
989 ret = REG_NOERROR;
990 free_return:
991 re_free (sifted_states);
992 re_free (lim_states);
993 return ret;
994 }
995
996 /* Acquire an initial state and return it.
997 We must select appropriate initial state depending on the context,
998 since initial states may have constraints like "\<", "^", etc.. */
999
1000 static inline re_dfastate_t *
1001 __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1002 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1003 Idx idx)
1004 {
1005 const re_dfa_t *const dfa = mctx->dfa;
1006 if (dfa->init_state->has_constraint)
1007 {
1008 unsigned int context;
1009 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1010 if (IS_WORD_CONTEXT (context))
1011 return dfa->init_state_word;
1012 else if (IS_ORDINARY_CONTEXT (context))
1013 return dfa->init_state;
1014 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1015 return dfa->init_state_begbuf;
1016 else if (IS_NEWLINE_CONTEXT (context))
1017 return dfa->init_state_nl;
1018 else if (IS_BEGBUF_CONTEXT (context))
1019 {
1020 /* It is relatively rare case, then calculate on demand. */
1021 return re_acquire_state_context (err, dfa,
1022 dfa->init_state->entrance_nodes,
1023 context);
1024 }
1025 else
1026 /* Must not happen? */
1027 return dfa->init_state;
1028 }
1029 else
1030 return dfa->init_state;
1031 }
1032
1033 /* Check whether the regular expression match input string INPUT or not,
1034 and return the index where the matching end. Return -1 if
1035 there is no match, and return -2 in case of an error.
1036 FL_LONGEST_MATCH means we want the POSIX longest matching.
1037 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1038 next place where we may want to try matching.
1039 Note that the matcher assumes that the matching starts from the current
1040 index of the buffer. */
1041
1042 static Idx
1043 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1044 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1045 Idx *p_match_first)
1046 {
1047 const re_dfa_t *const dfa = mctx->dfa;
1048 reg_errcode_t err;
1049 Idx match = 0;
1050 Idx match_last = -1;
1051 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1052 re_dfastate_t *cur_state;
1053 bool at_init_state = p_match_first != NULL;
1054 Idx next_start_idx = cur_str_idx;
1055
1056 err = REG_NOERROR;
1057 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1058 /* An initial state must not be NULL (invalid). */
1059 if (__glibc_unlikely (cur_state == NULL))
1060 {
1061 DEBUG_ASSERT (err == REG_ESPACE);
1062 return -2;
1063 }
1064
1065 if (mctx->state_log != NULL)
1066 {
1067 mctx->state_log[cur_str_idx] = cur_state;
1068
1069 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1070 later. E.g. Processing back references. */
1071 if (__glibc_unlikely (dfa->nbackref))
1072 {
1073 at_init_state = false;
1074 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1075 if (__glibc_unlikely (err != REG_NOERROR))
1076 return err;
1077
1078 if (cur_state->has_backref)
1079 {
1080 err = transit_state_bkref (mctx, &cur_state->nodes);
1081 if (__glibc_unlikely (err != REG_NOERROR))
1082 return err;
1083 }
1084 }
1085 }
1086
1087 /* If the RE accepts NULL string. */
1088 if (__glibc_unlikely (cur_state->halt))
1089 {
1090 if (!cur_state->has_constraint
1091 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1092 {
1093 if (!fl_longest_match)
1094 return cur_str_idx;
1095 else
1096 {
1097 match_last = cur_str_idx;
1098 match = 1;
1099 }
1100 }
1101 }
1102
1103 while (!re_string_eoi (&mctx->input))
1104 {
1105 re_dfastate_t *old_state = cur_state;
1106 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1107
1108 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1109 && mctx->input.bufs_len < mctx->input.len)
1110 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1111 && mctx->input.valid_len < mctx->input.len))
1112 {
1113 err = extend_buffers (mctx, next_char_idx + 1);
1114 if (__glibc_unlikely (err != REG_NOERROR))
1115 {
1116 DEBUG_ASSERT (err == REG_ESPACE);
1117 return -2;
1118 }
1119 }
1120
1121 cur_state = transit_state (&err, mctx, cur_state);
1122 if (mctx->state_log != NULL)
1123 cur_state = merge_state_with_log (&err, mctx, cur_state);
1124
1125 if (cur_state == NULL)
1126 {
1127 /* Reached the invalid state or an error. Try to recover a valid
1128 state using the state log, if available and if we have not
1129 already found a valid (even if not the longest) match. */
1130 if (__glibc_unlikely (err != REG_NOERROR))
1131 return -2;
1132
1133 if (mctx->state_log == NULL
1134 || (match && !fl_longest_match)
1135 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1136 break;
1137 }
1138
1139 if (__glibc_unlikely (at_init_state))
1140 {
1141 if (old_state == cur_state)
1142 next_start_idx = next_char_idx;
1143 else
1144 at_init_state = false;
1145 }
1146
1147 if (cur_state->halt)
1148 {
1149 /* Reached a halt state.
1150 Check the halt state can satisfy the current context. */
1151 if (!cur_state->has_constraint
1152 || check_halt_state_context (mctx, cur_state,
1153 re_string_cur_idx (&mctx->input)))
1154 {
1155 /* We found an appropriate halt state. */
1156 match_last = re_string_cur_idx (&mctx->input);
1157 match = 1;
1158
1159 /* We found a match, do not modify match_first below. */
1160 p_match_first = NULL;
1161 if (!fl_longest_match)
1162 break;
1163 }
1164 }
1165 }
1166
1167 if (p_match_first)
1168 *p_match_first += next_start_idx;
1169
1170 return match_last;
1171 }
1172
1173 /* Check NODE match the current context. */
1174
1175 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1176 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1177 {
1178 re_token_type_t type = dfa->nodes[node].type;
1179 unsigned int constraint = dfa->nodes[node].constraint;
1180 if (type != END_OF_RE)
1181 return false;
1182 if (!constraint)
1183 return true;
1184 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1185 return false;
1186 return true;
1187 }
1188
1189 /* Check the halt state STATE match the current context.
1190 Return 0 if not match, if the node, STATE has, is a halt node and
1191 match the context, return the node. */
1192
1193 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1194 check_halt_state_context (const re_match_context_t *mctx,
1195 const re_dfastate_t *state, Idx idx)
1196 {
1197 Idx i;
1198 unsigned int context;
1199 DEBUG_ASSERT (state->halt);
1200 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201 for (i = 0; i < state->nodes.nelem; ++i)
1202 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203 return state->nodes.elems[i];
1204 return 0;
1205 }
1206
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208 corresponding to the DFA).
1209 Return the destination node, and update EPS_VIA_NODES;
1210 return -1 on match failure, -2 on error. */
1211
1212 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1213 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1214 regmatch_t *prevregs,
1215 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1216 struct re_fail_stack_t *fs)
1217 {
1218 const re_dfa_t *const dfa = mctx->dfa;
1219 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1220 {
1221 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1222 re_node_set *edests = &dfa->edests[node];
1223 bool ok = re_node_set_insert (eps_via_nodes, node);
1224 if (__glibc_unlikely (! ok))
1225 return -2;
1226
1227 /* Pick a valid destination, or return -1 if none is found. */
1228 Idx dest_node = -1;
1229 for (Idx i = 0; i < edests->nelem; i++)
1230 {
1231 Idx candidate = edests->elems[i];
1232 if (!re_node_set_contains (cur_nodes, candidate))
1233 continue;
1234 if (dest_node == -1)
1235 dest_node = candidate;
1236
1237 else
1238 {
1239 /* In order to avoid infinite loop like "(a*)*", return the second
1240 epsilon-transition if the first was already considered. */
1241 if (re_node_set_contains (eps_via_nodes, dest_node))
1242 return candidate;
1243
1244 /* Otherwise, push the second epsilon-transition on the fail stack. */
1245 else if (fs != NULL
1246 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1247 prevregs, eps_via_nodes))
1248 return -2;
1249
1250 /* We know we are going to exit. */
1251 break;
1252 }
1253 }
1254 return dest_node;
1255 }
1256 else
1257 {
1258 Idx naccepted = 0;
1259 re_token_type_t type = dfa->nodes[node].type;
1260
1261 #ifdef RE_ENABLE_I18N
1262 if (dfa->nodes[node].accept_mb)
1263 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1264 else
1265 #endif /* RE_ENABLE_I18N */
1266 if (type == OP_BACK_REF)
1267 {
1268 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1269 if (subexp_idx < nregs)
1270 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1271 if (fs != NULL)
1272 {
1273 if (subexp_idx >= nregs
1274 || regs[subexp_idx].rm_so == -1
1275 || regs[subexp_idx].rm_eo == -1)
1276 return -1;
1277 else if (naccepted)
1278 {
1279 char *buf = (char *) re_string_get_buffer (&mctx->input);
1280 if (mctx->input.valid_len - *pidx < naccepted
1281 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1282 naccepted)
1283 != 0))
1284 return -1;
1285 }
1286 }
1287
1288 if (naccepted == 0)
1289 {
1290 Idx dest_node;
1291 bool ok = re_node_set_insert (eps_via_nodes, node);
1292 if (__glibc_unlikely (! ok))
1293 return -2;
1294 dest_node = dfa->edests[node].elems[0];
1295 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1296 dest_node))
1297 return dest_node;
1298 }
1299 }
1300
1301 if (naccepted != 0
1302 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1303 {
1304 Idx dest_node = dfa->nexts[node];
1305 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1306 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1307 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1308 dest_node)))
1309 return -1;
1310 re_node_set_empty (eps_via_nodes);
1311 return dest_node;
1312 }
1313 }
1314 return -1;
1315 }
1316
1317 static reg_errcode_t
1318 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1319 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1320 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1321 re_node_set *eps_via_nodes)
1322 {
1323 reg_errcode_t err;
1324 Idx num = fs->num++;
1325 if (fs->num == fs->alloc)
1326 {
1327 struct re_fail_stack_ent_t *new_array;
1328 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1329 fs->alloc * 2);
1330 if (new_array == NULL)
1331 return REG_ESPACE;
1332 fs->alloc *= 2;
1333 fs->stack = new_array;
1334 }
1335 fs->stack[num].idx = str_idx;
1336 fs->stack[num].node = dest_node;
1337 fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1338 if (fs->stack[num].regs == NULL)
1339 return REG_ESPACE;
1340 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1341 memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1342 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1343 return err;
1344 }
1345
1346 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1347 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1348 regmatch_t *regs, regmatch_t *prevregs,
1349 re_node_set *eps_via_nodes)
1350 {
1351 if (fs == NULL || fs->num == 0)
1352 return -1;
1353 Idx num = --fs->num;
1354 *pidx = fs->stack[num].idx;
1355 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1356 memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1357 re_node_set_free (eps_via_nodes);
1358 re_free (fs->stack[num].regs);
1359 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1360 DEBUG_ASSERT (0 <= fs->stack[num].node);
1361 return fs->stack[num].node;
1362 }
1363
1364
1365 #define DYNARRAY_STRUCT regmatch_list
1366 #define DYNARRAY_ELEMENT regmatch_t
1367 #define DYNARRAY_PREFIX regmatch_list_
1368 #include <malloc/dynarray-skeleton.c>
1369
1370 /* Set the positions where the subexpressions are starts/ends to registers
1371 PMATCH.
1372 Note: We assume that pmatch[0] is already set, and
1373 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1374
1375 static reg_errcode_t
1376 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1377 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1378 regmatch_t *pmatch, bool fl_backtrack)
1379 {
1380 const re_dfa_t *dfa = preg->buffer;
1381 Idx idx, cur_node;
1382 re_node_set eps_via_nodes;
1383 struct re_fail_stack_t *fs;
1384 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385 struct regmatch_list prev_match;
1386 regmatch_list_init (&prev_match);
1387
1388 DEBUG_ASSERT (nmatch > 1);
1389 DEBUG_ASSERT (mctx->state_log != NULL);
1390 if (fl_backtrack)
1391 {
1392 fs = &fs_body;
1393 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394 if (fs->stack == NULL)
1395 return REG_ESPACE;
1396 }
1397 else
1398 fs = NULL;
1399
1400 cur_node = dfa->init_node;
1401 re_node_set_init_empty (&eps_via_nodes);
1402
1403 if (!regmatch_list_resize (&prev_match, nmatch))
1404 {
1405 regmatch_list_free (&prev_match);
1406 free_fail_stack_return (fs);
1407 return REG_ESPACE;
1408 }
1409 regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1410 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1411
1412 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1413 {
1414 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1415
1416 if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1417 || re_node_set_contains (&eps_via_nodes, cur_node))
1418 {
1419 Idx reg_idx;
1420 cur_node = -1;
1421 if (fs)
1422 {
1423 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1424 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1425 {
1426 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1427 prev_idx_match, &eps_via_nodes);
1428 break;
1429 }
1430 }
1431 if (cur_node < 0)
1432 {
1433 re_node_set_free (&eps_via_nodes);
1434 regmatch_list_free (&prev_match);
1435 return free_fail_stack_return (fs);
1436 }
1437 }
1438
1439 /* Proceed to next node. */
1440 cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1441 &idx, cur_node,
1442 &eps_via_nodes, fs);
1443
1444 if (__glibc_unlikely (cur_node < 0))
1445 {
1446 if (__glibc_unlikely (cur_node == -2))
1447 {
1448 re_node_set_free (&eps_via_nodes);
1449 regmatch_list_free (&prev_match);
1450 free_fail_stack_return (fs);
1451 return REG_ESPACE;
1452 }
1453 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 prev_idx_match, &eps_via_nodes);
1455 if (cur_node < 0)
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 regmatch_list_free (&prev_match);
1459 free_fail_stack_return (fs);
1460 return REG_NOMATCH;
1461 }
1462 }
1463 }
1464 re_node_set_free (&eps_via_nodes);
1465 regmatch_list_free (&prev_match);
1466 return free_fail_stack_return (fs);
1467 }
1468
1469 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1470 free_fail_stack_return (struct re_fail_stack_t *fs)
1471 {
1472 if (fs)
1473 {
1474 Idx fs_idx;
1475 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1476 {
1477 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1478 re_free (fs->stack[fs_idx].regs);
1479 }
1480 re_free (fs->stack);
1481 }
1482 return REG_NOERROR;
1483 }
1484
1485 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1486 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1487 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1488 {
1489 int type = dfa->nodes[cur_node].type;
1490 if (type == OP_OPEN_SUBEXP)
1491 {
1492 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493
1494 /* We are at the first node of this sub expression. */
1495 if (reg_num < nmatch)
1496 {
1497 pmatch[reg_num].rm_so = cur_idx;
1498 pmatch[reg_num].rm_eo = -1;
1499 }
1500 }
1501 else if (type == OP_CLOSE_SUBEXP)
1502 {
1503 /* We are at the last node of this sub expression. */
1504 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1505 if (reg_num < nmatch)
1506 {
1507 if (pmatch[reg_num].rm_so < cur_idx)
1508 {
1509 pmatch[reg_num].rm_eo = cur_idx;
1510 /* This is a non-empty match or we are not inside an optional
1511 subexpression. Accept this right away. */
1512 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1513 }
1514 else
1515 {
1516 if (dfa->nodes[cur_node].opt_subexp
1517 && prev_idx_match[reg_num].rm_so != -1)
1518 /* We transited through an empty match for an optional
1519 subexpression, like (a?)*, and this is not the subexp's
1520 first match. Copy back the old content of the registers
1521 so that matches of an inner subexpression are undone as
1522 well, like in ((a?))*. */
1523 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1524 else
1525 /* We completed a subexpression, but it may be part of
1526 an optional one, so do not update PREV_IDX_MATCH. */
1527 pmatch[reg_num].rm_eo = cur_idx;
1528 }
1529 }
1530 }
1531 }
1532
1533 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1534 and sift the nodes in each states according to the following rules.
1535 Updated state_log will be wrote to STATE_LOG.
1536
1537 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1538 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1539 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1540 the LAST_NODE, we throw away the node 'a'.
1541 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1542 string 's' and transit to 'b':
1543 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1544 away the node 'a'.
1545 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1546 thrown away, we throw away the node 'a'.
1547 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1548 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1549 node 'a'.
1550 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1551 we throw away the node 'a'. */
1552
1553 #define STATE_NODE_CONTAINS(state,node) \
1554 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1555
1556 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1557 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1558 {
1559 reg_errcode_t err;
1560 int null_cnt = 0;
1561 Idx str_idx = sctx->last_str_idx;
1562 re_node_set cur_dest;
1563
1564 DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1565
1566 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1567 transit to the last_node and the last_node itself. */
1568 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1569 if (__glibc_unlikely (err != REG_NOERROR))
1570 return err;
1571 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1572 if (__glibc_unlikely (err != REG_NOERROR))
1573 goto free_return;
1574
1575 /* Then check each states in the state_log. */
1576 while (str_idx > 0)
1577 {
1578 /* Update counters. */
1579 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1580 if (null_cnt > mctx->max_mb_elem_len)
1581 {
1582 memset (sctx->sifted_states, '\0',
1583 sizeof (re_dfastate_t *) * str_idx);
1584 re_node_set_free (&cur_dest);
1585 return REG_NOERROR;
1586 }
1587 re_node_set_empty (&cur_dest);
1588 --str_idx;
1589
1590 if (mctx->state_log[str_idx])
1591 {
1592 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1593 if (__glibc_unlikely (err != REG_NOERROR))
1594 goto free_return;
1595 }
1596
1597 /* Add all the nodes which satisfy the following conditions:
1598 - It can epsilon transit to a node in CUR_DEST.
1599 - It is in CUR_SRC.
1600 And update state_log. */
1601 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1602 if (__glibc_unlikely (err != REG_NOERROR))
1603 goto free_return;
1604 }
1605 err = REG_NOERROR;
1606 free_return:
1607 re_node_set_free (&cur_dest);
1608 return err;
1609 }
1610
1611 static reg_errcode_t
1612 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1613 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1614 Idx str_idx, re_node_set *cur_dest)
1615 {
1616 const re_dfa_t *const dfa = mctx->dfa;
1617 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1618 Idx i;
1619
1620 /* Then build the next sifted state.
1621 We build the next sifted state on 'cur_dest', and update
1622 'sifted_states[str_idx]' with 'cur_dest'.
1623 Note:
1624 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1625 'cur_src' points the node_set of the old 'state_log[str_idx]'
1626 (with the epsilon nodes pre-filtered out). */
1627 for (i = 0; i < cur_src->nelem; i++)
1628 {
1629 Idx prev_node = cur_src->elems[i];
1630 int naccepted = 0;
1631 bool ok;
1632 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1633
1634 #ifdef RE_ENABLE_I18N
1635 /* If the node may accept "multi byte". */
1636 if (dfa->nodes[prev_node].accept_mb)
1637 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1638 str_idx, sctx->last_str_idx);
1639 #endif /* RE_ENABLE_I18N */
1640
1641 /* We don't check backreferences here.
1642 See update_cur_sifted_state(). */
1643 if (!naccepted
1644 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1645 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1646 dfa->nexts[prev_node]))
1647 naccepted = 1;
1648
1649 if (naccepted == 0)
1650 continue;
1651
1652 if (sctx->limits.nelem)
1653 {
1654 Idx to_idx = str_idx + naccepted;
1655 if (check_dst_limits (mctx, &sctx->limits,
1656 dfa->nexts[prev_node], to_idx,
1657 prev_node, str_idx))
1658 continue;
1659 }
1660 ok = re_node_set_insert (cur_dest, prev_node);
1661 if (__glibc_unlikely (! ok))
1662 return REG_ESPACE;
1663 }
1664
1665 return REG_NOERROR;
1666 }
1667
1668 /* Helper functions. */
1669
1670 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1671 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1672 {
1673 Idx top = mctx->state_log_top;
1674
1675 if ((next_state_log_idx >= mctx->input.bufs_len
1676 && mctx->input.bufs_len < mctx->input.len)
1677 || (next_state_log_idx >= mctx->input.valid_len
1678 && mctx->input.valid_len < mctx->input.len))
1679 {
1680 reg_errcode_t err;
1681 err = extend_buffers (mctx, next_state_log_idx + 1);
1682 if (__glibc_unlikely (err != REG_NOERROR))
1683 return err;
1684 }
1685
1686 if (top < next_state_log_idx)
1687 {
1688 memset (mctx->state_log + top + 1, '\0',
1689 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1690 mctx->state_log_top = next_state_log_idx;
1691 }
1692 return REG_NOERROR;
1693 }
1694
1695 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1696 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1697 re_dfastate_t **src, Idx num)
1698 {
1699 Idx st_idx;
1700 reg_errcode_t err;
1701 for (st_idx = 0; st_idx < num; ++st_idx)
1702 {
1703 if (dst[st_idx] == NULL)
1704 dst[st_idx] = src[st_idx];
1705 else if (src[st_idx] != NULL)
1706 {
1707 re_node_set merged_set;
1708 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1709 &src[st_idx]->nodes);
1710 if (__glibc_unlikely (err != REG_NOERROR))
1711 return err;
1712 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1713 re_node_set_free (&merged_set);
1714 if (__glibc_unlikely (err != REG_NOERROR))
1715 return err;
1716 }
1717 }
1718 return REG_NOERROR;
1719 }
1720
1721 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1722 update_cur_sifted_state (const re_match_context_t *mctx,
1723 re_sift_context_t *sctx, Idx str_idx,
1724 re_node_set *dest_nodes)
1725 {
1726 const re_dfa_t *const dfa = mctx->dfa;
1727 reg_errcode_t err = REG_NOERROR;
1728 const re_node_set *candidates;
1729 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1730 : &mctx->state_log[str_idx]->nodes);
1731
1732 if (dest_nodes->nelem == 0)
1733 sctx->sifted_states[str_idx] = NULL;
1734 else
1735 {
1736 if (candidates)
1737 {
1738 /* At first, add the nodes which can epsilon transit to a node in
1739 DEST_NODE. */
1740 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1741 if (__glibc_unlikely (err != REG_NOERROR))
1742 return err;
1743
1744 /* Then, check the limitations in the current sift_context. */
1745 if (sctx->limits.nelem)
1746 {
1747 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1748 mctx->bkref_ents, str_idx);
1749 if (__glibc_unlikely (err != REG_NOERROR))
1750 return err;
1751 }
1752 }
1753
1754 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1755 if (__glibc_unlikely (err != REG_NOERROR))
1756 return err;
1757 }
1758
1759 if (candidates && mctx->state_log[str_idx]->has_backref)
1760 {
1761 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1762 if (__glibc_unlikely (err != REG_NOERROR))
1763 return err;
1764 }
1765 return REG_NOERROR;
1766 }
1767
1768 static reg_errcode_t
1769 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1770 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1771 const re_node_set *candidates)
1772 {
1773 reg_errcode_t err = REG_NOERROR;
1774 Idx i;
1775
1776 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1777 if (__glibc_unlikely (err != REG_NOERROR))
1778 return err;
1779
1780 if (!state->inveclosure.alloc)
1781 {
1782 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1783 if (__glibc_unlikely (err != REG_NOERROR))
1784 return REG_ESPACE;
1785 for (i = 0; i < dest_nodes->nelem; i++)
1786 {
1787 err = re_node_set_merge (&state->inveclosure,
1788 dfa->inveclosures + dest_nodes->elems[i]);
1789 if (__glibc_unlikely (err != REG_NOERROR))
1790 return REG_ESPACE;
1791 }
1792 }
1793 return re_node_set_add_intersect (dest_nodes, candidates,
1794 &state->inveclosure);
1795 }
1796
1797 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1798 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1799 const re_node_set *candidates)
1800 {
1801 Idx ecl_idx;
1802 reg_errcode_t err;
1803 re_node_set *inv_eclosure = dfa->inveclosures + node;
1804 re_node_set except_nodes;
1805 re_node_set_init_empty (&except_nodes);
1806 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1807 {
1808 Idx cur_node = inv_eclosure->elems[ecl_idx];
1809 if (cur_node == node)
1810 continue;
1811 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1812 {
1813 Idx edst1 = dfa->edests[cur_node].elems[0];
1814 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1815 ? dfa->edests[cur_node].elems[1] : -1);
1816 if ((!re_node_set_contains (inv_eclosure, edst1)
1817 && re_node_set_contains (dest_nodes, edst1))
1818 || (edst2 > 0
1819 && !re_node_set_contains (inv_eclosure, edst2)
1820 && re_node_set_contains (dest_nodes, edst2)))
1821 {
1822 err = re_node_set_add_intersect (&except_nodes, candidates,
1823 dfa->inveclosures + cur_node);
1824 if (__glibc_unlikely (err != REG_NOERROR))
1825 {
1826 re_node_set_free (&except_nodes);
1827 return err;
1828 }
1829 }
1830 }
1831 }
1832 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1833 {
1834 Idx cur_node = inv_eclosure->elems[ecl_idx];
1835 if (!re_node_set_contains (&except_nodes, cur_node))
1836 {
1837 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1838 re_node_set_remove_at (dest_nodes, idx);
1839 }
1840 }
1841 re_node_set_free (&except_nodes);
1842 return REG_NOERROR;
1843 }
1844
1845 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1846 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1847 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1848 {
1849 const re_dfa_t *const dfa = mctx->dfa;
1850 Idx lim_idx, src_pos, dst_pos;
1851
1852 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1853 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1854 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1855 {
1856 Idx subexp_idx;
1857 struct re_backref_cache_entry *ent;
1858 ent = mctx->bkref_ents + limits->elems[lim_idx];
1859 subexp_idx = dfa->nodes[ent->node].opr.idx;
1860
1861 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1862 subexp_idx, dst_node, dst_idx,
1863 dst_bkref_idx);
1864 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1865 subexp_idx, src_node, src_idx,
1866 src_bkref_idx);
1867
1868 /* In case of:
1869 <src> <dst> ( <subexp> )
1870 ( <subexp> ) <src> <dst>
1871 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1872 if (src_pos == dst_pos)
1873 continue; /* This is unrelated limitation. */
1874 else
1875 return true;
1876 }
1877 return false;
1878 }
1879
1880 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1881 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1882 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1883 {
1884 const re_dfa_t *const dfa = mctx->dfa;
1885 const re_node_set *eclosures = dfa->eclosures + from_node;
1886 Idx node_idx;
1887
1888 /* Else, we are on the boundary: examine the nodes on the epsilon
1889 closure. */
1890 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1891 {
1892 Idx node = eclosures->elems[node_idx];
1893 switch (dfa->nodes[node].type)
1894 {
1895 case OP_BACK_REF:
1896 if (bkref_idx != -1)
1897 {
1898 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1899 do
1900 {
1901 Idx dst;
1902 int cpos;
1903
1904 if (ent->node != node)
1905 continue;
1906
1907 if (subexp_idx < BITSET_WORD_BITS
1908 && !(ent->eps_reachable_subexps_map
1909 & ((bitset_word_t) 1 << subexp_idx)))
1910 continue;
1911
1912 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1913 OP_CLOSE_SUBEXP cases below. But, if the
1914 destination node is the same node as the source
1915 node, don't recurse because it would cause an
1916 infinite loop: a regex that exhibits this behavior
1917 is ()\1*\1* */
1918 dst = dfa->edests[node].elems[0];
1919 if (dst == from_node)
1920 {
1921 if (boundaries & 1)
1922 return -1;
1923 else /* if (boundaries & 2) */
1924 return 0;
1925 }
1926
1927 cpos =
1928 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1929 dst, bkref_idx);
1930 if (cpos == -1 /* && (boundaries & 1) */)
1931 return -1;
1932 if (cpos == 0 && (boundaries & 2))
1933 return 0;
1934
1935 if (subexp_idx < BITSET_WORD_BITS)
1936 ent->eps_reachable_subexps_map
1937 &= ~((bitset_word_t) 1 << subexp_idx);
1938 }
1939 while (ent++->more);
1940 }
1941 break;
1942
1943 case OP_OPEN_SUBEXP:
1944 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1945 return -1;
1946 break;
1947
1948 case OP_CLOSE_SUBEXP:
1949 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1950 return 0;
1951 break;
1952
1953 default:
1954 break;
1955 }
1956 }
1957
1958 return (boundaries & 2) ? 1 : 0;
1959 }
1960
1961 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1962 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1963 Idx subexp_idx, Idx from_node, Idx str_idx,
1964 Idx bkref_idx)
1965 {
1966 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1967 int boundaries;
1968
1969 /* If we are outside the range of the subexpression, return -1 or 1. */
1970 if (str_idx < lim->subexp_from)
1971 return -1;
1972
1973 if (lim->subexp_to < str_idx)
1974 return 1;
1975
1976 /* If we are within the subexpression, return 0. */
1977 boundaries = (str_idx == lim->subexp_from);
1978 boundaries |= (str_idx == lim->subexp_to) << 1;
1979 if (boundaries == 0)
1980 return 0;
1981
1982 /* Else, examine epsilon closure. */
1983 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1984 from_node, bkref_idx);
1985 }
1986
1987 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1988 which are against limitations from DEST_NODES. */
1989
1990 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)1991 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1992 const re_node_set *candidates, re_node_set *limits,
1993 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1994 {
1995 reg_errcode_t err;
1996 Idx node_idx, lim_idx;
1997
1998 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1999 {
2000 Idx subexp_idx;
2001 struct re_backref_cache_entry *ent;
2002 ent = bkref_ents + limits->elems[lim_idx];
2003
2004 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2005 continue; /* This is unrelated limitation. */
2006
2007 subexp_idx = dfa->nodes[ent->node].opr.idx;
2008 if (ent->subexp_to == str_idx)
2009 {
2010 Idx ops_node = -1;
2011 Idx cls_node = -1;
2012 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2013 {
2014 Idx node = dest_nodes->elems[node_idx];
2015 re_token_type_t type = dfa->nodes[node].type;
2016 if (type == OP_OPEN_SUBEXP
2017 && subexp_idx == dfa->nodes[node].opr.idx)
2018 ops_node = node;
2019 else if (type == OP_CLOSE_SUBEXP
2020 && subexp_idx == dfa->nodes[node].opr.idx)
2021 cls_node = node;
2022 }
2023
2024 /* Check the limitation of the open subexpression. */
2025 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2026 if (ops_node >= 0)
2027 {
2028 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2029 candidates);
2030 if (__glibc_unlikely (err != REG_NOERROR))
2031 return err;
2032 }
2033
2034 /* Check the limitation of the close subexpression. */
2035 if (cls_node >= 0)
2036 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2037 {
2038 Idx node = dest_nodes->elems[node_idx];
2039 if (!re_node_set_contains (dfa->inveclosures + node,
2040 cls_node)
2041 && !re_node_set_contains (dfa->eclosures + node,
2042 cls_node))
2043 {
2044 /* It is against this limitation.
2045 Remove it form the current sifted state. */
2046 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2047 candidates);
2048 if (__glibc_unlikely (err != REG_NOERROR))
2049 return err;
2050 --node_idx;
2051 }
2052 }
2053 }
2054 else /* (ent->subexp_to != str_idx) */
2055 {
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057 {
2058 Idx node = dest_nodes->elems[node_idx];
2059 re_token_type_t type = dfa->nodes[node].type;
2060 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2061 {
2062 if (subexp_idx != dfa->nodes[node].opr.idx)
2063 continue;
2064 /* It is against this limitation.
2065 Remove it form the current sifted state. */
2066 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067 candidates);
2068 if (__glibc_unlikely (err != REG_NOERROR))
2069 return err;
2070 }
2071 }
2072 }
2073 }
2074 return REG_NOERROR;
2075 }
2076
2077 static reg_errcode_t
2078 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2079 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2080 Idx str_idx, const re_node_set *candidates)
2081 {
2082 const re_dfa_t *const dfa = mctx->dfa;
2083 reg_errcode_t err;
2084 Idx node_idx, node;
2085 re_sift_context_t local_sctx;
2086 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2087
2088 if (first_idx == -1)
2089 return REG_NOERROR;
2090
2091 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2092
2093 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2094 {
2095 Idx enabled_idx;
2096 re_token_type_t type;
2097 struct re_backref_cache_entry *entry;
2098 node = candidates->elems[node_idx];
2099 type = dfa->nodes[node].type;
2100 /* Avoid infinite loop for the REs like "()\1+". */
2101 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2102 continue;
2103 if (type != OP_BACK_REF)
2104 continue;
2105
2106 entry = mctx->bkref_ents + first_idx;
2107 enabled_idx = first_idx;
2108 do
2109 {
2110 Idx subexp_len;
2111 Idx to_idx;
2112 Idx dst_node;
2113 bool ok;
2114 re_dfastate_t *cur_state;
2115
2116 if (entry->node != node)
2117 continue;
2118 subexp_len = entry->subexp_to - entry->subexp_from;
2119 to_idx = str_idx + subexp_len;
2120 dst_node = (subexp_len ? dfa->nexts[node]
2121 : dfa->edests[node].elems[0]);
2122
2123 if (to_idx > sctx->last_str_idx
2124 || sctx->sifted_states[to_idx] == NULL
2125 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2126 || check_dst_limits (mctx, &sctx->limits, node,
2127 str_idx, dst_node, to_idx))
2128 continue;
2129
2130 if (local_sctx.sifted_states == NULL)
2131 {
2132 local_sctx = *sctx;
2133 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2134 if (__glibc_unlikely (err != REG_NOERROR))
2135 goto free_return;
2136 }
2137 local_sctx.last_node = node;
2138 local_sctx.last_str_idx = str_idx;
2139 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2140 if (__glibc_unlikely (! ok))
2141 {
2142 err = REG_ESPACE;
2143 goto free_return;
2144 }
2145 cur_state = local_sctx.sifted_states[str_idx];
2146 err = sift_states_backward (mctx, &local_sctx);
2147 if (__glibc_unlikely (err != REG_NOERROR))
2148 goto free_return;
2149 if (sctx->limited_states != NULL)
2150 {
2151 err = merge_state_array (dfa, sctx->limited_states,
2152 local_sctx.sifted_states,
2153 str_idx + 1);
2154 if (__glibc_unlikely (err != REG_NOERROR))
2155 goto free_return;
2156 }
2157 local_sctx.sifted_states[str_idx] = cur_state;
2158 re_node_set_remove (&local_sctx.limits, enabled_idx);
2159
2160 /* mctx->bkref_ents may have changed, reload the pointer. */
2161 entry = mctx->bkref_ents + enabled_idx;
2162 }
2163 while (enabled_idx++, entry++->more);
2164 }
2165 err = REG_NOERROR;
2166 free_return:
2167 if (local_sctx.sifted_states != NULL)
2168 {
2169 re_node_set_free (&local_sctx.limits);
2170 }
2171
2172 return err;
2173 }
2174
2175
2176 #ifdef RE_ENABLE_I18N
2177 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2178 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2179 Idx node_idx, Idx str_idx, Idx max_str_idx)
2180 {
2181 const re_dfa_t *const dfa = mctx->dfa;
2182 int naccepted;
2183 /* Check the node can accept "multi byte". */
2184 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2185 if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2186 && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2187 dfa->nexts[node_idx]))
2188 /* The node can't accept the "multi byte", or the
2189 destination was already thrown away, then the node
2190 couldn't accept the current input "multi byte". */
2191 naccepted = 0;
2192 /* Otherwise, it is sure that the node could accept
2193 'naccepted' bytes input. */
2194 return naccepted;
2195 }
2196 #endif /* RE_ENABLE_I18N */
2197
2198
2199 /* Functions for state transition. */
2200
2201 /* Return the next state to which the current state STATE will transit by
2202 accepting the current input byte, and update STATE_LOG if necessary.
2203 Return NULL on failure.
2204 If STATE can accept a multibyte char/collating element/back reference
2205 update the destination of STATE_LOG. */
2206
2207 static re_dfastate_t *
2208 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2209 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2210 re_dfastate_t *state)
2211 {
2212 re_dfastate_t **trtable;
2213 unsigned char ch;
2214
2215 #ifdef RE_ENABLE_I18N
2216 /* If the current state can accept multibyte. */
2217 if (__glibc_unlikely (state->accept_mb))
2218 {
2219 *err = transit_state_mb (mctx, state);
2220 if (__glibc_unlikely (*err != REG_NOERROR))
2221 return NULL;
2222 }
2223 #endif /* RE_ENABLE_I18N */
2224
2225 /* Then decide the next state with the single byte. */
2226 #if 0
2227 if (0)
2228 /* don't use transition table */
2229 return transit_state_sb (err, mctx, state);
2230 #endif
2231
2232 /* Use transition table */
2233 ch = re_string_fetch_byte (&mctx->input);
2234 for (;;)
2235 {
2236 trtable = state->trtable;
2237 if (__glibc_likely (trtable != NULL))
2238 return trtable[ch];
2239
2240 trtable = state->word_trtable;
2241 if (__glibc_likely (trtable != NULL))
2242 {
2243 unsigned int context;
2244 context
2245 = re_string_context_at (&mctx->input,
2246 re_string_cur_idx (&mctx->input) - 1,
2247 mctx->eflags);
2248 if (IS_WORD_CONTEXT (context))
2249 return trtable[ch + SBC_MAX];
2250 else
2251 return trtable[ch];
2252 }
2253
2254 if (!build_trtable (mctx->dfa, state))
2255 {
2256 *err = REG_ESPACE;
2257 return NULL;
2258 }
2259
2260 /* Retry, we now have a transition table. */
2261 }
2262 }
2263
2264 /* Update the state_log if we need */
2265 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2266 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2267 re_dfastate_t *next_state)
2268 {
2269 const re_dfa_t *const dfa = mctx->dfa;
2270 Idx cur_idx = re_string_cur_idx (&mctx->input);
2271
2272 if (cur_idx > mctx->state_log_top)
2273 {
2274 mctx->state_log[cur_idx] = next_state;
2275 mctx->state_log_top = cur_idx;
2276 }
2277 else if (mctx->state_log[cur_idx] == 0)
2278 {
2279 mctx->state_log[cur_idx] = next_state;
2280 }
2281 else
2282 {
2283 re_dfastate_t *pstate;
2284 unsigned int context;
2285 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2286 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2287 the destination of a multibyte char/collating element/
2288 back reference. Then the next state is the union set of
2289 these destinations and the results of the transition table. */
2290 pstate = mctx->state_log[cur_idx];
2291 log_nodes = pstate->entrance_nodes;
2292 if (next_state != NULL)
2293 {
2294 table_nodes = next_state->entrance_nodes;
2295 *err = re_node_set_init_union (&next_nodes, table_nodes,
2296 log_nodes);
2297 if (__glibc_unlikely (*err != REG_NOERROR))
2298 return NULL;
2299 }
2300 else
2301 next_nodes = *log_nodes;
2302 /* Note: We already add the nodes of the initial state,
2303 then we don't need to add them here. */
2304
2305 context = re_string_context_at (&mctx->input,
2306 re_string_cur_idx (&mctx->input) - 1,
2307 mctx->eflags);
2308 next_state = mctx->state_log[cur_idx]
2309 = re_acquire_state_context (err, dfa, &next_nodes, context);
2310 /* We don't need to check errors here, since the return value of
2311 this function is next_state and ERR is already set. */
2312
2313 if (table_nodes != NULL)
2314 re_node_set_free (&next_nodes);
2315 }
2316
2317 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2318 {
2319 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2320 later. We must check them here, since the back references in the
2321 next state might use them. */
2322 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2323 cur_idx);
2324 if (__glibc_unlikely (*err != REG_NOERROR))
2325 return NULL;
2326
2327 /* If the next state has back references. */
2328 if (next_state->has_backref)
2329 {
2330 *err = transit_state_bkref (mctx, &next_state->nodes);
2331 if (__glibc_unlikely (*err != REG_NOERROR))
2332 return NULL;
2333 next_state = mctx->state_log[cur_idx];
2334 }
2335 }
2336
2337 return next_state;
2338 }
2339
2340 /* Skip bytes in the input that correspond to part of a
2341 multi-byte match, then look in the log for a state
2342 from which to restart matching. */
2343 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2344 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2345 {
2346 re_dfastate_t *cur_state;
2347 do
2348 {
2349 Idx max = mctx->state_log_top;
2350 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2351
2352 do
2353 {
2354 if (++cur_str_idx > max)
2355 return NULL;
2356 re_string_skip_bytes (&mctx->input, 1);
2357 }
2358 while (mctx->state_log[cur_str_idx] == NULL);
2359
2360 cur_state = merge_state_with_log (err, mctx, NULL);
2361 }
2362 while (*err == REG_NOERROR && cur_state == NULL);
2363 return cur_state;
2364 }
2365
2366 /* Helper functions for transit_state. */
2367
2368 /* From the node set CUR_NODES, pick up the nodes whose types are
2369 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2370 expression. And register them to use them later for evaluating the
2371 corresponding back references. */
2372
2373 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2374 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2375 Idx str_idx)
2376 {
2377 const re_dfa_t *const dfa = mctx->dfa;
2378 Idx node_idx;
2379 reg_errcode_t err;
2380
2381 /* TODO: This isn't efficient.
2382 Because there might be more than one nodes whose types are
2383 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2384 nodes.
2385 E.g. RE: (a){2} */
2386 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2387 {
2388 Idx node = cur_nodes->elems[node_idx];
2389 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2390 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2391 && (dfa->used_bkref_map
2392 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2393 {
2394 err = match_ctx_add_subtop (mctx, node, str_idx);
2395 if (__glibc_unlikely (err != REG_NOERROR))
2396 return err;
2397 }
2398 }
2399 return REG_NOERROR;
2400 }
2401
2402 #if 0
2403 /* Return the next state to which the current state STATE will transit by
2404 accepting the current input byte. Return NULL on failure. */
2405
2406 static re_dfastate_t *
2407 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2408 re_dfastate_t *state)
2409 {
2410 const re_dfa_t *const dfa = mctx->dfa;
2411 re_node_set next_nodes;
2412 re_dfastate_t *next_state;
2413 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2414 unsigned int context;
2415
2416 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2417 if (__glibc_unlikely (*err != REG_NOERROR))
2418 return NULL;
2419 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2420 {
2421 Idx cur_node = state->nodes.elems[node_cnt];
2422 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2423 {
2424 *err = re_node_set_merge (&next_nodes,
2425 dfa->eclosures + dfa->nexts[cur_node]);
2426 if (__glibc_unlikely (*err != REG_NOERROR))
2427 {
2428 re_node_set_free (&next_nodes);
2429 return NULL;
2430 }
2431 }
2432 }
2433 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2434 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2435 /* We don't need to check errors here, since the return value of
2436 this function is next_state and ERR is already set. */
2437
2438 re_node_set_free (&next_nodes);
2439 re_string_skip_bytes (&mctx->input, 1);
2440 return next_state;
2441 }
2442 #endif
2443
2444 #ifdef RE_ENABLE_I18N
2445 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2446 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2447 {
2448 const re_dfa_t *const dfa = mctx->dfa;
2449 reg_errcode_t err;
2450 Idx i;
2451
2452 for (i = 0; i < pstate->nodes.nelem; ++i)
2453 {
2454 re_node_set dest_nodes, *new_nodes;
2455 Idx cur_node_idx = pstate->nodes.elems[i];
2456 int naccepted;
2457 Idx dest_idx;
2458 unsigned int context;
2459 re_dfastate_t *dest_state;
2460
2461 if (!dfa->nodes[cur_node_idx].accept_mb)
2462 continue;
2463
2464 if (dfa->nodes[cur_node_idx].constraint)
2465 {
2466 context = re_string_context_at (&mctx->input,
2467 re_string_cur_idx (&mctx->input),
2468 mctx->eflags);
2469 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2470 context))
2471 continue;
2472 }
2473
2474 /* How many bytes the node can accept? */
2475 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2476 re_string_cur_idx (&mctx->input));
2477 if (naccepted == 0)
2478 continue;
2479
2480 /* The node can accepts 'naccepted' bytes. */
2481 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2482 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2483 : mctx->max_mb_elem_len);
2484 err = clean_state_log_if_needed (mctx, dest_idx);
2485 if (__glibc_unlikely (err != REG_NOERROR))
2486 return err;
2487 DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2488 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2489
2490 dest_state = mctx->state_log[dest_idx];
2491 if (dest_state == NULL)
2492 dest_nodes = *new_nodes;
2493 else
2494 {
2495 err = re_node_set_init_union (&dest_nodes,
2496 dest_state->entrance_nodes, new_nodes);
2497 if (__glibc_unlikely (err != REG_NOERROR))
2498 return err;
2499 }
2500 context = re_string_context_at (&mctx->input, dest_idx - 1,
2501 mctx->eflags);
2502 mctx->state_log[dest_idx]
2503 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2504 if (dest_state != NULL)
2505 re_node_set_free (&dest_nodes);
2506 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2507 && err != REG_NOERROR))
2508 return err;
2509 }
2510 return REG_NOERROR;
2511 }
2512 #endif /* RE_ENABLE_I18N */
2513
2514 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2515 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2516 {
2517 const re_dfa_t *const dfa = mctx->dfa;
2518 reg_errcode_t err;
2519 Idx i;
2520 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2521
2522 for (i = 0; i < nodes->nelem; ++i)
2523 {
2524 Idx dest_str_idx, prev_nelem, bkc_idx;
2525 Idx node_idx = nodes->elems[i];
2526 unsigned int context;
2527 const re_token_t *node = dfa->nodes + node_idx;
2528 re_node_set *new_dest_nodes;
2529
2530 /* Check whether 'node' is a backreference or not. */
2531 if (node->type != OP_BACK_REF)
2532 continue;
2533
2534 if (node->constraint)
2535 {
2536 context = re_string_context_at (&mctx->input, cur_str_idx,
2537 mctx->eflags);
2538 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2539 continue;
2540 }
2541
2542 /* 'node' is a backreference.
2543 Check the substring which the substring matched. */
2544 bkc_idx = mctx->nbkref_ents;
2545 err = get_subexp (mctx, node_idx, cur_str_idx);
2546 if (__glibc_unlikely (err != REG_NOERROR))
2547 goto free_return;
2548
2549 /* And add the epsilon closures (which is 'new_dest_nodes') of
2550 the backreference to appropriate state_log. */
2551 DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2552 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2553 {
2554 Idx subexp_len;
2555 re_dfastate_t *dest_state;
2556 struct re_backref_cache_entry *bkref_ent;
2557 bkref_ent = mctx->bkref_ents + bkc_idx;
2558 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2559 continue;
2560 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2561 new_dest_nodes = (subexp_len == 0
2562 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2563 : dfa->eclosures + dfa->nexts[node_idx]);
2564 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2565 - bkref_ent->subexp_from);
2566 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2567 mctx->eflags);
2568 dest_state = mctx->state_log[dest_str_idx];
2569 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2570 : mctx->state_log[cur_str_idx]->nodes.nelem);
2571 /* Add 'new_dest_node' to state_log. */
2572 if (dest_state == NULL)
2573 {
2574 mctx->state_log[dest_str_idx]
2575 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2576 context);
2577 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2578 && err != REG_NOERROR))
2579 goto free_return;
2580 }
2581 else
2582 {
2583 re_node_set dest_nodes;
2584 err = re_node_set_init_union (&dest_nodes,
2585 dest_state->entrance_nodes,
2586 new_dest_nodes);
2587 if (__glibc_unlikely (err != REG_NOERROR))
2588 {
2589 re_node_set_free (&dest_nodes);
2590 goto free_return;
2591 }
2592 mctx->state_log[dest_str_idx]
2593 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2594 re_node_set_free (&dest_nodes);
2595 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2596 && err != REG_NOERROR))
2597 goto free_return;
2598 }
2599 /* We need to check recursively if the backreference can epsilon
2600 transit. */
2601 if (subexp_len == 0
2602 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2603 {
2604 err = check_subexp_matching_top (mctx, new_dest_nodes,
2605 cur_str_idx);
2606 if (__glibc_unlikely (err != REG_NOERROR))
2607 goto free_return;
2608 err = transit_state_bkref (mctx, new_dest_nodes);
2609 if (__glibc_unlikely (err != REG_NOERROR))
2610 goto free_return;
2611 }
2612 }
2613 }
2614 err = REG_NOERROR;
2615 free_return:
2616 return err;
2617 }
2618
2619 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2620 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2621 Note that we might collect inappropriate candidates here.
2622 However, the cost of checking them strictly here is too high, then we
2623 delay these checking for prune_impossible_nodes(). */
2624
2625 static reg_errcode_t
2626 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2627 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2628 {
2629 const re_dfa_t *const dfa = mctx->dfa;
2630 Idx subexp_num, sub_top_idx;
2631 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2632 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2633 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2634 if (cache_idx != -1)
2635 {
2636 const struct re_backref_cache_entry *entry
2637 = mctx->bkref_ents + cache_idx;
2638 do
2639 if (entry->node == bkref_node)
2640 return REG_NOERROR; /* We already checked it. */
2641 while (entry++->more);
2642 }
2643
2644 subexp_num = dfa->nodes[bkref_node].opr.idx;
2645
2646 /* For each sub expression */
2647 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2648 {
2649 reg_errcode_t err;
2650 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2651 re_sub_match_last_t *sub_last;
2652 Idx sub_last_idx, sl_str, bkref_str_off;
2653
2654 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2655 continue; /* It isn't related. */
2656
2657 sl_str = sub_top->str_idx;
2658 bkref_str_off = bkref_str_idx;
2659 /* At first, check the last node of sub expressions we already
2660 evaluated. */
2661 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2662 {
2663 regoff_t sl_str_diff;
2664 sub_last = sub_top->lasts[sub_last_idx];
2665 sl_str_diff = sub_last->str_idx - sl_str;
2666 /* The matched string by the sub expression match with the substring
2667 at the back reference? */
2668 if (sl_str_diff > 0)
2669 {
2670 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2671 > mctx->input.valid_len))
2672 {
2673 /* Not enough chars for a successful match. */
2674 if (bkref_str_off + sl_str_diff > mctx->input.len)
2675 break;
2676
2677 err = clean_state_log_if_needed (mctx,
2678 bkref_str_off
2679 + sl_str_diff);
2680 if (__glibc_unlikely (err != REG_NOERROR))
2681 return err;
2682 buf = (const char *) re_string_get_buffer (&mctx->input);
2683 }
2684 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2685 /* We don't need to search this sub expression any more. */
2686 break;
2687 }
2688 bkref_str_off += sl_str_diff;
2689 sl_str += sl_str_diff;
2690 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2691 bkref_str_idx);
2692
2693 /* Reload buf, since the preceding call might have reallocated
2694 the buffer. */
2695 buf = (const char *) re_string_get_buffer (&mctx->input);
2696
2697 if (err == REG_NOMATCH)
2698 continue;
2699 if (__glibc_unlikely (err != REG_NOERROR))
2700 return err;
2701 }
2702
2703 if (sub_last_idx < sub_top->nlasts)
2704 continue;
2705 if (sub_last_idx > 0)
2706 ++sl_str;
2707 /* Then, search for the other last nodes of the sub expression. */
2708 for (; sl_str <= bkref_str_idx; ++sl_str)
2709 {
2710 Idx cls_node;
2711 regoff_t sl_str_off;
2712 const re_node_set *nodes;
2713 sl_str_off = sl_str - sub_top->str_idx;
2714 /* The matched string by the sub expression match with the substring
2715 at the back reference? */
2716 if (sl_str_off > 0)
2717 {
2718 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2719 {
2720 /* If we are at the end of the input, we cannot match. */
2721 if (bkref_str_off >= mctx->input.len)
2722 break;
2723
2724 err = extend_buffers (mctx, bkref_str_off + 1);
2725 if (__glibc_unlikely (err != REG_NOERROR))
2726 return err;
2727
2728 buf = (const char *) re_string_get_buffer (&mctx->input);
2729 }
2730 if (buf [bkref_str_off++] != buf[sl_str - 1])
2731 break; /* We don't need to search this sub expression
2732 any more. */
2733 }
2734 if (mctx->state_log[sl_str] == NULL)
2735 continue;
2736 /* Does this state have a ')' of the sub expression? */
2737 nodes = &mctx->state_log[sl_str]->nodes;
2738 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2739 OP_CLOSE_SUBEXP);
2740 if (cls_node == -1)
2741 continue; /* No. */
2742 if (sub_top->path == NULL)
2743 {
2744 sub_top->path = calloc (sizeof (state_array_t),
2745 sl_str - sub_top->str_idx + 1);
2746 if (sub_top->path == NULL)
2747 return REG_ESPACE;
2748 }
2749 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2750 in the current context? */
2751 err = check_arrival (mctx, sub_top->path, sub_top->node,
2752 sub_top->str_idx, cls_node, sl_str,
2753 OP_CLOSE_SUBEXP);
2754 if (err == REG_NOMATCH)
2755 continue;
2756 if (__glibc_unlikely (err != REG_NOERROR))
2757 return err;
2758 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2759 if (__glibc_unlikely (sub_last == NULL))
2760 return REG_ESPACE;
2761 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2762 bkref_str_idx);
2763 buf = (const char *) re_string_get_buffer (&mctx->input);
2764 if (err == REG_NOMATCH)
2765 continue;
2766 if (__glibc_unlikely (err != REG_NOERROR))
2767 return err;
2768 }
2769 }
2770 return REG_NOERROR;
2771 }
2772
2773 /* Helper functions for get_subexp(). */
2774
2775 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2776 If it can arrive, register the sub expression expressed with SUB_TOP
2777 and SUB_LAST. */
2778
2779 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2780 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2781 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2782 {
2783 reg_errcode_t err;
2784 Idx to_idx;
2785 /* Can the subexpression arrive the back reference? */
2786 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2787 sub_last->str_idx, bkref_node, bkref_str,
2788 OP_OPEN_SUBEXP);
2789 if (err != REG_NOERROR)
2790 return err;
2791 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2792 sub_last->str_idx);
2793 if (__glibc_unlikely (err != REG_NOERROR))
2794 return err;
2795 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2796 return clean_state_log_if_needed (mctx, to_idx);
2797 }
2798
2799 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2800 Search '(' if FL_OPEN, or search ')' otherwise.
2801 TODO: This function isn't efficient...
2802 Because there might be more than one nodes whose types are
2803 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2804 nodes.
2805 E.g. RE: (a){2} */
2806
2807 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2808 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2809 Idx subexp_idx, int type)
2810 {
2811 Idx cls_idx;
2812 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2813 {
2814 Idx cls_node = nodes->elems[cls_idx];
2815 const re_token_t *node = dfa->nodes + cls_node;
2816 if (node->type == type
2817 && node->opr.idx == subexp_idx)
2818 return cls_node;
2819 }
2820 return -1;
2821 }
2822
2823 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2824 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2825 heavily reused.
2826 Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2827 REG_ESPACE if memory is exhausted. */
2828
2829 static reg_errcode_t
2830 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2831 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2832 Idx top_str, Idx last_node, Idx last_str, int type)
2833 {
2834 const re_dfa_t *const dfa = mctx->dfa;
2835 reg_errcode_t err = REG_NOERROR;
2836 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2837 re_dfastate_t *cur_state = NULL;
2838 re_node_set *cur_nodes, next_nodes;
2839 re_dfastate_t **backup_state_log;
2840 unsigned int context;
2841
2842 subexp_num = dfa->nodes[top_node].opr.idx;
2843 /* Extend the buffer if we need. */
2844 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2845 {
2846 re_dfastate_t **new_array;
2847 Idx old_alloc = path->alloc;
2848 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2849 Idx new_alloc;
2850 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2851 return REG_ESPACE;
2852 new_alloc = old_alloc + incr_alloc;
2853 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2854 return REG_ESPACE;
2855 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2856 if (__glibc_unlikely (new_array == NULL))
2857 return REG_ESPACE;
2858 path->array = new_array;
2859 path->alloc = new_alloc;
2860 memset (new_array + old_alloc, '\0',
2861 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2862 }
2863
2864 str_idx = path->next_idx ? path->next_idx : top_str;
2865
2866 /* Temporary modify MCTX. */
2867 backup_state_log = mctx->state_log;
2868 backup_cur_idx = mctx->input.cur_idx;
2869 mctx->state_log = path->array;
2870 mctx->input.cur_idx = str_idx;
2871
2872 /* Setup initial node set. */
2873 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2874 if (str_idx == top_str)
2875 {
2876 err = re_node_set_init_1 (&next_nodes, top_node);
2877 if (__glibc_unlikely (err != REG_NOERROR))
2878 return err;
2879 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2880 if (__glibc_unlikely (err != REG_NOERROR))
2881 {
2882 re_node_set_free (&next_nodes);
2883 return err;
2884 }
2885 }
2886 else
2887 {
2888 cur_state = mctx->state_log[str_idx];
2889 if (cur_state && cur_state->has_backref)
2890 {
2891 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2892 if (__glibc_unlikely (err != REG_NOERROR))
2893 return err;
2894 }
2895 else
2896 re_node_set_init_empty (&next_nodes);
2897 }
2898 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2899 {
2900 if (next_nodes.nelem)
2901 {
2902 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2903 subexp_num, type);
2904 if (__glibc_unlikely (err != REG_NOERROR))
2905 {
2906 re_node_set_free (&next_nodes);
2907 return err;
2908 }
2909 }
2910 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2911 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2912 {
2913 re_node_set_free (&next_nodes);
2914 return err;
2915 }
2916 mctx->state_log[str_idx] = cur_state;
2917 }
2918
2919 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2920 {
2921 re_node_set_empty (&next_nodes);
2922 if (mctx->state_log[str_idx + 1])
2923 {
2924 err = re_node_set_merge (&next_nodes,
2925 &mctx->state_log[str_idx + 1]->nodes);
2926 if (__glibc_unlikely (err != REG_NOERROR))
2927 {
2928 re_node_set_free (&next_nodes);
2929 return err;
2930 }
2931 }
2932 if (cur_state)
2933 {
2934 err = check_arrival_add_next_nodes (mctx, str_idx,
2935 &cur_state->non_eps_nodes,
2936 &next_nodes);
2937 if (__glibc_unlikely (err != REG_NOERROR))
2938 {
2939 re_node_set_free (&next_nodes);
2940 return err;
2941 }
2942 }
2943 ++str_idx;
2944 if (next_nodes.nelem)
2945 {
2946 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947 if (__glibc_unlikely (err != REG_NOERROR))
2948 {
2949 re_node_set_free (&next_nodes);
2950 return err;
2951 }
2952 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2953 subexp_num, type);
2954 if (__glibc_unlikely (err != REG_NOERROR))
2955 {
2956 re_node_set_free (&next_nodes);
2957 return err;
2958 }
2959 }
2960 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2961 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2962 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2963 {
2964 re_node_set_free (&next_nodes);
2965 return err;
2966 }
2967 mctx->state_log[str_idx] = cur_state;
2968 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2969 }
2970 re_node_set_free (&next_nodes);
2971 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2972 : &mctx->state_log[last_str]->nodes);
2973 path->next_idx = str_idx;
2974
2975 /* Fix MCTX. */
2976 mctx->state_log = backup_state_log;
2977 mctx->input.cur_idx = backup_cur_idx;
2978
2979 /* Then check the current node set has the node LAST_NODE. */
2980 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2981 return REG_NOERROR;
2982
2983 return REG_NOMATCH;
2984 }
2985
2986 /* Helper functions for check_arrival. */
2987
2988 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2989 to NEXT_NODES.
2990 TODO: This function is similar to the functions transit_state*(),
2991 however this function has many additional works.
2992 Can't we unify them? */
2993
2994 static reg_errcode_t
2995 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)2996 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2997 re_node_set *cur_nodes, re_node_set *next_nodes)
2998 {
2999 const re_dfa_t *const dfa = mctx->dfa;
3000 bool ok;
3001 Idx cur_idx;
3002 #ifdef RE_ENABLE_I18N
3003 reg_errcode_t err = REG_NOERROR;
3004 #endif
3005 re_node_set union_set;
3006 re_node_set_init_empty (&union_set);
3007 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3008 {
3009 int naccepted = 0;
3010 Idx cur_node = cur_nodes->elems[cur_idx];
3011 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3012
3013 #ifdef RE_ENABLE_I18N
3014 /* If the node may accept "multi byte". */
3015 if (dfa->nodes[cur_node].accept_mb)
3016 {
3017 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3018 str_idx);
3019 if (naccepted > 1)
3020 {
3021 re_dfastate_t *dest_state;
3022 Idx next_node = dfa->nexts[cur_node];
3023 Idx next_idx = str_idx + naccepted;
3024 dest_state = mctx->state_log[next_idx];
3025 re_node_set_empty (&union_set);
3026 if (dest_state)
3027 {
3028 err = re_node_set_merge (&union_set, &dest_state->nodes);
3029 if (__glibc_unlikely (err != REG_NOERROR))
3030 {
3031 re_node_set_free (&union_set);
3032 return err;
3033 }
3034 }
3035 ok = re_node_set_insert (&union_set, next_node);
3036 if (__glibc_unlikely (! ok))
3037 {
3038 re_node_set_free (&union_set);
3039 return REG_ESPACE;
3040 }
3041 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3042 &union_set);
3043 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3044 && err != REG_NOERROR))
3045 {
3046 re_node_set_free (&union_set);
3047 return err;
3048 }
3049 }
3050 }
3051 #endif /* RE_ENABLE_I18N */
3052 if (naccepted
3053 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3054 {
3055 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3056 if (__glibc_unlikely (! ok))
3057 {
3058 re_node_set_free (&union_set);
3059 return REG_ESPACE;
3060 }
3061 }
3062 }
3063 re_node_set_free (&union_set);
3064 return REG_NOERROR;
3065 }
3066
3067 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3068 CUR_NODES, however exclude the nodes which are:
3069 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3070 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3071 */
3072
3073 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3074 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3075 Idx ex_subexp, int type)
3076 {
3077 reg_errcode_t err;
3078 Idx idx, outside_node;
3079 re_node_set new_nodes;
3080 DEBUG_ASSERT (cur_nodes->nelem);
3081 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3082 if (__glibc_unlikely (err != REG_NOERROR))
3083 return err;
3084 /* Create a new node set NEW_NODES with the nodes which are epsilon
3085 closures of the node in CUR_NODES. */
3086
3087 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3088 {
3089 Idx cur_node = cur_nodes->elems[idx];
3090 const re_node_set *eclosure = dfa->eclosures + cur_node;
3091 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3092 if (outside_node == -1)
3093 {
3094 /* There are no problematic nodes, just merge them. */
3095 err = re_node_set_merge (&new_nodes, eclosure);
3096 if (__glibc_unlikely (err != REG_NOERROR))
3097 {
3098 re_node_set_free (&new_nodes);
3099 return err;
3100 }
3101 }
3102 else
3103 {
3104 /* There are problematic nodes, re-calculate incrementally. */
3105 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3106 ex_subexp, type);
3107 if (__glibc_unlikely (err != REG_NOERROR))
3108 {
3109 re_node_set_free (&new_nodes);
3110 return err;
3111 }
3112 }
3113 }
3114 re_node_set_free (cur_nodes);
3115 *cur_nodes = new_nodes;
3116 return REG_NOERROR;
3117 }
3118
3119 /* Helper function for check_arrival_expand_ecl.
3120 Check incrementally the epsilon closure of TARGET, and if it isn't
3121 problematic append it to DST_NODES. */
3122
3123 static reg_errcode_t
3124 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3125 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3126 Idx target, Idx ex_subexp, int type)
3127 {
3128 Idx cur_node;
3129 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3130 {
3131 bool ok;
3132
3133 if (dfa->nodes[cur_node].type == type
3134 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3135 {
3136 if (type == OP_CLOSE_SUBEXP)
3137 {
3138 ok = re_node_set_insert (dst_nodes, cur_node);
3139 if (__glibc_unlikely (! ok))
3140 return REG_ESPACE;
3141 }
3142 break;
3143 }
3144 ok = re_node_set_insert (dst_nodes, cur_node);
3145 if (__glibc_unlikely (! ok))
3146 return REG_ESPACE;
3147 if (dfa->edests[cur_node].nelem == 0)
3148 break;
3149 if (dfa->edests[cur_node].nelem == 2)
3150 {
3151 reg_errcode_t err;
3152 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3153 dfa->edests[cur_node].elems[1],
3154 ex_subexp, type);
3155 if (__glibc_unlikely (err != REG_NOERROR))
3156 return err;
3157 }
3158 cur_node = dfa->edests[cur_node].elems[0];
3159 }
3160 return REG_NOERROR;
3161 }
3162
3163
3164 /* For all the back references in the current state, calculate the
3165 destination of the back references by the appropriate entry
3166 in MCTX->BKREF_ENTS. */
3167
3168 static reg_errcode_t
3169 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3170 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3171 Idx cur_str, Idx subexp_num, int type)
3172 {
3173 const re_dfa_t *const dfa = mctx->dfa;
3174 reg_errcode_t err;
3175 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3176 struct re_backref_cache_entry *ent;
3177
3178 if (cache_idx_start == -1)
3179 return REG_NOERROR;
3180
3181 restart:
3182 ent = mctx->bkref_ents + cache_idx_start;
3183 do
3184 {
3185 Idx to_idx, next_node;
3186
3187 /* Is this entry ENT is appropriate? */
3188 if (!re_node_set_contains (cur_nodes, ent->node))
3189 continue; /* No. */
3190
3191 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3192 /* Calculate the destination of the back reference, and append it
3193 to MCTX->STATE_LOG. */
3194 if (to_idx == cur_str)
3195 {
3196 /* The backreference did epsilon transit, we must re-check all the
3197 node in the current state. */
3198 re_node_set new_dests;
3199 reg_errcode_t err2, err3;
3200 next_node = dfa->edests[ent->node].elems[0];
3201 if (re_node_set_contains (cur_nodes, next_node))
3202 continue;
3203 err = re_node_set_init_1 (&new_dests, next_node);
3204 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3205 err3 = re_node_set_merge (cur_nodes, &new_dests);
3206 re_node_set_free (&new_dests);
3207 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3208 || err3 != REG_NOERROR))
3209 {
3210 err = (err != REG_NOERROR ? err
3211 : (err2 != REG_NOERROR ? err2 : err3));
3212 return err;
3213 }
3214 /* TODO: It is still inefficient... */
3215 goto restart;
3216 }
3217 else
3218 {
3219 re_node_set union_set;
3220 next_node = dfa->nexts[ent->node];
3221 if (mctx->state_log[to_idx])
3222 {
3223 bool ok;
3224 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3225 next_node))
3226 continue;
3227 err = re_node_set_init_copy (&union_set,
3228 &mctx->state_log[to_idx]->nodes);
3229 ok = re_node_set_insert (&union_set, next_node);
3230 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3231 {
3232 re_node_set_free (&union_set);
3233 err = err != REG_NOERROR ? err : REG_ESPACE;
3234 return err;
3235 }
3236 }
3237 else
3238 {
3239 err = re_node_set_init_1 (&union_set, next_node);
3240 if (__glibc_unlikely (err != REG_NOERROR))
3241 return err;
3242 }
3243 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3244 re_node_set_free (&union_set);
3245 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3246 && err != REG_NOERROR))
3247 return err;
3248 }
3249 }
3250 while (ent++->more);
3251 return REG_NOERROR;
3252 }
3253
3254 /* Build transition table for the state.
3255 Return true if successful. */
3256
3257 static bool __attribute_noinline__
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3258 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3259 {
3260 reg_errcode_t err;
3261 Idx i, j;
3262 int ch;
3263 bool need_word_trtable = false;
3264 bitset_word_t elem, mask;
3265 Idx ndests; /* Number of the destination states from 'state'. */
3266 re_dfastate_t **trtable;
3267 re_dfastate_t *dest_states[SBC_MAX];
3268 re_dfastate_t *dest_states_word[SBC_MAX];
3269 re_dfastate_t *dest_states_nl[SBC_MAX];
3270 re_node_set follows;
3271 bitset_t acceptable;
3272
3273 /* We build DFA states which corresponds to the destination nodes
3274 from 'state'. 'dests_node[i]' represents the nodes which i-th
3275 destination state contains, and 'dests_ch[i]' represents the
3276 characters which i-th destination state accepts. */
3277 re_node_set dests_node[SBC_MAX];
3278 bitset_t dests_ch[SBC_MAX];
3279
3280 /* Initialize transition table. */
3281 state->word_trtable = state->trtable = NULL;
3282
3283 /* At first, group all nodes belonging to 'state' into several
3284 destinations. */
3285 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3286 if (__glibc_unlikely (ndests <= 0))
3287 {
3288 /* Return false in case of an error, true otherwise. */
3289 if (ndests == 0)
3290 {
3291 state->trtable = (re_dfastate_t **)
3292 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3293 if (__glibc_unlikely (state->trtable == NULL))
3294 return false;
3295 return true;
3296 }
3297 return false;
3298 }
3299
3300 err = re_node_set_alloc (&follows, ndests + 1);
3301 if (__glibc_unlikely (err != REG_NOERROR))
3302 {
3303 out_free:
3304 re_node_set_free (&follows);
3305 for (i = 0; i < ndests; ++i)
3306 re_node_set_free (dests_node + i);
3307 return false;
3308 }
3309
3310 bitset_empty (acceptable);
3311
3312 /* Then build the states for all destinations. */
3313 for (i = 0; i < ndests; ++i)
3314 {
3315 Idx next_node;
3316 re_node_set_empty (&follows);
3317 /* Merge the follows of this destination states. */
3318 for (j = 0; j < dests_node[i].nelem; ++j)
3319 {
3320 next_node = dfa->nexts[dests_node[i].elems[j]];
3321 if (next_node != -1)
3322 {
3323 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3324 if (__glibc_unlikely (err != REG_NOERROR))
3325 goto out_free;
3326 }
3327 }
3328 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3329 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3330 goto out_free;
3331 /* If the new state has context constraint,
3332 build appropriate states for these contexts. */
3333 if (dest_states[i]->has_constraint)
3334 {
3335 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3336 CONTEXT_WORD);
3337 if (__glibc_unlikely (dest_states_word[i] == NULL
3338 && err != REG_NOERROR))
3339 goto out_free;
3340
3341 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3342 need_word_trtable = true;
3343
3344 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3345 CONTEXT_NEWLINE);
3346 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3347 goto out_free;
3348 }
3349 else
3350 {
3351 dest_states_word[i] = dest_states[i];
3352 dest_states_nl[i] = dest_states[i];
3353 }
3354 bitset_merge (acceptable, dests_ch[i]);
3355 }
3356
3357 if (!__glibc_unlikely (need_word_trtable))
3358 {
3359 /* We don't care about whether the following character is a word
3360 character, or we are in a single-byte character set so we can
3361 discern by looking at the character code: allocate a
3362 256-entry transition table. */
3363 trtable = state->trtable =
3364 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365 if (__glibc_unlikely (trtable == NULL))
3366 goto out_free;
3367
3368 /* For all characters ch...: */
3369 for (i = 0; i < BITSET_WORDS; ++i)
3370 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3371 elem;
3372 mask <<= 1, elem >>= 1, ++ch)
3373 if (__glibc_unlikely (elem & 1))
3374 {
3375 /* There must be exactly one destination which accepts
3376 character ch. See group_nodes_into_DFAstates. */
3377 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3378 ;
3379
3380 /* j-th destination accepts the word character ch. */
3381 if (dfa->word_char[i] & mask)
3382 trtable[ch] = dest_states_word[j];
3383 else
3384 trtable[ch] = dest_states[j];
3385 }
3386 }
3387 else
3388 {
3389 /* We care about whether the following character is a word
3390 character, and we are in a multi-byte character set: discern
3391 by looking at the character code: build two 256-entry
3392 transition tables, one starting at trtable[0] and one
3393 starting at trtable[SBC_MAX]. */
3394 trtable = state->word_trtable =
3395 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3396 if (__glibc_unlikely (trtable == NULL))
3397 goto out_free;
3398
3399 /* For all characters ch...: */
3400 for (i = 0; i < BITSET_WORDS; ++i)
3401 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3402 elem;
3403 mask <<= 1, elem >>= 1, ++ch)
3404 if (__glibc_unlikely (elem & 1))
3405 {
3406 /* There must be exactly one destination which accepts
3407 character ch. See group_nodes_into_DFAstates. */
3408 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3409 ;
3410
3411 /* j-th destination accepts the word character ch. */
3412 trtable[ch] = dest_states[j];
3413 trtable[ch + SBC_MAX] = dest_states_word[j];
3414 }
3415 }
3416
3417 /* new line */
3418 if (bitset_contain (acceptable, NEWLINE_CHAR))
3419 {
3420 /* The current state accepts newline character. */
3421 for (j = 0; j < ndests; ++j)
3422 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3423 {
3424 /* k-th destination accepts newline character. */
3425 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3426 if (need_word_trtable)
3427 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3428 /* There must be only one destination which accepts
3429 newline. See group_nodes_into_DFAstates. */
3430 break;
3431 }
3432 }
3433
3434 re_node_set_free (&follows);
3435 for (i = 0; i < ndests; ++i)
3436 re_node_set_free (dests_node + i);
3437 return true;
3438 }
3439
3440 /* Group all nodes belonging to STATE into several destinations.
3441 Then for all destinations, set the nodes belonging to the destination
3442 to DESTS_NODE[i] and set the characters accepted by the destination
3443 to DEST_CH[i]. Return the number of destinations if successful,
3444 -1 on internal error. */
3445
3446 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3447 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3448 re_node_set *dests_node, bitset_t *dests_ch)
3449 {
3450 reg_errcode_t err;
3451 bool ok;
3452 Idx i, j, k;
3453 Idx ndests; /* Number of the destinations from 'state'. */
3454 bitset_t accepts; /* Characters a node can accept. */
3455 const re_node_set *cur_nodes = &state->nodes;
3456 bitset_empty (accepts);
3457 ndests = 0;
3458
3459 /* For all the nodes belonging to 'state', */
3460 for (i = 0; i < cur_nodes->nelem; ++i)
3461 {
3462 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3463 re_token_type_t type = node->type;
3464 unsigned int constraint = node->constraint;
3465
3466 /* Enumerate all single byte character this node can accept. */
3467 if (type == CHARACTER)
3468 bitset_set (accepts, node->opr.c);
3469 else if (type == SIMPLE_BRACKET)
3470 {
3471 bitset_merge (accepts, node->opr.sbcset);
3472 }
3473 else if (type == OP_PERIOD)
3474 {
3475 #ifdef RE_ENABLE_I18N
3476 if (dfa->mb_cur_max > 1)
3477 bitset_merge (accepts, dfa->sb_char);
3478 else
3479 #endif
3480 bitset_set_all (accepts);
3481 if (!(dfa->syntax & RE_DOT_NEWLINE))
3482 bitset_clear (accepts, '\n');
3483 if (dfa->syntax & RE_DOT_NOT_NULL)
3484 bitset_clear (accepts, '\0');
3485 }
3486 #ifdef RE_ENABLE_I18N
3487 else if (type == OP_UTF8_PERIOD)
3488 {
3489 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3490 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3491 else
3492 bitset_merge (accepts, utf8_sb_map);
3493 if (!(dfa->syntax & RE_DOT_NEWLINE))
3494 bitset_clear (accepts, '\n');
3495 if (dfa->syntax & RE_DOT_NOT_NULL)
3496 bitset_clear (accepts, '\0');
3497 }
3498 #endif
3499 else
3500 continue;
3501
3502 /* Check the 'accepts' and sift the characters which are not
3503 match it the context. */
3504 if (constraint)
3505 {
3506 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3507 {
3508 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3509 bitset_empty (accepts);
3510 if (accepts_newline)
3511 bitset_set (accepts, NEWLINE_CHAR);
3512 else
3513 continue;
3514 }
3515 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3516 {
3517 bitset_empty (accepts);
3518 continue;
3519 }
3520
3521 if (constraint & NEXT_WORD_CONSTRAINT)
3522 {
3523 bitset_word_t any_set = 0;
3524 if (type == CHARACTER && !node->word_char)
3525 {
3526 bitset_empty (accepts);
3527 continue;
3528 }
3529 #ifdef RE_ENABLE_I18N
3530 if (dfa->mb_cur_max > 1)
3531 for (j = 0; j < BITSET_WORDS; ++j)
3532 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3533 else
3534 #endif
3535 for (j = 0; j < BITSET_WORDS; ++j)
3536 any_set |= (accepts[j] &= dfa->word_char[j]);
3537 if (!any_set)
3538 continue;
3539 }
3540 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3541 {
3542 bitset_word_t any_set = 0;
3543 if (type == CHARACTER && node->word_char)
3544 {
3545 bitset_empty (accepts);
3546 continue;
3547 }
3548 #ifdef RE_ENABLE_I18N
3549 if (dfa->mb_cur_max > 1)
3550 for (j = 0; j < BITSET_WORDS; ++j)
3551 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3552 else
3553 #endif
3554 for (j = 0; j < BITSET_WORDS; ++j)
3555 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3556 if (!any_set)
3557 continue;
3558 }
3559 }
3560
3561 /* Then divide 'accepts' into DFA states, or create a new
3562 state. Above, we make sure that accepts is not empty. */
3563 for (j = 0; j < ndests; ++j)
3564 {
3565 bitset_t intersec; /* Intersection sets, see below. */
3566 bitset_t remains;
3567 /* Flags, see below. */
3568 bitset_word_t has_intersec, not_subset, not_consumed;
3569
3570 /* Optimization, skip if this state doesn't accept the character. */
3571 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3572 continue;
3573
3574 /* Enumerate the intersection set of this state and 'accepts'. */
3575 has_intersec = 0;
3576 for (k = 0; k < BITSET_WORDS; ++k)
3577 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3578 /* And skip if the intersection set is empty. */
3579 if (!has_intersec)
3580 continue;
3581
3582 /* Then check if this state is a subset of 'accepts'. */
3583 not_subset = not_consumed = 0;
3584 for (k = 0; k < BITSET_WORDS; ++k)
3585 {
3586 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3587 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3588 }
3589
3590 /* If this state isn't a subset of 'accepts', create a
3591 new group state, which has the 'remains'. */
3592 if (not_subset)
3593 {
3594 bitset_copy (dests_ch[ndests], remains);
3595 bitset_copy (dests_ch[j], intersec);
3596 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3597 if (__glibc_unlikely (err != REG_NOERROR))
3598 goto error_return;
3599 ++ndests;
3600 }
3601
3602 /* Put the position in the current group. */
3603 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3604 if (__glibc_unlikely (! ok))
3605 goto error_return;
3606
3607 /* If all characters are consumed, go to next node. */
3608 if (!not_consumed)
3609 break;
3610 }
3611 /* Some characters remain, create a new group. */
3612 if (j == ndests)
3613 {
3614 bitset_copy (dests_ch[ndests], accepts);
3615 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3616 if (__glibc_unlikely (err != REG_NOERROR))
3617 goto error_return;
3618 ++ndests;
3619 bitset_empty (accepts);
3620 }
3621 }
3622 assume (ndests <= SBC_MAX);
3623 return ndests;
3624 error_return:
3625 for (j = 0; j < ndests; ++j)
3626 re_node_set_free (dests_node + j);
3627 return -1;
3628 }
3629
3630 #ifdef RE_ENABLE_I18N
3631 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3632 Return the number of the bytes the node accepts.
3633 STR_IDX is the current index of the input string.
3634
3635 This function handles the nodes which can accept one character, or
3636 one collating element like '.', '[a-z]', opposite to the other nodes
3637 can only accept one byte. */
3638
3639 # ifdef _LIBC
3640 # include <locale/weight.h>
3641 # endif
3642
3643 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3644 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3645 const re_string_t *input, Idx str_idx)
3646 {
3647 const re_token_t *node = dfa->nodes + node_idx;
3648 int char_len, elem_len;
3649 Idx i;
3650
3651 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3652 {
3653 unsigned char c = re_string_byte_at (input, str_idx), d;
3654 if (__glibc_likely (c < 0xc2))
3655 return 0;
3656
3657 if (str_idx + 2 > input->len)
3658 return 0;
3659
3660 d = re_string_byte_at (input, str_idx + 1);
3661 if (c < 0xe0)
3662 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3663 else if (c < 0xf0)
3664 {
3665 char_len = 3;
3666 if (c == 0xe0 && d < 0xa0)
3667 return 0;
3668 }
3669 else if (c < 0xf8)
3670 {
3671 char_len = 4;
3672 if (c == 0xf0 && d < 0x90)
3673 return 0;
3674 }
3675 else if (c < 0xfc)
3676 {
3677 char_len = 5;
3678 if (c == 0xf8 && d < 0x88)
3679 return 0;
3680 }
3681 else if (c < 0xfe)
3682 {
3683 char_len = 6;
3684 if (c == 0xfc && d < 0x84)
3685 return 0;
3686 }
3687 else
3688 return 0;
3689
3690 if (str_idx + char_len > input->len)
3691 return 0;
3692
3693 for (i = 1; i < char_len; ++i)
3694 {
3695 d = re_string_byte_at (input, str_idx + i);
3696 if (d < 0x80 || d > 0xbf)
3697 return 0;
3698 }
3699 return char_len;
3700 }
3701
3702 char_len = re_string_char_size_at (input, str_idx);
3703 if (node->type == OP_PERIOD)
3704 {
3705 if (char_len <= 1)
3706 return 0;
3707 /* FIXME: I don't think this if is needed, as both '\n'
3708 and '\0' are char_len == 1. */
3709 /* '.' accepts any one character except the following two cases. */
3710 if ((!(dfa->syntax & RE_DOT_NEWLINE)
3711 && re_string_byte_at (input, str_idx) == '\n')
3712 || ((dfa->syntax & RE_DOT_NOT_NULL)
3713 && re_string_byte_at (input, str_idx) == '\0'))
3714 return 0;
3715 return char_len;
3716 }
3717
3718 elem_len = re_string_elem_size_at (input, str_idx);
3719 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3720 return 0;
3721
3722 if (node->type == COMPLEX_BRACKET)
3723 {
3724 const re_charset_t *cset = node->opr.mbcset;
3725 # ifdef _LIBC
3726 const unsigned char *pin
3727 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3728 Idx j;
3729 uint32_t nrules;
3730 # endif /* _LIBC */
3731 int match_len = 0;
3732 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3733 ? re_string_wchar_at (input, str_idx) : 0);
3734
3735 /* match with multibyte character? */
3736 for (i = 0; i < cset->nmbchars; ++i)
3737 if (wc == cset->mbchars[i])
3738 {
3739 match_len = char_len;
3740 goto check_node_accept_bytes_match;
3741 }
3742 /* match with character_class? */
3743 for (i = 0; i < cset->nchar_classes; ++i)
3744 {
3745 wctype_t wt = cset->char_classes[i];
3746 if (__iswctype (wc, wt))
3747 {
3748 match_len = char_len;
3749 goto check_node_accept_bytes_match;
3750 }
3751 }
3752
3753 # ifdef _LIBC
3754 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3755 if (nrules != 0)
3756 {
3757 unsigned int in_collseq = 0;
3758 const int32_t *table, *indirect;
3759 const unsigned char *weights, *extra;
3760 const char *collseqwc;
3761
3762 /* match with collating_symbol? */
3763 if (cset->ncoll_syms)
3764 extra = (const unsigned char *)
3765 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3766 for (i = 0; i < cset->ncoll_syms; ++i)
3767 {
3768 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3769 /* Compare the length of input collating element and
3770 the length of current collating element. */
3771 if (*coll_sym != elem_len)
3772 continue;
3773 /* Compare each bytes. */
3774 for (j = 0; j < *coll_sym; j++)
3775 if (pin[j] != coll_sym[1 + j])
3776 break;
3777 if (j == *coll_sym)
3778 {
3779 /* Match if every bytes is equal. */
3780 match_len = j;
3781 goto check_node_accept_bytes_match;
3782 }
3783 }
3784
3785 if (cset->nranges)
3786 {
3787 if (elem_len <= char_len)
3788 {
3789 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3790 in_collseq = __collseq_table_lookup (collseqwc, wc);
3791 }
3792 else
3793 in_collseq = find_collation_sequence_value (pin, elem_len);
3794 }
3795 /* match with range expression? */
3796 /* FIXME: Implement rational ranges here, too. */
3797 for (i = 0; i < cset->nranges; ++i)
3798 if (cset->range_starts[i] <= in_collseq
3799 && in_collseq <= cset->range_ends[i])
3800 {
3801 match_len = elem_len;
3802 goto check_node_accept_bytes_match;
3803 }
3804
3805 /* match with equivalence_class? */
3806 if (cset->nequiv_classes)
3807 {
3808 const unsigned char *cp = pin;
3809 table = (const int32_t *)
3810 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3811 weights = (const unsigned char *)
3812 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3813 extra = (const unsigned char *)
3814 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3815 indirect = (const int32_t *)
3816 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3817 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3818 int32_t rule = idx >> 24;
3819 idx &= 0xffffff;
3820 if (idx > 0)
3821 {
3822 size_t weight_len = weights[idx];
3823 for (i = 0; i < cset->nequiv_classes; ++i)
3824 {
3825 int32_t equiv_class_idx = cset->equiv_classes[i];
3826 int32_t equiv_class_rule = equiv_class_idx >> 24;
3827 equiv_class_idx &= 0xffffff;
3828 if (weights[equiv_class_idx] == weight_len
3829 && equiv_class_rule == rule
3830 && memcmp (weights + idx + 1,
3831 weights + equiv_class_idx + 1,
3832 weight_len) == 0)
3833 {
3834 match_len = elem_len;
3835 goto check_node_accept_bytes_match;
3836 }
3837 }
3838 }
3839 }
3840 }
3841 else
3842 # endif /* _LIBC */
3843 {
3844 /* match with range expression? */
3845 for (i = 0; i < cset->nranges; ++i)
3846 {
3847 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3848 {
3849 match_len = char_len;
3850 goto check_node_accept_bytes_match;
3851 }
3852 }
3853 }
3854 check_node_accept_bytes_match:
3855 if (!cset->non_match)
3856 return match_len;
3857 else
3858 {
3859 if (match_len > 0)
3860 return 0;
3861 else
3862 return (elem_len > char_len) ? elem_len : char_len;
3863 }
3864 }
3865 return 0;
3866 }
3867
3868 # ifdef _LIBC
3869 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3870 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3871 {
3872 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3873 if (nrules == 0)
3874 {
3875 if (mbs_len == 1)
3876 {
3877 /* No valid character. Match it as a single byte character. */
3878 const unsigned char *collseq = (const unsigned char *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3880 return collseq[mbs[0]];
3881 }
3882 return UINT_MAX;
3883 }
3884 else
3885 {
3886 int32_t idx;
3887 const unsigned char *extra = (const unsigned char *)
3888 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3889 int32_t extrasize = (const unsigned char *)
3890 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3891
3892 for (idx = 0; idx < extrasize;)
3893 {
3894 int mbs_cnt;
3895 bool found = false;
3896 int32_t elem_mbs_len;
3897 /* Skip the name of collating element name. */
3898 idx = idx + extra[idx] + 1;
3899 elem_mbs_len = extra[idx++];
3900 if (mbs_len == elem_mbs_len)
3901 {
3902 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3903 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3904 break;
3905 if (mbs_cnt == elem_mbs_len)
3906 /* Found the entry. */
3907 found = true;
3908 }
3909 /* Skip the byte sequence of the collating element. */
3910 idx += elem_mbs_len;
3911 /* Adjust for the alignment. */
3912 idx = (idx + 3) & ~3;
3913 /* Skip the collation sequence value. */
3914 idx += sizeof (uint32_t);
3915 /* Skip the wide char sequence of the collating element. */
3916 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3917 /* If we found the entry, return the sequence value. */
3918 if (found)
3919 return *(uint32_t *) (extra + idx);
3920 /* Skip the collation sequence value. */
3921 idx += sizeof (uint32_t);
3922 }
3923 return UINT_MAX;
3924 }
3925 }
3926 # endif /* _LIBC */
3927 #endif /* RE_ENABLE_I18N */
3928
3929 /* Check whether the node accepts the byte which is IDX-th
3930 byte of the INPUT. */
3931
3932 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3933 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3934 Idx idx)
3935 {
3936 unsigned char ch;
3937 ch = re_string_byte_at (&mctx->input, idx);
3938 switch (node->type)
3939 {
3940 case CHARACTER:
3941 if (node->opr.c != ch)
3942 return false;
3943 break;
3944
3945 case SIMPLE_BRACKET:
3946 if (!bitset_contain (node->opr.sbcset, ch))
3947 return false;
3948 break;
3949
3950 #ifdef RE_ENABLE_I18N
3951 case OP_UTF8_PERIOD:
3952 if (ch >= ASCII_CHARS)
3953 return false;
3954 FALLTHROUGH;
3955 #endif
3956 case OP_PERIOD:
3957 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3958 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3959 return false;
3960 break;
3961
3962 default:
3963 return false;
3964 }
3965
3966 if (node->constraint)
3967 {
3968 /* The node has constraints. Check whether the current context
3969 satisfies the constraints. */
3970 unsigned int context = re_string_context_at (&mctx->input, idx,
3971 mctx->eflags);
3972 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3973 return false;
3974 }
3975
3976 return true;
3977 }
3978
3979 /* Extend the buffers, if the buffers have run out. */
3980
3981 static reg_errcode_t
3982 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)3983 extend_buffers (re_match_context_t *mctx, int min_len)
3984 {
3985 reg_errcode_t ret;
3986 re_string_t *pstr = &mctx->input;
3987
3988 /* Avoid overflow. */
3989 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3990 <= pstr->bufs_len))
3991 return REG_ESPACE;
3992
3993 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3994 ret = re_string_realloc_buffers (pstr,
3995 MAX (min_len,
3996 MIN (pstr->len, pstr->bufs_len * 2)));
3997 if (__glibc_unlikely (ret != REG_NOERROR))
3998 return ret;
3999
4000 if (mctx->state_log != NULL)
4001 {
4002 /* And double the length of state_log. */
4003 /* XXX We have no indication of the size of this buffer. If this
4004 allocation fail we have no indication that the state_log array
4005 does not have the right size. */
4006 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4007 pstr->bufs_len + 1);
4008 if (__glibc_unlikely (new_array == NULL))
4009 return REG_ESPACE;
4010 mctx->state_log = new_array;
4011 }
4012
4013 /* Then reconstruct the buffers. */
4014 if (pstr->icase)
4015 {
4016 #ifdef RE_ENABLE_I18N
4017 if (pstr->mb_cur_max > 1)
4018 {
4019 ret = build_wcs_upper_buffer (pstr);
4020 if (__glibc_unlikely (ret != REG_NOERROR))
4021 return ret;
4022 }
4023 else
4024 #endif /* RE_ENABLE_I18N */
4025 build_upper_buffer (pstr);
4026 }
4027 else
4028 {
4029 #ifdef RE_ENABLE_I18N
4030 if (pstr->mb_cur_max > 1)
4031 build_wcs_buffer (pstr);
4032 else
4033 #endif /* RE_ENABLE_I18N */
4034 {
4035 if (pstr->trans != NULL)
4036 re_string_translate_buffer (pstr);
4037 }
4038 }
4039 return REG_NOERROR;
4040 }
4041
4042
4043 /* Functions for matching context. */
4044
4045 /* Initialize MCTX. */
4046
4047 static reg_errcode_t
4048 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4049 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4050 {
4051 mctx->eflags = eflags;
4052 mctx->match_last = -1;
4053 if (n > 0)
4054 {
4055 /* Avoid overflow. */
4056 size_t max_object_size =
4057 MAX (sizeof (struct re_backref_cache_entry),
4058 sizeof (re_sub_match_top_t *));
4059 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4060 return REG_ESPACE;
4061
4062 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4063 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4064 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4065 return REG_ESPACE;
4066 }
4067 /* Already zero-ed by the caller.
4068 else
4069 mctx->bkref_ents = NULL;
4070 mctx->nbkref_ents = 0;
4071 mctx->nsub_tops = 0; */
4072 mctx->abkref_ents = n;
4073 mctx->max_mb_elem_len = 1;
4074 mctx->asub_tops = n;
4075 return REG_NOERROR;
4076 }
4077
4078 /* Clean the entries which depend on the current input in MCTX.
4079 This function must be invoked when the matcher changes the start index
4080 of the input, or changes the input string. */
4081
4082 static void
match_ctx_clean(re_match_context_t * mctx)4083 match_ctx_clean (re_match_context_t *mctx)
4084 {
4085 Idx st_idx;
4086 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4087 {
4088 Idx sl_idx;
4089 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4090 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4091 {
4092 re_sub_match_last_t *last = top->lasts[sl_idx];
4093 re_free (last->path.array);
4094 re_free (last);
4095 }
4096 re_free (top->lasts);
4097 if (top->path)
4098 {
4099 re_free (top->path->array);
4100 re_free (top->path);
4101 }
4102 re_free (top);
4103 }
4104
4105 mctx->nsub_tops = 0;
4106 mctx->nbkref_ents = 0;
4107 }
4108
4109 /* Free all the memory associated with MCTX. */
4110
4111 static void
match_ctx_free(re_match_context_t * mctx)4112 match_ctx_free (re_match_context_t *mctx)
4113 {
4114 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4115 match_ctx_clean (mctx);
4116 re_free (mctx->sub_tops);
4117 re_free (mctx->bkref_ents);
4118 }
4119
4120 /* Add a new backreference entry to MCTX.
4121 Note that we assume that caller never call this function with duplicate
4122 entry, and call with STR_IDX which isn't smaller than any existing entry.
4123 */
4124
4125 static reg_errcode_t
4126 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4127 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4128 Idx to)
4129 {
4130 if (mctx->nbkref_ents >= mctx->abkref_ents)
4131 {
4132 struct re_backref_cache_entry* new_entry;
4133 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4134 mctx->abkref_ents * 2);
4135 if (__glibc_unlikely (new_entry == NULL))
4136 {
4137 re_free (mctx->bkref_ents);
4138 return REG_ESPACE;
4139 }
4140 mctx->bkref_ents = new_entry;
4141 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4142 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4143 mctx->abkref_ents *= 2;
4144 }
4145 if (mctx->nbkref_ents > 0
4146 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4147 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4148
4149 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4150 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4151 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4152 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4153
4154 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4155 If bit N is clear, means that this entry won't epsilon-transition to
4156 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4157 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4158 such node.
4159
4160 A backreference does not epsilon-transition unless it is empty, so set
4161 to all zeros if FROM != TO. */
4162 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4163 = (from == to ? -1 : 0);
4164
4165 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4166 if (mctx->max_mb_elem_len < to - from)
4167 mctx->max_mb_elem_len = to - from;
4168 return REG_NOERROR;
4169 }
4170
4171 /* Return the first entry with the same str_idx, or -1 if none is
4172 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4173
4174 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4175 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4176 {
4177 Idx left, right, mid, last;
4178 last = right = mctx->nbkref_ents;
4179 for (left = 0; left < right;)
4180 {
4181 mid = (left + right) / 2;
4182 if (mctx->bkref_ents[mid].str_idx < str_idx)
4183 left = mid + 1;
4184 else
4185 right = mid;
4186 }
4187 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4188 return left;
4189 else
4190 return -1;
4191 }
4192
4193 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4194 at STR_IDX. */
4195
4196 static reg_errcode_t
4197 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4198 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4199 {
4200 DEBUG_ASSERT (mctx->sub_tops != NULL);
4201 DEBUG_ASSERT (mctx->asub_tops > 0);
4202 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4203 {
4204 Idx new_asub_tops = mctx->asub_tops * 2;
4205 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4206 re_sub_match_top_t *,
4207 new_asub_tops);
4208 if (__glibc_unlikely (new_array == NULL))
4209 return REG_ESPACE;
4210 mctx->sub_tops = new_array;
4211 mctx->asub_tops = new_asub_tops;
4212 }
4213 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4214 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4215 return REG_ESPACE;
4216 mctx->sub_tops[mctx->nsub_tops]->node = node;
4217 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4218 return REG_NOERROR;
4219 }
4220
4221 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4222 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4223 Return the new entry if successful, NULL if memory is exhausted. */
4224
4225 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4226 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4227 {
4228 re_sub_match_last_t *new_entry;
4229 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4230 {
4231 Idx new_alasts = 2 * subtop->alasts + 1;
4232 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4233 re_sub_match_last_t *,
4234 new_alasts);
4235 if (__glibc_unlikely (new_array == NULL))
4236 return NULL;
4237 subtop->lasts = new_array;
4238 subtop->alasts = new_alasts;
4239 }
4240 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4241 if (__glibc_likely (new_entry != NULL))
4242 {
4243 subtop->lasts[subtop->nlasts] = new_entry;
4244 new_entry->node = node;
4245 new_entry->str_idx = str_idx;
4246 ++subtop->nlasts;
4247 }
4248 return new_entry;
4249 }
4250
4251 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4252 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4253 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4254 {
4255 sctx->sifted_states = sifted_sts;
4256 sctx->limited_states = limited_sts;
4257 sctx->last_node = last_node;
4258 sctx->last_str_idx = last_str_idx;
4259 re_node_set_init_empty (&sctx->limits);
4260 }
4261