1
2 /* this is very old and has massive numbers of compiler warnings. */
3 /* Maybe try upgrading to a newer version such as */
4 /* https://fossies.org/dox/glibc-2.24/regexec_8c_source.html */
5
6
7 /* Extended regular expression matching and search library.
8 Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
9 This file is part of the GNU C Library.
10 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
11
12 The GNU C Library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 2.1 of the License, or (at your option) any later version.
16
17 The GNU C Library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License along with the GNU C Library; if not, write to the Free
24 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 02111-1307 USA. */
26
27 /* Added 12/2016 to remove warning: */
28 /* incompatible implicit declaration of built-in function 'alloca' */
29 #if __WIN32__
30 #include <malloc.h>
31 #else
32 #include <alloca.h>
33 #endif
34
35 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
36 int n) internal_function;
37 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
38 static void match_ctx_free (re_match_context_t *cache) internal_function;
39 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
40 int str_idx, int from, int to)
41 internal_function;
42 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
43 internal_function;
44 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
45 int str_idx) internal_function;
46 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
47 int node, int str_idx)
48 internal_function;
49 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
50 re_dfastate_t **limited_sts, int last_node,
51 int last_str_idx)
52 internal_function;
53 static reg_errcode_t re_search_internal (const regex_t *preg,
54 const char *string, int length,
55 int start, int range, int stop,
56 size_t nmatch, regmatch_t pmatch[],
57 int eflags) internal_function;
58 static int re_search_2_stub (struct re_pattern_buffer *bufp,
59 const char *string1, int length1,
60 const char *string2, int length2,
61 int start, int range, struct re_registers *regs,
62 int stop, int ret_len) internal_function;
63 static int re_search_stub (struct re_pattern_buffer *bufp,
64 const char *string, int length, int start,
65 int range, int stop, struct re_registers *regs,
66 int ret_len) internal_function;
67 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
68 int nregs, int regs_allocated) internal_function;
69 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
70 internal_function;
71 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
72 int *p_match_first) internal_function;
73 static int check_halt_state_context (const re_match_context_t *mctx,
74 const re_dfastate_t *state, int idx)
75 internal_function;
76 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
77 regmatch_t *prev_idx_match, int cur_node,
78 int cur_idx, int nmatch) internal_function;
79 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
80 int str_idx, int dest_node, int nregs,
81 regmatch_t *regs,
82 re_node_set *eps_via_nodes)
83 internal_function;
84 static reg_errcode_t set_regs (const regex_t *preg,
85 const re_match_context_t *mctx,
86 size_t nmatch, regmatch_t *pmatch,
87 int fl_backtrack) internal_function;
88 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
89 internal_function;
90
91 #ifdef RE_ENABLE_I18N
92 static int sift_states_iter_mb (const re_match_context_t *mctx,
93 re_sift_context_t *sctx,
94 int node_idx, int str_idx, int max_str_idx)
95 internal_function;
96 #endif /* RE_ENABLE_I18N */
97 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
98 re_sift_context_t *sctx)
99 internal_function;
100 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
101 re_sift_context_t *sctx, int str_idx,
102 re_node_set *cur_dest)
103 internal_function;
104 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 int str_idx,
107 re_node_set *dest_nodes)
108 internal_function;
109 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
110 re_node_set *dest_nodes,
111 const re_node_set *candidates)
112 internal_function;
113 static int check_dst_limits (const re_match_context_t *mctx,
114 re_node_set *limits,
115 int dst_node, int dst_idx, int src_node,
116 int src_idx) internal_function;
117 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
118 int boundaries, int subexp_idx,
119 int from_node, int bkref_idx)
120 internal_function;
121 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
122 int limit, int subexp_idx,
123 int node, int str_idx,
124 int bkref_idx) internal_function;
125 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
126 re_node_set *dest_nodes,
127 const re_node_set *candidates,
128 re_node_set *limits,
129 struct re_backref_cache_entry *bkref_ents,
130 int str_idx) internal_function;
131 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
132 re_sift_context_t *sctx,
133 int str_idx, const re_node_set *candidates)
134 internal_function;
135 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
136 re_dfastate_t **dst,
137 re_dfastate_t **src, int num)
138 internal_function;
139 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
140 re_match_context_t *mctx) internal_function;
141 static re_dfastate_t *transit_state (reg_errcode_t *err,
142 re_match_context_t *mctx,
143 re_dfastate_t *state) internal_function;
144 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
145 re_match_context_t *mctx,
146 re_dfastate_t *next_state)
147 internal_function;
148 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
149 re_node_set *cur_nodes,
150 int str_idx) internal_function;
151 #if 0
152 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
153 re_match_context_t *mctx,
154 re_dfastate_t *pstate)
155 internal_function;
156 #endif
157 #ifdef RE_ENABLE_I18N
158 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
159 re_dfastate_t *pstate)
160 internal_function;
161 #endif /* RE_ENABLE_I18N */
162 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
163 const re_node_set *nodes)
164 internal_function;
165 static reg_errcode_t get_subexp (re_match_context_t *mctx,
166 int bkref_node, int bkref_str_idx)
167 internal_function;
168 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
169 const re_sub_match_top_t *sub_top,
170 re_sub_match_last_t *sub_last,
171 int bkref_node, int bkref_str)
172 internal_function;
173 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
174 int subexp_idx, int type) internal_function;
175 static reg_errcode_t check_arrival (re_match_context_t *mctx,
176 state_array_t *path, int top_node,
177 int top_str, int last_node, int last_str,
178 int type) internal_function;
179 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
180 int str_idx,
181 re_node_set *cur_nodes,
182 re_node_set *next_nodes)
183 internal_function;
184 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
185 re_node_set *cur_nodes,
186 int ex_subexp, int type)
187 internal_function;
188 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
189 re_node_set *dst_nodes,
190 int target, int ex_subexp,
191 int type) internal_function;
192 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
193 re_node_set *cur_nodes, int cur_str,
194 int subexp_num, int type)
195 internal_function;
196 static int build_trtable (const re_dfa_t *dfa,
197 re_dfastate_t *state) internal_function;
198 #ifdef RE_ENABLE_I18N
199 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
200 const re_string_t *input, int idx)
201 internal_function;
202 # ifdef _LIBC
203 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
204 size_t name_len)
205 internal_function;
206 # endif /* _LIBC */
207 #endif /* RE_ENABLE_I18N */
208 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
209 const re_dfastate_t *state,
210 re_node_set *states_node,
211 bitset_t *states_ch) internal_function;
212 static int check_node_accept (const re_match_context_t *mctx,
213 const re_token_t *node, int idx)
214 internal_function;
215 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
216 internal_function;
217
218 /* Entry point for POSIX code. */
219
220 /* regexec searches for a given pattern, specified by PREG, in the
221 string STRING.
222
223 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
224 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
225 least NMATCH elements, and we set them to the offsets of the
226 corresponding matched substrings.
227
228 EFLAGS specifies `execution flags' which affect matching: if
229 REG_NOTBOL is set, then ^ does not match at the beginning of the
230 string; if REG_NOTEOL is set, then $ does not match at the end.
231
232 We return 0 if we find a match and REG_NOMATCH if not. */
233
234 int
regexec(preg,string,nmatch,pmatch,eflags)235 regexec (preg, string, nmatch, pmatch, eflags)
236 const regex_t *__restrict preg;
237 const char *__restrict string;
238 size_t nmatch;
239 regmatch_t pmatch[];
240 int eflags;
241 {
242 reg_errcode_t err;
243 int start, length;
244 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
245 (void)dfa;
246
247 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
248 return REG_BADPAT;
249
250 if (eflags & REG_STARTEND)
251 {
252 start = pmatch[0].rm_so;
253 length = pmatch[0].rm_eo;
254 }
255 else
256 {
257 start = 0;
258 length = strlen (string);
259 }
260
261 __libc_lock_lock (dfa->lock);
262 if (preg->no_sub)
263 err = re_search_internal (preg, string, length, start, length - start,
264 length, 0, NULL, eflags);
265 else
266 err = re_search_internal (preg, string, length, start, length - start,
267 length, nmatch, pmatch, eflags);
268 __libc_lock_unlock (dfa->lock);
269 return err != REG_NOERROR;
270 }
271
272 #ifdef _LIBC
273 # include <shlib-compat.h>
274 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
275
276 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
277 __typeof__ (__regexec) __compat_regexec;
278
279 int
280 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)281 __compat_regexec (const regex_t *__restrict preg,
282 const char *__restrict string, size_t nmatch,
283 regmatch_t pmatch[], int eflags)
284 {
285 return regexec (preg, string, nmatch, pmatch,
286 eflags & (REG_NOTBOL | REG_NOTEOL));
287 }
288 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
289 # endif
290 #endif
291
292 /* Entry points for GNU code. */
293
294 /* re_match, re_search, re_match_2, re_search_2
295
296 The former two functions operate on STRING with length LENGTH,
297 while the later two operate on concatenation of STRING1 and STRING2
298 with lengths LENGTH1 and LENGTH2, respectively.
299
300 re_match() matches the compiled pattern in BUFP against the string,
301 starting at index START.
302
303 re_search() first tries matching at index START, then it tries to match
304 starting from index START + 1, and so on. The last start position tried
305 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
306 way as re_match().)
307
308 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
309 the first STOP characters of the concatenation of the strings should be
310 concerned.
311
312 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
313 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
314 computed relative to the concatenation, not relative to the individual
315 strings.)
316
317 On success, re_match* functions return the length of the match, re_search*
318 return the position of the start of the match. Return value -1 means no
319 match was found and -2 indicates an internal error. */
320
321 int
re_match(bufp,string,length,start,regs)322 re_match (bufp, string, length, start, regs)
323 struct re_pattern_buffer *bufp;
324 const char *string;
325 int length, start;
326 struct re_registers *regs;
327 {
328 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
329 }
330 #ifdef _LIBC
331 weak_alias (__re_match, re_match)
332 #endif
333
334 int
335 re_search (bufp, string, length, start, range, regs)
336 struct re_pattern_buffer *bufp;
337 const char *string;
338 int length, start, range;
339 struct re_registers *regs;
340 {
341 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
342 }
343 #ifdef _LIBC
344 weak_alias (__re_search, re_search)
345 #endif
346
347 int
348 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
349 struct re_pattern_buffer *bufp;
350 const char *string1, *string2;
351 int length1, length2, start, stop;
352 struct re_registers *regs;
353 {
354 return re_search_2_stub (bufp, string1, length1, string2, length2,
355 start, 0, regs, stop, 1);
356 }
357 #ifdef _LIBC
358 weak_alias (__re_match_2, re_match_2)
359 #endif
360
361 int
362 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
363 struct re_pattern_buffer *bufp;
364 const char *string1, *string2;
365 int length1, length2, start, range, stop;
366 struct re_registers *regs;
367 {
368 return re_search_2_stub (bufp, string1, length1, string2, length2,
369 start, range, regs, stop, 0);
370 }
371 #ifdef _LIBC
372 weak_alias (__re_search_2, re_search_2)
373 #endif
374
375 static int
376 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
377 stop, ret_len)
378 struct re_pattern_buffer *bufp;
379 const char *string1, *string2;
380 int length1, length2, start, range, stop, ret_len;
381 struct re_registers *regs;
382 {
383 const char *str;
384 int rval;
385 int len = length1 + length2;
386 int free_str = 0;
387
388 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
389 return -2;
390
391 /* Concatenate the strings. */
392 if (length2 > 0)
393 if (length1 > 0)
394 {
395 char *s = re_malloc (char, len);
396
397 if (BE (s == NULL, 0))
398 return -2;
399 #ifdef _LIBC
400 memcpy (__mempcpy (s, string1, length1), string2, length2);
401 #else
402 memcpy (s, string1, length1);
403 memcpy (s + length1, string2, length2);
404 #endif
405 str = s;
406 free_str = 1;
407 }
408 else
409 str = string2;
410 else
411 str = string1;
412
413 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
414 ret_len);
415 if (free_str)
416 re_free ((char *) str);
417 return rval;
418 }
419
420 /* The parameters have the same meaning as those of re_search.
421 Additional parameters:
422 If RET_LEN is nonzero the length of the match is returned (re_match style);
423 otherwise the position of the match is returned. */
424
425 static int
re_search_stub(bufp,string,length,start,range,stop,regs,ret_len)426 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
427 struct re_pattern_buffer *bufp;
428 const char *string;
429 int length, start, range, stop, ret_len;
430 struct re_registers *regs;
431 {
432 reg_errcode_t result;
433 regmatch_t *pmatch;
434 int nregs, rval;
435 int eflags = 0;
436 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
437 (void)dfa;
438
439 /* Check for out-of-range. */
440 if (BE (start < 0 || start > length, 0))
441 return -1;
442 if (BE (start + range > length, 0))
443 range = length - start;
444 else if (BE (start + range < 0, 0))
445 range = -start;
446
447 __libc_lock_lock (dfa->lock);
448
449 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
450 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
451
452 /* Compile fastmap if we haven't yet. */
453 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
454 re_compile_fastmap (bufp);
455
456 if (BE (bufp->no_sub, 0))
457 regs = NULL;
458
459 /* We need at least 1 register. */
460 if (regs == NULL)
461 nregs = 1;
462 else if (BE (bufp->regs_allocated == REGS_FIXED &&
463 regs->num_regs < bufp->re_nsub + 1, 0))
464 {
465 nregs = regs->num_regs;
466 if (BE (nregs < 1, 0))
467 {
468 /* Nothing can be copied to regs. */
469 regs = NULL;
470 nregs = 1;
471 }
472 }
473 else
474 nregs = bufp->re_nsub + 1;
475 pmatch = re_malloc (regmatch_t, nregs);
476 if (BE (pmatch == NULL, 0))
477 {
478 rval = -2;
479 goto out;
480 }
481
482 result = re_search_internal (bufp, string, length, start, range, stop,
483 nregs, pmatch, eflags);
484
485 rval = 0;
486
487 /* I hope we needn't fill ther regs with -1's when no match was found. */
488 if (result != REG_NOERROR)
489 rval = -1;
490 else if (regs != NULL)
491 {
492 /* If caller wants register contents data back, copy them. */
493 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
494 bufp->regs_allocated);
495 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
496 rval = -2;
497 }
498
499 if (BE (rval == 0, 1))
500 {
501 if (ret_len)
502 {
503 assert (pmatch[0].rm_so == start);
504 rval = pmatch[0].rm_eo - start;
505 }
506 else
507 rval = pmatch[0].rm_so;
508 }
509 re_free (pmatch);
510 out:
511 __libc_lock_unlock (dfa->lock);
512 return rval;
513 }
514
515 static unsigned
re_copy_regs(regs,pmatch,nregs,regs_allocated)516 re_copy_regs (regs, pmatch, nregs, regs_allocated)
517 struct re_registers *regs;
518 regmatch_t *pmatch;
519 int nregs, regs_allocated;
520 {
521 int rval = REGS_REALLOCATE;
522 int i;
523 int need_regs = nregs + 1;
524 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
525 uses. */
526
527 /* Have the register data arrays been allocated? */
528 if (regs_allocated == REGS_UNALLOCATED)
529 { /* No. So allocate them with malloc. */
530 regs->start = re_malloc (regoff_t, need_regs);
531 regs->end = re_malloc (regoff_t, need_regs);
532 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
533 return REGS_UNALLOCATED;
534 regs->num_regs = need_regs;
535 }
536 else if (regs_allocated == REGS_REALLOCATE)
537 { /* Yes. If we need more elements than were already
538 allocated, reallocate them. If we need fewer, just
539 leave it alone. */
540 if (BE (need_regs > regs->num_regs, 0))
541 {
542 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
543 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
544 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
545 return REGS_UNALLOCATED;
546 regs->start = new_start;
547 regs->end = new_end;
548 regs->num_regs = need_regs;
549 }
550 }
551 else
552 {
553 assert (regs_allocated == REGS_FIXED);
554 /* This function may not be called with REGS_FIXED and nregs too big. */
555 assert (regs->num_regs >= nregs);
556 rval = REGS_FIXED;
557 }
558
559 /* Copy the regs. */
560 for (i = 0; i < nregs; ++i)
561 {
562 regs->start[i] = pmatch[i].rm_so;
563 regs->end[i] = pmatch[i].rm_eo;
564 }
565 for ( ; i < regs->num_regs; ++i)
566 regs->start[i] = regs->end[i] = -1;
567
568 return rval;
569 }
570
571 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
572 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
573 this memory for recording register information. STARTS and ENDS
574 must be allocated using the malloc library routine, and must each
575 be at least NUM_REGS * sizeof (regoff_t) bytes long.
576
577 If NUM_REGS == 0, then subsequent matches should allocate their own
578 register data.
579
580 Unless this function is called, the first search or match using
581 PATTERN_BUFFER will allocate its own register data, without
582 freeing the old data. */
583
584 void
re_set_registers(bufp,regs,num_regs,starts,ends)585 re_set_registers (bufp, regs, num_regs, starts, ends)
586 struct re_pattern_buffer *bufp;
587 struct re_registers *regs;
588 unsigned num_regs;
589 regoff_t *starts, *ends;
590 {
591 if (num_regs)
592 {
593 bufp->regs_allocated = REGS_REALLOCATE;
594 regs->num_regs = num_regs;
595 regs->start = starts;
596 regs->end = ends;
597 }
598 else
599 {
600 bufp->regs_allocated = REGS_UNALLOCATED;
601 regs->num_regs = 0;
602 regs->start = regs->end = (regoff_t *) 0;
603 }
604 }
605 #ifdef _LIBC
606 weak_alias (__re_set_registers, re_set_registers)
607 #endif
608
609 /* Entry points compatible with 4.2 BSD regex library. We don't define
610 them unless specifically requested. */
611
612 #if defined _REGEX_RE_COMP || defined _LIBC
613 int
614 # ifdef _LIBC
615 weak_function
616 # endif
617 re_exec (s)
618 const char *s;
619 {
620 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
621 }
622 #endif /* _REGEX_RE_COMP */
623
624 /* Internal entry point. */
625
626 /* Searches for a compiled pattern PREG in the string STRING, whose
627 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
628 mingings with regexec. START, and RANGE have the same meanings
629 with re_search.
630 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
631 otherwise return the error code.
632 Note: We assume front end functions already check ranges.
633 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
634
635 static reg_errcode_t
re_search_internal(preg,string,length,start,range,stop,nmatch,pmatch,eflags)636 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
637 eflags)
638 const regex_t *preg;
639 const char *string;
640 int length, start, range, stop, eflags;
641 size_t nmatch;
642 regmatch_t pmatch[];
643 {
644 reg_errcode_t err;
645 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
646 int left_lim, right_lim, incr;
647 int fl_longest_match, match_first, match_kind, match_last = -1;
648 int extra_nmatch;
649 int sb, ch;
650 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
651 re_match_context_t mctx = { .dfa = dfa };
652 #else
653 re_match_context_t mctx;
654 #endif
655 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
656 && range && !preg->can_be_null) ? preg->fastmap : NULL;
657 RE_TRANSLATE_TYPE t = preg->translate;
658
659 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
660 memset (&mctx, '\0', sizeof (re_match_context_t));
661 mctx.dfa = dfa;
662 #endif
663
664 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
665 nmatch -= extra_nmatch;
666
667 /* Check if the DFA haven't been compiled. */
668 if (BE (preg->used == 0 || dfa->init_state == NULL
669 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
670 || dfa->init_state_begbuf == NULL, 0))
671 return REG_NOMATCH;
672
673 #ifdef DEBUG
674 /* We assume front-end functions already check them. */
675 assert (start + range >= 0 && start + range <= length);
676 #endif
677
678 /* If initial states with non-begbuf contexts have no elements,
679 the regex must be anchored. If preg->newline_anchor is set,
680 we'll never use init_state_nl, so do not check it. */
681 if (dfa->init_state->nodes.nelem == 0
682 && dfa->init_state_word->nodes.nelem == 0
683 && (dfa->init_state_nl->nodes.nelem == 0
684 || !preg->newline_anchor))
685 {
686 if (start != 0 && start + range != 0)
687 return REG_NOMATCH;
688 start = range = 0;
689 }
690
691 /* We must check the longest matching, if nmatch > 0. */
692 fl_longest_match = (nmatch != 0 || dfa->nbackref);
693
694 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
695 preg->translate, preg->syntax & RE_ICASE, dfa);
696 if (BE (err != REG_NOERROR, 0))
697 goto free_return;
698 mctx.input.stop = stop;
699 mctx.input.raw_stop = stop;
700 mctx.input.newline_anchor = preg->newline_anchor;
701
702 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
703 if (BE (err != REG_NOERROR, 0))
704 goto free_return;
705
706 /* We will log all the DFA states through which the dfa pass,
707 if nmatch > 1, or this dfa has "multibyte node", which is a
708 back-reference or a node which can accept multibyte character or
709 multi character collating element. */
710 if (nmatch > 1 || dfa->has_mb_node)
711 {
712 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
713 if (BE (mctx.state_log == NULL, 0))
714 {
715 err = REG_ESPACE;
716 goto free_return;
717 }
718 }
719 else
720 mctx.state_log = NULL;
721
722 match_first = start;
723 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
724 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
725
726 /* Check incrementally whether of not the input string match. */
727 incr = (range < 0) ? -1 : 1;
728 left_lim = (range < 0) ? start + range : start;
729 right_lim = (range < 0) ? start : start + range;
730 sb = dfa->mb_cur_max == 1;
731 match_kind =
732 (fastmap
733 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
734 | (range >= 0 ? 2 : 0)
735 | (t != NULL ? 1 : 0))
736 : 8);
737
738 for (;; match_first += incr)
739 {
740 err = REG_NOMATCH;
741 if (match_first < left_lim || right_lim < match_first)
742 goto free_return;
743
744 /* Advance as rapidly as possible through the string, until we
745 find a plausible place to start matching. This may be done
746 with varying efficiency, so there are various possibilities:
747 only the most common of them are specialized, in order to
748 save on code size. We use a switch statement for speed. */
749 switch (match_kind)
750 {
751 case 8:
752 /* No fastmap. */
753 break;
754
755 case 7:
756 /* Fastmap with single-byte translation, match forward. */
757 while (BE (match_first < right_lim, 1)
758 && !fastmap[t[(unsigned char) string[match_first]]])
759 ++match_first;
760 goto forward_match_found_start_or_reached_end;
761
762 case 6:
763 /* Fastmap without translation, match forward. */
764 while (BE (match_first < right_lim, 1)
765 && !fastmap[(unsigned char) string[match_first]])
766 ++match_first;
767
768 forward_match_found_start_or_reached_end:
769 if (BE (match_first == right_lim, 0))
770 {
771 ch = match_first >= length
772 ? 0 : (unsigned char) string[match_first];
773 if (!fastmap[t ? t[ch] : ch])
774 goto free_return;
775 }
776 break;
777
778 case 4:
779 case 5:
780 /* Fastmap without multi-byte translation, match backwards. */
781 while (match_first >= left_lim)
782 {
783 ch = match_first >= length
784 ? 0 : (unsigned char) string[match_first];
785 if (fastmap[t ? t[ch] : ch])
786 break;
787 --match_first;
788 }
789 if (match_first < left_lim)
790 goto free_return;
791 break;
792
793 default:
794 /* In this case, we can't determine easily the current byte,
795 since it might be a component byte of a multibyte
796 character. Then we use the constructed buffer instead. */
797 for (;;)
798 {
799 /* If MATCH_FIRST is out of the valid range, reconstruct the
800 buffers. */
801 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
802 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
803 {
804 err = re_string_reconstruct (&mctx.input, match_first,
805 eflags);
806 if (BE (err != REG_NOERROR, 0))
807 goto free_return;
808
809 offset = match_first - mctx.input.raw_mbs_idx;
810 }
811 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
812 Note that MATCH_FIRST must not be smaller than 0. */
813 ch = (match_first >= length
814 ? 0 : re_string_byte_at (&mctx.input, offset));
815 if (fastmap[ch])
816 break;
817 match_first += incr;
818 if (match_first < left_lim || match_first > right_lim)
819 {
820 err = REG_NOMATCH;
821 goto free_return;
822 }
823 }
824 break;
825 }
826
827 /* Reconstruct the buffers so that the matcher can assume that
828 the matching starts from the beginning of the buffer. */
829 err = re_string_reconstruct (&mctx.input, match_first, eflags);
830 if (BE (err != REG_NOERROR, 0))
831 goto free_return;
832
833 #ifdef RE_ENABLE_I18N
834 /* Don't consider this char as a possible match start if it part,
835 yet isn't the head, of a multibyte character. */
836 if (!sb && !re_string_first_byte (&mctx.input, 0))
837 continue;
838 #endif
839
840 /* It seems to be appropriate one, then use the matcher. */
841 /* We assume that the matching starts from 0. */
842 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
843 match_last = check_matching (&mctx, fl_longest_match,
844 range >= 0 ? &match_first : NULL);
845 if (match_last != -1)
846 {
847 if (BE (match_last == -2, 0))
848 {
849 err = REG_ESPACE;
850 goto free_return;
851 }
852 else
853 {
854 mctx.match_last = match_last;
855 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
856 {
857 re_dfastate_t *pstate = mctx.state_log[match_last];
858 mctx.last_node = check_halt_state_context (&mctx, pstate,
859 match_last);
860 }
861 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
862 || dfa->nbackref)
863 {
864 err = prune_impossible_nodes (&mctx);
865 if (err == REG_NOERROR)
866 break;
867 if (BE (err != REG_NOMATCH, 0))
868 goto free_return;
869 match_last = -1;
870 }
871 else
872 break; /* We found a match. */
873 }
874 }
875
876 match_ctx_clean (&mctx);
877 }
878
879 #ifdef DEBUG
880 assert (match_last != -1);
881 assert (err == REG_NOERROR);
882 #endif
883
884 /* Set pmatch[] if we need. */
885 if (nmatch > 0)
886 {
887 int reg_idx;
888
889 /* Initialize registers. */
890 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
891 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
892
893 /* Set the points where matching start/end. */
894 pmatch[0].rm_so = 0;
895 pmatch[0].rm_eo = mctx.match_last;
896
897 if (!preg->no_sub && nmatch > 1)
898 {
899 err = set_regs (preg, &mctx, nmatch, pmatch,
900 dfa->has_plural_match && dfa->nbackref > 0);
901 if (BE (err != REG_NOERROR, 0))
902 goto free_return;
903 }
904
905 /* At last, add the offset to the each registers, since we slided
906 the buffers so that we could assume that the matching starts
907 from 0. */
908 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
909 if (pmatch[reg_idx].rm_so != -1)
910 {
911 #ifdef RE_ENABLE_I18N
912 if (BE (mctx.input.offsets_needed != 0, 0))
913 {
914 pmatch[reg_idx].rm_so =
915 (pmatch[reg_idx].rm_so == mctx.input.valid_len
916 ? mctx.input.valid_raw_len
917 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
918 pmatch[reg_idx].rm_eo =
919 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
920 ? mctx.input.valid_raw_len
921 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
922 }
923 #else
924 assert (mctx.input.offsets_needed == 0);
925 #endif
926 pmatch[reg_idx].rm_so += match_first;
927 pmatch[reg_idx].rm_eo += match_first;
928 }
929 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
930 {
931 pmatch[nmatch + reg_idx].rm_so = -1;
932 pmatch[nmatch + reg_idx].rm_eo = -1;
933 }
934
935 if (dfa->subexp_map)
936 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
937 if (dfa->subexp_map[reg_idx] != reg_idx)
938 {
939 pmatch[reg_idx + 1].rm_so
940 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
941 pmatch[reg_idx + 1].rm_eo
942 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
943 }
944 }
945
946 free_return:
947 re_free (mctx.state_log);
948 if (dfa->nbackref)
949 match_ctx_free (&mctx);
950 re_string_destruct (&mctx.input);
951 return err;
952 }
953
954 static reg_errcode_t
prune_impossible_nodes(mctx)955 prune_impossible_nodes (mctx)
956 re_match_context_t *mctx;
957 {
958 const re_dfa_t *const dfa = mctx->dfa;
959 int halt_node, match_last;
960 reg_errcode_t ret;
961 re_dfastate_t **sifted_states;
962 re_dfastate_t **lim_states = NULL;
963 re_sift_context_t sctx;
964 #ifdef DEBUG
965 assert (mctx->state_log != NULL);
966 #endif
967 match_last = mctx->match_last;
968 halt_node = mctx->last_node;
969 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
970 if (BE (sifted_states == NULL, 0))
971 {
972 ret = REG_ESPACE;
973 goto free_return;
974 }
975 if (dfa->nbackref)
976 {
977 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
978 if (BE (lim_states == NULL, 0))
979 {
980 ret = REG_ESPACE;
981 goto free_return;
982 }
983 while (1)
984 {
985 memset (lim_states, '\0',
986 sizeof (re_dfastate_t *) * (match_last + 1));
987 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
988 match_last);
989 ret = sift_states_backward (mctx, &sctx);
990 re_node_set_free (&sctx.limits);
991 if (BE (ret != REG_NOERROR, 0))
992 goto free_return;
993 if (sifted_states[0] != NULL || lim_states[0] != NULL)
994 break;
995 do
996 {
997 --match_last;
998 if (match_last < 0)
999 {
1000 ret = REG_NOMATCH;
1001 goto free_return;
1002 }
1003 } while (mctx->state_log[match_last] == NULL
1004 || !mctx->state_log[match_last]->halt);
1005 halt_node = check_halt_state_context (mctx,
1006 mctx->state_log[match_last],
1007 match_last);
1008 }
1009 ret = merge_state_array (dfa, sifted_states, lim_states,
1010 match_last + 1);
1011 re_free (lim_states);
1012 lim_states = NULL;
1013 if (BE (ret != REG_NOERROR, 0))
1014 goto free_return;
1015 }
1016 else
1017 {
1018 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019 ret = sift_states_backward (mctx, &sctx);
1020 re_node_set_free (&sctx.limits);
1021 if (BE (ret != REG_NOERROR, 0))
1022 goto free_return;
1023 }
1024 re_free (mctx->state_log);
1025 mctx->state_log = sifted_states;
1026 sifted_states = NULL;
1027 mctx->last_node = halt_node;
1028 mctx->match_last = match_last;
1029 ret = REG_NOERROR;
1030 free_return:
1031 re_free (sifted_states);
1032 re_free (lim_states);
1033 return ret;
1034 }
1035
1036 /* Acquire an initial state and return it.
1037 We must select appropriate initial state depending on the context,
1038 since initial states may have constraints like "\<", "^", etc.. */
1039
1040 static inline re_dfastate_t *
1041 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,int idx)1042 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1043 int idx)
1044 {
1045 const re_dfa_t *const dfa = mctx->dfa;
1046 if (dfa->init_state->has_constraint)
1047 {
1048 unsigned int context;
1049 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1050 if (IS_WORD_CONTEXT (context))
1051 return dfa->init_state_word;
1052 else if (IS_ORDINARY_CONTEXT (context))
1053 return dfa->init_state;
1054 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1055 return dfa->init_state_begbuf;
1056 else if (IS_NEWLINE_CONTEXT (context))
1057 return dfa->init_state_nl;
1058 else if (IS_BEGBUF_CONTEXT (context))
1059 {
1060 /* It is relatively rare case, then calculate on demand. */
1061 return re_acquire_state_context (err, dfa,
1062 dfa->init_state->entrance_nodes,
1063 context);
1064 }
1065 else
1066 /* Must not happen? */
1067 return dfa->init_state;
1068 }
1069 else
1070 return dfa->init_state;
1071 }
1072
1073 /* Check whether the regular expression match input string INPUT or not,
1074 and return the index where the matching end, return -1 if not match,
1075 or return -2 in case of an error.
1076 FL_LONGEST_MATCH means we want the POSIX longest matching.
1077 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1078 next place where we may want to try matching.
1079 Note that the matcher assume that the maching starts from the current
1080 index of the buffer. */
1081
1082 static int
1083 internal_function
check_matching(re_match_context_t * mctx,int fl_longest_match,int * p_match_first)1084 check_matching (re_match_context_t *mctx, int fl_longest_match,
1085 int *p_match_first)
1086 {
1087 const re_dfa_t *const dfa = mctx->dfa;
1088 reg_errcode_t err;
1089 int match = 0;
1090 int match_last = -1;
1091 int cur_str_idx = re_string_cur_idx (&mctx->input);
1092 re_dfastate_t *cur_state;
1093 int at_init_state = p_match_first != NULL;
1094 int next_start_idx = cur_str_idx;
1095
1096 err = REG_NOERROR;
1097 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1098 /* An initial state must not be NULL (invalid). */
1099 if (BE (cur_state == NULL, 0))
1100 {
1101 assert (err == REG_ESPACE);
1102 return -2;
1103 }
1104
1105 if (mctx->state_log != NULL)
1106 {
1107 mctx->state_log[cur_str_idx] = cur_state;
1108
1109 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1110 later. E.g. Processing back references. */
1111 if (BE (dfa->nbackref, 0))
1112 {
1113 at_init_state = 0;
1114 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1115 if (BE (err != REG_NOERROR, 0))
1116 return err;
1117
1118 if (cur_state->has_backref)
1119 {
1120 err = transit_state_bkref (mctx, &cur_state->nodes);
1121 if (BE (err != REG_NOERROR, 0))
1122 return err;
1123 }
1124 }
1125 }
1126
1127 /* If the RE accepts NULL string. */
1128 if (BE (cur_state->halt, 0))
1129 {
1130 if (!cur_state->has_constraint
1131 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1132 {
1133 if (!fl_longest_match)
1134 return cur_str_idx;
1135 else
1136 {
1137 match_last = cur_str_idx;
1138 match = 1;
1139 }
1140 }
1141 }
1142
1143 while (!re_string_eoi (&mctx->input))
1144 {
1145 re_dfastate_t *old_state = cur_state;
1146 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1147
1148 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1149 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1150 && mctx->input.valid_len < mctx->input.len))
1151 {
1152 err = extend_buffers (mctx);
1153 if (BE (err != REG_NOERROR, 0))
1154 {
1155 assert (err == REG_ESPACE);
1156 return -2;
1157 }
1158 }
1159
1160 cur_state = transit_state (&err, mctx, cur_state);
1161 if (mctx->state_log != NULL)
1162 cur_state = merge_state_with_log (&err, mctx, cur_state);
1163
1164 if (cur_state == NULL)
1165 {
1166 /* Reached the invalid state or an error. Try to recover a valid
1167 state using the state log, if available and if we have not
1168 already found a valid (even if not the longest) match. */
1169 if (BE (err != REG_NOERROR, 0))
1170 return -2;
1171
1172 if (mctx->state_log == NULL
1173 || (match && !fl_longest_match)
1174 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1175 break;
1176 }
1177
1178 if (BE (at_init_state, 0))
1179 {
1180 if (old_state == cur_state)
1181 next_start_idx = next_char_idx;
1182 else
1183 at_init_state = 0;
1184 }
1185
1186 if (cur_state->halt)
1187 {
1188 /* Reached a halt state.
1189 Check the halt state can satisfy the current context. */
1190 if (!cur_state->has_constraint
1191 || check_halt_state_context (mctx, cur_state,
1192 re_string_cur_idx (&mctx->input)))
1193 {
1194 /* We found an appropriate halt state. */
1195 match_last = re_string_cur_idx (&mctx->input);
1196 match = 1;
1197
1198 /* We found a match, do not modify match_first below. */
1199 p_match_first = NULL;
1200 if (!fl_longest_match)
1201 break;
1202 }
1203 }
1204 }
1205
1206 if (p_match_first)
1207 *p_match_first += next_start_idx;
1208
1209 return match_last;
1210 }
1211
1212 /* Check NODE match the current context. */
1213
1214 static int
1215 internal_function
check_halt_node_context(const re_dfa_t * dfa,int node,unsigned int context)1216 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1217 {
1218 re_token_type_t type = dfa->nodes[node].type;
1219 unsigned int constraint = dfa->nodes[node].constraint;
1220 if (type != END_OF_RE)
1221 return 0;
1222 if (!constraint)
1223 return 1;
1224 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1225 return 0;
1226 return 1;
1227 }
1228
1229 /* Check the halt state STATE match the current context.
1230 Return 0 if not match, if the node, STATE has, is a halt node and
1231 match the context, return the node. */
1232
1233 static int
1234 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,int idx)1235 check_halt_state_context (const re_match_context_t *mctx,
1236 const re_dfastate_t *state, int idx)
1237 {
1238 int i;
1239 unsigned int context;
1240 #ifdef DEBUG
1241 assert (state->halt);
1242 #endif
1243 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1244 for (i = 0; i < state->nodes.nelem; ++i)
1245 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1246 return state->nodes.elems[i];
1247 return 0;
1248 }
1249
1250 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1251 corresponding to the DFA).
1252 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1253 of errors. */
1254
1255 static int
1256 internal_function
proceed_next_node(const re_match_context_t * mctx,int nregs,regmatch_t * regs,int * pidx,int node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1257 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1258 int *pidx, int node, re_node_set *eps_via_nodes,
1259 struct re_fail_stack_t *fs)
1260 {
1261 const re_dfa_t *const dfa = mctx->dfa;
1262 int i, err;
1263 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1264 {
1265 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1266 re_node_set *edests = &dfa->edests[node];
1267 int dest_node;
1268 err = re_node_set_insert (eps_via_nodes, node);
1269 if (BE (err < 0, 0))
1270 return -2;
1271 /* Pick up a valid destination, or return -1 if none is found. */
1272 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1273 {
1274 int candidate = edests->elems[i];
1275 if (!re_node_set_contains (cur_nodes, candidate))
1276 continue;
1277 if (dest_node == -1)
1278 dest_node = candidate;
1279
1280 else
1281 {
1282 /* In order to avoid infinite loop like "(a*)*", return the second
1283 epsilon-transition if the first was already considered. */
1284 if (re_node_set_contains (eps_via_nodes, dest_node))
1285 return candidate;
1286
1287 /* Otherwise, push the second epsilon-transition on the fail stack. */
1288 else if (fs != NULL
1289 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1290 eps_via_nodes))
1291 return -2;
1292
1293 /* We know we are going to exit. */
1294 break;
1295 }
1296 }
1297 return dest_node;
1298 }
1299 else
1300 {
1301 int naccepted = 0;
1302 re_token_type_t type = dfa->nodes[node].type;
1303
1304 #ifdef RE_ENABLE_I18N
1305 if (dfa->nodes[node].accept_mb)
1306 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1307 else
1308 #endif /* RE_ENABLE_I18N */
1309 if (type == OP_BACK_REF)
1310 {
1311 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1312 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1313 if (fs != NULL)
1314 {
1315 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1316 return -1;
1317 else if (naccepted)
1318 {
1319 char *buf = (char *) re_string_get_buffer (&mctx->input);
1320 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1321 naccepted) != 0)
1322 return -1;
1323 }
1324 }
1325
1326 if (naccepted == 0)
1327 {
1328 int dest_node;
1329 err = re_node_set_insert (eps_via_nodes, node);
1330 if (BE (err < 0, 0))
1331 return -2;
1332 dest_node = dfa->edests[node].elems[0];
1333 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1334 dest_node))
1335 return dest_node;
1336 }
1337 }
1338
1339 if (naccepted != 0
1340 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1341 {
1342 int dest_node = dfa->nexts[node];
1343 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1344 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1345 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1346 dest_node)))
1347 return -1;
1348 re_node_set_empty (eps_via_nodes);
1349 return dest_node;
1350 }
1351 }
1352 return -1;
1353 }
1354
1355 static reg_errcode_t
1356 internal_function
push_fail_stack(struct re_fail_stack_t * fs,int str_idx,int dest_node,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1357 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1358 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1359 {
1360 reg_errcode_t err;
1361 int num = fs->num++;
1362 if (fs->num == fs->alloc)
1363 {
1364 struct re_fail_stack_ent_t *new_array;
1365 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1366 * fs->alloc * 2));
1367 if (new_array == NULL)
1368 return REG_ESPACE;
1369 fs->alloc *= 2;
1370 fs->stack = new_array;
1371 }
1372 fs->stack[num].idx = str_idx;
1373 fs->stack[num].node = dest_node;
1374 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1375 if (fs->stack[num].regs == NULL)
1376 return REG_ESPACE;
1377 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1378 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1379 return err;
1380 }
1381
1382 static int
1383 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,int * pidx,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1384 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1385 regmatch_t *regs, re_node_set *eps_via_nodes)
1386 {
1387 int num = --fs->num;
1388 assert (num >= 0);
1389 *pidx = fs->stack[num].idx;
1390 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1391 re_node_set_free (eps_via_nodes);
1392 re_free (fs->stack[num].regs);
1393 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1394 return fs->stack[num].node;
1395 }
1396
1397 /* Set the positions where the subexpressions are starts/ends to registers
1398 PMATCH.
1399 Note: We assume that pmatch[0] is already set, and
1400 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1401
1402 static reg_errcode_t
1403 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,int fl_backtrack)1404 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1405 regmatch_t *pmatch, int fl_backtrack)
1406 {
1407 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1408 int idx, cur_node;
1409 re_node_set eps_via_nodes;
1410 struct re_fail_stack_t *fs;
1411 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1412 regmatch_t *prev_idx_match;
1413 int prev_idx_match_malloced = 0;
1414
1415 #ifdef DEBUG
1416 assert (nmatch > 1);
1417 assert (mctx->state_log != NULL);
1418 #endif
1419 if (fl_backtrack)
1420 {
1421 fs = &fs_body;
1422 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1423 if (fs->stack == NULL)
1424 return REG_ESPACE;
1425 }
1426 else
1427 fs = NULL;
1428
1429 cur_node = dfa->init_node;
1430 re_node_set_init_empty (&eps_via_nodes);
1431
1432 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1433 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1434 else
1435 {
1436 prev_idx_match = re_malloc (regmatch_t, nmatch);
1437 if (prev_idx_match == NULL)
1438 {
1439 free_fail_stack_return (fs);
1440 return REG_ESPACE;
1441 }
1442 prev_idx_match_malloced = 1;
1443 }
1444 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1445
1446 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1447 {
1448 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1449
1450 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1451 {
1452 int reg_idx;
1453 if (fs)
1454 {
1455 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1456 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1457 break;
1458 if (reg_idx == nmatch)
1459 {
1460 re_node_set_free (&eps_via_nodes);
1461 if (prev_idx_match_malloced)
1462 re_free (prev_idx_match);
1463 return free_fail_stack_return (fs);
1464 }
1465 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1466 &eps_via_nodes);
1467 }
1468 else
1469 {
1470 re_node_set_free (&eps_via_nodes);
1471 if (prev_idx_match_malloced)
1472 re_free (prev_idx_match);
1473 return REG_NOERROR;
1474 }
1475 }
1476
1477 /* Proceed to next node. */
1478 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1479 &eps_via_nodes, fs);
1480
1481 if (BE (cur_node < 0, 0))
1482 {
1483 if (BE (cur_node == -2, 0))
1484 {
1485 re_node_set_free (&eps_via_nodes);
1486 if (prev_idx_match_malloced)
1487 re_free (prev_idx_match);
1488 free_fail_stack_return (fs);
1489 return REG_ESPACE;
1490 }
1491 if (fs)
1492 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1493 &eps_via_nodes);
1494 else
1495 {
1496 re_node_set_free (&eps_via_nodes);
1497 if (prev_idx_match_malloced)
1498 re_free (prev_idx_match);
1499 return REG_NOMATCH;
1500 }
1501 }
1502 }
1503 re_node_set_free (&eps_via_nodes);
1504 if (prev_idx_match_malloced)
1505 re_free (prev_idx_match);
1506 return free_fail_stack_return (fs);
1507 }
1508
1509 static reg_errcode_t
1510 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1511 free_fail_stack_return (struct re_fail_stack_t *fs)
1512 {
1513 if (fs)
1514 {
1515 int fs_idx;
1516 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1517 {
1518 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1519 re_free (fs->stack[fs_idx].regs);
1520 }
1521 re_free (fs->stack);
1522 }
1523 return REG_NOERROR;
1524 }
1525
1526 static void
1527 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,int cur_node,int cur_idx,int nmatch)1528 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1529 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1530 {
1531 int type = dfa->nodes[cur_node].type;
1532 if (type == OP_OPEN_SUBEXP)
1533 {
1534 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1535
1536 /* We are at the first node of this sub expression. */
1537 if (reg_num < nmatch)
1538 {
1539 pmatch[reg_num].rm_so = cur_idx;
1540 pmatch[reg_num].rm_eo = -1;
1541 }
1542 }
1543 else if (type == OP_CLOSE_SUBEXP)
1544 {
1545 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1546 if (reg_num < nmatch)
1547 {
1548 /* We are at the last node of this sub expression. */
1549 if (pmatch[reg_num].rm_so < cur_idx)
1550 {
1551 pmatch[reg_num].rm_eo = cur_idx;
1552 /* This is a non-empty match or we are not inside an optional
1553 subexpression. Accept this right away. */
1554 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1555 }
1556 else
1557 {
1558 if (dfa->nodes[cur_node].opt_subexp
1559 && prev_idx_match[reg_num].rm_so != -1)
1560 /* We transited through an empty match for an optional
1561 subexpression, like (a?)*, and this is not the subexp's
1562 first match. Copy back the old content of the registers
1563 so that matches of an inner subexpression are undone as
1564 well, like in ((a?))*. */
1565 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1566 else
1567 /* We completed a subexpression, but it may be part of
1568 an optional one, so do not update PREV_IDX_MATCH. */
1569 pmatch[reg_num].rm_eo = cur_idx;
1570 }
1571 }
1572 }
1573 }
1574
1575 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1576 and sift the nodes in each states according to the following rules.
1577 Updated state_log will be wrote to STATE_LOG.
1578
1579 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1580 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1581 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1582 the LAST_NODE, we throw away the node `a'.
1583 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1584 string `s' and transit to `b':
1585 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1586 away the node `a'.
1587 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1588 thrown away, we throw away the node `a'.
1589 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1590 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1591 node `a'.
1592 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1593 we throw away the node `a'. */
1594
1595 #define STATE_NODE_CONTAINS(state,node) \
1596 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1597
1598 static reg_errcode_t
1599 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1600 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1601 {
1602 reg_errcode_t err;
1603 int null_cnt = 0;
1604 int str_idx = sctx->last_str_idx;
1605 re_node_set cur_dest;
1606
1607 #ifdef DEBUG
1608 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1609 #endif
1610
1611 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1612 transit to the last_node and the last_node itself. */
1613 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1614 if (BE (err != REG_NOERROR, 0))
1615 return err;
1616 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1617 if (BE (err != REG_NOERROR, 0))
1618 goto free_return;
1619
1620 /* Then check each states in the state_log. */
1621 while (str_idx > 0)
1622 {
1623 /* Update counters. */
1624 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1625 if (null_cnt > mctx->max_mb_elem_len)
1626 {
1627 memset (sctx->sifted_states, '\0',
1628 sizeof (re_dfastate_t *) * str_idx);
1629 re_node_set_free (&cur_dest);
1630 return REG_NOERROR;
1631 }
1632 re_node_set_empty (&cur_dest);
1633 --str_idx;
1634
1635 if (mctx->state_log[str_idx])
1636 {
1637 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1638 if (BE (err != REG_NOERROR, 0))
1639 goto free_return;
1640 }
1641
1642 /* Add all the nodes which satisfy the following conditions:
1643 - It can epsilon transit to a node in CUR_DEST.
1644 - It is in CUR_SRC.
1645 And update state_log. */
1646 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1647 if (BE (err != REG_NOERROR, 0))
1648 goto free_return;
1649 }
1650 err = REG_NOERROR;
1651 free_return:
1652 re_node_set_free (&cur_dest);
1653 return err;
1654 }
1655
1656 static reg_errcode_t
1657 internal_function
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * cur_dest)1658 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1659 int str_idx, re_node_set *cur_dest)
1660 {
1661 const re_dfa_t *const dfa = mctx->dfa;
1662 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1663 int i;
1664
1665 /* Then build the next sifted state.
1666 We build the next sifted state on `cur_dest', and update
1667 `sifted_states[str_idx]' with `cur_dest'.
1668 Note:
1669 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1670 `cur_src' points the node_set of the old `state_log[str_idx]'
1671 (with the epsilon nodes pre-filtered out). */
1672 for (i = 0; i < cur_src->nelem; i++)
1673 {
1674 int prev_node = cur_src->elems[i];
1675 int naccepted = 0;
1676 int ret;
1677
1678 #ifdef DEBUG
1679 re_token_type_t type = dfa->nodes[prev_node].type;
1680 assert (!IS_EPSILON_NODE (type));
1681 #endif
1682 #ifdef RE_ENABLE_I18N
1683 /* If the node may accept `multi byte'. */
1684 if (dfa->nodes[prev_node].accept_mb)
1685 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1686 str_idx, sctx->last_str_idx);
1687 #endif /* RE_ENABLE_I18N */
1688
1689 /* We don't check backreferences here.
1690 See update_cur_sifted_state(). */
1691 if (!naccepted
1692 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1693 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1694 dfa->nexts[prev_node]))
1695 naccepted = 1;
1696
1697 if (naccepted == 0)
1698 continue;
1699
1700 if (sctx->limits.nelem)
1701 {
1702 int to_idx = str_idx + naccepted;
1703 if (check_dst_limits (mctx, &sctx->limits,
1704 dfa->nexts[prev_node], to_idx,
1705 prev_node, str_idx))
1706 continue;
1707 }
1708 ret = re_node_set_insert (cur_dest, prev_node);
1709 if (BE (ret == -1, 0))
1710 return REG_ESPACE;
1711 }
1712
1713 return REG_NOERROR;
1714 }
1715
1716 /* Helper functions. */
1717
1718 static reg_errcode_t
1719 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,int next_state_log_idx)1720 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1721 {
1722 int top = mctx->state_log_top;
1723
1724 if (next_state_log_idx >= mctx->input.bufs_len
1725 || (next_state_log_idx >= mctx->input.valid_len
1726 && mctx->input.valid_len < mctx->input.len))
1727 {
1728 reg_errcode_t err;
1729 err = extend_buffers (mctx);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1732 }
1733
1734 if (top < next_state_log_idx)
1735 {
1736 memset (mctx->state_log + top + 1, '\0',
1737 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1738 mctx->state_log_top = next_state_log_idx;
1739 }
1740 return REG_NOERROR;
1741 }
1742
1743 static reg_errcode_t
1744 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,int num)1745 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1746 re_dfastate_t **src, int num)
1747 {
1748 int st_idx;
1749 reg_errcode_t err;
1750 for (st_idx = 0; st_idx < num; ++st_idx)
1751 {
1752 if (dst[st_idx] == NULL)
1753 dst[st_idx] = src[st_idx];
1754 else if (src[st_idx] != NULL)
1755 {
1756 re_node_set merged_set;
1757 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1758 &src[st_idx]->nodes);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1761 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1762 re_node_set_free (&merged_set);
1763 if (BE (err != REG_NOERROR, 0))
1764 return err;
1765 }
1766 }
1767 return REG_NOERROR;
1768 }
1769
1770 static reg_errcode_t
1771 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * dest_nodes)1772 update_cur_sifted_state (const re_match_context_t *mctx,
1773 re_sift_context_t *sctx, int str_idx,
1774 re_node_set *dest_nodes)
1775 {
1776 const re_dfa_t *const dfa = mctx->dfa;
1777 reg_errcode_t err = REG_NOERROR;
1778 const re_node_set *candidates;
1779 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1780 : &mctx->state_log[str_idx]->nodes);
1781
1782 if (dest_nodes->nelem == 0)
1783 sctx->sifted_states[str_idx] = NULL;
1784 else
1785 {
1786 if (candidates)
1787 {
1788 /* At first, add the nodes which can epsilon transit to a node in
1789 DEST_NODE. */
1790 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1791 if (BE (err != REG_NOERROR, 0))
1792 return err;
1793
1794 /* Then, check the limitations in the current sift_context. */
1795 if (sctx->limits.nelem)
1796 {
1797 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1798 mctx->bkref_ents, str_idx);
1799 if (BE (err != REG_NOERROR, 0))
1800 return err;
1801 }
1802 }
1803
1804 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1805 if (BE (err != REG_NOERROR, 0))
1806 return err;
1807 }
1808
1809 if (candidates && mctx->state_log[str_idx]->has_backref)
1810 {
1811 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1812 if (BE (err != REG_NOERROR, 0))
1813 return err;
1814 }
1815 return REG_NOERROR;
1816 }
1817
1818 static reg_errcode_t
1819 internal_function
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1820 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1821 const re_node_set *candidates)
1822 {
1823 reg_errcode_t err = REG_NOERROR;
1824 int i;
1825
1826 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1827 if (BE (err != REG_NOERROR, 0))
1828 return err;
1829
1830 if (!state->inveclosure.alloc)
1831 {
1832 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1833 if (BE (err != REG_NOERROR, 0))
1834 return REG_ESPACE;
1835 for (i = 0; i < dest_nodes->nelem; i++)
1836 re_node_set_merge (&state->inveclosure,
1837 dfa->inveclosures + dest_nodes->elems[i]);
1838 }
1839 return re_node_set_add_intersect (dest_nodes, candidates,
1840 &state->inveclosure);
1841 }
1842
1843 static reg_errcode_t
1844 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,int node,re_node_set * dest_nodes,const re_node_set * candidates)1845 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1846 const re_node_set *candidates)
1847 {
1848 int ecl_idx;
1849 reg_errcode_t err;
1850 re_node_set *inv_eclosure = dfa->inveclosures + node;
1851 re_node_set except_nodes;
1852 re_node_set_init_empty (&except_nodes);
1853 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854 {
1855 int cur_node = inv_eclosure->elems[ecl_idx];
1856 if (cur_node == node)
1857 continue;
1858 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1859 {
1860 int edst1 = dfa->edests[cur_node].elems[0];
1861 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1862 ? dfa->edests[cur_node].elems[1] : -1);
1863 if ((!re_node_set_contains (inv_eclosure, edst1)
1864 && re_node_set_contains (dest_nodes, edst1))
1865 || (edst2 > 0
1866 && !re_node_set_contains (inv_eclosure, edst2)
1867 && re_node_set_contains (dest_nodes, edst2)))
1868 {
1869 err = re_node_set_add_intersect (&except_nodes, candidates,
1870 dfa->inveclosures + cur_node);
1871 if (BE (err != REG_NOERROR, 0))
1872 {
1873 re_node_set_free (&except_nodes);
1874 return err;
1875 }
1876 }
1877 }
1878 }
1879 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880 {
1881 int cur_node = inv_eclosure->elems[ecl_idx];
1882 if (!re_node_set_contains (&except_nodes, cur_node))
1883 {
1884 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1885 re_node_set_remove_at (dest_nodes, idx);
1886 }
1887 }
1888 re_node_set_free (&except_nodes);
1889 return REG_NOERROR;
1890 }
1891
1892 static int
1893 internal_function
check_dst_limits(const re_match_context_t * mctx,re_node_set * limits,int dst_node,int dst_idx,int src_node,int src_idx)1894 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1895 int dst_node, int dst_idx, int src_node, int src_idx)
1896 {
1897 const re_dfa_t *const dfa = mctx->dfa;
1898 int lim_idx, src_pos, dst_pos;
1899
1900 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1901 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1902 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1903 {
1904 int subexp_idx;
1905 struct re_backref_cache_entry *ent;
1906 ent = mctx->bkref_ents + limits->elems[lim_idx];
1907 subexp_idx = dfa->nodes[ent->node].opr.idx;
1908
1909 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1910 subexp_idx, dst_node, dst_idx,
1911 dst_bkref_idx);
1912 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1913 subexp_idx, src_node, src_idx,
1914 src_bkref_idx);
1915
1916 /* In case of:
1917 <src> <dst> ( <subexp> )
1918 ( <subexp> ) <src> <dst>
1919 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1920 if (src_pos == dst_pos)
1921 continue; /* This is unrelated limitation. */
1922 else
1923 return 1;
1924 }
1925 return 0;
1926 }
1927
1928 static int
1929 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,int subexp_idx,int from_node,int bkref_idx)1930 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1931 int subexp_idx, int from_node, int bkref_idx)
1932 {
1933 const re_dfa_t *const dfa = mctx->dfa;
1934 const re_node_set *eclosures = dfa->eclosures + from_node;
1935 int node_idx;
1936
1937 /* Else, we are on the boundary: examine the nodes on the epsilon
1938 closure. */
1939 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1940 {
1941 int node = eclosures->elems[node_idx];
1942 switch (dfa->nodes[node].type)
1943 {
1944 case OP_BACK_REF:
1945 if (bkref_idx != -1)
1946 {
1947 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1948 do
1949 {
1950 int dst, cpos;
1951
1952 if (ent->node != node)
1953 continue;
1954
1955 if (subexp_idx < BITSET_WORD_BITS
1956 && !(ent->eps_reachable_subexps_map
1957 & ((bitset_word_t) 1 << subexp_idx)))
1958 continue;
1959
1960 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961 OP_CLOSE_SUBEXP cases below. But, if the
1962 destination node is the same node as the source
1963 node, don't recurse because it would cause an
1964 infinite loop: a regex that exhibits this behavior
1965 is ()\1*\1* */
1966 dst = dfa->edests[node].elems[0];
1967 if (dst == from_node)
1968 {
1969 if (boundaries & 1)
1970 return -1;
1971 else /* if (boundaries & 2) */
1972 return 0;
1973 }
1974
1975 cpos =
1976 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1977 dst, bkref_idx);
1978 if (cpos == -1 /* && (boundaries & 1) */)
1979 return -1;
1980 if (cpos == 0 && (boundaries & 2))
1981 return 0;
1982
1983 if (subexp_idx < BITSET_WORD_BITS)
1984 ent->eps_reachable_subexps_map
1985 &= ~((bitset_word_t) 1 << subexp_idx);
1986 }
1987 while (ent++->more);
1988 }
1989 break;
1990
1991 case OP_OPEN_SUBEXP:
1992 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1993 return -1;
1994 break;
1995
1996 case OP_CLOSE_SUBEXP:
1997 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1998 return 0;
1999 break;
2000
2001 default:
2002 break;
2003 }
2004 }
2005
2006 return (boundaries & 2) ? 1 : 0;
2007 }
2008
2009 static int
2010 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,int limit,int subexp_idx,int from_node,int str_idx,int bkref_idx)2011 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2012 int subexp_idx, int from_node, int str_idx,
2013 int bkref_idx)
2014 {
2015 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2016 int boundaries;
2017
2018 /* If we are outside the range of the subexpression, return -1 or 1. */
2019 if (str_idx < lim->subexp_from)
2020 return -1;
2021
2022 if (lim->subexp_to < str_idx)
2023 return 1;
2024
2025 /* If we are within the subexpression, return 0. */
2026 boundaries = (str_idx == lim->subexp_from);
2027 boundaries |= (str_idx == lim->subexp_to) << 1;
2028 if (boundaries == 0)
2029 return 0;
2030
2031 /* Else, examine epsilon closure. */
2032 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2033 from_node, bkref_idx);
2034 }
2035
2036 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2037 which are against limitations from DEST_NODES. */
2038
2039 static reg_errcode_t
2040 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,int str_idx)2041 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2042 const re_node_set *candidates, re_node_set *limits,
2043 struct re_backref_cache_entry *bkref_ents, int str_idx)
2044 {
2045 reg_errcode_t err;
2046 int node_idx, lim_idx;
2047
2048 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2049 {
2050 int subexp_idx;
2051 struct re_backref_cache_entry *ent;
2052 ent = bkref_ents + limits->elems[lim_idx];
2053
2054 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2055 continue; /* This is unrelated limitation. */
2056
2057 subexp_idx = dfa->nodes[ent->node].opr.idx;
2058 if (ent->subexp_to == str_idx)
2059 {
2060 int ops_node = -1;
2061 int cls_node = -1;
2062 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2063 {
2064 int node = dest_nodes->elems[node_idx];
2065 re_token_type_t type = dfa->nodes[node].type;
2066 if (type == OP_OPEN_SUBEXP
2067 && subexp_idx == dfa->nodes[node].opr.idx)
2068 ops_node = node;
2069 else if (type == OP_CLOSE_SUBEXP
2070 && subexp_idx == dfa->nodes[node].opr.idx)
2071 cls_node = node;
2072 }
2073
2074 /* Check the limitation of the open subexpression. */
2075 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2076 if (ops_node >= 0)
2077 {
2078 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2079 candidates);
2080 if (BE (err != REG_NOERROR, 0))
2081 return err;
2082 }
2083
2084 /* Check the limitation of the close subexpression. */
2085 if (cls_node >= 0)
2086 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2087 {
2088 int node = dest_nodes->elems[node_idx];
2089 if (!re_node_set_contains (dfa->inveclosures + node,
2090 cls_node)
2091 && !re_node_set_contains (dfa->eclosures + node,
2092 cls_node))
2093 {
2094 /* It is against this limitation.
2095 Remove it form the current sifted state. */
2096 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2097 candidates);
2098 if (BE (err != REG_NOERROR, 0))
2099 return err;
2100 --node_idx;
2101 }
2102 }
2103 }
2104 else /* (ent->subexp_to != str_idx) */
2105 {
2106 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2107 {
2108 int node = dest_nodes->elems[node_idx];
2109 re_token_type_t type = dfa->nodes[node].type;
2110 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2111 {
2112 if (subexp_idx != dfa->nodes[node].opr.idx)
2113 continue;
2114 /* It is against this limitation.
2115 Remove it form the current sifted state. */
2116 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2117 candidates);
2118 if (BE (err != REG_NOERROR, 0))
2119 return err;
2120 }
2121 }
2122 }
2123 }
2124 return REG_NOERROR;
2125 }
2126
2127 static reg_errcode_t
2128 internal_function
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,const re_node_set * candidates)2129 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2130 int str_idx, const re_node_set *candidates)
2131 {
2132 const re_dfa_t *const dfa = mctx->dfa;
2133 reg_errcode_t err;
2134 int node_idx, node;
2135 re_sift_context_t local_sctx;
2136 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2137
2138 if (first_idx == -1)
2139 return REG_NOERROR;
2140
2141 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2142
2143 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2144 {
2145 int enabled_idx;
2146 re_token_type_t type;
2147 struct re_backref_cache_entry *entry;
2148 node = candidates->elems[node_idx];
2149 type = dfa->nodes[node].type;
2150 /* Avoid infinite loop for the REs like "()\1+". */
2151 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2152 continue;
2153 if (type != OP_BACK_REF)
2154 continue;
2155
2156 entry = mctx->bkref_ents + first_idx;
2157 enabled_idx = first_idx;
2158 do
2159 {
2160 int subexp_len;
2161 int to_idx;
2162 int dst_node;
2163 int ret;
2164 re_dfastate_t *cur_state;
2165
2166 if (entry->node != node)
2167 continue;
2168 subexp_len = entry->subexp_to - entry->subexp_from;
2169 to_idx = str_idx + subexp_len;
2170 dst_node = (subexp_len ? dfa->nexts[node]
2171 : dfa->edests[node].elems[0]);
2172
2173 if (to_idx > sctx->last_str_idx
2174 || sctx->sifted_states[to_idx] == NULL
2175 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2176 || check_dst_limits (mctx, &sctx->limits, node,
2177 str_idx, dst_node, to_idx))
2178 continue;
2179
2180 if (local_sctx.sifted_states == NULL)
2181 {
2182 local_sctx = *sctx;
2183 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2184 if (BE (err != REG_NOERROR, 0))
2185 goto free_return;
2186 }
2187 local_sctx.last_node = node;
2188 local_sctx.last_str_idx = str_idx;
2189 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2190 if (BE (ret < 0, 0))
2191 {
2192 err = REG_ESPACE;
2193 goto free_return;
2194 }
2195 cur_state = local_sctx.sifted_states[str_idx];
2196 err = sift_states_backward (mctx, &local_sctx);
2197 if (BE (err != REG_NOERROR, 0))
2198 goto free_return;
2199 if (sctx->limited_states != NULL)
2200 {
2201 err = merge_state_array (dfa, sctx->limited_states,
2202 local_sctx.sifted_states,
2203 str_idx + 1);
2204 if (BE (err != REG_NOERROR, 0))
2205 goto free_return;
2206 }
2207 local_sctx.sifted_states[str_idx] = cur_state;
2208 re_node_set_remove (&local_sctx.limits, enabled_idx);
2209
2210 /* mctx->bkref_ents may have changed, reload the pointer. */
2211 entry = mctx->bkref_ents + enabled_idx;
2212 }
2213 while (enabled_idx++, entry++->more);
2214 }
2215 err = REG_NOERROR;
2216 free_return:
2217 if (local_sctx.sifted_states != NULL)
2218 {
2219 re_node_set_free (&local_sctx.limits);
2220 }
2221
2222 return err;
2223 }
2224
2225
2226 #ifdef RE_ENABLE_I18N
2227 static int
2228 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,int node_idx,int str_idx,int max_str_idx)2229 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2230 int node_idx, int str_idx, int max_str_idx)
2231 {
2232 const re_dfa_t *const dfa = mctx->dfa;
2233 int naccepted;
2234 /* Check the node can accept `multi byte'. */
2235 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2236 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2237 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2238 dfa->nexts[node_idx]))
2239 /* The node can't accept the `multi byte', or the
2240 destination was already thrown away, then the node
2241 could't accept the current input `multi byte'. */
2242 naccepted = 0;
2243 /* Otherwise, it is sure that the node could accept
2244 `naccepted' bytes input. */
2245 return naccepted;
2246 }
2247 #endif /* RE_ENABLE_I18N */
2248
2249
2250 /* Functions for state transition. */
2251
2252 /* Return the next state to which the current state STATE will transit by
2253 accepting the current input byte, and update STATE_LOG if necessary.
2254 If STATE can accept a multibyte char/collating element/back reference
2255 update the destination of STATE_LOG. */
2256
2257 static re_dfastate_t *
2258 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2259 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2260 re_dfastate_t *state)
2261 {
2262 re_dfastate_t **trtable;
2263 unsigned char ch;
2264
2265 #ifdef RE_ENABLE_I18N
2266 /* If the current state can accept multibyte. */
2267 if (BE (state->accept_mb, 0))
2268 {
2269 *err = transit_state_mb (mctx, state);
2270 if (BE (*err != REG_NOERROR, 0))
2271 return NULL;
2272 }
2273 #endif /* RE_ENABLE_I18N */
2274
2275 /* Then decide the next state with the single byte. */
2276 #if 0
2277 if (0)
2278 /* don't use transition table */
2279 return transit_state_sb (err, mctx, state);
2280 #endif
2281
2282 /* Use transition table */
2283 ch = re_string_fetch_byte (&mctx->input);
2284 for (;;)
2285 {
2286 trtable = state->trtable;
2287 if (BE (trtable != NULL, 1))
2288 return trtable[ch];
2289
2290 trtable = state->word_trtable;
2291 if (BE (trtable != NULL, 1))
2292 {
2293 unsigned int context;
2294 context
2295 = re_string_context_at (&mctx->input,
2296 re_string_cur_idx (&mctx->input) - 1,
2297 mctx->eflags);
2298 if (IS_WORD_CONTEXT (context))
2299 return trtable[ch + SBC_MAX];
2300 else
2301 return trtable[ch];
2302 }
2303
2304 if (!build_trtable (mctx->dfa, state))
2305 {
2306 *err = REG_ESPACE;
2307 return NULL;
2308 }
2309
2310 /* Retry, we now have a transition table. */
2311 }
2312 }
2313
2314 /* Update the state_log if we need */
2315 re_dfastate_t *
2316 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2317 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2318 re_dfastate_t *next_state)
2319 {
2320 const re_dfa_t *const dfa = mctx->dfa;
2321 int cur_idx = re_string_cur_idx (&mctx->input);
2322
2323 if (cur_idx > mctx->state_log_top)
2324 {
2325 mctx->state_log[cur_idx] = next_state;
2326 mctx->state_log_top = cur_idx;
2327 }
2328 else if (mctx->state_log[cur_idx] == 0)
2329 {
2330 mctx->state_log[cur_idx] = next_state;
2331 }
2332 else
2333 {
2334 re_dfastate_t *pstate;
2335 unsigned int context;
2336 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2337 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338 the destination of a multibyte char/collating element/
2339 back reference. Then the next state is the union set of
2340 these destinations and the results of the transition table. */
2341 pstate = mctx->state_log[cur_idx];
2342 log_nodes = pstate->entrance_nodes;
2343 if (next_state != NULL)
2344 {
2345 table_nodes = next_state->entrance_nodes;
2346 *err = re_node_set_init_union (&next_nodes, table_nodes,
2347 log_nodes);
2348 if (BE (*err != REG_NOERROR, 0))
2349 return NULL;
2350 }
2351 else
2352 next_nodes = *log_nodes;
2353 /* Note: We already add the nodes of the initial state,
2354 then we don't need to add them here. */
2355
2356 context = re_string_context_at (&mctx->input,
2357 re_string_cur_idx (&mctx->input) - 1,
2358 mctx->eflags);
2359 next_state = mctx->state_log[cur_idx]
2360 = re_acquire_state_context (err, dfa, &next_nodes, context);
2361 /* We don't need to check errors here, since the return value of
2362 this function is next_state and ERR is already set. */
2363
2364 if (table_nodes != NULL)
2365 re_node_set_free (&next_nodes);
2366 }
2367
2368 if (BE (dfa->nbackref, 0) && next_state != NULL)
2369 {
2370 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371 later. We must check them here, since the back references in the
2372 next state might use them. */
2373 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2374 cur_idx);
2375 if (BE (*err != REG_NOERROR, 0))
2376 return NULL;
2377
2378 /* If the next state has back references. */
2379 if (next_state->has_backref)
2380 {
2381 *err = transit_state_bkref (mctx, &next_state->nodes);
2382 if (BE (*err != REG_NOERROR, 0))
2383 return NULL;
2384 next_state = mctx->state_log[cur_idx];
2385 }
2386 }
2387
2388 return next_state;
2389 }
2390
2391 /* Skip bytes in the input that correspond to part of a
2392 multi-byte match, then look in the log for a state
2393 from which to restart matching. */
2394 re_dfastate_t *
2395 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2396 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2397 {
2398 re_dfastate_t *cur_state;
2399 do
2400 {
2401 int max = mctx->state_log_top;
2402 int cur_str_idx = re_string_cur_idx (&mctx->input);
2403
2404 do
2405 {
2406 if (++cur_str_idx > max)
2407 return NULL;
2408 re_string_skip_bytes (&mctx->input, 1);
2409 }
2410 while (mctx->state_log[cur_str_idx] == NULL);
2411
2412 cur_state = merge_state_with_log (err, mctx, NULL);
2413 }
2414 while (*err == REG_NOERROR && cur_state == NULL);
2415 return cur_state;
2416 }
2417
2418 /* Helper functions for transit_state. */
2419
2420 /* From the node set CUR_NODES, pick up the nodes whose types are
2421 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422 expression. And register them to use them later for evaluating the
2423 correspoding back references. */
2424
2425 static reg_errcode_t
2426 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,int str_idx)2427 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2428 int str_idx)
2429 {
2430 const re_dfa_t *const dfa = mctx->dfa;
2431 int node_idx;
2432 reg_errcode_t err;
2433
2434 /* TODO: This isn't efficient.
2435 Because there might be more than one nodes whose types are
2436 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2437 nodes.
2438 E.g. RE: (a){2} */
2439 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2440 {
2441 int node = cur_nodes->elems[node_idx];
2442 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2443 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2444 && (dfa->used_bkref_map
2445 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2446 {
2447 err = match_ctx_add_subtop (mctx, node, str_idx);
2448 if (BE (err != REG_NOERROR, 0))
2449 return err;
2450 }
2451 }
2452 return REG_NOERROR;
2453 }
2454
2455 #if 0
2456 /* Return the next state to which the current state STATE will transit by
2457 accepting the current input byte. */
2458
2459 static re_dfastate_t *
2460 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2461 re_dfastate_t *state)
2462 {
2463 const re_dfa_t *const dfa = mctx->dfa;
2464 re_node_set next_nodes;
2465 re_dfastate_t *next_state;
2466 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2467 unsigned int context;
2468
2469 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2470 if (BE (*err != REG_NOERROR, 0))
2471 return NULL;
2472 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2473 {
2474 int cur_node = state->nodes.elems[node_cnt];
2475 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2476 {
2477 *err = re_node_set_merge (&next_nodes,
2478 dfa->eclosures + dfa->nexts[cur_node]);
2479 if (BE (*err != REG_NOERROR, 0))
2480 {
2481 re_node_set_free (&next_nodes);
2482 return NULL;
2483 }
2484 }
2485 }
2486 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2487 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2488 /* We don't need to check errors here, since the return value of
2489 this function is next_state and ERR is already set. */
2490
2491 re_node_set_free (&next_nodes);
2492 re_string_skip_bytes (&mctx->input, 1);
2493 return next_state;
2494 }
2495 #endif
2496
2497 #ifdef RE_ENABLE_I18N
2498 static reg_errcode_t
2499 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2500 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2501 {
2502 const re_dfa_t *const dfa = mctx->dfa;
2503 reg_errcode_t err;
2504 int i;
2505
2506 for (i = 0; i < pstate->nodes.nelem; ++i)
2507 {
2508 re_node_set dest_nodes, *new_nodes;
2509 int cur_node_idx = pstate->nodes.elems[i];
2510 int naccepted, dest_idx;
2511 unsigned int context;
2512 re_dfastate_t *dest_state;
2513
2514 if (!dfa->nodes[cur_node_idx].accept_mb)
2515 continue;
2516
2517 if (dfa->nodes[cur_node_idx].constraint)
2518 {
2519 context = re_string_context_at (&mctx->input,
2520 re_string_cur_idx (&mctx->input),
2521 mctx->eflags);
2522 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2523 context))
2524 continue;
2525 }
2526
2527 /* How many bytes the node can accept? */
2528 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2529 re_string_cur_idx (&mctx->input));
2530 if (naccepted == 0)
2531 continue;
2532
2533 /* The node can accepts `naccepted' bytes. */
2534 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2535 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2536 : mctx->max_mb_elem_len);
2537 err = clean_state_log_if_needed (mctx, dest_idx);
2538 if (BE (err != REG_NOERROR, 0))
2539 return err;
2540 #ifdef DEBUG
2541 assert (dfa->nexts[cur_node_idx] != -1);
2542 #endif
2543 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2544
2545 dest_state = mctx->state_log[dest_idx];
2546 if (dest_state == NULL)
2547 dest_nodes = *new_nodes;
2548 else
2549 {
2550 err = re_node_set_init_union (&dest_nodes,
2551 dest_state->entrance_nodes, new_nodes);
2552 if (BE (err != REG_NOERROR, 0))
2553 return err;
2554 }
2555 context = re_string_context_at (&mctx->input, dest_idx - 1,
2556 mctx->eflags);
2557 mctx->state_log[dest_idx]
2558 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2559 if (dest_state != NULL)
2560 re_node_set_free (&dest_nodes);
2561 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2562 return err;
2563 }
2564 return REG_NOERROR;
2565 }
2566 #endif /* RE_ENABLE_I18N */
2567
2568 static reg_errcode_t
2569 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2570 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2571 {
2572 const re_dfa_t *const dfa = mctx->dfa;
2573 reg_errcode_t err;
2574 int i;
2575 int cur_str_idx = re_string_cur_idx (&mctx->input);
2576
2577 for (i = 0; i < nodes->nelem; ++i)
2578 {
2579 int dest_str_idx, prev_nelem, bkc_idx;
2580 int node_idx = nodes->elems[i];
2581 unsigned int context;
2582 const re_token_t *node = dfa->nodes + node_idx;
2583 re_node_set *new_dest_nodes;
2584
2585 /* Check whether `node' is a backreference or not. */
2586 if (node->type != OP_BACK_REF)
2587 continue;
2588
2589 if (node->constraint)
2590 {
2591 context = re_string_context_at (&mctx->input, cur_str_idx,
2592 mctx->eflags);
2593 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2594 continue;
2595 }
2596
2597 /* `node' is a backreference.
2598 Check the substring which the substring matched. */
2599 bkc_idx = mctx->nbkref_ents;
2600 err = get_subexp (mctx, node_idx, cur_str_idx);
2601 if (BE (err != REG_NOERROR, 0))
2602 goto free_return;
2603
2604 /* And add the epsilon closures (which is `new_dest_nodes') of
2605 the backreference to appropriate state_log. */
2606 #ifdef DEBUG
2607 assert (dfa->nexts[node_idx] != -1);
2608 #endif
2609 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2610 {
2611 int subexp_len;
2612 re_dfastate_t *dest_state;
2613 struct re_backref_cache_entry *bkref_ent;
2614 bkref_ent = mctx->bkref_ents + bkc_idx;
2615 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2616 continue;
2617 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2618 new_dest_nodes = (subexp_len == 0
2619 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2620 : dfa->eclosures + dfa->nexts[node_idx]);
2621 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2622 - bkref_ent->subexp_from);
2623 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2624 mctx->eflags);
2625 dest_state = mctx->state_log[dest_str_idx];
2626 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2627 : mctx->state_log[cur_str_idx]->nodes.nelem);
2628 /* Add `new_dest_node' to state_log. */
2629 if (dest_state == NULL)
2630 {
2631 mctx->state_log[dest_str_idx]
2632 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2633 context);
2634 if (BE (mctx->state_log[dest_str_idx] == NULL
2635 && err != REG_NOERROR, 0))
2636 goto free_return;
2637 }
2638 else
2639 {
2640 re_node_set dest_nodes;
2641 err = re_node_set_init_union (&dest_nodes,
2642 dest_state->entrance_nodes,
2643 new_dest_nodes);
2644 if (BE (err != REG_NOERROR, 0))
2645 {
2646 re_node_set_free (&dest_nodes);
2647 goto free_return;
2648 }
2649 mctx->state_log[dest_str_idx]
2650 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2651 re_node_set_free (&dest_nodes);
2652 if (BE (mctx->state_log[dest_str_idx] == NULL
2653 && err != REG_NOERROR, 0))
2654 goto free_return;
2655 }
2656 /* We need to check recursively if the backreference can epsilon
2657 transit. */
2658 if (subexp_len == 0
2659 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2660 {
2661 err = check_subexp_matching_top (mctx, new_dest_nodes,
2662 cur_str_idx);
2663 if (BE (err != REG_NOERROR, 0))
2664 goto free_return;
2665 err = transit_state_bkref (mctx, new_dest_nodes);
2666 if (BE (err != REG_NOERROR, 0))
2667 goto free_return;
2668 }
2669 }
2670 }
2671 err = REG_NOERROR;
2672 free_return:
2673 return err;
2674 }
2675
2676 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2677 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678 Note that we might collect inappropriate candidates here.
2679 However, the cost of checking them strictly here is too high, then we
2680 delay these checking for prune_impossible_nodes(). */
2681
2682 static reg_errcode_t
2683 internal_function
get_subexp(re_match_context_t * mctx,int bkref_node,int bkref_str_idx)2684 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2685 {
2686 const re_dfa_t *const dfa = mctx->dfa;
2687 int subexp_num, sub_top_idx;
2688 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2689 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2690 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2691 if (cache_idx != -1)
2692 {
2693 const struct re_backref_cache_entry *entry
2694 = mctx->bkref_ents + cache_idx;
2695 do
2696 if (entry->node == bkref_node)
2697 return REG_NOERROR; /* We already checked it. */
2698 while (entry++->more);
2699 }
2700
2701 subexp_num = dfa->nodes[bkref_node].opr.idx;
2702
2703 /* For each sub expression */
2704 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2705 {
2706 reg_errcode_t err;
2707 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2708 re_sub_match_last_t *sub_last;
2709 int sub_last_idx, sl_str, bkref_str_off;
2710
2711 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2712 continue; /* It isn't related. */
2713
2714 sl_str = sub_top->str_idx;
2715 bkref_str_off = bkref_str_idx;
2716 /* At first, check the last node of sub expressions we already
2717 evaluated. */
2718 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2719 {
2720 int sl_str_diff;
2721 sub_last = sub_top->lasts[sub_last_idx];
2722 sl_str_diff = sub_last->str_idx - sl_str;
2723 /* The matched string by the sub expression match with the substring
2724 at the back reference? */
2725 if (sl_str_diff > 0)
2726 {
2727 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2728 {
2729 /* Not enough chars for a successful match. */
2730 if (bkref_str_off + sl_str_diff > mctx->input.len)
2731 break;
2732
2733 err = clean_state_log_if_needed (mctx,
2734 bkref_str_off
2735 + sl_str_diff);
2736 if (BE (err != REG_NOERROR, 0))
2737 return err;
2738 buf = (const char *) re_string_get_buffer (&mctx->input);
2739 }
2740 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2741 /* We don't need to search this sub expression any more. */
2742 break;
2743 }
2744 bkref_str_off += sl_str_diff;
2745 sl_str += sl_str_diff;
2746 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2747 bkref_str_idx);
2748
2749 /* Reload buf, since the preceding call might have reallocated
2750 the buffer. */
2751 buf = (const char *) re_string_get_buffer (&mctx->input);
2752
2753 if (err == REG_NOMATCH)
2754 continue;
2755 if (BE (err != REG_NOERROR, 0))
2756 return err;
2757 }
2758
2759 if (sub_last_idx < sub_top->nlasts)
2760 continue;
2761 if (sub_last_idx > 0)
2762 ++sl_str;
2763 /* Then, search for the other last nodes of the sub expression. */
2764 for (; sl_str <= bkref_str_idx; ++sl_str)
2765 {
2766 int cls_node, sl_str_off;
2767 const re_node_set *nodes;
2768 sl_str_off = sl_str - sub_top->str_idx;
2769 /* The matched string by the sub expression match with the substring
2770 at the back reference? */
2771 if (sl_str_off > 0)
2772 {
2773 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2774 {
2775 /* If we are at the end of the input, we cannot match. */
2776 if (bkref_str_off >= mctx->input.len)
2777 break;
2778
2779 err = extend_buffers (mctx);
2780 if (BE (err != REG_NOERROR, 0))
2781 return err;
2782
2783 buf = (const char *) re_string_get_buffer (&mctx->input);
2784 }
2785 if (buf [bkref_str_off++] != buf[sl_str - 1])
2786 break; /* We don't need to search this sub expression
2787 any more. */
2788 }
2789 if (mctx->state_log[sl_str] == NULL)
2790 continue;
2791 /* Does this state have a ')' of the sub expression? */
2792 nodes = &mctx->state_log[sl_str]->nodes;
2793 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2794 OP_CLOSE_SUBEXP);
2795 if (cls_node == -1)
2796 continue; /* No. */
2797 if (sub_top->path == NULL)
2798 {
2799 sub_top->path = calloc (sizeof (state_array_t),
2800 sl_str - sub_top->str_idx + 1);
2801 if (sub_top->path == NULL)
2802 return REG_ESPACE;
2803 }
2804 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805 in the current context? */
2806 err = check_arrival (mctx, sub_top->path, sub_top->node,
2807 sub_top->str_idx, cls_node, sl_str,
2808 OP_CLOSE_SUBEXP);
2809 if (err == REG_NOMATCH)
2810 continue;
2811 if (BE (err != REG_NOERROR, 0))
2812 return err;
2813 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2814 if (BE (sub_last == NULL, 0))
2815 return REG_ESPACE;
2816 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2817 bkref_str_idx);
2818 if (err == REG_NOMATCH)
2819 continue;
2820 }
2821 }
2822 return REG_NOERROR;
2823 }
2824
2825 /* Helper functions for get_subexp(). */
2826
2827 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828 If it can arrive, register the sub expression expressed with SUB_TOP
2829 and SUB_LAST. */
2830
2831 static reg_errcode_t
2832 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,int bkref_node,int bkref_str)2833 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2834 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2835 {
2836 reg_errcode_t err;
2837 int to_idx;
2838 /* Can the subexpression arrive the back reference? */
2839 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2840 sub_last->str_idx, bkref_node, bkref_str,
2841 OP_OPEN_SUBEXP);
2842 if (err != REG_NOERROR)
2843 return err;
2844 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2845 sub_last->str_idx);
2846 if (BE (err != REG_NOERROR, 0))
2847 return err;
2848 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2849 return clean_state_log_if_needed (mctx, to_idx);
2850 }
2851
2852 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853 Search '(' if FL_OPEN, or search ')' otherwise.
2854 TODO: This function isn't efficient...
2855 Because there might be more than one nodes whose types are
2856 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857 nodes.
2858 E.g. RE: (a){2} */
2859
2860 static int
2861 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,int subexp_idx,int type)2862 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2863 int subexp_idx, int type)
2864 {
2865 int cls_idx;
2866 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2867 {
2868 int cls_node = nodes->elems[cls_idx];
2869 const re_token_t *node = dfa->nodes + cls_node;
2870 if (node->type == type
2871 && node->opr.idx == subexp_idx)
2872 return cls_node;
2873 }
2874 return -1;
2875 }
2876
2877 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2879 heavily reused.
2880 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2881
2882 static reg_errcode_t
2883 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,int top_node,int top_str,int last_node,int last_str,int type)2884 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2885 int top_str, int last_node, int last_str, int type)
2886 {
2887 const re_dfa_t *const dfa = mctx->dfa;
2888 reg_errcode_t err = REG_NOERROR;
2889 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2890 re_dfastate_t *cur_state = NULL;
2891 re_node_set *cur_nodes, next_nodes;
2892 re_dfastate_t **backup_state_log;
2893 unsigned int context;
2894
2895 subexp_num = dfa->nodes[top_node].opr.idx;
2896 /* Extend the buffer if we need. */
2897 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2898 {
2899 re_dfastate_t **new_array;
2900 int old_alloc = path->alloc;
2901 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2902 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2903 if (BE (new_array == NULL, 0))
2904 {
2905 path->alloc = old_alloc;
2906 return REG_ESPACE;
2907 }
2908 path->array = new_array;
2909 memset (new_array + old_alloc, '\0',
2910 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2911 }
2912
2913 // Original:
2914 // str_idx = path->next_idx ?: top_str;
2915 // Copied following from another version when cleaning up compiler warnings.
2916 str_idx = path->next_idx ? path->next_idx : top_str;
2917
2918 /* Temporary modify MCTX. */
2919 backup_state_log = mctx->state_log;
2920 backup_cur_idx = mctx->input.cur_idx;
2921 mctx->state_log = path->array;
2922 mctx->input.cur_idx = str_idx;
2923
2924 /* Setup initial node set. */
2925 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2926 if (str_idx == top_str)
2927 {
2928 err = re_node_set_init_1 (&next_nodes, top_node);
2929 if (BE (err != REG_NOERROR, 0))
2930 return err;
2931 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2932 if (BE (err != REG_NOERROR, 0))
2933 {
2934 re_node_set_free (&next_nodes);
2935 return err;
2936 }
2937 }
2938 else
2939 {
2940 cur_state = mctx->state_log[str_idx];
2941 if (cur_state && cur_state->has_backref)
2942 {
2943 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2944 if (BE (err != REG_NOERROR, 0))
2945 return err;
2946 }
2947 else
2948 re_node_set_init_empty (&next_nodes);
2949 }
2950 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2951 {
2952 if (next_nodes.nelem)
2953 {
2954 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2955 subexp_num, type);
2956 if (BE (err != REG_NOERROR, 0))
2957 {
2958 re_node_set_free (&next_nodes);
2959 return err;
2960 }
2961 }
2962 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2963 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2964 {
2965 re_node_set_free (&next_nodes);
2966 return err;
2967 }
2968 mctx->state_log[str_idx] = cur_state;
2969 }
2970
2971 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2972 {
2973 re_node_set_empty (&next_nodes);
2974 if (mctx->state_log[str_idx + 1])
2975 {
2976 err = re_node_set_merge (&next_nodes,
2977 &mctx->state_log[str_idx + 1]->nodes);
2978 if (BE (err != REG_NOERROR, 0))
2979 {
2980 re_node_set_free (&next_nodes);
2981 return err;
2982 }
2983 }
2984 if (cur_state)
2985 {
2986 err = check_arrival_add_next_nodes (mctx, str_idx,
2987 &cur_state->non_eps_nodes,
2988 &next_nodes);
2989 if (BE (err != REG_NOERROR, 0))
2990 {
2991 re_node_set_free (&next_nodes);
2992 return err;
2993 }
2994 }
2995 ++str_idx;
2996 if (next_nodes.nelem)
2997 {
2998 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2999 if (BE (err != REG_NOERROR, 0))
3000 {
3001 re_node_set_free (&next_nodes);
3002 return err;
3003 }
3004 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3005 subexp_num, type);
3006 if (BE (err != REG_NOERROR, 0))
3007 {
3008 re_node_set_free (&next_nodes);
3009 return err;
3010 }
3011 }
3012 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3013 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3014 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3015 {
3016 re_node_set_free (&next_nodes);
3017 return err;
3018 }
3019 mctx->state_log[str_idx] = cur_state;
3020 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3021 }
3022 re_node_set_free (&next_nodes);
3023 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3024 : &mctx->state_log[last_str]->nodes);
3025 path->next_idx = str_idx;
3026
3027 /* Fix MCTX. */
3028 mctx->state_log = backup_state_log;
3029 mctx->input.cur_idx = backup_cur_idx;
3030
3031 /* Then check the current node set has the node LAST_NODE. */
3032 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3033 return REG_NOERROR;
3034
3035 return REG_NOMATCH;
3036 }
3037
3038 /* Helper functions for check_arrival. */
3039
3040 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3041 to NEXT_NODES.
3042 TODO: This function is similar to the functions transit_state*(),
3043 however this function has many additional works.
3044 Can't we unify them? */
3045
3046 static reg_errcode_t
3047 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,int str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3048 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3049 re_node_set *cur_nodes, re_node_set *next_nodes)
3050 {
3051 const re_dfa_t *const dfa = mctx->dfa;
3052 int result;
3053 int cur_idx;
3054 #ifdef RE_ENABLE_I18N
3055 reg_errcode_t err = REG_NOERROR;
3056 #endif
3057 re_node_set union_set;
3058 re_node_set_init_empty (&union_set);
3059 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3060 {
3061 int naccepted = 0;
3062 int cur_node = cur_nodes->elems[cur_idx];
3063 #ifdef DEBUG
3064 re_token_type_t type = dfa->nodes[cur_node].type;
3065 assert (!IS_EPSILON_NODE (type));
3066 #endif
3067 #ifdef RE_ENABLE_I18N
3068 /* If the node may accept `multi byte'. */
3069 if (dfa->nodes[cur_node].accept_mb)
3070 {
3071 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3072 str_idx);
3073 if (naccepted > 1)
3074 {
3075 re_dfastate_t *dest_state;
3076 int next_node = dfa->nexts[cur_node];
3077 int next_idx = str_idx + naccepted;
3078 dest_state = mctx->state_log[next_idx];
3079 re_node_set_empty (&union_set);
3080 if (dest_state)
3081 {
3082 err = re_node_set_merge (&union_set, &dest_state->nodes);
3083 if (BE (err != REG_NOERROR, 0))
3084 {
3085 re_node_set_free (&union_set);
3086 return err;
3087 }
3088 }
3089 result = re_node_set_insert (&union_set, next_node);
3090 if (BE (result < 0, 0))
3091 {
3092 re_node_set_free (&union_set);
3093 return REG_ESPACE;
3094 }
3095 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3096 &union_set);
3097 if (BE (mctx->state_log[next_idx] == NULL
3098 && err != REG_NOERROR, 0))
3099 {
3100 re_node_set_free (&union_set);
3101 return err;
3102 }
3103 }
3104 }
3105 #endif /* RE_ENABLE_I18N */
3106 if (naccepted
3107 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3108 {
3109 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3110 if (BE (result < 0, 0))
3111 {
3112 re_node_set_free (&union_set);
3113 return REG_ESPACE;
3114 }
3115 }
3116 }
3117 re_node_set_free (&union_set);
3118 return REG_NOERROR;
3119 }
3120
3121 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3122 CUR_NODES, however exclude the nodes which are:
3123 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3124 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3125 */
3126
3127 static reg_errcode_t
3128 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,int ex_subexp,int type)3129 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3130 int ex_subexp, int type)
3131 {
3132 reg_errcode_t err;
3133 int idx, outside_node;
3134 re_node_set new_nodes;
3135 #ifdef DEBUG
3136 assert (cur_nodes->nelem);
3137 #endif
3138 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3139 if (BE (err != REG_NOERROR, 0))
3140 return err;
3141 /* Create a new node set NEW_NODES with the nodes which are epsilon
3142 closures of the node in CUR_NODES. */
3143
3144 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3145 {
3146 int cur_node = cur_nodes->elems[idx];
3147 const re_node_set *eclosure = dfa->eclosures + cur_node;
3148 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3149 if (outside_node == -1)
3150 {
3151 /* There are no problematic nodes, just merge them. */
3152 err = re_node_set_merge (&new_nodes, eclosure);
3153 if (BE (err != REG_NOERROR, 0))
3154 {
3155 re_node_set_free (&new_nodes);
3156 return err;
3157 }
3158 }
3159 else
3160 {
3161 /* There are problematic nodes, re-calculate incrementally. */
3162 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3163 ex_subexp, type);
3164 if (BE (err != REG_NOERROR, 0))
3165 {
3166 re_node_set_free (&new_nodes);
3167 return err;
3168 }
3169 }
3170 }
3171 re_node_set_free (cur_nodes);
3172 *cur_nodes = new_nodes;
3173 return REG_NOERROR;
3174 }
3175
3176 /* Helper function for check_arrival_expand_ecl.
3177 Check incrementally the epsilon closure of TARGET, and if it isn't
3178 problematic append it to DST_NODES. */
3179
3180 static reg_errcode_t
3181 internal_function
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,int target,int ex_subexp,int type)3182 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3183 int target, int ex_subexp, int type)
3184 {
3185 int cur_node;
3186 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3187 {
3188 int err;
3189
3190 if (dfa->nodes[cur_node].type == type
3191 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3192 {
3193 if (type == OP_CLOSE_SUBEXP)
3194 {
3195 err = re_node_set_insert (dst_nodes, cur_node);
3196 if (BE (err == -1, 0))
3197 return REG_ESPACE;
3198 }
3199 break;
3200 }
3201 err = re_node_set_insert (dst_nodes, cur_node);
3202 if (BE (err == -1, 0))
3203 return REG_ESPACE;
3204 if (dfa->edests[cur_node].nelem == 0)
3205 break;
3206 if (dfa->edests[cur_node].nelem == 2)
3207 {
3208 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3209 dfa->edests[cur_node].elems[1],
3210 ex_subexp, type);
3211 if (BE (err != REG_NOERROR, 0))
3212 return err;
3213 }
3214 cur_node = dfa->edests[cur_node].elems[0];
3215 }
3216 return REG_NOERROR;
3217 }
3218
3219
3220 /* For all the back references in the current state, calculate the
3221 destination of the back references by the appropriate entry
3222 in MCTX->BKREF_ENTS. */
3223
3224 static reg_errcode_t
3225 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,int cur_str,int subexp_num,int type)3226 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3227 int cur_str, int subexp_num, int type)
3228 {
3229 const re_dfa_t *const dfa = mctx->dfa;
3230 reg_errcode_t err;
3231 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3232 struct re_backref_cache_entry *ent;
3233
3234 if (cache_idx_start == -1)
3235 return REG_NOERROR;
3236
3237 restart:
3238 ent = mctx->bkref_ents + cache_idx_start;
3239 do
3240 {
3241 int to_idx, next_node;
3242
3243 /* Is this entry ENT is appropriate? */
3244 if (!re_node_set_contains (cur_nodes, ent->node))
3245 continue; /* No. */
3246
3247 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3248 /* Calculate the destination of the back reference, and append it
3249 to MCTX->STATE_LOG. */
3250 if (to_idx == cur_str)
3251 {
3252 /* The backreference did epsilon transit, we must re-check all the
3253 node in the current state. */
3254 re_node_set new_dests;
3255 reg_errcode_t err2, err3;
3256 next_node = dfa->edests[ent->node].elems[0];
3257 if (re_node_set_contains (cur_nodes, next_node))
3258 continue;
3259 err = re_node_set_init_1 (&new_dests, next_node);
3260 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3261 err3 = re_node_set_merge (cur_nodes, &new_dests);
3262 re_node_set_free (&new_dests);
3263 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3264 || err3 != REG_NOERROR, 0))
3265 {
3266 err = (err != REG_NOERROR ? err
3267 : (err2 != REG_NOERROR ? err2 : err3));
3268 return err;
3269 }
3270 /* TODO: It is still inefficient... */
3271 goto restart;
3272 }
3273 else
3274 {
3275 re_node_set union_set;
3276 next_node = dfa->nexts[ent->node];
3277 if (mctx->state_log[to_idx])
3278 {
3279 int ret;
3280 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3281 next_node))
3282 continue;
3283 err = re_node_set_init_copy (&union_set,
3284 &mctx->state_log[to_idx]->nodes);
3285 ret = re_node_set_insert (&union_set, next_node);
3286 if (BE (err != REG_NOERROR || ret < 0, 0))
3287 {
3288 re_node_set_free (&union_set);
3289 err = err != REG_NOERROR ? err : REG_ESPACE;
3290 return err;
3291 }
3292 }
3293 else
3294 {
3295 err = re_node_set_init_1 (&union_set, next_node);
3296 if (BE (err != REG_NOERROR, 0))
3297 return err;
3298 }
3299 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3300 re_node_set_free (&union_set);
3301 if (BE (mctx->state_log[to_idx] == NULL
3302 && err != REG_NOERROR, 0))
3303 return err;
3304 }
3305 }
3306 while (ent++->more);
3307 return REG_NOERROR;
3308 }
3309
3310 /* Build transition table for the state.
3311 Return 1 if succeeded, otherwise return NULL. */
3312
3313 static int
3314 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3315 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3316 {
3317 reg_errcode_t err;
3318 int i, j, ch, need_word_trtable = 0;
3319 bitset_word_t elem, mask;
3320 bool dests_node_malloced = false;
3321 bool dest_states_malloced = false;
3322 int ndests; /* Number of the destination states from `state'. */
3323 re_dfastate_t **trtable;
3324 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3325 re_node_set follows, *dests_node;
3326 bitset_t *dests_ch;
3327 bitset_t acceptable;
3328
3329 struct dests_alloc
3330 {
3331 re_node_set dests_node[SBC_MAX];
3332 bitset_t dests_ch[SBC_MAX];
3333 } *dests_alloc;
3334
3335 /* We build DFA states which corresponds to the destination nodes
3336 from `state'. `dests_node[i]' represents the nodes which i-th
3337 destination state contains, and `dests_ch[i]' represents the
3338 characters which i-th destination state accepts. */
3339 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3340 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3341 else
3342 {
3343 dests_alloc = re_malloc (struct dests_alloc, 1);
3344 if (BE (dests_alloc == NULL, 0))
3345 return 0;
3346 dests_node_malloced = true;
3347 }
3348 dests_node = dests_alloc->dests_node;
3349 dests_ch = dests_alloc->dests_ch;
3350
3351 /* Initialize transiton table. */
3352 state->word_trtable = state->trtable = NULL;
3353
3354 /* At first, group all nodes belonging to `state' into several
3355 destinations. */
3356 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3357 if (BE (ndests <= 0, 0))
3358 {
3359 if (dests_node_malloced)
3360 free (dests_alloc);
3361 /* Return 0 in case of an error, 1 otherwise. */
3362 if (ndests == 0)
3363 {
3364 state->trtable = (re_dfastate_t **)
3365 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3366 return 1;
3367 }
3368 return 0;
3369 }
3370
3371 err = re_node_set_alloc (&follows, ndests + 1);
3372 if (BE (err != REG_NOERROR, 0))
3373 goto out_free;
3374
3375 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3376 + ndests * 3 * sizeof (re_dfastate_t *)))
3377 dest_states = (re_dfastate_t **)
3378 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3379 else
3380 {
3381 dest_states = (re_dfastate_t **)
3382 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3383 if (BE (dest_states == NULL, 0))
3384 {
3385 out_free:
3386 if (dest_states_malloced)
3387 free (dest_states);
3388 re_node_set_free (&follows);
3389 for (i = 0; i < ndests; ++i)
3390 re_node_set_free (dests_node + i);
3391 if (dests_node_malloced)
3392 free (dests_alloc);
3393 return 0;
3394 }
3395 dest_states_malloced = true;
3396 }
3397 dest_states_word = dest_states + ndests;
3398 dest_states_nl = dest_states_word + ndests;
3399 bitset_empty (acceptable);
3400
3401 /* Then build the states for all destinations. */
3402 for (i = 0; i < ndests; ++i)
3403 {
3404 int next_node;
3405 re_node_set_empty (&follows);
3406 /* Merge the follows of this destination states. */
3407 for (j = 0; j < dests_node[i].nelem; ++j)
3408 {
3409 next_node = dfa->nexts[dests_node[i].elems[j]];
3410 if (next_node != -1)
3411 {
3412 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3413 if (BE (err != REG_NOERROR, 0))
3414 goto out_free;
3415 }
3416 }
3417 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3418 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3419 goto out_free;
3420 /* If the new state has context constraint,
3421 build appropriate states for these contexts. */
3422 if (dest_states[i]->has_constraint)
3423 {
3424 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3425 CONTEXT_WORD);
3426 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3427 goto out_free;
3428
3429 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3430 need_word_trtable = 1;
3431
3432 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3433 CONTEXT_NEWLINE);
3434 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3435 goto out_free;
3436 }
3437 else
3438 {
3439 dest_states_word[i] = dest_states[i];
3440 dest_states_nl[i] = dest_states[i];
3441 }
3442 bitset_merge (acceptable, dests_ch[i]);
3443 }
3444
3445 if (!BE (need_word_trtable, 0))
3446 {
3447 /* We don't care about whether the following character is a word
3448 character, or we are in a single-byte character set so we can
3449 discern by looking at the character code: allocate a
3450 256-entry transition table. */
3451 trtable = state->trtable =
3452 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3453 if (BE (trtable == NULL, 0))
3454 goto out_free;
3455
3456 /* For all characters ch...: */
3457 for (i = 0; i < BITSET_WORDS; ++i)
3458 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3459 elem;
3460 mask <<= 1, elem >>= 1, ++ch)
3461 if (BE (elem & 1, 0))
3462 {
3463 /* There must be exactly one destination which accepts
3464 character ch. See group_nodes_into_DFAstates. */
3465 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3466 ;
3467
3468 /* j-th destination accepts the word character ch. */
3469 if (dfa->word_char[i] & mask)
3470 trtable[ch] = dest_states_word[j];
3471 else
3472 trtable[ch] = dest_states[j];
3473 }
3474 }
3475 else
3476 {
3477 /* We care about whether the following character is a word
3478 character, and we are in a multi-byte character set: discern
3479 by looking at the character code: build two 256-entry
3480 transition tables, one starting at trtable[0] and one
3481 starting at trtable[SBC_MAX]. */
3482 trtable = state->word_trtable =
3483 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3484 if (BE (trtable == NULL, 0))
3485 goto out_free;
3486
3487 /* For all characters ch...: */
3488 for (i = 0; i < BITSET_WORDS; ++i)
3489 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3490 elem;
3491 mask <<= 1, elem >>= 1, ++ch)
3492 if (BE (elem & 1, 0))
3493 {
3494 /* There must be exactly one destination which accepts
3495 character ch. See group_nodes_into_DFAstates. */
3496 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3497 ;
3498
3499 /* j-th destination accepts the word character ch. */
3500 trtable[ch] = dest_states[j];
3501 trtable[ch + SBC_MAX] = dest_states_word[j];
3502 }
3503 }
3504
3505 /* new line */
3506 if (bitset_contain (acceptable, NEWLINE_CHAR))
3507 {
3508 /* The current state accepts newline character. */
3509 for (j = 0; j < ndests; ++j)
3510 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3511 {
3512 /* k-th destination accepts newline character. */
3513 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3514 if (need_word_trtable)
3515 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3516 /* There must be only one destination which accepts
3517 newline. See group_nodes_into_DFAstates. */
3518 break;
3519 }
3520 }
3521
3522 if (dest_states_malloced)
3523 free (dest_states);
3524
3525 re_node_set_free (&follows);
3526 for (i = 0; i < ndests; ++i)
3527 re_node_set_free (dests_node + i);
3528
3529 if (dests_node_malloced)
3530 free (dests_alloc);
3531
3532 return 1;
3533 }
3534
3535 /* Group all nodes belonging to STATE into several destinations.
3536 Then for all destinations, set the nodes belonging to the destination
3537 to DESTS_NODE[i] and set the characters accepted by the destination
3538 to DEST_CH[i]. This function return the number of destinations. */
3539
3540 static int
3541 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3542 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3543 re_node_set *dests_node, bitset_t *dests_ch)
3544 {
3545 reg_errcode_t err;
3546 int result;
3547 int i, j, k;
3548 int ndests; /* Number of the destinations from `state'. */
3549 bitset_t accepts; /* Characters a node can accept. */
3550 const re_node_set *cur_nodes = &state->nodes;
3551 bitset_empty (accepts);
3552 ndests = 0;
3553
3554 /* For all the nodes belonging to `state', */
3555 for (i = 0; i < cur_nodes->nelem; ++i)
3556 {
3557 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3558 re_token_type_t type = node->type;
3559 unsigned int constraint = node->constraint;
3560
3561 /* Enumerate all single byte character this node can accept. */
3562 if (type == CHARACTER)
3563 bitset_set (accepts, node->opr.c);
3564 else if (type == SIMPLE_BRACKET)
3565 {
3566 bitset_merge (accepts, node->opr.sbcset);
3567 }
3568 else if (type == OP_PERIOD)
3569 {
3570 #ifdef RE_ENABLE_I18N
3571 if (dfa->mb_cur_max > 1)
3572 bitset_merge (accepts, dfa->sb_char);
3573 else
3574 #endif
3575 bitset_set_all (accepts);
3576 if (!(dfa->syntax & RE_DOT_NEWLINE))
3577 bitset_clear (accepts, '\n');
3578 if (dfa->syntax & RE_DOT_NOT_NULL)
3579 bitset_clear (accepts, '\0');
3580 }
3581 #ifdef RE_ENABLE_I18N
3582 else if (type == OP_UTF8_PERIOD)
3583 {
3584 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3585 if (!(dfa->syntax & RE_DOT_NEWLINE))
3586 bitset_clear (accepts, '\n');
3587 if (dfa->syntax & RE_DOT_NOT_NULL)
3588 bitset_clear (accepts, '\0');
3589 }
3590 #endif
3591 else
3592 continue;
3593
3594 /* Check the `accepts' and sift the characters which are not
3595 match it the context. */
3596 if (constraint)
3597 {
3598 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3599 {
3600 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3601 bitset_empty (accepts);
3602 if (accepts_newline)
3603 bitset_set (accepts, NEWLINE_CHAR);
3604 else
3605 continue;
3606 }
3607 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3608 {
3609 bitset_empty (accepts);
3610 continue;
3611 }
3612
3613 if (constraint & NEXT_WORD_CONSTRAINT)
3614 {
3615 bitset_word_t any_set = 0;
3616 if (type == CHARACTER && !node->word_char)
3617 {
3618 bitset_empty (accepts);
3619 continue;
3620 }
3621 #ifdef RE_ENABLE_I18N
3622 if (dfa->mb_cur_max > 1)
3623 for (j = 0; j < BITSET_WORDS; ++j)
3624 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3625 else
3626 #endif
3627 for (j = 0; j < BITSET_WORDS; ++j)
3628 any_set |= (accepts[j] &= dfa->word_char[j]);
3629 if (!any_set)
3630 continue;
3631 }
3632 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3633 {
3634 bitset_word_t any_set = 0;
3635 if (type == CHARACTER && node->word_char)
3636 {
3637 bitset_empty (accepts);
3638 continue;
3639 }
3640 #ifdef RE_ENABLE_I18N
3641 if (dfa->mb_cur_max > 1)
3642 for (j = 0; j < BITSET_WORDS; ++j)
3643 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3644 else
3645 #endif
3646 for (j = 0; j < BITSET_WORDS; ++j)
3647 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3648 if (!any_set)
3649 continue;
3650 }
3651 }
3652
3653 /* Then divide `accepts' into DFA states, or create a new
3654 state. Above, we make sure that accepts is not empty. */
3655 for (j = 0; j < ndests; ++j)
3656 {
3657 bitset_t intersec; /* Intersection sets, see below. */
3658 bitset_t remains;
3659 /* Flags, see below. */
3660 bitset_word_t has_intersec, not_subset, not_consumed;
3661
3662 /* Optimization, skip if this state doesn't accept the character. */
3663 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3664 continue;
3665
3666 /* Enumerate the intersection set of this state and `accepts'. */
3667 has_intersec = 0;
3668 for (k = 0; k < BITSET_WORDS; ++k)
3669 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3670 /* And skip if the intersection set is empty. */
3671 if (!has_intersec)
3672 continue;
3673
3674 /* Then check if this state is a subset of `accepts'. */
3675 not_subset = not_consumed = 0;
3676 for (k = 0; k < BITSET_WORDS; ++k)
3677 {
3678 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3679 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3680 }
3681
3682 /* If this state isn't a subset of `accepts', create a
3683 new group state, which has the `remains'. */
3684 if (not_subset)
3685 {
3686 bitset_copy (dests_ch[ndests], remains);
3687 bitset_copy (dests_ch[j], intersec);
3688 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3689 if (BE (err != REG_NOERROR, 0))
3690 goto error_return;
3691 ++ndests;
3692 }
3693
3694 /* Put the position in the current group. */
3695 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3696 if (BE (result < 0, 0))
3697 goto error_return;
3698
3699 /* If all characters are consumed, go to next node. */
3700 if (!not_consumed)
3701 break;
3702 }
3703 /* Some characters remain, create a new group. */
3704 if (j == ndests)
3705 {
3706 bitset_copy (dests_ch[ndests], accepts);
3707 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3708 if (BE (err != REG_NOERROR, 0))
3709 goto error_return;
3710 ++ndests;
3711 bitset_empty (accepts);
3712 }
3713 }
3714 return ndests;
3715 error_return:
3716 for (j = 0; j < ndests; ++j)
3717 re_node_set_free (dests_node + j);
3718 return -1;
3719 }
3720
3721 #ifdef RE_ENABLE_I18N
3722 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3723 Return the number of the bytes the node accepts.
3724 STR_IDX is the current index of the input string.
3725
3726 This function handles the nodes which can accept one character, or
3727 one collating element like '.', '[a-z]', opposite to the other nodes
3728 can only accept one byte. */
3729
3730 static int
3731 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,int node_idx,const re_string_t * input,int str_idx)3732 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3733 const re_string_t *input, int str_idx)
3734 {
3735 const re_token_t *node = dfa->nodes + node_idx;
3736 int char_len, elem_len;
3737 int i;
3738
3739 if (BE (node->type == OP_UTF8_PERIOD, 0))
3740 {
3741 unsigned char c = re_string_byte_at (input, str_idx), d;
3742 if (BE (c < 0xc2, 1))
3743 return 0;
3744
3745 if (str_idx + 2 > input->len)
3746 return 0;
3747
3748 d = re_string_byte_at (input, str_idx + 1);
3749 if (c < 0xe0)
3750 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3751 else if (c < 0xf0)
3752 {
3753 char_len = 3;
3754 if (c == 0xe0 && d < 0xa0)
3755 return 0;
3756 }
3757 else if (c < 0xf8)
3758 {
3759 char_len = 4;
3760 if (c == 0xf0 && d < 0x90)
3761 return 0;
3762 }
3763 else if (c < 0xfc)
3764 {
3765 char_len = 5;
3766 if (c == 0xf8 && d < 0x88)
3767 return 0;
3768 }
3769 else if (c < 0xfe)
3770 {
3771 char_len = 6;
3772 if (c == 0xfc && d < 0x84)
3773 return 0;
3774 }
3775 else
3776 return 0;
3777
3778 if (str_idx + char_len > input->len)
3779 return 0;
3780
3781 for (i = 1; i < char_len; ++i)
3782 {
3783 d = re_string_byte_at (input, str_idx + i);
3784 if (d < 0x80 || d > 0xbf)
3785 return 0;
3786 }
3787 return char_len;
3788 }
3789
3790 char_len = re_string_char_size_at (input, str_idx);
3791 if (node->type == OP_PERIOD)
3792 {
3793 if (char_len <= 1)
3794 return 0;
3795 /* FIXME: I don't think this if is needed, as both '\n'
3796 and '\0' are char_len == 1. */
3797 /* '.' accepts any one character except the following two cases. */
3798 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3799 re_string_byte_at (input, str_idx) == '\n') ||
3800 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3801 re_string_byte_at (input, str_idx) == '\0'))
3802 return 0;
3803 return char_len;
3804 }
3805
3806 elem_len = re_string_elem_size_at (input, str_idx);
3807 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3808 return 0;
3809
3810 if (node->type == COMPLEX_BRACKET)
3811 {
3812 const re_charset_t *cset = node->opr.mbcset;
3813 # ifdef _LIBC
3814 const unsigned char *pin
3815 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3816 int j;
3817 uint32_t nrules;
3818 # endif /* _LIBC */
3819 int match_len = 0;
3820 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3821 ? re_string_wchar_at (input, str_idx) : 0);
3822
3823 /* match with multibyte character? */
3824 for (i = 0; i < cset->nmbchars; ++i)
3825 if (wc == cset->mbchars[i])
3826 {
3827 match_len = char_len;
3828 goto check_node_accept_bytes_match;
3829 }
3830 /* match with character_class? */
3831 for (i = 0; i < cset->nchar_classes; ++i)
3832 {
3833 wctype_t wt = cset->char_classes[i];
3834 if (__iswctype (wc, wt))
3835 {
3836 match_len = char_len;
3837 goto check_node_accept_bytes_match;
3838 }
3839 }
3840
3841 # ifdef _LIBC
3842 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3843 if (nrules != 0)
3844 {
3845 unsigned int in_collseq = 0;
3846 const int32_t *table, *indirect;
3847 const unsigned char *weights, *extra;
3848 const char *collseqwc;
3849 /* This #include defines a local function! */
3850 # include <locale/weight.h>
3851
3852 /* match with collating_symbol? */
3853 if (cset->ncoll_syms)
3854 extra = (const unsigned char *)
3855 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3856 for (i = 0; i < cset->ncoll_syms; ++i)
3857 {
3858 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3859 /* Compare the length of input collating element and
3860 the length of current collating element. */
3861 if (*coll_sym != elem_len)
3862 continue;
3863 /* Compare each bytes. */
3864 for (j = 0; j < *coll_sym; j++)
3865 if (pin[j] != coll_sym[1 + j])
3866 break;
3867 if (j == *coll_sym)
3868 {
3869 /* Match if every bytes is equal. */
3870 match_len = j;
3871 goto check_node_accept_bytes_match;
3872 }
3873 }
3874
3875 if (cset->nranges)
3876 {
3877 if (elem_len <= char_len)
3878 {
3879 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3880 in_collseq = __collseq_table_lookup (collseqwc, wc);
3881 }
3882 else
3883 in_collseq = find_collation_sequence_value (pin, elem_len);
3884 }
3885 /* match with range expression? */
3886 for (i = 0; i < cset->nranges; ++i)
3887 if (cset->range_starts[i] <= in_collseq
3888 && in_collseq <= cset->range_ends[i])
3889 {
3890 match_len = elem_len;
3891 goto check_node_accept_bytes_match;
3892 }
3893
3894 /* match with equivalence_class? */
3895 if (cset->nequiv_classes)
3896 {
3897 const unsigned char *cp = pin;
3898 table = (const int32_t *)
3899 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3900 weights = (const unsigned char *)
3901 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3902 extra = (const unsigned char *)
3903 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3904 indirect = (const int32_t *)
3905 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3906 int32_t idx = findidx (&cp);
3907 if (idx > 0)
3908 for (i = 0; i < cset->nequiv_classes; ++i)
3909 {
3910 int32_t equiv_class_idx = cset->equiv_classes[i];
3911 size_t weight_len = weights[idx & 0xffffff];
3912 if (weight_len == weights[equiv_class_idx & 0xffffff]
3913 && (idx >> 24) == (equiv_class_idx >> 24))
3914 {
3915 int cnt = 0;
3916
3917 idx &= 0xffffff;
3918 equiv_class_idx &= 0xffffff;
3919
3920 while (cnt <= weight_len
3921 && (weights[equiv_class_idx + 1 + cnt]
3922 == weights[idx + 1 + cnt]))
3923 ++cnt;
3924 if (cnt > weight_len)
3925 {
3926 match_len = elem_len;
3927 goto check_node_accept_bytes_match;
3928 }
3929 }
3930 }
3931 }
3932 }
3933 else
3934 # endif /* _LIBC */
3935 {
3936 /* match with range expression? */
3937 #if __GNUC__ >= 2
3938 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3939 #else
3940 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3941 cmp_buf[2] = wc;
3942 #endif
3943 for (i = 0; i < cset->nranges; ++i)
3944 {
3945 cmp_buf[0] = cset->range_starts[i];
3946 cmp_buf[4] = cset->range_ends[i];
3947 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3948 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3949 {
3950 match_len = char_len;
3951 goto check_node_accept_bytes_match;
3952 }
3953 }
3954 }
3955 check_node_accept_bytes_match:
3956 if (!cset->non_match)
3957 return match_len;
3958 else
3959 {
3960 if (match_len > 0)
3961 return 0;
3962 else
3963 return (elem_len > char_len) ? elem_len : char_len;
3964 }
3965 }
3966 return 0;
3967 }
3968
3969 # ifdef _LIBC
3970 static unsigned int
3971 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3972 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3973 {
3974 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3975 if (nrules == 0)
3976 {
3977 if (mbs_len == 1)
3978 {
3979 /* No valid character. Match it as a single byte character. */
3980 const unsigned char *collseq = (const unsigned char *)
3981 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3982 return collseq[mbs[0]];
3983 }
3984 return UINT_MAX;
3985 }
3986 else
3987 {
3988 int32_t idx;
3989 const unsigned char *extra = (const unsigned char *)
3990 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3991 int32_t extrasize = (const unsigned char *)
3992 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3993
3994 for (idx = 0; idx < extrasize;)
3995 {
3996 int mbs_cnt, found = 0;
3997 int32_t elem_mbs_len;
3998 /* Skip the name of collating element name. */
3999 idx = idx + extra[idx] + 1;
4000 elem_mbs_len = extra[idx++];
4001 if (mbs_len == elem_mbs_len)
4002 {
4003 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4004 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4005 break;
4006 if (mbs_cnt == elem_mbs_len)
4007 /* Found the entry. */
4008 found = 1;
4009 }
4010 /* Skip the byte sequence of the collating element. */
4011 idx += elem_mbs_len;
4012 /* Adjust for the alignment. */
4013 idx = (idx + 3) & ~3;
4014 /* Skip the collation sequence value. */
4015 idx += sizeof (uint32_t);
4016 /* Skip the wide char sequence of the collating element. */
4017 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4018 /* If we found the entry, return the sequence value. */
4019 if (found)
4020 return *(uint32_t *) (extra + idx);
4021 /* Skip the collation sequence value. */
4022 idx += sizeof (uint32_t);
4023 }
4024 return UINT_MAX;
4025 }
4026 }
4027 # endif /* _LIBC */
4028 #endif /* RE_ENABLE_I18N */
4029
4030 /* Check whether the node accepts the byte which is IDX-th
4031 byte of the INPUT. */
4032
4033 static int
4034 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,int idx)4035 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4036 int idx)
4037 {
4038 unsigned char ch;
4039 ch = re_string_byte_at (&mctx->input, idx);
4040 switch (node->type)
4041 {
4042 case CHARACTER:
4043 if (node->opr.c != ch)
4044 return 0;
4045 break;
4046
4047 case SIMPLE_BRACKET:
4048 if (!bitset_contain (node->opr.sbcset, ch))
4049 return 0;
4050 break;
4051
4052 #ifdef RE_ENABLE_I18N
4053 case OP_UTF8_PERIOD:
4054 if (ch >= 0x80)
4055 return 0;
4056 /* FALLTHROUGH */
4057 #endif
4058 case OP_PERIOD:
4059 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4060 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4061 return 0;
4062 break;
4063
4064 default:
4065 return 0;
4066 }
4067
4068 if (node->constraint)
4069 {
4070 /* The node has constraints. Check whether the current context
4071 satisfies the constraints. */
4072 unsigned int context = re_string_context_at (&mctx->input, idx,
4073 mctx->eflags);
4074 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4075 return 0;
4076 }
4077
4078 return 1;
4079 }
4080
4081 /* Extend the buffers, if the buffers have run out. */
4082
4083 static reg_errcode_t
4084 internal_function
extend_buffers(re_match_context_t * mctx)4085 extend_buffers (re_match_context_t *mctx)
4086 {
4087 reg_errcode_t ret;
4088 re_string_t *pstr = &mctx->input;
4089
4090 /* Double the lengthes of the buffers. */
4091 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4092 if (BE (ret != REG_NOERROR, 0))
4093 return ret;
4094
4095 if (mctx->state_log != NULL)
4096 {
4097 /* And double the length of state_log. */
4098 /* XXX We have no indication of the size of this buffer. If this
4099 allocation fail we have no indication that the state_log array
4100 does not have the right size. */
4101 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4102 pstr->bufs_len + 1);
4103 if (BE (new_array == NULL, 0))
4104 return REG_ESPACE;
4105 mctx->state_log = new_array;
4106 }
4107
4108 /* Then reconstruct the buffers. */
4109 if (pstr->icase)
4110 {
4111 #ifdef RE_ENABLE_I18N
4112 if (pstr->mb_cur_max > 1)
4113 {
4114 ret = build_wcs_upper_buffer (pstr);
4115 if (BE (ret != REG_NOERROR, 0))
4116 return ret;
4117 }
4118 else
4119 #endif /* RE_ENABLE_I18N */
4120 build_upper_buffer (pstr);
4121 }
4122 else
4123 {
4124 #ifdef RE_ENABLE_I18N
4125 if (pstr->mb_cur_max > 1)
4126 build_wcs_buffer (pstr);
4127 else
4128 #endif /* RE_ENABLE_I18N */
4129 {
4130 if (pstr->trans != NULL)
4131 re_string_translate_buffer (pstr);
4132 }
4133 }
4134 return REG_NOERROR;
4135 }
4136
4137
4138 /* Functions for matching context. */
4139
4140 /* Initialize MCTX. */
4141
4142 static reg_errcode_t
4143 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,int n)4144 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4145 {
4146 mctx->eflags = eflags;
4147 mctx->match_last = -1;
4148 if (n > 0)
4149 {
4150 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4151 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4152 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4153 return REG_ESPACE;
4154 }
4155 /* Already zero-ed by the caller.
4156 else
4157 mctx->bkref_ents = NULL;
4158 mctx->nbkref_ents = 0;
4159 mctx->nsub_tops = 0; */
4160 mctx->abkref_ents = n;
4161 mctx->max_mb_elem_len = 1;
4162 mctx->asub_tops = n;
4163 return REG_NOERROR;
4164 }
4165
4166 /* Clean the entries which depend on the current input in MCTX.
4167 This function must be invoked when the matcher changes the start index
4168 of the input, or changes the input string. */
4169
4170 static void
4171 internal_function
match_ctx_clean(re_match_context_t * mctx)4172 match_ctx_clean (re_match_context_t *mctx)
4173 {
4174 int st_idx;
4175 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4176 {
4177 int sl_idx;
4178 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4179 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4180 {
4181 re_sub_match_last_t *last = top->lasts[sl_idx];
4182 re_free (last->path.array);
4183 re_free (last);
4184 }
4185 re_free (top->lasts);
4186 if (top->path)
4187 {
4188 re_free (top->path->array);
4189 re_free (top->path);
4190 }
4191 free (top);
4192 }
4193
4194 mctx->nsub_tops = 0;
4195 mctx->nbkref_ents = 0;
4196 }
4197
4198 /* Free all the memory associated with MCTX. */
4199
4200 static void
4201 internal_function
match_ctx_free(re_match_context_t * mctx)4202 match_ctx_free (re_match_context_t *mctx)
4203 {
4204 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4205 match_ctx_clean (mctx);
4206 re_free (mctx->sub_tops);
4207 re_free (mctx->bkref_ents);
4208 }
4209
4210 /* Add a new backreference entry to MCTX.
4211 Note that we assume that caller never call this function with duplicate
4212 entry, and call with STR_IDX which isn't smaller than any existing entry.
4213 */
4214
4215 static reg_errcode_t
4216 internal_function
match_ctx_add_entry(re_match_context_t * mctx,int node,int str_idx,int from,int to)4217 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4218 int to)
4219 {
4220 if (mctx->nbkref_ents >= mctx->abkref_ents)
4221 {
4222 struct re_backref_cache_entry* new_entry;
4223 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4224 mctx->abkref_ents * 2);
4225 if (BE (new_entry == NULL, 0))
4226 {
4227 re_free (mctx->bkref_ents);
4228 return REG_ESPACE;
4229 }
4230 mctx->bkref_ents = new_entry;
4231 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4232 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4233 mctx->abkref_ents *= 2;
4234 }
4235 if (mctx->nbkref_ents > 0
4236 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4237 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4238
4239 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4240 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4241 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4242 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4243
4244 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4245 If bit N is clear, means that this entry won't epsilon-transition to
4246 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4247 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4248 such node.
4249
4250 A backreference does not epsilon-transition unless it is empty, so set
4251 to all zeros if FROM != TO. */
4252 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4253 = (from == to ? ~0 : 0);
4254
4255 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4256 if (mctx->max_mb_elem_len < to - from)
4257 mctx->max_mb_elem_len = to - from;
4258 return REG_NOERROR;
4259 }
4260
4261 /* Search for the first entry which has the same str_idx, or -1 if none is
4262 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4263
4264 static int
4265 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,int str_idx)4266 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4267 {
4268 int left, right, mid, last;
4269 last = right = mctx->nbkref_ents;
4270 for (left = 0; left < right;)
4271 {
4272 mid = (left + right) / 2;
4273 if (mctx->bkref_ents[mid].str_idx < str_idx)
4274 left = mid + 1;
4275 else
4276 right = mid;
4277 }
4278 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4279 return left;
4280 else
4281 return -1;
4282 }
4283
4284 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4285 at STR_IDX. */
4286
4287 static reg_errcode_t
4288 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,int node,int str_idx)4289 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4290 {
4291 #ifdef DEBUG
4292 assert (mctx->sub_tops != NULL);
4293 assert (mctx->asub_tops > 0);
4294 #endif
4295 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4296 {
4297 int new_asub_tops = mctx->asub_tops * 2;
4298 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4299 re_sub_match_top_t *,
4300 new_asub_tops);
4301 if (BE (new_array == NULL, 0))
4302 return REG_ESPACE;
4303 mctx->sub_tops = new_array;
4304 mctx->asub_tops = new_asub_tops;
4305 }
4306 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4307 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4308 return REG_ESPACE;
4309 mctx->sub_tops[mctx->nsub_tops]->node = node;
4310 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4311 return REG_NOERROR;
4312 }
4313
4314 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4315 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4316
4317 static re_sub_match_last_t *
4318 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,int node,int str_idx)4319 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4320 {
4321 re_sub_match_last_t *new_entry;
4322 if (BE (subtop->nlasts == subtop->alasts, 0))
4323 {
4324 int new_alasts = 2 * subtop->alasts + 1;
4325 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4326 re_sub_match_last_t *,
4327 new_alasts);
4328 if (BE (new_array == NULL, 0))
4329 return NULL;
4330 subtop->lasts = new_array;
4331 subtop->alasts = new_alasts;
4332 }
4333 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4334 if (BE (new_entry != NULL, 1))
4335 {
4336 subtop->lasts[subtop->nlasts] = new_entry;
4337 new_entry->node = node;
4338 new_entry->str_idx = str_idx;
4339 ++subtop->nlasts;
4340 }
4341 return new_entry;
4342 }
4343
4344 static void
4345 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,int last_node,int last_str_idx)4346 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4347 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4348 {
4349 sctx->sifted_states = sifted_sts;
4350 sctx->limited_states = limited_sts;
4351 sctx->last_node = last_node;
4352 sctx->last_str_idx = last_str_idx;
4353 re_node_set_init_empty (&sctx->limits);
4354 }
4355