1 
2 /* this is very old and has massive numbers of compiler warnings. */
3 /* Maybe try upgrading to a newer version such as */
4 /* https://fossies.org/dox/glibc-2.24/regexec_8c_source.html */
5 
6 
7 /* Extended regular expression matching and search library.
8    Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
9    This file is part of the GNU C Library.
10    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
11 
12    The GNU C Library is free software; you can redistribute it and/or
13    modify it under the terms of the GNU Lesser General Public
14    License as published by the Free Software Foundation; either
15    version 2.1 of the License, or (at your option) any later version.
16 
17    The GNU C Library is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20    Lesser General Public License for more details.
21 
22    You should have received a copy of the GNU Lesser General Public
23    License along with the GNU C Library; if not, write to the Free
24    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25    02111-1307 USA.  */
26 
27 /* Added 12/2016 to remove warning: */
28 /* incompatible implicit declaration of built-in function 'alloca' */
29 #if __WIN32__
30 #include <malloc.h>
31 #else
32 #include <alloca.h>
33 #endif
34 
35 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
36 				     int n) internal_function;
37 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
38 static void match_ctx_free (re_match_context_t *cache) internal_function;
39 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
40 					  int str_idx, int from, int to)
41      internal_function;
42 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
43      internal_function;
44 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
45 					   int str_idx) internal_function;
46 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
47 						   int node, int str_idx)
48      internal_function;
49 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
50 			   re_dfastate_t **limited_sts, int last_node,
51 			   int last_str_idx)
52      internal_function;
53 static reg_errcode_t re_search_internal (const regex_t *preg,
54 					 const char *string, int length,
55 					 int start, int range, int stop,
56 					 size_t nmatch, regmatch_t pmatch[],
57 					 int eflags) internal_function;
58 static int re_search_2_stub (struct re_pattern_buffer *bufp,
59 			     const char *string1, int length1,
60 			     const char *string2, int length2,
61 			     int start, int range, struct re_registers *regs,
62 			     int stop, int ret_len) internal_function;
63 static int re_search_stub (struct re_pattern_buffer *bufp,
64 			   const char *string, int length, int start,
65 			   int range, int stop, struct re_registers *regs,
66 			   int ret_len) internal_function;
67 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
68 			      int nregs, int regs_allocated) internal_function;
69 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
70      internal_function;
71 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
72 			   int *p_match_first) internal_function;
73 static int check_halt_state_context (const re_match_context_t *mctx,
74 				     const re_dfastate_t *state, int idx)
75      internal_function;
76 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
77 			 regmatch_t *prev_idx_match, int cur_node,
78 			 int cur_idx, int nmatch) internal_function;
79 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
80 				      int str_idx, int dest_node, int nregs,
81 				      regmatch_t *regs,
82 				      re_node_set *eps_via_nodes)
83      internal_function;
84 static reg_errcode_t set_regs (const regex_t *preg,
85 			       const re_match_context_t *mctx,
86 			       size_t nmatch, regmatch_t *pmatch,
87 			       int fl_backtrack) internal_function;
88 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
89      internal_function;
90 
91 #ifdef RE_ENABLE_I18N
92 static int sift_states_iter_mb (const re_match_context_t *mctx,
93 				re_sift_context_t *sctx,
94 				int node_idx, int str_idx, int max_str_idx)
95      internal_function;
96 #endif /* RE_ENABLE_I18N */
97 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
98 					   re_sift_context_t *sctx)
99      internal_function;
100 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
101 					  re_sift_context_t *sctx, int str_idx,
102 					  re_node_set *cur_dest)
103      internal_function;
104 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
105 					      re_sift_context_t *sctx,
106 					      int str_idx,
107 					      re_node_set *dest_nodes)
108      internal_function;
109 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
110 					    re_node_set *dest_nodes,
111 					    const re_node_set *candidates)
112      internal_function;
113 static int check_dst_limits (const re_match_context_t *mctx,
114 			     re_node_set *limits,
115 			     int dst_node, int dst_idx, int src_node,
116 			     int src_idx) internal_function;
117 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
118 					int boundaries, int subexp_idx,
119 					int from_node, int bkref_idx)
120      internal_function;
121 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
122 				      int limit, int subexp_idx,
123 				      int node, int str_idx,
124 				      int bkref_idx) internal_function;
125 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
126 					  re_node_set *dest_nodes,
127 					  const re_node_set *candidates,
128 					  re_node_set *limits,
129 					  struct re_backref_cache_entry *bkref_ents,
130 					  int str_idx) internal_function;
131 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
132 					re_sift_context_t *sctx,
133 					int str_idx, const re_node_set *candidates)
134      internal_function;
135 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
136 					re_dfastate_t **dst,
137 					re_dfastate_t **src, int num)
138      internal_function;
139 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
140 					 re_match_context_t *mctx) internal_function;
141 static re_dfastate_t *transit_state (reg_errcode_t *err,
142 				     re_match_context_t *mctx,
143 				     re_dfastate_t *state) internal_function;
144 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
145 					    re_match_context_t *mctx,
146 					    re_dfastate_t *next_state)
147      internal_function;
148 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
149 						re_node_set *cur_nodes,
150 						int str_idx) internal_function;
151 #if 0
152 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
153 					re_match_context_t *mctx,
154 					re_dfastate_t *pstate)
155      internal_function;
156 #endif
157 #ifdef RE_ENABLE_I18N
158 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
159 				       re_dfastate_t *pstate)
160      internal_function;
161 #endif /* RE_ENABLE_I18N */
162 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
163 					  const re_node_set *nodes)
164      internal_function;
165 static reg_errcode_t get_subexp (re_match_context_t *mctx,
166 				 int bkref_node, int bkref_str_idx)
167      internal_function;
168 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
169 				     const re_sub_match_top_t *sub_top,
170 				     re_sub_match_last_t *sub_last,
171 				     int bkref_node, int bkref_str)
172      internal_function;
173 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
174 			     int subexp_idx, int type) internal_function;
175 static reg_errcode_t check_arrival (re_match_context_t *mctx,
176 				    state_array_t *path, int top_node,
177 				    int top_str, int last_node, int last_str,
178 				    int type) internal_function;
179 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
180 						   int str_idx,
181 						   re_node_set *cur_nodes,
182 						   re_node_set *next_nodes)
183      internal_function;
184 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
185 					       re_node_set *cur_nodes,
186 					       int ex_subexp, int type)
187      internal_function;
188 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
189 						   re_node_set *dst_nodes,
190 						   int target, int ex_subexp,
191 						   int type) internal_function;
192 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
193 					 re_node_set *cur_nodes, int cur_str,
194 					 int subexp_num, int type)
195      internal_function;
196 static int build_trtable (const re_dfa_t *dfa,
197 			  re_dfastate_t *state) internal_function;
198 #ifdef RE_ENABLE_I18N
199 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
200 				    const re_string_t *input, int idx)
201      internal_function;
202 # ifdef _LIBC
203 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
204 						   size_t name_len)
205      internal_function;
206 # endif /* _LIBC */
207 #endif /* RE_ENABLE_I18N */
208 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
209 				       const re_dfastate_t *state,
210 				       re_node_set *states_node,
211 				       bitset_t *states_ch) internal_function;
212 static int check_node_accept (const re_match_context_t *mctx,
213 			      const re_token_t *node, int idx)
214      internal_function;
215 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
216      internal_function;
217 
218 /* Entry point for POSIX code.  */
219 
220 /* regexec searches for a given pattern, specified by PREG, in the
221    string STRING.
222 
223    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
224    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
225    least NMATCH elements, and we set them to the offsets of the
226    corresponding matched substrings.
227 
228    EFLAGS specifies `execution flags' which affect matching: if
229    REG_NOTBOL is set, then ^ does not match at the beginning of the
230    string; if REG_NOTEOL is set, then $ does not match at the end.
231 
232    We return 0 if we find a match and REG_NOMATCH if not.  */
233 
234 int
regexec(preg,string,nmatch,pmatch,eflags)235 regexec (preg, string, nmatch, pmatch, eflags)
236     const regex_t *__restrict preg;
237     const char *__restrict string;
238     size_t nmatch;
239     regmatch_t pmatch[];
240     int eflags;
241 {
242   reg_errcode_t err;
243   int start, length;
244   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
245   (void)dfa;
246 
247   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
248     return REG_BADPAT;
249 
250   if (eflags & REG_STARTEND)
251     {
252       start = pmatch[0].rm_so;
253       length = pmatch[0].rm_eo;
254     }
255   else
256     {
257       start = 0;
258       length = strlen (string);
259     }
260 
261   __libc_lock_lock (dfa->lock);
262   if (preg->no_sub)
263     err = re_search_internal (preg, string, length, start, length - start,
264 			      length, 0, NULL, eflags);
265   else
266     err = re_search_internal (preg, string, length, start, length - start,
267 			      length, nmatch, pmatch, eflags);
268   __libc_lock_unlock (dfa->lock);
269   return err != REG_NOERROR;
270 }
271 
272 #ifdef _LIBC
273 # include <shlib-compat.h>
274 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
275 
276 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
277 __typeof__ (__regexec) __compat_regexec;
278 
279 int
280 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)281 __compat_regexec (const regex_t *__restrict preg,
282 		  const char *__restrict string, size_t nmatch,
283 		  regmatch_t pmatch[], int eflags)
284 {
285   return regexec (preg, string, nmatch, pmatch,
286 		  eflags & (REG_NOTBOL | REG_NOTEOL));
287 }
288 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
289 # endif
290 #endif
291 
292 /* Entry points for GNU code.  */
293 
294 /* re_match, re_search, re_match_2, re_search_2
295 
296    The former two functions operate on STRING with length LENGTH,
297    while the later two operate on concatenation of STRING1 and STRING2
298    with lengths LENGTH1 and LENGTH2, respectively.
299 
300    re_match() matches the compiled pattern in BUFP against the string,
301    starting at index START.
302 
303    re_search() first tries matching at index START, then it tries to match
304    starting from index START + 1, and so on.  The last start position tried
305    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
306    way as re_match().)
307 
308    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
309    the first STOP characters of the concatenation of the strings should be
310    concerned.
311 
312    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
313    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
314    computed relative to the concatenation, not relative to the individual
315    strings.)
316 
317    On success, re_match* functions return the length of the match, re_search*
318    return the position of the start of the match.  Return value -1 means no
319    match was found and -2 indicates an internal error.  */
320 
321 int
re_match(bufp,string,length,start,regs)322 re_match (bufp, string, length, start, regs)
323     struct re_pattern_buffer *bufp;
324     const char *string;
325     int length, start;
326     struct re_registers *regs;
327 {
328   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
329 }
330 #ifdef _LIBC
331 weak_alias (__re_match, re_match)
332 #endif
333 
334 int
335 re_search (bufp, string, length, start, range, regs)
336     struct re_pattern_buffer *bufp;
337     const char *string;
338     int length, start, range;
339     struct re_registers *regs;
340 {
341   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
342 }
343 #ifdef _LIBC
344 weak_alias (__re_search, re_search)
345 #endif
346 
347 int
348 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
349     struct re_pattern_buffer *bufp;
350     const char *string1, *string2;
351     int length1, length2, start, stop;
352     struct re_registers *regs;
353 {
354   return re_search_2_stub (bufp, string1, length1, string2, length2,
355 			   start, 0, regs, stop, 1);
356 }
357 #ifdef _LIBC
358 weak_alias (__re_match_2, re_match_2)
359 #endif
360 
361 int
362 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
363     struct re_pattern_buffer *bufp;
364     const char *string1, *string2;
365     int length1, length2, start, range, stop;
366     struct re_registers *regs;
367 {
368   return re_search_2_stub (bufp, string1, length1, string2, length2,
369 			   start, range, regs, stop, 0);
370 }
371 #ifdef _LIBC
372 weak_alias (__re_search_2, re_search_2)
373 #endif
374 
375 static int
376 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
377 		  stop, ret_len)
378     struct re_pattern_buffer *bufp;
379     const char *string1, *string2;
380     int length1, length2, start, range, stop, ret_len;
381     struct re_registers *regs;
382 {
383   const char *str;
384   int rval;
385   int len = length1 + length2;
386   int free_str = 0;
387 
388   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
389     return -2;
390 
391   /* Concatenate the strings.  */
392   if (length2 > 0)
393     if (length1 > 0)
394       {
395 	char *s = re_malloc (char, len);
396 
397 	if (BE (s == NULL, 0))
398 	  return -2;
399 #ifdef _LIBC
400 	memcpy (__mempcpy (s, string1, length1), string2, length2);
401 #else
402 	memcpy (s, string1, length1);
403 	memcpy (s + length1, string2, length2);
404 #endif
405 	str = s;
406 	free_str = 1;
407       }
408     else
409       str = string2;
410   else
411     str = string1;
412 
413   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
414 			 ret_len);
415   if (free_str)
416     re_free ((char *) str);
417   return rval;
418 }
419 
420 /* The parameters have the same meaning as those of re_search.
421    Additional parameters:
422    If RET_LEN is nonzero the length of the match is returned (re_match style);
423    otherwise the position of the match is returned.  */
424 
425 static int
re_search_stub(bufp,string,length,start,range,stop,regs,ret_len)426 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
427     struct re_pattern_buffer *bufp;
428     const char *string;
429     int length, start, range, stop, ret_len;
430     struct re_registers *regs;
431 {
432   reg_errcode_t result;
433   regmatch_t *pmatch;
434   int nregs, rval;
435   int eflags = 0;
436   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
437   (void)dfa;
438 
439   /* Check for out-of-range.  */
440   if (BE (start < 0 || start > length, 0))
441     return -1;
442   if (BE (start + range > length, 0))
443     range = length - start;
444   else if (BE (start + range < 0, 0))
445     range = -start;
446 
447   __libc_lock_lock (dfa->lock);
448 
449   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
450   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
451 
452   /* Compile fastmap if we haven't yet.  */
453   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
454     re_compile_fastmap (bufp);
455 
456   if (BE (bufp->no_sub, 0))
457     regs = NULL;
458 
459   /* We need at least 1 register.  */
460   if (regs == NULL)
461     nregs = 1;
462   else if (BE (bufp->regs_allocated == REGS_FIXED &&
463 	       regs->num_regs < bufp->re_nsub + 1, 0))
464     {
465       nregs = regs->num_regs;
466       if (BE (nregs < 1, 0))
467 	{
468 	  /* Nothing can be copied to regs.  */
469 	  regs = NULL;
470 	  nregs = 1;
471 	}
472     }
473   else
474     nregs = bufp->re_nsub + 1;
475   pmatch = re_malloc (regmatch_t, nregs);
476   if (BE (pmatch == NULL, 0))
477     {
478       rval = -2;
479       goto out;
480     }
481 
482   result = re_search_internal (bufp, string, length, start, range, stop,
483 			       nregs, pmatch, eflags);
484 
485   rval = 0;
486 
487   /* I hope we needn't fill ther regs with -1's when no match was found.  */
488   if (result != REG_NOERROR)
489     rval = -1;
490   else if (regs != NULL)
491     {
492       /* If caller wants register contents data back, copy them.  */
493       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
494 					   bufp->regs_allocated);
495       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
496 	rval = -2;
497     }
498 
499   if (BE (rval == 0, 1))
500     {
501       if (ret_len)
502 	{
503 	  assert (pmatch[0].rm_so == start);
504 	  rval = pmatch[0].rm_eo - start;
505 	}
506       else
507 	rval = pmatch[0].rm_so;
508     }
509   re_free (pmatch);
510  out:
511   __libc_lock_unlock (dfa->lock);
512   return rval;
513 }
514 
515 static unsigned
re_copy_regs(regs,pmatch,nregs,regs_allocated)516 re_copy_regs (regs, pmatch, nregs, regs_allocated)
517     struct re_registers *regs;
518     regmatch_t *pmatch;
519     int nregs, regs_allocated;
520 {
521   int rval = REGS_REALLOCATE;
522   int i;
523   int need_regs = nregs + 1;
524   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
525      uses.  */
526 
527   /* Have the register data arrays been allocated?  */
528   if (regs_allocated == REGS_UNALLOCATED)
529     { /* No.  So allocate them with malloc.  */
530       regs->start = re_malloc (regoff_t, need_regs);
531       regs->end = re_malloc (regoff_t, need_regs);
532       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
533 	return REGS_UNALLOCATED;
534       regs->num_regs = need_regs;
535     }
536   else if (regs_allocated == REGS_REALLOCATE)
537     { /* Yes.  If we need more elements than were already
538 	 allocated, reallocate them.  If we need fewer, just
539 	 leave it alone.  */
540       if (BE (need_regs > regs->num_regs, 0))
541 	{
542 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
543 	  regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
544 	  if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
545 	    return REGS_UNALLOCATED;
546 	  regs->start = new_start;
547 	  regs->end = new_end;
548 	  regs->num_regs = need_regs;
549 	}
550     }
551   else
552     {
553       assert (regs_allocated == REGS_FIXED);
554       /* This function may not be called with REGS_FIXED and nregs too big.  */
555       assert (regs->num_regs >= nregs);
556       rval = REGS_FIXED;
557     }
558 
559   /* Copy the regs.  */
560   for (i = 0; i < nregs; ++i)
561     {
562       regs->start[i] = pmatch[i].rm_so;
563       regs->end[i] = pmatch[i].rm_eo;
564     }
565   for ( ; i < regs->num_regs; ++i)
566     regs->start[i] = regs->end[i] = -1;
567 
568   return rval;
569 }
570 
571 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
572    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
573    this memory for recording register information.  STARTS and ENDS
574    must be allocated using the malloc library routine, and must each
575    be at least NUM_REGS * sizeof (regoff_t) bytes long.
576 
577    If NUM_REGS == 0, then subsequent matches should allocate their own
578    register data.
579 
580    Unless this function is called, the first search or match using
581    PATTERN_BUFFER will allocate its own register data, without
582    freeing the old data.  */
583 
584 void
re_set_registers(bufp,regs,num_regs,starts,ends)585 re_set_registers (bufp, regs, num_regs, starts, ends)
586     struct re_pattern_buffer *bufp;
587     struct re_registers *regs;
588     unsigned num_regs;
589     regoff_t *starts, *ends;
590 {
591   if (num_regs)
592     {
593       bufp->regs_allocated = REGS_REALLOCATE;
594       regs->num_regs = num_regs;
595       regs->start = starts;
596       regs->end = ends;
597     }
598   else
599     {
600       bufp->regs_allocated = REGS_UNALLOCATED;
601       regs->num_regs = 0;
602       regs->start = regs->end = (regoff_t *) 0;
603     }
604 }
605 #ifdef _LIBC
606 weak_alias (__re_set_registers, re_set_registers)
607 #endif
608 
609 /* Entry points compatible with 4.2 BSD regex library.  We don't define
610    them unless specifically requested.  */
611 
612 #if defined _REGEX_RE_COMP || defined _LIBC
613 int
614 # ifdef _LIBC
615 weak_function
616 # endif
617 re_exec (s)
618      const char *s;
619 {
620   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
621 }
622 #endif /* _REGEX_RE_COMP */
623 
624 /* Internal entry point.  */
625 
626 /* Searches for a compiled pattern PREG in the string STRING, whose
627    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
628    mingings with regexec.  START, and RANGE have the same meanings
629    with re_search.
630    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
631    otherwise return the error code.
632    Note: We assume front end functions already check ranges.
633    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
634 
635 static reg_errcode_t
re_search_internal(preg,string,length,start,range,stop,nmatch,pmatch,eflags)636 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
637 		    eflags)
638     const regex_t *preg;
639     const char *string;
640     int length, start, range, stop, eflags;
641     size_t nmatch;
642     regmatch_t pmatch[];
643 {
644   reg_errcode_t err;
645   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
646   int left_lim, right_lim, incr;
647   int fl_longest_match, match_first, match_kind, match_last = -1;
648   int extra_nmatch;
649   int sb, ch;
650 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
651   re_match_context_t mctx = { .dfa = dfa };
652 #else
653   re_match_context_t mctx;
654 #endif
655   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
656 		   && range && !preg->can_be_null) ? preg->fastmap : NULL;
657   RE_TRANSLATE_TYPE t = preg->translate;
658 
659 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
660   memset (&mctx, '\0', sizeof (re_match_context_t));
661   mctx.dfa = dfa;
662 #endif
663 
664   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
665   nmatch -= extra_nmatch;
666 
667   /* Check if the DFA haven't been compiled.  */
668   if (BE (preg->used == 0 || dfa->init_state == NULL
669 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
670 	  || dfa->init_state_begbuf == NULL, 0))
671     return REG_NOMATCH;
672 
673 #ifdef DEBUG
674   /* We assume front-end functions already check them.  */
675   assert (start + range >= 0 && start + range <= length);
676 #endif
677 
678   /* If initial states with non-begbuf contexts have no elements,
679      the regex must be anchored.  If preg->newline_anchor is set,
680      we'll never use init_state_nl, so do not check it.  */
681   if (dfa->init_state->nodes.nelem == 0
682       && dfa->init_state_word->nodes.nelem == 0
683       && (dfa->init_state_nl->nodes.nelem == 0
684 	  || !preg->newline_anchor))
685     {
686       if (start != 0 && start + range != 0)
687         return REG_NOMATCH;
688       start = range = 0;
689     }
690 
691   /* We must check the longest matching, if nmatch > 0.  */
692   fl_longest_match = (nmatch != 0 || dfa->nbackref);
693 
694   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
695 			    preg->translate, preg->syntax & RE_ICASE, dfa);
696   if (BE (err != REG_NOERROR, 0))
697     goto free_return;
698   mctx.input.stop = stop;
699   mctx.input.raw_stop = stop;
700   mctx.input.newline_anchor = preg->newline_anchor;
701 
702   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
703   if (BE (err != REG_NOERROR, 0))
704     goto free_return;
705 
706   /* We will log all the DFA states through which the dfa pass,
707      if nmatch > 1, or this dfa has "multibyte node", which is a
708      back-reference or a node which can accept multibyte character or
709      multi character collating element.  */
710   if (nmatch > 1 || dfa->has_mb_node)
711     {
712       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
713       if (BE (mctx.state_log == NULL, 0))
714 	{
715 	  err = REG_ESPACE;
716 	  goto free_return;
717 	}
718     }
719   else
720     mctx.state_log = NULL;
721 
722   match_first = start;
723   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
724 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
725 
726   /* Check incrementally whether of not the input string match.  */
727   incr = (range < 0) ? -1 : 1;
728   left_lim = (range < 0) ? start + range : start;
729   right_lim = (range < 0) ? start : start + range;
730   sb = dfa->mb_cur_max == 1;
731   match_kind =
732     (fastmap
733      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
734 	| (range >= 0 ? 2 : 0)
735 	| (t != NULL ? 1 : 0))
736      : 8);
737 
738   for (;; match_first += incr)
739     {
740       err = REG_NOMATCH;
741       if (match_first < left_lim || right_lim < match_first)
742 	goto free_return;
743 
744       /* Advance as rapidly as possible through the string, until we
745 	 find a plausible place to start matching.  This may be done
746 	 with varying efficiency, so there are various possibilities:
747 	 only the most common of them are specialized, in order to
748 	 save on code size.  We use a switch statement for speed.  */
749       switch (match_kind)
750 	{
751 	case 8:
752 	  /* No fastmap.  */
753 	  break;
754 
755 	case 7:
756 	  /* Fastmap with single-byte translation, match forward.  */
757 	  while (BE (match_first < right_lim, 1)
758 		 && !fastmap[t[(unsigned char) string[match_first]]])
759 	    ++match_first;
760 	  goto forward_match_found_start_or_reached_end;
761 
762 	case 6:
763 	  /* Fastmap without translation, match forward.  */
764 	  while (BE (match_first < right_lim, 1)
765 		 && !fastmap[(unsigned char) string[match_first]])
766 	    ++match_first;
767 
768 	forward_match_found_start_or_reached_end:
769 	  if (BE (match_first == right_lim, 0))
770 	    {
771 	      ch = match_first >= length
772 		       ? 0 : (unsigned char) string[match_first];
773 	      if (!fastmap[t ? t[ch] : ch])
774 		goto free_return;
775 	    }
776 	  break;
777 
778 	case 4:
779 	case 5:
780 	  /* Fastmap without multi-byte translation, match backwards.  */
781 	  while (match_first >= left_lim)
782 	    {
783 	      ch = match_first >= length
784 		       ? 0 : (unsigned char) string[match_first];
785 	      if (fastmap[t ? t[ch] : ch])
786 		break;
787 	      --match_first;
788 	    }
789 	  if (match_first < left_lim)
790 	    goto free_return;
791 	  break;
792 
793 	default:
794 	  /* In this case, we can't determine easily the current byte,
795 	     since it might be a component byte of a multibyte
796 	     character.  Then we use the constructed buffer instead.  */
797 	  for (;;)
798 	    {
799 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
800 		 buffers.  */
801 	      unsigned int offset = match_first - mctx.input.raw_mbs_idx;
802 	      if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
803 		{
804 		  err = re_string_reconstruct (&mctx.input, match_first,
805 					       eflags);
806 		  if (BE (err != REG_NOERROR, 0))
807 		    goto free_return;
808 
809 		  offset = match_first - mctx.input.raw_mbs_idx;
810 		}
811 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
812 		 Note that MATCH_FIRST must not be smaller than 0.  */
813 	      ch = (match_first >= length
814 		    ? 0 : re_string_byte_at (&mctx.input, offset));
815 	      if (fastmap[ch])
816 		break;
817 	      match_first += incr;
818 	      if (match_first < left_lim || match_first > right_lim)
819 	        {
820 	          err = REG_NOMATCH;
821 	          goto free_return;
822 	        }
823 	    }
824 	  break;
825 	}
826 
827       /* Reconstruct the buffers so that the matcher can assume that
828 	 the matching starts from the beginning of the buffer.  */
829       err = re_string_reconstruct (&mctx.input, match_first, eflags);
830       if (BE (err != REG_NOERROR, 0))
831 	goto free_return;
832 
833 #ifdef RE_ENABLE_I18N
834      /* Don't consider this char as a possible match start if it part,
835 	yet isn't the head, of a multibyte character.  */
836       if (!sb && !re_string_first_byte (&mctx.input, 0))
837 	continue;
838 #endif
839 
840       /* It seems to be appropriate one, then use the matcher.  */
841       /* We assume that the matching starts from 0.  */
842       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
843       match_last = check_matching (&mctx, fl_longest_match,
844 				   range >= 0 ? &match_first : NULL);
845       if (match_last != -1)
846 	{
847 	  if (BE (match_last == -2, 0))
848 	    {
849 	      err = REG_ESPACE;
850 	      goto free_return;
851 	    }
852 	  else
853 	    {
854 	      mctx.match_last = match_last;
855 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
856 		{
857 		  re_dfastate_t *pstate = mctx.state_log[match_last];
858 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
859 							     match_last);
860 		}
861 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
862 		  || dfa->nbackref)
863 		{
864 		  err = prune_impossible_nodes (&mctx);
865 		  if (err == REG_NOERROR)
866 		    break;
867 		  if (BE (err != REG_NOMATCH, 0))
868 		    goto free_return;
869 		  match_last = -1;
870 		}
871 	      else
872 		break; /* We found a match.  */
873 	    }
874 	}
875 
876       match_ctx_clean (&mctx);
877     }
878 
879 #ifdef DEBUG
880   assert (match_last != -1);
881   assert (err == REG_NOERROR);
882 #endif
883 
884   /* Set pmatch[] if we need.  */
885   if (nmatch > 0)
886     {
887       int reg_idx;
888 
889       /* Initialize registers.  */
890       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
891 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
892 
893       /* Set the points where matching start/end.  */
894       pmatch[0].rm_so = 0;
895       pmatch[0].rm_eo = mctx.match_last;
896 
897       if (!preg->no_sub && nmatch > 1)
898 	{
899 	  err = set_regs (preg, &mctx, nmatch, pmatch,
900 			  dfa->has_plural_match && dfa->nbackref > 0);
901 	  if (BE (err != REG_NOERROR, 0))
902 	    goto free_return;
903 	}
904 
905       /* At last, add the offset to the each registers, since we slided
906 	 the buffers so that we could assume that the matching starts
907 	 from 0.  */
908       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
909 	if (pmatch[reg_idx].rm_so != -1)
910 	  {
911 #ifdef RE_ENABLE_I18N
912 	    if (BE (mctx.input.offsets_needed != 0, 0))
913 	      {
914 		pmatch[reg_idx].rm_so =
915 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
916 		   ? mctx.input.valid_raw_len
917 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
918 		pmatch[reg_idx].rm_eo =
919 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
920 		   ? mctx.input.valid_raw_len
921 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
922 	      }
923 #else
924 	    assert (mctx.input.offsets_needed == 0);
925 #endif
926 	    pmatch[reg_idx].rm_so += match_first;
927 	    pmatch[reg_idx].rm_eo += match_first;
928 	  }
929       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
930 	{
931 	  pmatch[nmatch + reg_idx].rm_so = -1;
932 	  pmatch[nmatch + reg_idx].rm_eo = -1;
933 	}
934 
935       if (dfa->subexp_map)
936         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
937           if (dfa->subexp_map[reg_idx] != reg_idx)
938             {
939               pmatch[reg_idx + 1].rm_so
940                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
941               pmatch[reg_idx + 1].rm_eo
942                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
943             }
944     }
945 
946  free_return:
947   re_free (mctx.state_log);
948   if (dfa->nbackref)
949     match_ctx_free (&mctx);
950   re_string_destruct (&mctx.input);
951   return err;
952 }
953 
954 static reg_errcode_t
prune_impossible_nodes(mctx)955 prune_impossible_nodes (mctx)
956      re_match_context_t *mctx;
957 {
958   const re_dfa_t *const dfa = mctx->dfa;
959   int halt_node, match_last;
960   reg_errcode_t ret;
961   re_dfastate_t **sifted_states;
962   re_dfastate_t **lim_states = NULL;
963   re_sift_context_t sctx;
964 #ifdef DEBUG
965   assert (mctx->state_log != NULL);
966 #endif
967   match_last = mctx->match_last;
968   halt_node = mctx->last_node;
969   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
970   if (BE (sifted_states == NULL, 0))
971     {
972       ret = REG_ESPACE;
973       goto free_return;
974     }
975   if (dfa->nbackref)
976     {
977       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
978       if (BE (lim_states == NULL, 0))
979 	{
980 	  ret = REG_ESPACE;
981 	  goto free_return;
982 	}
983       while (1)
984 	{
985 	  memset (lim_states, '\0',
986 		  sizeof (re_dfastate_t *) * (match_last + 1));
987 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
988 			 match_last);
989 	  ret = sift_states_backward (mctx, &sctx);
990 	  re_node_set_free (&sctx.limits);
991 	  if (BE (ret != REG_NOERROR, 0))
992 	      goto free_return;
993 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
994 	    break;
995 	  do
996 	    {
997 	      --match_last;
998 	      if (match_last < 0)
999 		{
1000 		  ret = REG_NOMATCH;
1001 		  goto free_return;
1002 		}
1003 	    } while (mctx->state_log[match_last] == NULL
1004 		     || !mctx->state_log[match_last]->halt);
1005 	  halt_node = check_halt_state_context (mctx,
1006 						mctx->state_log[match_last],
1007 						match_last);
1008 	}
1009       ret = merge_state_array (dfa, sifted_states, lim_states,
1010 			       match_last + 1);
1011       re_free (lim_states);
1012       lim_states = NULL;
1013       if (BE (ret != REG_NOERROR, 0))
1014 	goto free_return;
1015     }
1016   else
1017     {
1018       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1019       ret = sift_states_backward (mctx, &sctx);
1020       re_node_set_free (&sctx.limits);
1021       if (BE (ret != REG_NOERROR, 0))
1022 	goto free_return;
1023     }
1024   re_free (mctx->state_log);
1025   mctx->state_log = sifted_states;
1026   sifted_states = NULL;
1027   mctx->last_node = halt_node;
1028   mctx->match_last = match_last;
1029   ret = REG_NOERROR;
1030  free_return:
1031   re_free (sifted_states);
1032   re_free (lim_states);
1033   return ret;
1034 }
1035 
1036 /* Acquire an initial state and return it.
1037    We must select appropriate initial state depending on the context,
1038    since initial states may have constraints like "\<", "^", etc..  */
1039 
1040 static inline re_dfastate_t *
1041 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,int idx)1042 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1043 			    int idx)
1044 {
1045   const re_dfa_t *const dfa = mctx->dfa;
1046   if (dfa->init_state->has_constraint)
1047     {
1048       unsigned int context;
1049       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1050       if (IS_WORD_CONTEXT (context))
1051 	return dfa->init_state_word;
1052       else if (IS_ORDINARY_CONTEXT (context))
1053 	return dfa->init_state;
1054       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1055 	return dfa->init_state_begbuf;
1056       else if (IS_NEWLINE_CONTEXT (context))
1057 	return dfa->init_state_nl;
1058       else if (IS_BEGBUF_CONTEXT (context))
1059 	{
1060 	  /* It is relatively rare case, then calculate on demand.  */
1061 	  return re_acquire_state_context (err, dfa,
1062 					   dfa->init_state->entrance_nodes,
1063 					   context);
1064 	}
1065       else
1066 	/* Must not happen?  */
1067 	return dfa->init_state;
1068     }
1069   else
1070     return dfa->init_state;
1071 }
1072 
1073 /* Check whether the regular expression match input string INPUT or not,
1074    and return the index where the matching end, return -1 if not match,
1075    or return -2 in case of an error.
1076    FL_LONGEST_MATCH means we want the POSIX longest matching.
1077    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1078    next place where we may want to try matching.
1079    Note that the matcher assume that the maching starts from the current
1080    index of the buffer.  */
1081 
1082 static int
1083 internal_function
check_matching(re_match_context_t * mctx,int fl_longest_match,int * p_match_first)1084 check_matching (re_match_context_t *mctx, int fl_longest_match,
1085 		int *p_match_first)
1086 {
1087   const re_dfa_t *const dfa = mctx->dfa;
1088   reg_errcode_t err;
1089   int match = 0;
1090   int match_last = -1;
1091   int cur_str_idx = re_string_cur_idx (&mctx->input);
1092   re_dfastate_t *cur_state;
1093   int at_init_state = p_match_first != NULL;
1094   int next_start_idx = cur_str_idx;
1095 
1096   err = REG_NOERROR;
1097   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1098   /* An initial state must not be NULL (invalid).  */
1099   if (BE (cur_state == NULL, 0))
1100     {
1101       assert (err == REG_ESPACE);
1102       return -2;
1103     }
1104 
1105   if (mctx->state_log != NULL)
1106     {
1107       mctx->state_log[cur_str_idx] = cur_state;
1108 
1109       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1110 	 later.  E.g. Processing back references.  */
1111       if (BE (dfa->nbackref, 0))
1112 	{
1113 	  at_init_state = 0;
1114 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1115 	  if (BE (err != REG_NOERROR, 0))
1116 	    return err;
1117 
1118 	  if (cur_state->has_backref)
1119 	    {
1120 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1121 	      if (BE (err != REG_NOERROR, 0))
1122 	        return err;
1123 	    }
1124 	}
1125     }
1126 
1127   /* If the RE accepts NULL string.  */
1128   if (BE (cur_state->halt, 0))
1129     {
1130       if (!cur_state->has_constraint
1131 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1132 	{
1133 	  if (!fl_longest_match)
1134 	    return cur_str_idx;
1135 	  else
1136 	    {
1137 	      match_last = cur_str_idx;
1138 	      match = 1;
1139 	    }
1140 	}
1141     }
1142 
1143   while (!re_string_eoi (&mctx->input))
1144     {
1145       re_dfastate_t *old_state = cur_state;
1146       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1147 
1148       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1149           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1150               && mctx->input.valid_len < mctx->input.len))
1151         {
1152           err = extend_buffers (mctx);
1153           if (BE (err != REG_NOERROR, 0))
1154 	    {
1155 	      assert (err == REG_ESPACE);
1156 	      return -2;
1157 	    }
1158         }
1159 
1160       cur_state = transit_state (&err, mctx, cur_state);
1161       if (mctx->state_log != NULL)
1162 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1163 
1164       if (cur_state == NULL)
1165 	{
1166 	  /* Reached the invalid state or an error.  Try to recover a valid
1167 	     state using the state log, if available and if we have not
1168 	     already found a valid (even if not the longest) match.  */
1169 	  if (BE (err != REG_NOERROR, 0))
1170 	    return -2;
1171 
1172 	  if (mctx->state_log == NULL
1173 	      || (match && !fl_longest_match)
1174 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1175 	    break;
1176 	}
1177 
1178       if (BE (at_init_state, 0))
1179 	{
1180 	  if (old_state == cur_state)
1181 	    next_start_idx = next_char_idx;
1182 	  else
1183 	    at_init_state = 0;
1184 	}
1185 
1186       if (cur_state->halt)
1187 	{
1188 	  /* Reached a halt state.
1189 	     Check the halt state can satisfy the current context.  */
1190 	  if (!cur_state->has_constraint
1191 	      || check_halt_state_context (mctx, cur_state,
1192 					   re_string_cur_idx (&mctx->input)))
1193 	    {
1194 	      /* We found an appropriate halt state.  */
1195 	      match_last = re_string_cur_idx (&mctx->input);
1196 	      match = 1;
1197 
1198 	      /* We found a match, do not modify match_first below.  */
1199 	      p_match_first = NULL;
1200 	      if (!fl_longest_match)
1201 		break;
1202 	    }
1203 	}
1204     }
1205 
1206   if (p_match_first)
1207     *p_match_first += next_start_idx;
1208 
1209   return match_last;
1210 }
1211 
1212 /* Check NODE match the current context.  */
1213 
1214 static int
1215 internal_function
check_halt_node_context(const re_dfa_t * dfa,int node,unsigned int context)1216 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1217 {
1218   re_token_type_t type = dfa->nodes[node].type;
1219   unsigned int constraint = dfa->nodes[node].constraint;
1220   if (type != END_OF_RE)
1221     return 0;
1222   if (!constraint)
1223     return 1;
1224   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1225     return 0;
1226   return 1;
1227 }
1228 
1229 /* Check the halt state STATE match the current context.
1230    Return 0 if not match, if the node, STATE has, is a halt node and
1231    match the context, return the node.  */
1232 
1233 static int
1234 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,int idx)1235 check_halt_state_context (const re_match_context_t *mctx,
1236 			  const re_dfastate_t *state, int idx)
1237 {
1238   int i;
1239   unsigned int context;
1240 #ifdef DEBUG
1241   assert (state->halt);
1242 #endif
1243   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1244   for (i = 0; i < state->nodes.nelem; ++i)
1245     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1246       return state->nodes.elems[i];
1247   return 0;
1248 }
1249 
1250 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1251    corresponding to the DFA).
1252    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1253    of errors.  */
1254 
1255 static int
1256 internal_function
proceed_next_node(const re_match_context_t * mctx,int nregs,regmatch_t * regs,int * pidx,int node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1257 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1258 		   int *pidx, int node, re_node_set *eps_via_nodes,
1259 		   struct re_fail_stack_t *fs)
1260 {
1261   const re_dfa_t *const dfa = mctx->dfa;
1262   int i, err;
1263   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1264     {
1265       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1266       re_node_set *edests = &dfa->edests[node];
1267       int dest_node;
1268       err = re_node_set_insert (eps_via_nodes, node);
1269       if (BE (err < 0, 0))
1270 	return -2;
1271       /* Pick up a valid destination, or return -1 if none is found.  */
1272       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1273 	{
1274 	  int candidate = edests->elems[i];
1275 	  if (!re_node_set_contains (cur_nodes, candidate))
1276 	    continue;
1277           if (dest_node == -1)
1278 	    dest_node = candidate;
1279 
1280           else
1281 	    {
1282 	      /* In order to avoid infinite loop like "(a*)*", return the second
1283 	         epsilon-transition if the first was already considered.  */
1284 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1285 	        return candidate;
1286 
1287 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1288 	      else if (fs != NULL
1289 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1290 				           eps_via_nodes))
1291 		return -2;
1292 
1293 	      /* We know we are going to exit.  */
1294 	      break;
1295 	    }
1296 	}
1297       return dest_node;
1298     }
1299   else
1300     {
1301       int naccepted = 0;
1302       re_token_type_t type = dfa->nodes[node].type;
1303 
1304 #ifdef RE_ENABLE_I18N
1305       if (dfa->nodes[node].accept_mb)
1306 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1307       else
1308 #endif /* RE_ENABLE_I18N */
1309       if (type == OP_BACK_REF)
1310 	{
1311 	  int subexp_idx = dfa->nodes[node].opr.idx + 1;
1312 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1313 	  if (fs != NULL)
1314 	    {
1315 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1316 		return -1;
1317 	      else if (naccepted)
1318 		{
1319 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1320 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1321 			      naccepted) != 0)
1322 		    return -1;
1323 		}
1324 	    }
1325 
1326 	  if (naccepted == 0)
1327 	    {
1328 	      int dest_node;
1329 	      err = re_node_set_insert (eps_via_nodes, node);
1330 	      if (BE (err < 0, 0))
1331 		return -2;
1332 	      dest_node = dfa->edests[node].elems[0];
1333 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1334 					dest_node))
1335 		return dest_node;
1336 	    }
1337 	}
1338 
1339       if (naccepted != 0
1340 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1341 	{
1342 	  int dest_node = dfa->nexts[node];
1343 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1344 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1345 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1346 					       dest_node)))
1347 	    return -1;
1348 	  re_node_set_empty (eps_via_nodes);
1349 	  return dest_node;
1350 	}
1351     }
1352   return -1;
1353 }
1354 
1355 static reg_errcode_t
1356 internal_function
push_fail_stack(struct re_fail_stack_t * fs,int str_idx,int dest_node,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1357 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1358 		 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1359 {
1360   reg_errcode_t err;
1361   int num = fs->num++;
1362   if (fs->num == fs->alloc)
1363     {
1364       struct re_fail_stack_ent_t *new_array;
1365       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1366 				       * fs->alloc * 2));
1367       if (new_array == NULL)
1368 	return REG_ESPACE;
1369       fs->alloc *= 2;
1370       fs->stack = new_array;
1371     }
1372   fs->stack[num].idx = str_idx;
1373   fs->stack[num].node = dest_node;
1374   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1375   if (fs->stack[num].regs == NULL)
1376     return REG_ESPACE;
1377   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1378   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1379   return err;
1380 }
1381 
1382 static int
1383 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,int * pidx,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1384 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1385 		regmatch_t *regs, re_node_set *eps_via_nodes)
1386 {
1387   int num = --fs->num;
1388   assert (num >= 0);
1389   *pidx = fs->stack[num].idx;
1390   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1391   re_node_set_free (eps_via_nodes);
1392   re_free (fs->stack[num].regs);
1393   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1394   return fs->stack[num].node;
1395 }
1396 
1397 /* Set the positions where the subexpressions are starts/ends to registers
1398    PMATCH.
1399    Note: We assume that pmatch[0] is already set, and
1400    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1401 
1402 static reg_errcode_t
1403 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,int fl_backtrack)1404 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1405 	  regmatch_t *pmatch, int fl_backtrack)
1406 {
1407   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1408   int idx, cur_node;
1409   re_node_set eps_via_nodes;
1410   struct re_fail_stack_t *fs;
1411   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1412   regmatch_t *prev_idx_match;
1413   int prev_idx_match_malloced = 0;
1414 
1415 #ifdef DEBUG
1416   assert (nmatch > 1);
1417   assert (mctx->state_log != NULL);
1418 #endif
1419   if (fl_backtrack)
1420     {
1421       fs = &fs_body;
1422       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1423       if (fs->stack == NULL)
1424 	return REG_ESPACE;
1425     }
1426   else
1427     fs = NULL;
1428 
1429   cur_node = dfa->init_node;
1430   re_node_set_init_empty (&eps_via_nodes);
1431 
1432   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1433     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1434   else
1435     {
1436       prev_idx_match = re_malloc (regmatch_t, nmatch);
1437       if (prev_idx_match == NULL)
1438 	{
1439 	  free_fail_stack_return (fs);
1440 	  return REG_ESPACE;
1441 	}
1442       prev_idx_match_malloced = 1;
1443     }
1444   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1445 
1446   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1447     {
1448       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1449 
1450       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1451 	{
1452 	  int reg_idx;
1453 	  if (fs)
1454 	    {
1455 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1456 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1457 		  break;
1458 	      if (reg_idx == nmatch)
1459 		{
1460 		  re_node_set_free (&eps_via_nodes);
1461 		  if (prev_idx_match_malloced)
1462 		    re_free (prev_idx_match);
1463 		  return free_fail_stack_return (fs);
1464 		}
1465 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1466 					 &eps_via_nodes);
1467 	    }
1468 	  else
1469 	    {
1470 	      re_node_set_free (&eps_via_nodes);
1471 	      if (prev_idx_match_malloced)
1472 		re_free (prev_idx_match);
1473 	      return REG_NOERROR;
1474 	    }
1475 	}
1476 
1477       /* Proceed to next node.  */
1478       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1479 				    &eps_via_nodes, fs);
1480 
1481       if (BE (cur_node < 0, 0))
1482 	{
1483 	  if (BE (cur_node == -2, 0))
1484 	    {
1485 	      re_node_set_free (&eps_via_nodes);
1486 	      if (prev_idx_match_malloced)
1487 		re_free (prev_idx_match);
1488 	      free_fail_stack_return (fs);
1489 	      return REG_ESPACE;
1490 	    }
1491 	  if (fs)
1492 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1493 				       &eps_via_nodes);
1494 	  else
1495 	    {
1496 	      re_node_set_free (&eps_via_nodes);
1497 	      if (prev_idx_match_malloced)
1498 		re_free (prev_idx_match);
1499 	      return REG_NOMATCH;
1500 	    }
1501 	}
1502     }
1503   re_node_set_free (&eps_via_nodes);
1504   if (prev_idx_match_malloced)
1505     re_free (prev_idx_match);
1506   return free_fail_stack_return (fs);
1507 }
1508 
1509 static reg_errcode_t
1510 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1511 free_fail_stack_return (struct re_fail_stack_t *fs)
1512 {
1513   if (fs)
1514     {
1515       int fs_idx;
1516       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1517 	{
1518 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1519 	  re_free (fs->stack[fs_idx].regs);
1520 	}
1521       re_free (fs->stack);
1522     }
1523   return REG_NOERROR;
1524 }
1525 
1526 static void
1527 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,int cur_node,int cur_idx,int nmatch)1528 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1529 	     regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1530 {
1531   int type = dfa->nodes[cur_node].type;
1532   if (type == OP_OPEN_SUBEXP)
1533     {
1534       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1535 
1536       /* We are at the first node of this sub expression.  */
1537       if (reg_num < nmatch)
1538 	{
1539 	  pmatch[reg_num].rm_so = cur_idx;
1540 	  pmatch[reg_num].rm_eo = -1;
1541 	}
1542     }
1543   else if (type == OP_CLOSE_SUBEXP)
1544     {
1545       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1546       if (reg_num < nmatch)
1547 	{
1548 	  /* We are at the last node of this sub expression.  */
1549 	  if (pmatch[reg_num].rm_so < cur_idx)
1550 	    {
1551 	      pmatch[reg_num].rm_eo = cur_idx;
1552 	      /* This is a non-empty match or we are not inside an optional
1553 		 subexpression.  Accept this right away.  */
1554 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1555 	    }
1556 	  else
1557 	    {
1558 	      if (dfa->nodes[cur_node].opt_subexp
1559 		  && prev_idx_match[reg_num].rm_so != -1)
1560 		/* We transited through an empty match for an optional
1561 		   subexpression, like (a?)*, and this is not the subexp's
1562 		   first match.  Copy back the old content of the registers
1563 		   so that matches of an inner subexpression are undone as
1564 		   well, like in ((a?))*.  */
1565 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1566 	      else
1567 		/* We completed a subexpression, but it may be part of
1568 		   an optional one, so do not update PREV_IDX_MATCH.  */
1569 		pmatch[reg_num].rm_eo = cur_idx;
1570 	    }
1571 	}
1572     }
1573 }
1574 
1575 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1576    and sift the nodes in each states according to the following rules.
1577    Updated state_log will be wrote to STATE_LOG.
1578 
1579    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1580      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1581 	If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1582 	the LAST_NODE, we throw away the node `a'.
1583      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1584 	string `s' and transit to `b':
1585 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1586 	   away the node `a'.
1587 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1588 	    thrown away, we throw away the node `a'.
1589      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1590 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1591 	   node `a'.
1592 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1593 	    we throw away the node `a'.  */
1594 
1595 #define STATE_NODE_CONTAINS(state,node) \
1596   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1597 
1598 static reg_errcode_t
1599 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1600 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1601 {
1602   reg_errcode_t err;
1603   int null_cnt = 0;
1604   int str_idx = sctx->last_str_idx;
1605   re_node_set cur_dest;
1606 
1607 #ifdef DEBUG
1608   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1609 #endif
1610 
1611   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1612      transit to the last_node and the last_node itself.  */
1613   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1614   if (BE (err != REG_NOERROR, 0))
1615     return err;
1616   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1617   if (BE (err != REG_NOERROR, 0))
1618     goto free_return;
1619 
1620   /* Then check each states in the state_log.  */
1621   while (str_idx > 0)
1622     {
1623       /* Update counters.  */
1624       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1625       if (null_cnt > mctx->max_mb_elem_len)
1626 	{
1627 	  memset (sctx->sifted_states, '\0',
1628 		  sizeof (re_dfastate_t *) * str_idx);
1629 	  re_node_set_free (&cur_dest);
1630 	  return REG_NOERROR;
1631 	}
1632       re_node_set_empty (&cur_dest);
1633       --str_idx;
1634 
1635       if (mctx->state_log[str_idx])
1636 	{
1637 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1638           if (BE (err != REG_NOERROR, 0))
1639 	    goto free_return;
1640 	}
1641 
1642       /* Add all the nodes which satisfy the following conditions:
1643 	 - It can epsilon transit to a node in CUR_DEST.
1644 	 - It is in CUR_SRC.
1645 	 And update state_log.  */
1646       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1647       if (BE (err != REG_NOERROR, 0))
1648 	goto free_return;
1649     }
1650   err = REG_NOERROR;
1651  free_return:
1652   re_node_set_free (&cur_dest);
1653   return err;
1654 }
1655 
1656 static reg_errcode_t
1657 internal_function
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * cur_dest)1658 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1659 		     int str_idx, re_node_set *cur_dest)
1660 {
1661   const re_dfa_t *const dfa = mctx->dfa;
1662   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1663   int i;
1664 
1665   /* Then build the next sifted state.
1666      We build the next sifted state on `cur_dest', and update
1667      `sifted_states[str_idx]' with `cur_dest'.
1668      Note:
1669      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1670      `cur_src' points the node_set of the old `state_log[str_idx]'
1671      (with the epsilon nodes pre-filtered out).  */
1672   for (i = 0; i < cur_src->nelem; i++)
1673     {
1674       int prev_node = cur_src->elems[i];
1675       int naccepted = 0;
1676       int ret;
1677 
1678 #ifdef DEBUG
1679       re_token_type_t type = dfa->nodes[prev_node].type;
1680       assert (!IS_EPSILON_NODE (type));
1681 #endif
1682 #ifdef RE_ENABLE_I18N
1683       /* If the node may accept `multi byte'.  */
1684       if (dfa->nodes[prev_node].accept_mb)
1685 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1686 					 str_idx, sctx->last_str_idx);
1687 #endif /* RE_ENABLE_I18N */
1688 
1689       /* We don't check backreferences here.
1690 	 See update_cur_sifted_state().  */
1691       if (!naccepted
1692 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1693 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1694 				  dfa->nexts[prev_node]))
1695 	naccepted = 1;
1696 
1697       if (naccepted == 0)
1698 	continue;
1699 
1700       if (sctx->limits.nelem)
1701 	{
1702 	  int to_idx = str_idx + naccepted;
1703 	  if (check_dst_limits (mctx, &sctx->limits,
1704 				dfa->nexts[prev_node], to_idx,
1705 				prev_node, str_idx))
1706 	    continue;
1707 	}
1708       ret = re_node_set_insert (cur_dest, prev_node);
1709       if (BE (ret == -1, 0))
1710 	return REG_ESPACE;
1711     }
1712 
1713   return REG_NOERROR;
1714 }
1715 
1716 /* Helper functions.  */
1717 
1718 static reg_errcode_t
1719 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,int next_state_log_idx)1720 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1721 {
1722   int top = mctx->state_log_top;
1723 
1724   if (next_state_log_idx >= mctx->input.bufs_len
1725       || (next_state_log_idx >= mctx->input.valid_len
1726 	  && mctx->input.valid_len < mctx->input.len))
1727     {
1728       reg_errcode_t err;
1729       err = extend_buffers (mctx);
1730       if (BE (err != REG_NOERROR, 0))
1731 	return err;
1732     }
1733 
1734   if (top < next_state_log_idx)
1735     {
1736       memset (mctx->state_log + top + 1, '\0',
1737 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1738       mctx->state_log_top = next_state_log_idx;
1739     }
1740   return REG_NOERROR;
1741 }
1742 
1743 static reg_errcode_t
1744 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,int num)1745 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1746 		   re_dfastate_t **src, int num)
1747 {
1748   int st_idx;
1749   reg_errcode_t err;
1750   for (st_idx = 0; st_idx < num; ++st_idx)
1751     {
1752       if (dst[st_idx] == NULL)
1753 	dst[st_idx] = src[st_idx];
1754       else if (src[st_idx] != NULL)
1755 	{
1756 	  re_node_set merged_set;
1757 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1758 					&src[st_idx]->nodes);
1759 	  if (BE (err != REG_NOERROR, 0))
1760 	    return err;
1761 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1762 	  re_node_set_free (&merged_set);
1763 	  if (BE (err != REG_NOERROR, 0))
1764 	    return err;
1765 	}
1766     }
1767   return REG_NOERROR;
1768 }
1769 
1770 static reg_errcode_t
1771 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * dest_nodes)1772 update_cur_sifted_state (const re_match_context_t *mctx,
1773 			 re_sift_context_t *sctx, int str_idx,
1774 			 re_node_set *dest_nodes)
1775 {
1776   const re_dfa_t *const dfa = mctx->dfa;
1777   reg_errcode_t err = REG_NOERROR;
1778   const re_node_set *candidates;
1779   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1780 		: &mctx->state_log[str_idx]->nodes);
1781 
1782   if (dest_nodes->nelem == 0)
1783     sctx->sifted_states[str_idx] = NULL;
1784   else
1785     {
1786       if (candidates)
1787 	{
1788 	  /* At first, add the nodes which can epsilon transit to a node in
1789 	     DEST_NODE.  */
1790 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1791 	  if (BE (err != REG_NOERROR, 0))
1792 	    return err;
1793 
1794 	  /* Then, check the limitations in the current sift_context.  */
1795 	  if (sctx->limits.nelem)
1796 	    {
1797 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1798 					 mctx->bkref_ents, str_idx);
1799 	      if (BE (err != REG_NOERROR, 0))
1800 		return err;
1801 	    }
1802 	}
1803 
1804       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1805       if (BE (err != REG_NOERROR, 0))
1806 	return err;
1807     }
1808 
1809   if (candidates && mctx->state_log[str_idx]->has_backref)
1810     {
1811       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1812       if (BE (err != REG_NOERROR, 0))
1813 	return err;
1814     }
1815   return REG_NOERROR;
1816 }
1817 
1818 static reg_errcode_t
1819 internal_function
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1820 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1821 		       const re_node_set *candidates)
1822 {
1823   reg_errcode_t err = REG_NOERROR;
1824   int i;
1825 
1826   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1827   if (BE (err != REG_NOERROR, 0))
1828     return err;
1829 
1830   if (!state->inveclosure.alloc)
1831     {
1832       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1833       if (BE (err != REG_NOERROR, 0))
1834         return REG_ESPACE;
1835       for (i = 0; i < dest_nodes->nelem; i++)
1836         re_node_set_merge (&state->inveclosure,
1837 			   dfa->inveclosures + dest_nodes->elems[i]);
1838     }
1839   return re_node_set_add_intersect (dest_nodes, candidates,
1840 				    &state->inveclosure);
1841 }
1842 
1843 static reg_errcode_t
1844 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,int node,re_node_set * dest_nodes,const re_node_set * candidates)1845 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1846 		       const re_node_set *candidates)
1847 {
1848     int ecl_idx;
1849     reg_errcode_t err;
1850     re_node_set *inv_eclosure = dfa->inveclosures + node;
1851     re_node_set except_nodes;
1852     re_node_set_init_empty (&except_nodes);
1853     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854       {
1855 	int cur_node = inv_eclosure->elems[ecl_idx];
1856 	if (cur_node == node)
1857 	  continue;
1858 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1859 	  {
1860 	    int edst1 = dfa->edests[cur_node].elems[0];
1861 	    int edst2 = ((dfa->edests[cur_node].nelem > 1)
1862 			 ? dfa->edests[cur_node].elems[1] : -1);
1863 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1864 		 && re_node_set_contains (dest_nodes, edst1))
1865 		|| (edst2 > 0
1866 		    && !re_node_set_contains (inv_eclosure, edst2)
1867 		    && re_node_set_contains (dest_nodes, edst2)))
1868 	      {
1869 		err = re_node_set_add_intersect (&except_nodes, candidates,
1870 						 dfa->inveclosures + cur_node);
1871 		if (BE (err != REG_NOERROR, 0))
1872 		  {
1873 		    re_node_set_free (&except_nodes);
1874 		    return err;
1875 		  }
1876 	      }
1877 	  }
1878       }
1879     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880       {
1881 	int cur_node = inv_eclosure->elems[ecl_idx];
1882 	if (!re_node_set_contains (&except_nodes, cur_node))
1883 	  {
1884 	    int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1885 	    re_node_set_remove_at (dest_nodes, idx);
1886 	  }
1887       }
1888     re_node_set_free (&except_nodes);
1889     return REG_NOERROR;
1890 }
1891 
1892 static int
1893 internal_function
check_dst_limits(const re_match_context_t * mctx,re_node_set * limits,int dst_node,int dst_idx,int src_node,int src_idx)1894 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1895 		  int dst_node, int dst_idx, int src_node, int src_idx)
1896 {
1897   const re_dfa_t *const dfa = mctx->dfa;
1898   int lim_idx, src_pos, dst_pos;
1899 
1900   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1901   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1902   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1903     {
1904       int subexp_idx;
1905       struct re_backref_cache_entry *ent;
1906       ent = mctx->bkref_ents + limits->elems[lim_idx];
1907       subexp_idx = dfa->nodes[ent->node].opr.idx;
1908 
1909       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1910 					   subexp_idx, dst_node, dst_idx,
1911 					   dst_bkref_idx);
1912       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1913 					   subexp_idx, src_node, src_idx,
1914 					   src_bkref_idx);
1915 
1916       /* In case of:
1917 	 <src> <dst> ( <subexp> )
1918 	 ( <subexp> ) <src> <dst>
1919 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1920       if (src_pos == dst_pos)
1921 	continue; /* This is unrelated limitation.  */
1922       else
1923 	return 1;
1924     }
1925   return 0;
1926 }
1927 
1928 static int
1929 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,int subexp_idx,int from_node,int bkref_idx)1930 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1931 			     int subexp_idx, int from_node, int bkref_idx)
1932 {
1933   const re_dfa_t *const dfa = mctx->dfa;
1934   const re_node_set *eclosures = dfa->eclosures + from_node;
1935   int node_idx;
1936 
1937   /* Else, we are on the boundary: examine the nodes on the epsilon
1938      closure.  */
1939   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1940     {
1941       int node = eclosures->elems[node_idx];
1942       switch (dfa->nodes[node].type)
1943 	{
1944 	case OP_BACK_REF:
1945 	  if (bkref_idx != -1)
1946 	    {
1947 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1948 	      do
1949 	        {
1950 		  int dst, cpos;
1951 
1952 		  if (ent->node != node)
1953 		    continue;
1954 
1955 		  if (subexp_idx < BITSET_WORD_BITS
1956 		      && !(ent->eps_reachable_subexps_map
1957 			   & ((bitset_word_t) 1 << subexp_idx)))
1958 		    continue;
1959 
1960 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961 		     OP_CLOSE_SUBEXP cases below.  But, if the
1962 		     destination node is the same node as the source
1963 		     node, don't recurse because it would cause an
1964 		     infinite loop: a regex that exhibits this behavior
1965 		     is ()\1*\1*  */
1966 		  dst = dfa->edests[node].elems[0];
1967 		  if (dst == from_node)
1968 		    {
1969 		      if (boundaries & 1)
1970 		        return -1;
1971 		      else /* if (boundaries & 2) */
1972 		        return 0;
1973 		    }
1974 
1975 		  cpos =
1976 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1977 						 dst, bkref_idx);
1978 		  if (cpos == -1 /* && (boundaries & 1) */)
1979 		    return -1;
1980 		  if (cpos == 0 && (boundaries & 2))
1981 		    return 0;
1982 
1983 		  if (subexp_idx < BITSET_WORD_BITS)
1984 		    ent->eps_reachable_subexps_map
1985 		      &= ~((bitset_word_t) 1 << subexp_idx);
1986 	        }
1987 	      while (ent++->more);
1988 	    }
1989 	  break;
1990 
1991 	case OP_OPEN_SUBEXP:
1992 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1993 	    return -1;
1994 	  break;
1995 
1996 	case OP_CLOSE_SUBEXP:
1997 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1998 	    return 0;
1999 	  break;
2000 
2001 	default:
2002 	    break;
2003 	}
2004     }
2005 
2006   return (boundaries & 2) ? 1 : 0;
2007 }
2008 
2009 static int
2010 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,int limit,int subexp_idx,int from_node,int str_idx,int bkref_idx)2011 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2012 			   int subexp_idx, int from_node, int str_idx,
2013 			   int bkref_idx)
2014 {
2015   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2016   int boundaries;
2017 
2018   /* If we are outside the range of the subexpression, return -1 or 1.  */
2019   if (str_idx < lim->subexp_from)
2020     return -1;
2021 
2022   if (lim->subexp_to < str_idx)
2023     return 1;
2024 
2025   /* If we are within the subexpression, return 0.  */
2026   boundaries = (str_idx == lim->subexp_from);
2027   boundaries |= (str_idx == lim->subexp_to) << 1;
2028   if (boundaries == 0)
2029     return 0;
2030 
2031   /* Else, examine epsilon closure.  */
2032   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2033 				      from_node, bkref_idx);
2034 }
2035 
2036 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2037    which are against limitations from DEST_NODES. */
2038 
2039 static reg_errcode_t
2040 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,int str_idx)2041 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2042 		     const re_node_set *candidates, re_node_set *limits,
2043 		     struct re_backref_cache_entry *bkref_ents, int str_idx)
2044 {
2045   reg_errcode_t err;
2046   int node_idx, lim_idx;
2047 
2048   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2049     {
2050       int subexp_idx;
2051       struct re_backref_cache_entry *ent;
2052       ent = bkref_ents + limits->elems[lim_idx];
2053 
2054       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2055 	continue; /* This is unrelated limitation.  */
2056 
2057       subexp_idx = dfa->nodes[ent->node].opr.idx;
2058       if (ent->subexp_to == str_idx)
2059 	{
2060 	  int ops_node = -1;
2061 	  int cls_node = -1;
2062 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2063 	    {
2064 	      int node = dest_nodes->elems[node_idx];
2065 	      re_token_type_t type = dfa->nodes[node].type;
2066 	      if (type == OP_OPEN_SUBEXP
2067 		  && subexp_idx == dfa->nodes[node].opr.idx)
2068 		ops_node = node;
2069 	      else if (type == OP_CLOSE_SUBEXP
2070 		       && subexp_idx == dfa->nodes[node].opr.idx)
2071 		cls_node = node;
2072 	    }
2073 
2074 	  /* Check the limitation of the open subexpression.  */
2075 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2076 	  if (ops_node >= 0)
2077 	    {
2078 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2079 					   candidates);
2080 	      if (BE (err != REG_NOERROR, 0))
2081 		return err;
2082 	    }
2083 
2084 	  /* Check the limitation of the close subexpression.  */
2085 	  if (cls_node >= 0)
2086 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2087 	      {
2088 		int node = dest_nodes->elems[node_idx];
2089 		if (!re_node_set_contains (dfa->inveclosures + node,
2090 					   cls_node)
2091 		    && !re_node_set_contains (dfa->eclosures + node,
2092 					      cls_node))
2093 		  {
2094 		    /* It is against this limitation.
2095 		       Remove it form the current sifted state.  */
2096 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2097 						 candidates);
2098 		    if (BE (err != REG_NOERROR, 0))
2099 		      return err;
2100 		    --node_idx;
2101 		  }
2102 	      }
2103 	}
2104       else /* (ent->subexp_to != str_idx)  */
2105 	{
2106 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2107 	    {
2108 	      int node = dest_nodes->elems[node_idx];
2109 	      re_token_type_t type = dfa->nodes[node].type;
2110 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2111 		{
2112 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2113 		    continue;
2114 		  /* It is against this limitation.
2115 		     Remove it form the current sifted state.  */
2116 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2117 					       candidates);
2118 		  if (BE (err != REG_NOERROR, 0))
2119 		    return err;
2120 		}
2121 	    }
2122 	}
2123     }
2124   return REG_NOERROR;
2125 }
2126 
2127 static reg_errcode_t
2128 internal_function
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,const re_node_set * candidates)2129 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2130 		   int str_idx, const re_node_set *candidates)
2131 {
2132   const re_dfa_t *const dfa = mctx->dfa;
2133   reg_errcode_t err;
2134   int node_idx, node;
2135   re_sift_context_t local_sctx;
2136   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2137 
2138   if (first_idx == -1)
2139     return REG_NOERROR;
2140 
2141   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2142 
2143   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2144     {
2145       int enabled_idx;
2146       re_token_type_t type;
2147       struct re_backref_cache_entry *entry;
2148       node = candidates->elems[node_idx];
2149       type = dfa->nodes[node].type;
2150       /* Avoid infinite loop for the REs like "()\1+".  */
2151       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2152 	continue;
2153       if (type != OP_BACK_REF)
2154 	continue;
2155 
2156       entry = mctx->bkref_ents + first_idx;
2157       enabled_idx = first_idx;
2158       do
2159 	{
2160 	  int subexp_len;
2161 	  int to_idx;
2162 	  int dst_node;
2163 	  int ret;
2164 	  re_dfastate_t *cur_state;
2165 
2166 	  if (entry->node != node)
2167 	    continue;
2168 	  subexp_len = entry->subexp_to - entry->subexp_from;
2169 	  to_idx = str_idx + subexp_len;
2170 	  dst_node = (subexp_len ? dfa->nexts[node]
2171 		      : dfa->edests[node].elems[0]);
2172 
2173 	  if (to_idx > sctx->last_str_idx
2174 	      || sctx->sifted_states[to_idx] == NULL
2175 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2176 	      || check_dst_limits (mctx, &sctx->limits, node,
2177 				   str_idx, dst_node, to_idx))
2178 	    continue;
2179 
2180 	  if (local_sctx.sifted_states == NULL)
2181 	    {
2182 	      local_sctx = *sctx;
2183 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2184 	      if (BE (err != REG_NOERROR, 0))
2185 		goto free_return;
2186 	    }
2187 	  local_sctx.last_node = node;
2188 	  local_sctx.last_str_idx = str_idx;
2189 	  ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2190 	  if (BE (ret < 0, 0))
2191 	    {
2192 	      err = REG_ESPACE;
2193 	      goto free_return;
2194 	    }
2195 	  cur_state = local_sctx.sifted_states[str_idx];
2196 	  err = sift_states_backward (mctx, &local_sctx);
2197 	  if (BE (err != REG_NOERROR, 0))
2198 	    goto free_return;
2199 	  if (sctx->limited_states != NULL)
2200 	    {
2201 	      err = merge_state_array (dfa, sctx->limited_states,
2202 				       local_sctx.sifted_states,
2203 				       str_idx + 1);
2204 	      if (BE (err != REG_NOERROR, 0))
2205 		goto free_return;
2206 	    }
2207 	  local_sctx.sifted_states[str_idx] = cur_state;
2208 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2209 
2210 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2211           entry = mctx->bkref_ents + enabled_idx;
2212 	}
2213       while (enabled_idx++, entry++->more);
2214     }
2215   err = REG_NOERROR;
2216  free_return:
2217   if (local_sctx.sifted_states != NULL)
2218     {
2219       re_node_set_free (&local_sctx.limits);
2220     }
2221 
2222   return err;
2223 }
2224 
2225 
2226 #ifdef RE_ENABLE_I18N
2227 static int
2228 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,int node_idx,int str_idx,int max_str_idx)2229 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2230 		     int node_idx, int str_idx, int max_str_idx)
2231 {
2232   const re_dfa_t *const dfa = mctx->dfa;
2233   int naccepted;
2234   /* Check the node can accept `multi byte'.  */
2235   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2236   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2237       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2238 			    dfa->nexts[node_idx]))
2239     /* The node can't accept the `multi byte', or the
2240        destination was already thrown away, then the node
2241        could't accept the current input `multi byte'.   */
2242     naccepted = 0;
2243   /* Otherwise, it is sure that the node could accept
2244      `naccepted' bytes input.  */
2245   return naccepted;
2246 }
2247 #endif /* RE_ENABLE_I18N */
2248 
2249 
2250 /* Functions for state transition.  */
2251 
2252 /* Return the next state to which the current state STATE will transit by
2253    accepting the current input byte, and update STATE_LOG if necessary.
2254    If STATE can accept a multibyte char/collating element/back reference
2255    update the destination of STATE_LOG.  */
2256 
2257 static re_dfastate_t *
2258 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2259 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2260 	       re_dfastate_t *state)
2261 {
2262   re_dfastate_t **trtable;
2263   unsigned char ch;
2264 
2265 #ifdef RE_ENABLE_I18N
2266   /* If the current state can accept multibyte.  */
2267   if (BE (state->accept_mb, 0))
2268     {
2269       *err = transit_state_mb (mctx, state);
2270       if (BE (*err != REG_NOERROR, 0))
2271 	return NULL;
2272     }
2273 #endif /* RE_ENABLE_I18N */
2274 
2275   /* Then decide the next state with the single byte.  */
2276 #if 0
2277   if (0)
2278     /* don't use transition table  */
2279     return transit_state_sb (err, mctx, state);
2280 #endif
2281 
2282   /* Use transition table  */
2283   ch = re_string_fetch_byte (&mctx->input);
2284   for (;;)
2285     {
2286       trtable = state->trtable;
2287       if (BE (trtable != NULL, 1))
2288 	return trtable[ch];
2289 
2290       trtable = state->word_trtable;
2291       if (BE (trtable != NULL, 1))
2292         {
2293 	  unsigned int context;
2294 	  context
2295 	    = re_string_context_at (&mctx->input,
2296 				    re_string_cur_idx (&mctx->input) - 1,
2297 				    mctx->eflags);
2298 	  if (IS_WORD_CONTEXT (context))
2299 	    return trtable[ch + SBC_MAX];
2300 	  else
2301 	    return trtable[ch];
2302 	}
2303 
2304       if (!build_trtable (mctx->dfa, state))
2305 	{
2306 	  *err = REG_ESPACE;
2307 	  return NULL;
2308 	}
2309 
2310       /* Retry, we now have a transition table.  */
2311     }
2312 }
2313 
2314 /* Update the state_log if we need */
2315 re_dfastate_t *
2316 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2317 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2318 		      re_dfastate_t *next_state)
2319 {
2320   const re_dfa_t *const dfa = mctx->dfa;
2321   int cur_idx = re_string_cur_idx (&mctx->input);
2322 
2323   if (cur_idx > mctx->state_log_top)
2324     {
2325       mctx->state_log[cur_idx] = next_state;
2326       mctx->state_log_top = cur_idx;
2327     }
2328   else if (mctx->state_log[cur_idx] == 0)
2329     {
2330       mctx->state_log[cur_idx] = next_state;
2331     }
2332   else
2333     {
2334       re_dfastate_t *pstate;
2335       unsigned int context;
2336       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2337       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338          the destination of a multibyte char/collating element/
2339          back reference.  Then the next state is the union set of
2340          these destinations and the results of the transition table.  */
2341       pstate = mctx->state_log[cur_idx];
2342       log_nodes = pstate->entrance_nodes;
2343       if (next_state != NULL)
2344         {
2345           table_nodes = next_state->entrance_nodes;
2346           *err = re_node_set_init_union (&next_nodes, table_nodes,
2347 					     log_nodes);
2348           if (BE (*err != REG_NOERROR, 0))
2349 	    return NULL;
2350         }
2351       else
2352         next_nodes = *log_nodes;
2353       /* Note: We already add the nodes of the initial state,
2354 	 then we don't need to add them here.  */
2355 
2356       context = re_string_context_at (&mctx->input,
2357 				      re_string_cur_idx (&mctx->input) - 1,
2358 				      mctx->eflags);
2359       next_state = mctx->state_log[cur_idx]
2360         = re_acquire_state_context (err, dfa, &next_nodes, context);
2361       /* We don't need to check errors here, since the return value of
2362          this function is next_state and ERR is already set.  */
2363 
2364       if (table_nodes != NULL)
2365         re_node_set_free (&next_nodes);
2366     }
2367 
2368   if (BE (dfa->nbackref, 0) && next_state != NULL)
2369     {
2370       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371 	 later.  We must check them here, since the back references in the
2372 	 next state might use them.  */
2373       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2374 					cur_idx);
2375       if (BE (*err != REG_NOERROR, 0))
2376 	return NULL;
2377 
2378       /* If the next state has back references.  */
2379       if (next_state->has_backref)
2380 	{
2381 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2382 	  if (BE (*err != REG_NOERROR, 0))
2383 	    return NULL;
2384 	  next_state = mctx->state_log[cur_idx];
2385 	}
2386     }
2387 
2388   return next_state;
2389 }
2390 
2391 /* Skip bytes in the input that correspond to part of a
2392    multi-byte match, then look in the log for a state
2393    from which to restart matching.  */
2394 re_dfastate_t *
2395 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2396 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2397 {
2398   re_dfastate_t *cur_state;
2399   do
2400     {
2401       int max = mctx->state_log_top;
2402       int cur_str_idx = re_string_cur_idx (&mctx->input);
2403 
2404       do
2405 	{
2406           if (++cur_str_idx > max)
2407             return NULL;
2408           re_string_skip_bytes (&mctx->input, 1);
2409 	}
2410       while (mctx->state_log[cur_str_idx] == NULL);
2411 
2412       cur_state = merge_state_with_log (err, mctx, NULL);
2413     }
2414   while (*err == REG_NOERROR && cur_state == NULL);
2415   return cur_state;
2416 }
2417 
2418 /* Helper functions for transit_state.  */
2419 
2420 /* From the node set CUR_NODES, pick up the nodes whose types are
2421    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422    expression. And register them to use them later for evaluating the
2423    correspoding back references.  */
2424 
2425 static reg_errcode_t
2426 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,int str_idx)2427 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2428 			   int str_idx)
2429 {
2430   const re_dfa_t *const dfa = mctx->dfa;
2431   int node_idx;
2432   reg_errcode_t err;
2433 
2434   /* TODO: This isn't efficient.
2435 	   Because there might be more than one nodes whose types are
2436 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2437 	   nodes.
2438 	   E.g. RE: (a){2}  */
2439   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2440     {
2441       int node = cur_nodes->elems[node_idx];
2442       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2443 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2444 	  && (dfa->used_bkref_map
2445 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2446 	{
2447 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2448 	  if (BE (err != REG_NOERROR, 0))
2449 	    return err;
2450 	}
2451     }
2452   return REG_NOERROR;
2453 }
2454 
2455 #if 0
2456 /* Return the next state to which the current state STATE will transit by
2457    accepting the current input byte.  */
2458 
2459 static re_dfastate_t *
2460 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2461 		  re_dfastate_t *state)
2462 {
2463   const re_dfa_t *const dfa = mctx->dfa;
2464   re_node_set next_nodes;
2465   re_dfastate_t *next_state;
2466   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2467   unsigned int context;
2468 
2469   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2470   if (BE (*err != REG_NOERROR, 0))
2471     return NULL;
2472   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2473     {
2474       int cur_node = state->nodes.elems[node_cnt];
2475       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2476 	{
2477 	  *err = re_node_set_merge (&next_nodes,
2478 				    dfa->eclosures + dfa->nexts[cur_node]);
2479 	  if (BE (*err != REG_NOERROR, 0))
2480 	    {
2481 	      re_node_set_free (&next_nodes);
2482 	      return NULL;
2483 	    }
2484 	}
2485     }
2486   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2487   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2488   /* We don't need to check errors here, since the return value of
2489      this function is next_state and ERR is already set.  */
2490 
2491   re_node_set_free (&next_nodes);
2492   re_string_skip_bytes (&mctx->input, 1);
2493   return next_state;
2494 }
2495 #endif
2496 
2497 #ifdef RE_ENABLE_I18N
2498 static reg_errcode_t
2499 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2500 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2501 {
2502   const re_dfa_t *const dfa = mctx->dfa;
2503   reg_errcode_t err;
2504   int i;
2505 
2506   for (i = 0; i < pstate->nodes.nelem; ++i)
2507     {
2508       re_node_set dest_nodes, *new_nodes;
2509       int cur_node_idx = pstate->nodes.elems[i];
2510       int naccepted, dest_idx;
2511       unsigned int context;
2512       re_dfastate_t *dest_state;
2513 
2514       if (!dfa->nodes[cur_node_idx].accept_mb)
2515         continue;
2516 
2517       if (dfa->nodes[cur_node_idx].constraint)
2518 	{
2519 	  context = re_string_context_at (&mctx->input,
2520 					  re_string_cur_idx (&mctx->input),
2521 					  mctx->eflags);
2522 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2523 					   context))
2524 	    continue;
2525 	}
2526 
2527       /* How many bytes the node can accept?  */
2528       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2529 					   re_string_cur_idx (&mctx->input));
2530       if (naccepted == 0)
2531 	continue;
2532 
2533       /* The node can accepts `naccepted' bytes.  */
2534       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2535       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2536 			       : mctx->max_mb_elem_len);
2537       err = clean_state_log_if_needed (mctx, dest_idx);
2538       if (BE (err != REG_NOERROR, 0))
2539 	return err;
2540 #ifdef DEBUG
2541       assert (dfa->nexts[cur_node_idx] != -1);
2542 #endif
2543       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2544 
2545       dest_state = mctx->state_log[dest_idx];
2546       if (dest_state == NULL)
2547 	dest_nodes = *new_nodes;
2548       else
2549 	{
2550 	  err = re_node_set_init_union (&dest_nodes,
2551 					dest_state->entrance_nodes, new_nodes);
2552 	  if (BE (err != REG_NOERROR, 0))
2553 	    return err;
2554 	}
2555       context = re_string_context_at (&mctx->input, dest_idx - 1,
2556 				      mctx->eflags);
2557       mctx->state_log[dest_idx]
2558 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2559       if (dest_state != NULL)
2560 	re_node_set_free (&dest_nodes);
2561       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2562 	return err;
2563     }
2564   return REG_NOERROR;
2565 }
2566 #endif /* RE_ENABLE_I18N */
2567 
2568 static reg_errcode_t
2569 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2570 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2571 {
2572   const re_dfa_t *const dfa = mctx->dfa;
2573   reg_errcode_t err;
2574   int i;
2575   int cur_str_idx = re_string_cur_idx (&mctx->input);
2576 
2577   for (i = 0; i < nodes->nelem; ++i)
2578     {
2579       int dest_str_idx, prev_nelem, bkc_idx;
2580       int node_idx = nodes->elems[i];
2581       unsigned int context;
2582       const re_token_t *node = dfa->nodes + node_idx;
2583       re_node_set *new_dest_nodes;
2584 
2585       /* Check whether `node' is a backreference or not.  */
2586       if (node->type != OP_BACK_REF)
2587 	continue;
2588 
2589       if (node->constraint)
2590 	{
2591 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2592 					  mctx->eflags);
2593 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2594 	    continue;
2595 	}
2596 
2597       /* `node' is a backreference.
2598 	 Check the substring which the substring matched.  */
2599       bkc_idx = mctx->nbkref_ents;
2600       err = get_subexp (mctx, node_idx, cur_str_idx);
2601       if (BE (err != REG_NOERROR, 0))
2602 	goto free_return;
2603 
2604       /* And add the epsilon closures (which is `new_dest_nodes') of
2605 	 the backreference to appropriate state_log.  */
2606 #ifdef DEBUG
2607       assert (dfa->nexts[node_idx] != -1);
2608 #endif
2609       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2610 	{
2611 	  int subexp_len;
2612 	  re_dfastate_t *dest_state;
2613 	  struct re_backref_cache_entry *bkref_ent;
2614 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2615 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2616 	    continue;
2617 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2618 	  new_dest_nodes = (subexp_len == 0
2619 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2620 			    : dfa->eclosures + dfa->nexts[node_idx]);
2621 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2622 			  - bkref_ent->subexp_from);
2623 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2624 					  mctx->eflags);
2625 	  dest_state = mctx->state_log[dest_str_idx];
2626 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2627 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2628 	  /* Add `new_dest_node' to state_log.  */
2629 	  if (dest_state == NULL)
2630 	    {
2631 	      mctx->state_log[dest_str_idx]
2632 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2633 					    context);
2634 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2635 		      && err != REG_NOERROR, 0))
2636 		goto free_return;
2637 	    }
2638 	  else
2639 	    {
2640 	      re_node_set dest_nodes;
2641 	      err = re_node_set_init_union (&dest_nodes,
2642 					    dest_state->entrance_nodes,
2643 					    new_dest_nodes);
2644 	      if (BE (err != REG_NOERROR, 0))
2645 		{
2646 		  re_node_set_free (&dest_nodes);
2647 		  goto free_return;
2648 		}
2649 	      mctx->state_log[dest_str_idx]
2650 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2651 	      re_node_set_free (&dest_nodes);
2652 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2653 		      && err != REG_NOERROR, 0))
2654 		goto free_return;
2655 	    }
2656 	  /* We need to check recursively if the backreference can epsilon
2657 	     transit.  */
2658 	  if (subexp_len == 0
2659 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2660 	    {
2661 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2662 					       cur_str_idx);
2663 	      if (BE (err != REG_NOERROR, 0))
2664 		goto free_return;
2665 	      err = transit_state_bkref (mctx, new_dest_nodes);
2666 	      if (BE (err != REG_NOERROR, 0))
2667 		goto free_return;
2668 	    }
2669 	}
2670     }
2671   err = REG_NOERROR;
2672  free_return:
2673   return err;
2674 }
2675 
2676 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2677    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678    Note that we might collect inappropriate candidates here.
2679    However, the cost of checking them strictly here is too high, then we
2680    delay these checking for prune_impossible_nodes().  */
2681 
2682 static reg_errcode_t
2683 internal_function
get_subexp(re_match_context_t * mctx,int bkref_node,int bkref_str_idx)2684 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2685 {
2686   const re_dfa_t *const dfa = mctx->dfa;
2687   int subexp_num, sub_top_idx;
2688   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2689   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2690   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2691   if (cache_idx != -1)
2692     {
2693       const struct re_backref_cache_entry *entry
2694 	= mctx->bkref_ents + cache_idx;
2695       do
2696         if (entry->node == bkref_node)
2697 	  return REG_NOERROR; /* We already checked it.  */
2698       while (entry++->more);
2699     }
2700 
2701   subexp_num = dfa->nodes[bkref_node].opr.idx;
2702 
2703   /* For each sub expression  */
2704   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2705     {
2706       reg_errcode_t err;
2707       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2708       re_sub_match_last_t *sub_last;
2709       int sub_last_idx, sl_str, bkref_str_off;
2710 
2711       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2712 	continue; /* It isn't related.  */
2713 
2714       sl_str = sub_top->str_idx;
2715       bkref_str_off = bkref_str_idx;
2716       /* At first, check the last node of sub expressions we already
2717 	 evaluated.  */
2718       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2719 	{
2720 	  int sl_str_diff;
2721 	  sub_last = sub_top->lasts[sub_last_idx];
2722 	  sl_str_diff = sub_last->str_idx - sl_str;
2723 	  /* The matched string by the sub expression match with the substring
2724 	     at the back reference?  */
2725 	  if (sl_str_diff > 0)
2726 	    {
2727 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2728 		{
2729 		  /* Not enough chars for a successful match.  */
2730 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2731 		    break;
2732 
2733 		  err = clean_state_log_if_needed (mctx,
2734 						   bkref_str_off
2735 						   + sl_str_diff);
2736 		  if (BE (err != REG_NOERROR, 0))
2737 		    return err;
2738 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2739 		}
2740 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2741 		/* We don't need to search this sub expression any more.  */
2742 		break;
2743 	    }
2744 	  bkref_str_off += sl_str_diff;
2745 	  sl_str += sl_str_diff;
2746 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2747 				bkref_str_idx);
2748 
2749 	  /* Reload buf, since the preceding call might have reallocated
2750 	     the buffer.  */
2751 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2752 
2753 	  if (err == REG_NOMATCH)
2754 	    continue;
2755 	  if (BE (err != REG_NOERROR, 0))
2756 	    return err;
2757 	}
2758 
2759       if (sub_last_idx < sub_top->nlasts)
2760 	continue;
2761       if (sub_last_idx > 0)
2762 	++sl_str;
2763       /* Then, search for the other last nodes of the sub expression.  */
2764       for (; sl_str <= bkref_str_idx; ++sl_str)
2765 	{
2766 	  int cls_node, sl_str_off;
2767 	  const re_node_set *nodes;
2768 	  sl_str_off = sl_str - sub_top->str_idx;
2769 	  /* The matched string by the sub expression match with the substring
2770 	     at the back reference?  */
2771 	  if (sl_str_off > 0)
2772 	    {
2773 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2774 		{
2775 		  /* If we are at the end of the input, we cannot match.  */
2776 		  if (bkref_str_off >= mctx->input.len)
2777 		    break;
2778 
2779 		  err = extend_buffers (mctx);
2780 		  if (BE (err != REG_NOERROR, 0))
2781 		    return err;
2782 
2783 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2784 		}
2785 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2786 		break; /* We don't need to search this sub expression
2787 			  any more.  */
2788 	    }
2789 	  if (mctx->state_log[sl_str] == NULL)
2790 	    continue;
2791 	  /* Does this state have a ')' of the sub expression?  */
2792 	  nodes = &mctx->state_log[sl_str]->nodes;
2793 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2794 				       OP_CLOSE_SUBEXP);
2795 	  if (cls_node == -1)
2796 	    continue; /* No.  */
2797 	  if (sub_top->path == NULL)
2798 	    {
2799 	      sub_top->path = calloc (sizeof (state_array_t),
2800 				      sl_str - sub_top->str_idx + 1);
2801 	      if (sub_top->path == NULL)
2802 		return REG_ESPACE;
2803 	    }
2804 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805 	     in the current context?  */
2806 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2807 			       sub_top->str_idx, cls_node, sl_str,
2808 			       OP_CLOSE_SUBEXP);
2809 	  if (err == REG_NOMATCH)
2810 	      continue;
2811 	  if (BE (err != REG_NOERROR, 0))
2812 	      return err;
2813 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2814 	  if (BE (sub_last == NULL, 0))
2815 	    return REG_ESPACE;
2816 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2817 				bkref_str_idx);
2818 	  if (err == REG_NOMATCH)
2819 	    continue;
2820 	}
2821     }
2822   return REG_NOERROR;
2823 }
2824 
2825 /* Helper functions for get_subexp().  */
2826 
2827 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828    If it can arrive, register the sub expression expressed with SUB_TOP
2829    and SUB_LAST.  */
2830 
2831 static reg_errcode_t
2832 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,int bkref_node,int bkref_str)2833 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2834 		re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2835 {
2836   reg_errcode_t err;
2837   int to_idx;
2838   /* Can the subexpression arrive the back reference?  */
2839   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2840 		       sub_last->str_idx, bkref_node, bkref_str,
2841 		       OP_OPEN_SUBEXP);
2842   if (err != REG_NOERROR)
2843     return err;
2844   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2845 			     sub_last->str_idx);
2846   if (BE (err != REG_NOERROR, 0))
2847     return err;
2848   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2849   return clean_state_log_if_needed (mctx, to_idx);
2850 }
2851 
2852 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853    Search '(' if FL_OPEN, or search ')' otherwise.
2854    TODO: This function isn't efficient...
2855 	 Because there might be more than one nodes whose types are
2856 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857 	 nodes.
2858 	 E.g. RE: (a){2}  */
2859 
2860 static int
2861 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,int subexp_idx,int type)2862 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2863 		  int subexp_idx, int type)
2864 {
2865   int cls_idx;
2866   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2867     {
2868       int cls_node = nodes->elems[cls_idx];
2869       const re_token_t *node = dfa->nodes + cls_node;
2870       if (node->type == type
2871 	  && node->opr.idx == subexp_idx)
2872 	return cls_node;
2873     }
2874   return -1;
2875 }
2876 
2877 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2879    heavily reused.
2880    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2881 
2882 static reg_errcode_t
2883 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,int top_node,int top_str,int last_node,int last_str,int type)2884 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2885 	       int top_str, int last_node, int last_str, int type)
2886 {
2887   const re_dfa_t *const dfa = mctx->dfa;
2888   reg_errcode_t err = REG_NOERROR;
2889   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2890   re_dfastate_t *cur_state = NULL;
2891   re_node_set *cur_nodes, next_nodes;
2892   re_dfastate_t **backup_state_log;
2893   unsigned int context;
2894 
2895   subexp_num = dfa->nodes[top_node].opr.idx;
2896   /* Extend the buffer if we need.  */
2897   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2898     {
2899       re_dfastate_t **new_array;
2900       int old_alloc = path->alloc;
2901       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2902       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2903       if (BE (new_array == NULL, 0))
2904 	{
2905 	  path->alloc = old_alloc;
2906 	  return REG_ESPACE;
2907 	}
2908       path->array = new_array;
2909       memset (new_array + old_alloc, '\0',
2910 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2911     }
2912 
2913   // Original:
2914   // str_idx = path->next_idx ?: top_str;
2915   // Copied following from another version when cleaning up compiler warnings.
2916   str_idx = path->next_idx ? path->next_idx : top_str;
2917 
2918   /* Temporary modify MCTX.  */
2919   backup_state_log = mctx->state_log;
2920   backup_cur_idx = mctx->input.cur_idx;
2921   mctx->state_log = path->array;
2922   mctx->input.cur_idx = str_idx;
2923 
2924   /* Setup initial node set.  */
2925   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2926   if (str_idx == top_str)
2927     {
2928       err = re_node_set_init_1 (&next_nodes, top_node);
2929       if (BE (err != REG_NOERROR, 0))
2930 	return err;
2931       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2932       if (BE (err != REG_NOERROR, 0))
2933 	{
2934 	  re_node_set_free (&next_nodes);
2935 	  return err;
2936 	}
2937     }
2938   else
2939     {
2940       cur_state = mctx->state_log[str_idx];
2941       if (cur_state && cur_state->has_backref)
2942 	{
2943 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2944 	  if (BE (err != REG_NOERROR, 0))
2945 	    return err;
2946 	}
2947       else
2948 	re_node_set_init_empty (&next_nodes);
2949     }
2950   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2951     {
2952       if (next_nodes.nelem)
2953 	{
2954 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2955 				    subexp_num, type);
2956 	  if (BE (err != REG_NOERROR, 0))
2957 	    {
2958 	      re_node_set_free (&next_nodes);
2959 	      return err;
2960 	    }
2961 	}
2962       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2963       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2964 	{
2965 	  re_node_set_free (&next_nodes);
2966 	  return err;
2967 	}
2968       mctx->state_log[str_idx] = cur_state;
2969     }
2970 
2971   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2972     {
2973       re_node_set_empty (&next_nodes);
2974       if (mctx->state_log[str_idx + 1])
2975 	{
2976 	  err = re_node_set_merge (&next_nodes,
2977 				   &mctx->state_log[str_idx + 1]->nodes);
2978 	  if (BE (err != REG_NOERROR, 0))
2979 	    {
2980 	      re_node_set_free (&next_nodes);
2981 	      return err;
2982 	    }
2983 	}
2984       if (cur_state)
2985 	{
2986 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2987 					      &cur_state->non_eps_nodes,
2988 					      &next_nodes);
2989 	  if (BE (err != REG_NOERROR, 0))
2990 	    {
2991 	      re_node_set_free (&next_nodes);
2992 	      return err;
2993 	    }
2994 	}
2995       ++str_idx;
2996       if (next_nodes.nelem)
2997 	{
2998 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2999 	  if (BE (err != REG_NOERROR, 0))
3000 	    {
3001 	      re_node_set_free (&next_nodes);
3002 	      return err;
3003 	    }
3004 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3005 				    subexp_num, type);
3006 	  if (BE (err != REG_NOERROR, 0))
3007 	    {
3008 	      re_node_set_free (&next_nodes);
3009 	      return err;
3010 	    }
3011 	}
3012       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3013       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3014       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3015 	{
3016 	  re_node_set_free (&next_nodes);
3017 	  return err;
3018 	}
3019       mctx->state_log[str_idx] = cur_state;
3020       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3021     }
3022   re_node_set_free (&next_nodes);
3023   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3024 	       : &mctx->state_log[last_str]->nodes);
3025   path->next_idx = str_idx;
3026 
3027   /* Fix MCTX.  */
3028   mctx->state_log = backup_state_log;
3029   mctx->input.cur_idx = backup_cur_idx;
3030 
3031   /* Then check the current node set has the node LAST_NODE.  */
3032   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3033     return REG_NOERROR;
3034 
3035   return REG_NOMATCH;
3036 }
3037 
3038 /* Helper functions for check_arrival.  */
3039 
3040 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3041    to NEXT_NODES.
3042    TODO: This function is similar to the functions transit_state*(),
3043 	 however this function has many additional works.
3044 	 Can't we unify them?  */
3045 
3046 static reg_errcode_t
3047 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,int str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3048 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3049 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3050 {
3051   const re_dfa_t *const dfa = mctx->dfa;
3052   int result;
3053   int cur_idx;
3054 #ifdef RE_ENABLE_I18N
3055   reg_errcode_t err = REG_NOERROR;
3056 #endif
3057   re_node_set union_set;
3058   re_node_set_init_empty (&union_set);
3059   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3060     {
3061       int naccepted = 0;
3062       int cur_node = cur_nodes->elems[cur_idx];
3063 #ifdef DEBUG
3064       re_token_type_t type = dfa->nodes[cur_node].type;
3065       assert (!IS_EPSILON_NODE (type));
3066 #endif
3067 #ifdef RE_ENABLE_I18N
3068       /* If the node may accept `multi byte'.  */
3069       if (dfa->nodes[cur_node].accept_mb)
3070 	{
3071 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3072 					       str_idx);
3073 	  if (naccepted > 1)
3074 	    {
3075 	      re_dfastate_t *dest_state;
3076 	      int next_node = dfa->nexts[cur_node];
3077 	      int next_idx = str_idx + naccepted;
3078 	      dest_state = mctx->state_log[next_idx];
3079 	      re_node_set_empty (&union_set);
3080 	      if (dest_state)
3081 		{
3082 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3083 		  if (BE (err != REG_NOERROR, 0))
3084 		    {
3085 		      re_node_set_free (&union_set);
3086 		      return err;
3087 		    }
3088 		}
3089 	      result = re_node_set_insert (&union_set, next_node);
3090 	      if (BE (result < 0, 0))
3091 		{
3092 		  re_node_set_free (&union_set);
3093 		  return REG_ESPACE;
3094 		}
3095 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3096 							    &union_set);
3097 	      if (BE (mctx->state_log[next_idx] == NULL
3098 		      && err != REG_NOERROR, 0))
3099 		{
3100 		  re_node_set_free (&union_set);
3101 		  return err;
3102 		}
3103 	    }
3104 	}
3105 #endif /* RE_ENABLE_I18N */
3106       if (naccepted
3107 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3108 	{
3109 	  result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3110 	  if (BE (result < 0, 0))
3111 	    {
3112 	      re_node_set_free (&union_set);
3113 	      return REG_ESPACE;
3114 	    }
3115 	}
3116     }
3117   re_node_set_free (&union_set);
3118   return REG_NOERROR;
3119 }
3120 
3121 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3122    CUR_NODES, however exclude the nodes which are:
3123     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3124     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3125 */
3126 
3127 static reg_errcode_t
3128 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,int ex_subexp,int type)3129 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3130 			  int ex_subexp, int type)
3131 {
3132   reg_errcode_t err;
3133   int idx, outside_node;
3134   re_node_set new_nodes;
3135 #ifdef DEBUG
3136   assert (cur_nodes->nelem);
3137 #endif
3138   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3139   if (BE (err != REG_NOERROR, 0))
3140     return err;
3141   /* Create a new node set NEW_NODES with the nodes which are epsilon
3142      closures of the node in CUR_NODES.  */
3143 
3144   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3145     {
3146       int cur_node = cur_nodes->elems[idx];
3147       const re_node_set *eclosure = dfa->eclosures + cur_node;
3148       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3149       if (outside_node == -1)
3150 	{
3151 	  /* There are no problematic nodes, just merge them.  */
3152 	  err = re_node_set_merge (&new_nodes, eclosure);
3153 	  if (BE (err != REG_NOERROR, 0))
3154 	    {
3155 	      re_node_set_free (&new_nodes);
3156 	      return err;
3157 	    }
3158 	}
3159       else
3160 	{
3161 	  /* There are problematic nodes, re-calculate incrementally.  */
3162 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3163 					      ex_subexp, type);
3164 	  if (BE (err != REG_NOERROR, 0))
3165 	    {
3166 	      re_node_set_free (&new_nodes);
3167 	      return err;
3168 	    }
3169 	}
3170     }
3171   re_node_set_free (cur_nodes);
3172   *cur_nodes = new_nodes;
3173   return REG_NOERROR;
3174 }
3175 
3176 /* Helper function for check_arrival_expand_ecl.
3177    Check incrementally the epsilon closure of TARGET, and if it isn't
3178    problematic append it to DST_NODES.  */
3179 
3180 static reg_errcode_t
3181 internal_function
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,int target,int ex_subexp,int type)3182 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3183 			      int target, int ex_subexp, int type)
3184 {
3185   int cur_node;
3186   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3187     {
3188       int err;
3189 
3190       if (dfa->nodes[cur_node].type == type
3191 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3192 	{
3193 	  if (type == OP_CLOSE_SUBEXP)
3194 	    {
3195 	      err = re_node_set_insert (dst_nodes, cur_node);
3196 	      if (BE (err == -1, 0))
3197 		return REG_ESPACE;
3198 	    }
3199 	  break;
3200 	}
3201       err = re_node_set_insert (dst_nodes, cur_node);
3202       if (BE (err == -1, 0))
3203 	return REG_ESPACE;
3204       if (dfa->edests[cur_node].nelem == 0)
3205 	break;
3206       if (dfa->edests[cur_node].nelem == 2)
3207 	{
3208 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3209 					      dfa->edests[cur_node].elems[1],
3210 					      ex_subexp, type);
3211 	  if (BE (err != REG_NOERROR, 0))
3212 	    return err;
3213 	}
3214       cur_node = dfa->edests[cur_node].elems[0];
3215     }
3216   return REG_NOERROR;
3217 }
3218 
3219 
3220 /* For all the back references in the current state, calculate the
3221    destination of the back references by the appropriate entry
3222    in MCTX->BKREF_ENTS.  */
3223 
3224 static reg_errcode_t
3225 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,int cur_str,int subexp_num,int type)3226 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3227 		    int cur_str, int subexp_num, int type)
3228 {
3229   const re_dfa_t *const dfa = mctx->dfa;
3230   reg_errcode_t err;
3231   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3232   struct re_backref_cache_entry *ent;
3233 
3234   if (cache_idx_start == -1)
3235     return REG_NOERROR;
3236 
3237  restart:
3238   ent = mctx->bkref_ents + cache_idx_start;
3239   do
3240     {
3241       int to_idx, next_node;
3242 
3243       /* Is this entry ENT is appropriate?  */
3244       if (!re_node_set_contains (cur_nodes, ent->node))
3245 	continue; /* No.  */
3246 
3247       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3248       /* Calculate the destination of the back reference, and append it
3249 	 to MCTX->STATE_LOG.  */
3250       if (to_idx == cur_str)
3251 	{
3252 	  /* The backreference did epsilon transit, we must re-check all the
3253 	     node in the current state.  */
3254 	  re_node_set new_dests;
3255 	  reg_errcode_t err2, err3;
3256 	  next_node = dfa->edests[ent->node].elems[0];
3257 	  if (re_node_set_contains (cur_nodes, next_node))
3258 	    continue;
3259 	  err = re_node_set_init_1 (&new_dests, next_node);
3260 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3261 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3262 	  re_node_set_free (&new_dests);
3263 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3264 		  || err3 != REG_NOERROR, 0))
3265 	    {
3266 	      err = (err != REG_NOERROR ? err
3267 		     : (err2 != REG_NOERROR ? err2 : err3));
3268 	      return err;
3269 	    }
3270 	  /* TODO: It is still inefficient...  */
3271 	  goto restart;
3272 	}
3273       else
3274 	{
3275 	  re_node_set union_set;
3276 	  next_node = dfa->nexts[ent->node];
3277 	  if (mctx->state_log[to_idx])
3278 	    {
3279 	      int ret;
3280 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3281 					next_node))
3282 		continue;
3283 	      err = re_node_set_init_copy (&union_set,
3284 					   &mctx->state_log[to_idx]->nodes);
3285 	      ret = re_node_set_insert (&union_set, next_node);
3286 	      if (BE (err != REG_NOERROR || ret < 0, 0))
3287 		{
3288 		  re_node_set_free (&union_set);
3289 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3290 		  return err;
3291 		}
3292 	    }
3293 	  else
3294 	    {
3295 	      err = re_node_set_init_1 (&union_set, next_node);
3296 	      if (BE (err != REG_NOERROR, 0))
3297 		return err;
3298 	    }
3299 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3300 	  re_node_set_free (&union_set);
3301 	  if (BE (mctx->state_log[to_idx] == NULL
3302 		  && err != REG_NOERROR, 0))
3303 	    return err;
3304 	}
3305     }
3306   while (ent++->more);
3307   return REG_NOERROR;
3308 }
3309 
3310 /* Build transition table for the state.
3311    Return 1 if succeeded, otherwise return NULL.  */
3312 
3313 static int
3314 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3315 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3316 {
3317   reg_errcode_t err;
3318   int i, j, ch, need_word_trtable = 0;
3319   bitset_word_t elem, mask;
3320   bool dests_node_malloced = false;
3321   bool dest_states_malloced = false;
3322   int ndests; /* Number of the destination states from `state'.  */
3323   re_dfastate_t **trtable;
3324   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3325   re_node_set follows, *dests_node;
3326   bitset_t *dests_ch;
3327   bitset_t acceptable;
3328 
3329   struct dests_alloc
3330   {
3331     re_node_set dests_node[SBC_MAX];
3332     bitset_t dests_ch[SBC_MAX];
3333   } *dests_alloc;
3334 
3335   /* We build DFA states which corresponds to the destination nodes
3336      from `state'.  `dests_node[i]' represents the nodes which i-th
3337      destination state contains, and `dests_ch[i]' represents the
3338      characters which i-th destination state accepts.  */
3339   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3340     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3341   else
3342     {
3343       dests_alloc = re_malloc (struct dests_alloc, 1);
3344       if (BE (dests_alloc == NULL, 0))
3345 	return 0;
3346       dests_node_malloced = true;
3347     }
3348   dests_node = dests_alloc->dests_node;
3349   dests_ch = dests_alloc->dests_ch;
3350 
3351   /* Initialize transiton table.  */
3352   state->word_trtable = state->trtable = NULL;
3353 
3354   /* At first, group all nodes belonging to `state' into several
3355      destinations.  */
3356   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3357   if (BE (ndests <= 0, 0))
3358     {
3359       if (dests_node_malloced)
3360 	free (dests_alloc);
3361       /* Return 0 in case of an error, 1 otherwise.  */
3362       if (ndests == 0)
3363 	{
3364 	  state->trtable = (re_dfastate_t **)
3365 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3366 	  return 1;
3367 	}
3368       return 0;
3369     }
3370 
3371   err = re_node_set_alloc (&follows, ndests + 1);
3372   if (BE (err != REG_NOERROR, 0))
3373     goto out_free;
3374 
3375   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3376 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3377     dest_states = (re_dfastate_t **)
3378       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3379   else
3380     {
3381       dest_states = (re_dfastate_t **)
3382 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
3383       if (BE (dest_states == NULL, 0))
3384 	{
3385 out_free:
3386 	  if (dest_states_malloced)
3387 	    free (dest_states);
3388 	  re_node_set_free (&follows);
3389 	  for (i = 0; i < ndests; ++i)
3390 	    re_node_set_free (dests_node + i);
3391 	  if (dests_node_malloced)
3392 	    free (dests_alloc);
3393 	  return 0;
3394 	}
3395       dest_states_malloced = true;
3396     }
3397   dest_states_word = dest_states + ndests;
3398   dest_states_nl = dest_states_word + ndests;
3399   bitset_empty (acceptable);
3400 
3401   /* Then build the states for all destinations.  */
3402   for (i = 0; i < ndests; ++i)
3403     {
3404       int next_node;
3405       re_node_set_empty (&follows);
3406       /* Merge the follows of this destination states.  */
3407       for (j = 0; j < dests_node[i].nelem; ++j)
3408 	{
3409 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3410 	  if (next_node != -1)
3411 	    {
3412 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3413 	      if (BE (err != REG_NOERROR, 0))
3414 		goto out_free;
3415 	    }
3416 	}
3417       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3418       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3419 	goto out_free;
3420       /* If the new state has context constraint,
3421 	 build appropriate states for these contexts.  */
3422       if (dest_states[i]->has_constraint)
3423 	{
3424 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3425 							  CONTEXT_WORD);
3426 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3427 	    goto out_free;
3428 
3429 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3430 	    need_word_trtable = 1;
3431 
3432 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3433 							CONTEXT_NEWLINE);
3434 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3435 	    goto out_free;
3436  	}
3437       else
3438 	{
3439 	  dest_states_word[i] = dest_states[i];
3440 	  dest_states_nl[i] = dest_states[i];
3441 	}
3442       bitset_merge (acceptable, dests_ch[i]);
3443     }
3444 
3445   if (!BE (need_word_trtable, 0))
3446     {
3447       /* We don't care about whether the following character is a word
3448 	 character, or we are in a single-byte character set so we can
3449 	 discern by looking at the character code: allocate a
3450 	 256-entry transition table.  */
3451       trtable = state->trtable =
3452 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3453       if (BE (trtable == NULL, 0))
3454 	goto out_free;
3455 
3456       /* For all characters ch...:  */
3457       for (i = 0; i < BITSET_WORDS; ++i)
3458 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3459 	     elem;
3460 	     mask <<= 1, elem >>= 1, ++ch)
3461 	  if (BE (elem & 1, 0))
3462 	    {
3463 	      /* There must be exactly one destination which accepts
3464 		 character ch.  See group_nodes_into_DFAstates.  */
3465 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3466 		;
3467 
3468 	      /* j-th destination accepts the word character ch.  */
3469 	      if (dfa->word_char[i] & mask)
3470 		trtable[ch] = dest_states_word[j];
3471 	      else
3472 		trtable[ch] = dest_states[j];
3473 	    }
3474     }
3475   else
3476     {
3477       /* We care about whether the following character is a word
3478 	 character, and we are in a multi-byte character set: discern
3479 	 by looking at the character code: build two 256-entry
3480 	 transition tables, one starting at trtable[0] and one
3481 	 starting at trtable[SBC_MAX].  */
3482       trtable = state->word_trtable =
3483 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3484       if (BE (trtable == NULL, 0))
3485 	goto out_free;
3486 
3487       /* For all characters ch...:  */
3488       for (i = 0; i < BITSET_WORDS; ++i)
3489 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3490 	     elem;
3491 	     mask <<= 1, elem >>= 1, ++ch)
3492 	  if (BE (elem & 1, 0))
3493 	    {
3494 	      /* There must be exactly one destination which accepts
3495 		 character ch.  See group_nodes_into_DFAstates.  */
3496 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3497 		;
3498 
3499 	      /* j-th destination accepts the word character ch.  */
3500 	      trtable[ch] = dest_states[j];
3501 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3502 	    }
3503     }
3504 
3505   /* new line */
3506   if (bitset_contain (acceptable, NEWLINE_CHAR))
3507     {
3508       /* The current state accepts newline character.  */
3509       for (j = 0; j < ndests; ++j)
3510 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3511 	  {
3512 	    /* k-th destination accepts newline character.  */
3513 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3514 	    if (need_word_trtable)
3515 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3516 	    /* There must be only one destination which accepts
3517 	       newline.  See group_nodes_into_DFAstates.  */
3518 	    break;
3519 	  }
3520     }
3521 
3522   if (dest_states_malloced)
3523     free (dest_states);
3524 
3525   re_node_set_free (&follows);
3526   for (i = 0; i < ndests; ++i)
3527     re_node_set_free (dests_node + i);
3528 
3529   if (dests_node_malloced)
3530     free (dests_alloc);
3531 
3532   return 1;
3533 }
3534 
3535 /* Group all nodes belonging to STATE into several destinations.
3536    Then for all destinations, set the nodes belonging to the destination
3537    to DESTS_NODE[i] and set the characters accepted by the destination
3538    to DEST_CH[i].  This function return the number of destinations.  */
3539 
3540 static int
3541 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3542 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3543 			    re_node_set *dests_node, bitset_t *dests_ch)
3544 {
3545   reg_errcode_t err;
3546   int result;
3547   int i, j, k;
3548   int ndests; /* Number of the destinations from `state'.  */
3549   bitset_t accepts; /* Characters a node can accept.  */
3550   const re_node_set *cur_nodes = &state->nodes;
3551   bitset_empty (accepts);
3552   ndests = 0;
3553 
3554   /* For all the nodes belonging to `state',  */
3555   for (i = 0; i < cur_nodes->nelem; ++i)
3556     {
3557       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3558       re_token_type_t type = node->type;
3559       unsigned int constraint = node->constraint;
3560 
3561       /* Enumerate all single byte character this node can accept.  */
3562       if (type == CHARACTER)
3563 	bitset_set (accepts, node->opr.c);
3564       else if (type == SIMPLE_BRACKET)
3565 	{
3566 	  bitset_merge (accepts, node->opr.sbcset);
3567 	}
3568       else if (type == OP_PERIOD)
3569 	{
3570 #ifdef RE_ENABLE_I18N
3571 	  if (dfa->mb_cur_max > 1)
3572 	    bitset_merge (accepts, dfa->sb_char);
3573 	  else
3574 #endif
3575 	    bitset_set_all (accepts);
3576 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3577 	    bitset_clear (accepts, '\n');
3578 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3579 	    bitset_clear (accepts, '\0');
3580 	}
3581 #ifdef RE_ENABLE_I18N
3582       else if (type == OP_UTF8_PERIOD)
3583         {
3584 	  memset (accepts, '\xff', sizeof (bitset_t) / 2);
3585 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3586 	    bitset_clear (accepts, '\n');
3587 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3588 	    bitset_clear (accepts, '\0');
3589         }
3590 #endif
3591       else
3592 	continue;
3593 
3594       /* Check the `accepts' and sift the characters which are not
3595 	 match it the context.  */
3596       if (constraint)
3597 	{
3598 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3599 	    {
3600 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3601 	      bitset_empty (accepts);
3602 	      if (accepts_newline)
3603 		bitset_set (accepts, NEWLINE_CHAR);
3604 	      else
3605 		continue;
3606 	    }
3607 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3608 	    {
3609 	      bitset_empty (accepts);
3610 	      continue;
3611 	    }
3612 
3613 	  if (constraint & NEXT_WORD_CONSTRAINT)
3614 	    {
3615 	      bitset_word_t any_set = 0;
3616 	      if (type == CHARACTER && !node->word_char)
3617 		{
3618 		  bitset_empty (accepts);
3619 		  continue;
3620 		}
3621 #ifdef RE_ENABLE_I18N
3622 	      if (dfa->mb_cur_max > 1)
3623 		for (j = 0; j < BITSET_WORDS; ++j)
3624 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3625 	      else
3626 #endif
3627 		for (j = 0; j < BITSET_WORDS; ++j)
3628 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3629 	      if (!any_set)
3630 		continue;
3631 	    }
3632 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3633 	    {
3634 	      bitset_word_t any_set = 0;
3635 	      if (type == CHARACTER && node->word_char)
3636 		{
3637 		  bitset_empty (accepts);
3638 		  continue;
3639 		}
3640 #ifdef RE_ENABLE_I18N
3641 	      if (dfa->mb_cur_max > 1)
3642 		for (j = 0; j < BITSET_WORDS; ++j)
3643 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3644 	      else
3645 #endif
3646 		for (j = 0; j < BITSET_WORDS; ++j)
3647 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3648 	      if (!any_set)
3649 		continue;
3650 	    }
3651 	}
3652 
3653       /* Then divide `accepts' into DFA states, or create a new
3654 	 state.  Above, we make sure that accepts is not empty.  */
3655       for (j = 0; j < ndests; ++j)
3656 	{
3657 	  bitset_t intersec; /* Intersection sets, see below.  */
3658 	  bitset_t remains;
3659 	  /* Flags, see below.  */
3660 	  bitset_word_t has_intersec, not_subset, not_consumed;
3661 
3662 	  /* Optimization, skip if this state doesn't accept the character.  */
3663 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3664 	    continue;
3665 
3666 	  /* Enumerate the intersection set of this state and `accepts'.  */
3667 	  has_intersec = 0;
3668 	  for (k = 0; k < BITSET_WORDS; ++k)
3669 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3670 	  /* And skip if the intersection set is empty.  */
3671 	  if (!has_intersec)
3672 	    continue;
3673 
3674 	  /* Then check if this state is a subset of `accepts'.  */
3675 	  not_subset = not_consumed = 0;
3676 	  for (k = 0; k < BITSET_WORDS; ++k)
3677 	    {
3678 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3679 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3680 	    }
3681 
3682 	  /* If this state isn't a subset of `accepts', create a
3683 	     new group state, which has the `remains'. */
3684 	  if (not_subset)
3685 	    {
3686 	      bitset_copy (dests_ch[ndests], remains);
3687 	      bitset_copy (dests_ch[j], intersec);
3688 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3689 	      if (BE (err != REG_NOERROR, 0))
3690 		goto error_return;
3691 	      ++ndests;
3692 	    }
3693 
3694 	  /* Put the position in the current group. */
3695 	  result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3696 	  if (BE (result < 0, 0))
3697 	    goto error_return;
3698 
3699 	  /* If all characters are consumed, go to next node. */
3700 	  if (!not_consumed)
3701 	    break;
3702 	}
3703       /* Some characters remain, create a new group. */
3704       if (j == ndests)
3705 	{
3706 	  bitset_copy (dests_ch[ndests], accepts);
3707 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3708 	  if (BE (err != REG_NOERROR, 0))
3709 	    goto error_return;
3710 	  ++ndests;
3711 	  bitset_empty (accepts);
3712 	}
3713     }
3714   return ndests;
3715  error_return:
3716   for (j = 0; j < ndests; ++j)
3717     re_node_set_free (dests_node + j);
3718   return -1;
3719 }
3720 
3721 #ifdef RE_ENABLE_I18N
3722 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3723    Return the number of the bytes the node accepts.
3724    STR_IDX is the current index of the input string.
3725 
3726    This function handles the nodes which can accept one character, or
3727    one collating element like '.', '[a-z]', opposite to the other nodes
3728    can only accept one byte.  */
3729 
3730 static int
3731 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,int node_idx,const re_string_t * input,int str_idx)3732 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3733 			 const re_string_t *input, int str_idx)
3734 {
3735   const re_token_t *node = dfa->nodes + node_idx;
3736   int char_len, elem_len;
3737   int i;
3738 
3739   if (BE (node->type == OP_UTF8_PERIOD, 0))
3740     {
3741       unsigned char c = re_string_byte_at (input, str_idx), d;
3742       if (BE (c < 0xc2, 1))
3743 	return 0;
3744 
3745       if (str_idx + 2 > input->len)
3746 	return 0;
3747 
3748       d = re_string_byte_at (input, str_idx + 1);
3749       if (c < 0xe0)
3750 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3751       else if (c < 0xf0)
3752 	{
3753 	  char_len = 3;
3754 	  if (c == 0xe0 && d < 0xa0)
3755 	    return 0;
3756 	}
3757       else if (c < 0xf8)
3758 	{
3759 	  char_len = 4;
3760 	  if (c == 0xf0 && d < 0x90)
3761 	    return 0;
3762 	}
3763       else if (c < 0xfc)
3764 	{
3765 	  char_len = 5;
3766 	  if (c == 0xf8 && d < 0x88)
3767 	    return 0;
3768 	}
3769       else if (c < 0xfe)
3770 	{
3771 	  char_len = 6;
3772 	  if (c == 0xfc && d < 0x84)
3773 	    return 0;
3774 	}
3775       else
3776 	return 0;
3777 
3778       if (str_idx + char_len > input->len)
3779 	return 0;
3780 
3781       for (i = 1; i < char_len; ++i)
3782 	{
3783 	  d = re_string_byte_at (input, str_idx + i);
3784 	  if (d < 0x80 || d > 0xbf)
3785 	    return 0;
3786 	}
3787       return char_len;
3788     }
3789 
3790   char_len = re_string_char_size_at (input, str_idx);
3791   if (node->type == OP_PERIOD)
3792     {
3793       if (char_len <= 1)
3794         return 0;
3795       /* FIXME: I don't think this if is needed, as both '\n'
3796 	 and '\0' are char_len == 1.  */
3797       /* '.' accepts any one character except the following two cases.  */
3798       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3799 	   re_string_byte_at (input, str_idx) == '\n') ||
3800 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
3801 	   re_string_byte_at (input, str_idx) == '\0'))
3802 	return 0;
3803       return char_len;
3804     }
3805 
3806   elem_len = re_string_elem_size_at (input, str_idx);
3807   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3808     return 0;
3809 
3810   if (node->type == COMPLEX_BRACKET)
3811     {
3812       const re_charset_t *cset = node->opr.mbcset;
3813 # ifdef _LIBC
3814       const unsigned char *pin
3815 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3816       int j;
3817       uint32_t nrules;
3818 # endif /* _LIBC */
3819       int match_len = 0;
3820       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3821 		    ? re_string_wchar_at (input, str_idx) : 0);
3822 
3823       /* match with multibyte character?  */
3824       for (i = 0; i < cset->nmbchars; ++i)
3825 	if (wc == cset->mbchars[i])
3826 	  {
3827 	    match_len = char_len;
3828 	    goto check_node_accept_bytes_match;
3829 	  }
3830       /* match with character_class?  */
3831       for (i = 0; i < cset->nchar_classes; ++i)
3832 	{
3833 	  wctype_t wt = cset->char_classes[i];
3834 	  if (__iswctype (wc, wt))
3835 	    {
3836 	      match_len = char_len;
3837 	      goto check_node_accept_bytes_match;
3838 	    }
3839 	}
3840 
3841 # ifdef _LIBC
3842       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3843       if (nrules != 0)
3844 	{
3845 	  unsigned int in_collseq = 0;
3846 	  const int32_t *table, *indirect;
3847 	  const unsigned char *weights, *extra;
3848 	  const char *collseqwc;
3849 	  /* This #include defines a local function!  */
3850 #  include <locale/weight.h>
3851 
3852 	  /* match with collating_symbol?  */
3853 	  if (cset->ncoll_syms)
3854 	    extra = (const unsigned char *)
3855 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3856 	  for (i = 0; i < cset->ncoll_syms; ++i)
3857 	    {
3858 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3859 	      /* Compare the length of input collating element and
3860 		 the length of current collating element.  */
3861 	      if (*coll_sym != elem_len)
3862 		continue;
3863 	      /* Compare each bytes.  */
3864 	      for (j = 0; j < *coll_sym; j++)
3865 		if (pin[j] != coll_sym[1 + j])
3866 		  break;
3867 	      if (j == *coll_sym)
3868 		{
3869 		  /* Match if every bytes is equal.  */
3870 		  match_len = j;
3871 		  goto check_node_accept_bytes_match;
3872 		}
3873 	    }
3874 
3875 	  if (cset->nranges)
3876 	    {
3877 	      if (elem_len <= char_len)
3878 		{
3879 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3880 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3881 		}
3882 	      else
3883 		in_collseq = find_collation_sequence_value (pin, elem_len);
3884 	    }
3885 	  /* match with range expression?  */
3886 	  for (i = 0; i < cset->nranges; ++i)
3887 	    if (cset->range_starts[i] <= in_collseq
3888 		&& in_collseq <= cset->range_ends[i])
3889 	      {
3890 		match_len = elem_len;
3891 		goto check_node_accept_bytes_match;
3892 	      }
3893 
3894 	  /* match with equivalence_class?  */
3895 	  if (cset->nequiv_classes)
3896 	    {
3897 	      const unsigned char *cp = pin;
3898 	      table = (const int32_t *)
3899 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3900 	      weights = (const unsigned char *)
3901 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3902 	      extra = (const unsigned char *)
3903 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3904 	      indirect = (const int32_t *)
3905 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3906 	      int32_t idx = findidx (&cp);
3907 	      if (idx > 0)
3908 		for (i = 0; i < cset->nequiv_classes; ++i)
3909 		  {
3910 		    int32_t equiv_class_idx = cset->equiv_classes[i];
3911 		    size_t weight_len = weights[idx & 0xffffff];
3912 		    if (weight_len == weights[equiv_class_idx & 0xffffff]
3913 			&& (idx >> 24) == (equiv_class_idx >> 24))
3914 		      {
3915 			int cnt = 0;
3916 
3917 			idx &= 0xffffff;
3918 			equiv_class_idx &= 0xffffff;
3919 
3920 			while (cnt <= weight_len
3921 			       && (weights[equiv_class_idx + 1 + cnt]
3922 				   == weights[idx + 1 + cnt]))
3923 			  ++cnt;
3924 			if (cnt > weight_len)
3925 			  {
3926 			    match_len = elem_len;
3927 			    goto check_node_accept_bytes_match;
3928 			  }
3929 		      }
3930 		  }
3931 	    }
3932 	}
3933       else
3934 # endif /* _LIBC */
3935 	{
3936 	  /* match with range expression?  */
3937 #if __GNUC__ >= 2
3938 	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3939 #else
3940 	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3941 	  cmp_buf[2] = wc;
3942 #endif
3943 	  for (i = 0; i < cset->nranges; ++i)
3944 	    {
3945 	      cmp_buf[0] = cset->range_starts[i];
3946 	      cmp_buf[4] = cset->range_ends[i];
3947 	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3948 		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3949 		{
3950 		  match_len = char_len;
3951 		  goto check_node_accept_bytes_match;
3952 		}
3953 	    }
3954 	}
3955     check_node_accept_bytes_match:
3956       if (!cset->non_match)
3957 	return match_len;
3958       else
3959 	{
3960 	  if (match_len > 0)
3961 	    return 0;
3962 	  else
3963 	    return (elem_len > char_len) ? elem_len : char_len;
3964 	}
3965     }
3966   return 0;
3967 }
3968 
3969 # ifdef _LIBC
3970 static unsigned int
3971 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3972 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3973 {
3974   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3975   if (nrules == 0)
3976     {
3977       if (mbs_len == 1)
3978 	{
3979 	  /* No valid character.  Match it as a single byte character.  */
3980 	  const unsigned char *collseq = (const unsigned char *)
3981 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3982 	  return collseq[mbs[0]];
3983 	}
3984       return UINT_MAX;
3985     }
3986   else
3987     {
3988       int32_t idx;
3989       const unsigned char *extra = (const unsigned char *)
3990 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3991       int32_t extrasize = (const unsigned char *)
3992 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3993 
3994       for (idx = 0; idx < extrasize;)
3995 	{
3996 	  int mbs_cnt, found = 0;
3997 	  int32_t elem_mbs_len;
3998 	  /* Skip the name of collating element name.  */
3999 	  idx = idx + extra[idx] + 1;
4000 	  elem_mbs_len = extra[idx++];
4001 	  if (mbs_len == elem_mbs_len)
4002 	    {
4003 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4004 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4005 		  break;
4006 	      if (mbs_cnt == elem_mbs_len)
4007 		/* Found the entry.  */
4008 		found = 1;
4009 	    }
4010 	  /* Skip the byte sequence of the collating element.  */
4011 	  idx += elem_mbs_len;
4012 	  /* Adjust for the alignment.  */
4013 	  idx = (idx + 3) & ~3;
4014 	  /* Skip the collation sequence value.  */
4015 	  idx += sizeof (uint32_t);
4016 	  /* Skip the wide char sequence of the collating element.  */
4017 	  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4018 	  /* If we found the entry, return the sequence value.  */
4019 	  if (found)
4020 	    return *(uint32_t *) (extra + idx);
4021 	  /* Skip the collation sequence value.  */
4022 	  idx += sizeof (uint32_t);
4023 	}
4024       return UINT_MAX;
4025     }
4026 }
4027 # endif /* _LIBC */
4028 #endif /* RE_ENABLE_I18N */
4029 
4030 /* Check whether the node accepts the byte which is IDX-th
4031    byte of the INPUT.  */
4032 
4033 static int
4034 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,int idx)4035 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4036 		   int idx)
4037 {
4038   unsigned char ch;
4039   ch = re_string_byte_at (&mctx->input, idx);
4040   switch (node->type)
4041     {
4042     case CHARACTER:
4043       if (node->opr.c != ch)
4044         return 0;
4045       break;
4046 
4047     case SIMPLE_BRACKET:
4048       if (!bitset_contain (node->opr.sbcset, ch))
4049         return 0;
4050       break;
4051 
4052 #ifdef RE_ENABLE_I18N
4053     case OP_UTF8_PERIOD:
4054       if (ch >= 0x80)
4055         return 0;
4056       /* FALLTHROUGH */
4057 #endif
4058     case OP_PERIOD:
4059       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4060 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4061 	return 0;
4062       break;
4063 
4064     default:
4065       return 0;
4066     }
4067 
4068   if (node->constraint)
4069     {
4070       /* The node has constraints.  Check whether the current context
4071 	 satisfies the constraints.  */
4072       unsigned int context = re_string_context_at (&mctx->input, idx,
4073 						   mctx->eflags);
4074       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4075 	return 0;
4076     }
4077 
4078   return 1;
4079 }
4080 
4081 /* Extend the buffers, if the buffers have run out.  */
4082 
4083 static reg_errcode_t
4084 internal_function
extend_buffers(re_match_context_t * mctx)4085 extend_buffers (re_match_context_t *mctx)
4086 {
4087   reg_errcode_t ret;
4088   re_string_t *pstr = &mctx->input;
4089 
4090   /* Double the lengthes of the buffers.  */
4091   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4092   if (BE (ret != REG_NOERROR, 0))
4093     return ret;
4094 
4095   if (mctx->state_log != NULL)
4096     {
4097       /* And double the length of state_log.  */
4098       /* XXX We have no indication of the size of this buffer.  If this
4099 	 allocation fail we have no indication that the state_log array
4100 	 does not have the right size.  */
4101       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4102 					      pstr->bufs_len + 1);
4103       if (BE (new_array == NULL, 0))
4104 	return REG_ESPACE;
4105       mctx->state_log = new_array;
4106     }
4107 
4108   /* Then reconstruct the buffers.  */
4109   if (pstr->icase)
4110     {
4111 #ifdef RE_ENABLE_I18N
4112       if (pstr->mb_cur_max > 1)
4113 	{
4114 	  ret = build_wcs_upper_buffer (pstr);
4115 	  if (BE (ret != REG_NOERROR, 0))
4116 	    return ret;
4117 	}
4118       else
4119 #endif /* RE_ENABLE_I18N  */
4120 	build_upper_buffer (pstr);
4121     }
4122   else
4123     {
4124 #ifdef RE_ENABLE_I18N
4125       if (pstr->mb_cur_max > 1)
4126 	build_wcs_buffer (pstr);
4127       else
4128 #endif /* RE_ENABLE_I18N  */
4129 	{
4130 	  if (pstr->trans != NULL)
4131 	    re_string_translate_buffer (pstr);
4132 	}
4133     }
4134   return REG_NOERROR;
4135 }
4136 
4137 
4138 /* Functions for matching context.  */
4139 
4140 /* Initialize MCTX.  */
4141 
4142 static reg_errcode_t
4143 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,int n)4144 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4145 {
4146   mctx->eflags = eflags;
4147   mctx->match_last = -1;
4148   if (n > 0)
4149     {
4150       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4151       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4152       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4153 	return REG_ESPACE;
4154     }
4155   /* Already zero-ed by the caller.
4156      else
4157        mctx->bkref_ents = NULL;
4158      mctx->nbkref_ents = 0;
4159      mctx->nsub_tops = 0;  */
4160   mctx->abkref_ents = n;
4161   mctx->max_mb_elem_len = 1;
4162   mctx->asub_tops = n;
4163   return REG_NOERROR;
4164 }
4165 
4166 /* Clean the entries which depend on the current input in MCTX.
4167    This function must be invoked when the matcher changes the start index
4168    of the input, or changes the input string.  */
4169 
4170 static void
4171 internal_function
match_ctx_clean(re_match_context_t * mctx)4172 match_ctx_clean (re_match_context_t *mctx)
4173 {
4174   int st_idx;
4175   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4176     {
4177       int sl_idx;
4178       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4179       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4180 	{
4181 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4182 	  re_free (last->path.array);
4183 	  re_free (last);
4184 	}
4185       re_free (top->lasts);
4186       if (top->path)
4187 	{
4188 	  re_free (top->path->array);
4189 	  re_free (top->path);
4190 	}
4191       free (top);
4192     }
4193 
4194   mctx->nsub_tops = 0;
4195   mctx->nbkref_ents = 0;
4196 }
4197 
4198 /* Free all the memory associated with MCTX.  */
4199 
4200 static void
4201 internal_function
match_ctx_free(re_match_context_t * mctx)4202 match_ctx_free (re_match_context_t *mctx)
4203 {
4204   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4205   match_ctx_clean (mctx);
4206   re_free (mctx->sub_tops);
4207   re_free (mctx->bkref_ents);
4208 }
4209 
4210 /* Add a new backreference entry to MCTX.
4211    Note that we assume that caller never call this function with duplicate
4212    entry, and call with STR_IDX which isn't smaller than any existing entry.
4213 */
4214 
4215 static reg_errcode_t
4216 internal_function
match_ctx_add_entry(re_match_context_t * mctx,int node,int str_idx,int from,int to)4217 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4218 		     int to)
4219 {
4220   if (mctx->nbkref_ents >= mctx->abkref_ents)
4221     {
4222       struct re_backref_cache_entry* new_entry;
4223       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4224 			      mctx->abkref_ents * 2);
4225       if (BE (new_entry == NULL, 0))
4226 	{
4227 	  re_free (mctx->bkref_ents);
4228 	  return REG_ESPACE;
4229 	}
4230       mctx->bkref_ents = new_entry;
4231       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4232 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4233       mctx->abkref_ents *= 2;
4234     }
4235   if (mctx->nbkref_ents > 0
4236       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4237     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4238 
4239   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4240   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4241   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4242   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4243 
4244   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4245      If bit N is clear, means that this entry won't epsilon-transition to
4246      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4247      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4248      such node.
4249 
4250      A backreference does not epsilon-transition unless it is empty, so set
4251      to all zeros if FROM != TO.  */
4252   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4253     = (from == to ? ~0 : 0);
4254 
4255   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4256   if (mctx->max_mb_elem_len < to - from)
4257     mctx->max_mb_elem_len = to - from;
4258   return REG_NOERROR;
4259 }
4260 
4261 /* Search for the first entry which has the same str_idx, or -1 if none is
4262    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4263 
4264 static int
4265 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,int str_idx)4266 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4267 {
4268   int left, right, mid, last;
4269   last = right = mctx->nbkref_ents;
4270   for (left = 0; left < right;)
4271     {
4272       mid = (left + right) / 2;
4273       if (mctx->bkref_ents[mid].str_idx < str_idx)
4274 	left = mid + 1;
4275       else
4276 	right = mid;
4277     }
4278   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4279     return left;
4280   else
4281     return -1;
4282 }
4283 
4284 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4285    at STR_IDX.  */
4286 
4287 static reg_errcode_t
4288 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,int node,int str_idx)4289 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4290 {
4291 #ifdef DEBUG
4292   assert (mctx->sub_tops != NULL);
4293   assert (mctx->asub_tops > 0);
4294 #endif
4295   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4296     {
4297       int new_asub_tops = mctx->asub_tops * 2;
4298       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4299 						   re_sub_match_top_t *,
4300 						   new_asub_tops);
4301       if (BE (new_array == NULL, 0))
4302 	return REG_ESPACE;
4303       mctx->sub_tops = new_array;
4304       mctx->asub_tops = new_asub_tops;
4305     }
4306   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4307   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4308     return REG_ESPACE;
4309   mctx->sub_tops[mctx->nsub_tops]->node = node;
4310   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4311   return REG_NOERROR;
4312 }
4313 
4314 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4315    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4316 
4317 static re_sub_match_last_t *
4318 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,int node,int str_idx)4319 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4320 {
4321   re_sub_match_last_t *new_entry;
4322   if (BE (subtop->nlasts == subtop->alasts, 0))
4323     {
4324       int new_alasts = 2 * subtop->alasts + 1;
4325       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4326 						    re_sub_match_last_t *,
4327 						    new_alasts);
4328       if (BE (new_array == NULL, 0))
4329 	return NULL;
4330       subtop->lasts = new_array;
4331       subtop->alasts = new_alasts;
4332     }
4333   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4334   if (BE (new_entry != NULL, 1))
4335     {
4336       subtop->lasts[subtop->nlasts] = new_entry;
4337       new_entry->node = node;
4338       new_entry->str_idx = str_idx;
4339       ++subtop->nlasts;
4340     }
4341   return new_entry;
4342 }
4343 
4344 static void
4345 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,int last_node,int last_str_idx)4346 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4347 	       re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4348 {
4349   sctx->sifted_states = sifted_sts;
4350   sctx->limited_states = limited_sts;
4351   sctx->last_node = last_node;
4352   sctx->last_str_idx = last_str_idx;
4353   re_node_set_init_empty (&sctx->limits);
4354 }
4355