1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     int n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
25 					  int str_idx, int from, int to)
26      internal_function;
27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
28      internal_function;
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
30 					   int str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 						   int node, int str_idx)
33      internal_function;
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 			   re_dfastate_t **limited_sts, int last_node,
36 			   int last_str_idx)
37      internal_function;
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 					 const char *string, int length,
40 					 int start, int range, int stop,
41 					 size_t nmatch, regmatch_t pmatch[],
42 					 int eflags);
43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
44 			     const char *string1, int length1,
45 			     const char *string2, int length2,
46 			     int start, int range, struct re_registers *regs,
47 			     int stop, int ret_len);
48 static int re_search_stub (struct re_pattern_buffer *bufp,
49 			   const char *string, int length, int start,
50 			   int range, int stop, struct re_registers *regs,
51 			   int ret_len);
52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53 			      int nregs, int regs_allocated);
54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
55 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
56 			   int *p_match_first) internal_function;
57 static int check_halt_state_context (const re_match_context_t *mctx,
58 				     const re_dfastate_t *state, int idx)
59      internal_function;
60 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
61 			 regmatch_t *prev_idx_match, int cur_node,
62 			 int cur_idx, int nmatch) internal_function;
63 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
64 				      int str_idx, int dest_node, int nregs,
65 				      regmatch_t *regs,
66 				      re_node_set *eps_via_nodes)
67      internal_function;
68 static reg_errcode_t set_regs (const regex_t *preg,
69 			       const re_match_context_t *mctx,
70 			       size_t nmatch, regmatch_t *pmatch,
71 			       int fl_backtrack) internal_function;
72 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
73      internal_function;
74 
75 #ifdef RE_ENABLE_I18N
76 static int sift_states_iter_mb (const re_match_context_t *mctx,
77 				re_sift_context_t *sctx,
78 				int node_idx, int str_idx, int max_str_idx)
79      internal_function;
80 #endif /* RE_ENABLE_I18N */
81 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
82 					   re_sift_context_t *sctx)
83      internal_function;
84 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
85 					  re_sift_context_t *sctx, int str_idx,
86 					  re_node_set *cur_dest)
87      internal_function;
88 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
89 					      re_sift_context_t *sctx,
90 					      int str_idx,
91 					      re_node_set *dest_nodes)
92      internal_function;
93 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
94 					    re_node_set *dest_nodes,
95 					    const re_node_set *candidates)
96      internal_function;
97 static int check_dst_limits (const re_match_context_t *mctx,
98 			     re_node_set *limits,
99 			     int dst_node, int dst_idx, int src_node,
100 			     int src_idx) internal_function;
101 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
102 					int boundaries, int subexp_idx,
103 					int from_node, int bkref_idx)
104      internal_function;
105 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
106 				      int limit, int subexp_idx,
107 				      int node, int str_idx,
108 				      int bkref_idx) internal_function;
109 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
110 					  re_node_set *dest_nodes,
111 					  const re_node_set *candidates,
112 					  re_node_set *limits,
113 					  struct re_backref_cache_entry *bkref_ents,
114 					  int str_idx) internal_function;
115 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
116 					re_sift_context_t *sctx,
117 					int str_idx, const re_node_set *candidates)
118      internal_function;
119 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
120 					re_dfastate_t **dst,
121 					re_dfastate_t **src, int num)
122      internal_function;
123 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
124 					 re_match_context_t *mctx) internal_function;
125 static re_dfastate_t *transit_state (reg_errcode_t *err,
126 				     re_match_context_t *mctx,
127 				     re_dfastate_t *state) internal_function;
128 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
129 					    re_match_context_t *mctx,
130 					    re_dfastate_t *next_state)
131      internal_function;
132 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
133 						re_node_set *cur_nodes,
134 						int str_idx) internal_function;
135 #if 0
136 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
137 					re_match_context_t *mctx,
138 					re_dfastate_t *pstate)
139      internal_function;
140 #endif
141 #ifdef RE_ENABLE_I18N
142 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
143 				       re_dfastate_t *pstate)
144      internal_function;
145 #endif /* RE_ENABLE_I18N */
146 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
147 					  const re_node_set *nodes)
148      internal_function;
149 static reg_errcode_t get_subexp (re_match_context_t *mctx,
150 				 int bkref_node, int bkref_str_idx)
151      internal_function;
152 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
153 				     const re_sub_match_top_t *sub_top,
154 				     re_sub_match_last_t *sub_last,
155 				     int bkref_node, int bkref_str)
156      internal_function;
157 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
158 			     int subexp_idx, int type) internal_function;
159 static reg_errcode_t check_arrival (re_match_context_t *mctx,
160 				    state_array_t *path, int top_node,
161 				    int top_str, int last_node, int last_str,
162 				    int type) internal_function;
163 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
164 						   int str_idx,
165 						   re_node_set *cur_nodes,
166 						   re_node_set *next_nodes)
167      internal_function;
168 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
169 					       re_node_set *cur_nodes,
170 					       int ex_subexp, int type)
171      internal_function;
172 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
173 						   re_node_set *dst_nodes,
174 						   int target, int ex_subexp,
175 						   int type) internal_function;
176 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
177 					 re_node_set *cur_nodes, int cur_str,
178 					 int subexp_num, int type)
179      internal_function;
180 static int build_trtable (const re_dfa_t *dfa,
181 			  re_dfastate_t *state) internal_function;
182 #ifdef RE_ENABLE_I18N
183 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
184 				    const re_string_t *input, int idx)
185      internal_function;
186 # ifdef _LIBC
187 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
188 						   size_t name_len)
189      internal_function;
190 # endif /* _LIBC */
191 #endif /* RE_ENABLE_I18N */
192 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
193 				       const re_dfastate_t *state,
194 				       re_node_set *states_node,
195 				       bitset_t *states_ch) internal_function;
196 static int check_node_accept (const re_match_context_t *mctx,
197 			      const re_token_t *node, int idx)
198      internal_function;
199 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
200      internal_function;
201 
202 /* Entry point for POSIX code.  */
203 
204 /* regexec searches for a given pattern, specified by PREG, in the
205    string STRING.
206 
207    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
208    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
209    least NMATCH elements, and we set them to the offsets of the
210    corresponding matched substrings.
211 
212    EFLAGS specifies `execution flags' which affect matching: if
213    REG_NOTBOL is set, then ^ does not match at the beginning of the
214    string; if REG_NOTEOL is set, then $ does not match at the end.
215 
216    We return 0 if we find a match and REG_NOMATCH if not.  */
217 
218 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)219 regexec (
220 	const regex_t *__restrict preg,
221 	const char *__restrict string,
222 	size_t nmatch,
223 	regmatch_t pmatch[],
224 	int eflags)
225 {
226   reg_errcode_t err;
227   int start, length;
228 
229   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230     return REG_BADPAT;
231 
232   if (eflags & REG_STARTEND)
233     {
234       start = pmatch[0].rm_so;
235       length = pmatch[0].rm_eo;
236     }
237   else
238     {
239       start = 0;
240       length = strlen (string);
241     }
242 
243   __libc_lock_lock (dfa->lock);
244   if (preg->no_sub)
245     err = re_search_internal (preg, string, length, start, length - start,
246 			      length, 0, NULL, eflags);
247   else
248     err = re_search_internal (preg, string, length, start, length - start,
249 			      length, nmatch, pmatch, eflags);
250   __libc_lock_unlock (dfa->lock);
251   return err != REG_NOERROR;
252 }
253 
254 #ifdef _LIBC
255 # include <shlib-compat.h>
256 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
257 
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec) __compat_regexec;
260 
261 int
262 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)263 __compat_regexec (const regex_t *__restrict preg,
264 		  const char *__restrict string, size_t nmatch,
265 		  regmatch_t pmatch[], int eflags)
266 {
267   return regexec (preg, string, nmatch, pmatch,
268 		  eflags & (REG_NOTBOL | REG_NOTEOL));
269 }
270 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271 # endif
272 #endif
273 
274 /* Entry points for GNU code.  */
275 
276 /* re_match, re_search, re_match_2, re_search_2
277 
278    The former two functions operate on STRING with length LENGTH,
279    while the later two operate on concatenation of STRING1 and STRING2
280    with lengths LENGTH1 and LENGTH2, respectively.
281 
282    re_match() matches the compiled pattern in BUFP against the string,
283    starting at index START.
284 
285    re_search() first tries matching at index START, then it tries to match
286    starting from index START + 1, and so on.  The last start position tried
287    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
288    way as re_match().)
289 
290    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291    the first STOP characters of the concatenation of the strings should be
292    concerned.
293 
294    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
296    computed relative to the concatenation, not relative to the individual
297    strings.)
298 
299    On success, re_match* functions return the length of the match, re_search*
300    return the position of the start of the match.  Return value -1 means no
301    match was found and -2 indicates an internal error.  */
302 
303 int
re_match(struct re_pattern_buffer * bufp,const char * string,int length,int start,struct re_registers * regs)304 re_match (struct re_pattern_buffer *bufp,
305 	  const char *string,
306 	  int length,
307 	  int start,
308 	  struct re_registers *regs)
309 {
310   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
311 }
312 #ifdef _LIBC
weak_alias(__re_match,re_match)313 weak_alias (__re_match, re_match)
314 #endif
315 
316 int
317 re_search (struct re_pattern_buffer *bufp,
318 	   const char *string,
319 	   int length, int start, int range,
320 	   struct re_registers *regs)
321 {
322   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
323 }
324 #ifdef _LIBC
weak_alias(__re_search,re_search)325 weak_alias (__re_search, re_search)
326 #endif
327 
328 int
329 re_match_2 (struct re_pattern_buffer *bufp,
330 	    const char *string1, int length1,
331 	    const char *string2, int length2, int start,
332 	    struct re_registers *regs, int stop)
333 {
334   return re_search_2_stub (bufp, string1, length1, string2, length2,
335 			   start, 0, regs, stop, 1);
336 }
337 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)338 weak_alias (__re_match_2, re_match_2)
339 #endif
340 
341 int
342 re_search_2 (struct re_pattern_buffer *bufp,
343 	     const char *string1, int length1,
344 	     const char *string2, int length2, int start,
345 	     int range, struct re_registers *regs,  int stop)
346 {
347   return re_search_2_stub (bufp, string1, length1, string2, length2,
348 			   start, range, regs, stop, 0);
349 }
350 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)351 weak_alias (__re_search_2, re_search_2)
352 #endif
353 
354 static int
355 re_search_2_stub (struct re_pattern_buffer *bufp,
356 		  const char *string1, int length1,
357 		  const char *string2, int length2, int start,
358 		  int range, struct re_registers *regs,
359 		  int stop, int ret_len)
360 {
361   const char *str;
362   int rval;
363   int len = length1 + length2;
364   int free_str = 0;
365 
366   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
367     return -2;
368 
369   /* Concatenate the strings.  */
370   if (length2 > 0)
371     if (length1 > 0)
372       {
373 	char *s = re_malloc (char, len);
374 
375 	if (BE (s == NULL, 0))
376 	  return -2;
377 	memcpy (s, string1, length1);
378 	memcpy (s + length1, string2, length2);
379 	str = s;
380 	free_str = 1;
381       }
382     else
383       str = string2;
384   else
385     str = string1;
386 
387   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
388   if (free_str)
389     re_free ((char *) str);
390   return rval;
391 }
392 
393 /* The parameters have the same meaning as those of re_search.
394    Additional parameters:
395    If RET_LEN is nonzero the length of the match is returned (re_match style);
396    otherwise the position of the match is returned.  */
397 
398 static int
re_search_stub(struct re_pattern_buffer * bufp,const char * string,int length,int start,int range,int stop,struct re_registers * regs,int ret_len)399 re_search_stub (struct re_pattern_buffer *bufp,
400 		const char *string, int length, int start,
401 		int range, int stop,
402 		struct re_registers *regs, int ret_len)
403 {
404   reg_errcode_t result;
405   regmatch_t *pmatch;
406   int nregs, rval;
407   int eflags = 0;
408 
409   /* Check for out-of-range.  */
410   if (BE (start < 0 || start > length, 0))
411     return -1;
412   if (BE (start + range > length, 0))
413     range = length - start;
414   else if (BE (start + range < 0, 0))
415     range = -start;
416 
417   __libc_lock_lock (dfa->lock);
418 
419   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
420   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
421 
422   /* Compile fastmap if we haven't yet.  */
423   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
424     re_compile_fastmap (bufp);
425 
426   if (BE (bufp->no_sub, 0))
427     regs = NULL;
428 
429   /* We need at least 1 register.  */
430   if (regs == NULL)
431     nregs = 1;
432   else if (BE (bufp->regs_allocated == REGS_FIXED &&
433 	       regs->num_regs < bufp->re_nsub + 1, 0))
434     {
435       nregs = regs->num_regs;
436       if (BE (nregs < 1, 0))
437 	{
438 	  /* Nothing can be copied to regs.  */
439 	  regs = NULL;
440 	  nregs = 1;
441 	}
442     }
443   else
444     nregs = bufp->re_nsub + 1;
445   pmatch = re_malloc (regmatch_t, nregs);
446   if (BE (pmatch == NULL, 0))
447     {
448       rval = -2;
449       goto out;
450     }
451 
452   result = re_search_internal (bufp, string, length, start, range, stop,
453 			       nregs, pmatch, eflags);
454 
455   rval = 0;
456 
457   /* I hope we needn't fill their regs with -1's when no match was found.  */
458   if (result != REG_NOERROR)
459     rval = -1;
460   else if (regs != NULL)
461     {
462       /* If caller wants register contents data back, copy them.  */
463       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
464 					   bufp->regs_allocated);
465       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
466 	rval = -2;
467     }
468 
469   if (BE (rval == 0, 1))
470     {
471       if (ret_len)
472 	{
473 	  assert (pmatch[0].rm_so == start);
474 	  rval = pmatch[0].rm_eo - start;
475 	}
476       else
477 	rval = pmatch[0].rm_so;
478     }
479   re_free (pmatch);
480  out:
481   __libc_lock_unlock (dfa->lock);
482   return rval;
483 }
484 
485 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,int nregs,int regs_allocated)486 re_copy_regs (struct re_registers *regs,
487 	      regmatch_t *pmatch,
488 	      int nregs, int regs_allocated)
489 {
490   int rval = REGS_REALLOCATE;
491   int i;
492   int need_regs = nregs + 1;
493   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
494      uses.  */
495 
496   /* Have the register data arrays been allocated?  */
497   if (regs_allocated == REGS_UNALLOCATED)
498     { /* No.  So allocate them with malloc.  */
499       regs->start = re_malloc (regoff_t, need_regs);
500       if (BE (regs->start == NULL, 0))
501 	return REGS_UNALLOCATED;
502       regs->end = re_malloc (regoff_t, need_regs);
503       if (BE (regs->end == NULL, 0))
504 	{
505 	  re_free (regs->start);
506 	  return REGS_UNALLOCATED;
507 	}
508       regs->num_regs = need_regs;
509     }
510   else if (regs_allocated == REGS_REALLOCATE)
511     { /* Yes.  If we need more elements than were already
512 	 allocated, reallocate them.  If we need fewer, just
513 	 leave it alone.  */
514       if (BE (need_regs > regs->num_regs, 0))
515 	{
516 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
517 	  regoff_t *new_end;
518 	  if (BE (new_start == NULL, 0))
519 	    return REGS_UNALLOCATED;
520 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
521 	  if (BE (new_end == NULL, 0))
522 	    {
523 	      re_free (new_start);
524 	      return REGS_UNALLOCATED;
525 	    }
526 	  regs->start = new_start;
527 	  regs->end = new_end;
528 	  regs->num_regs = need_regs;
529 	}
530     }
531   else
532     {
533       assert (regs_allocated == REGS_FIXED);
534       /* This function may not be called with REGS_FIXED and nregs too big.  */
535       assert (regs->num_regs >= nregs);
536       rval = REGS_FIXED;
537     }
538 
539   /* Copy the regs.  */
540   for (i = 0; i < nregs; ++i)
541     {
542       regs->start[i] = pmatch[i].rm_so;
543       regs->end[i] = pmatch[i].rm_eo;
544     }
545   for ( ; i < regs->num_regs; ++i)
546     regs->start[i] = regs->end[i] = -1;
547 
548   return rval;
549 }
550 
551 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
552    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
553    this memory for recording register information.  STARTS and ENDS
554    must be allocated using the malloc library routine, and must each
555    be at least NUM_REGS * sizeof (regoff_t) bytes long.
556 
557    If NUM_REGS == 0, then subsequent matches should allocate their own
558    register data.
559 
560    Unless this function is called, the first search or match using
561    PATTERN_BUFFER will allocate its own register data, without
562    freeing the old data.  */
563 
564 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,unsigned num_regs,regoff_t * starts,regoff_t * ends)565 re_set_registers (struct re_pattern_buffer *bufp,
566 		  struct re_registers *regs,
567 		  unsigned num_regs,
568 		  regoff_t *starts,
569 		  regoff_t *ends)
570 {
571   if (num_regs)
572     {
573       bufp->regs_allocated = REGS_REALLOCATE;
574       regs->num_regs = num_regs;
575       regs->start = starts;
576       regs->end = ends;
577     }
578   else
579     {
580       bufp->regs_allocated = REGS_UNALLOCATED;
581       regs->num_regs = 0;
582       regs->start = regs->end = (regoff_t *) 0;
583     }
584 }
585 #ifdef _LIBC
586 weak_alias (__re_set_registers, re_set_registers)
587 #endif
588 
589 /* Entry points compatible with 4.2 BSD regex library.  We don't define
590    them unless specifically requested.  */
591 
592 #if defined _REGEX_RE_COMP || defined _LIBC
593 int
594 # ifdef _LIBC
595 weak_function
596 # endif
597 re_exec (s)
598      const char *s;
599 {
600   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
601 }
602 #endif /* _REGEX_RE_COMP */
603 
604 /* Internal entry point.  */
605 
606 /* Searches for a compiled pattern PREG in the string STRING, whose
607    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
608    mingings with regexec.  START, and RANGE have the same meanings
609    with re_search.
610    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
611    otherwise return the error code.
612    Note: We assume front end functions already check ranges.
613    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
614 
615 static reg_errcode_t
re_search_internal(const regex_t * preg,const char * string,int length,int start,int range,int stop,size_t nmatch,regmatch_t pmatch[],int eflags)616 re_search_internal (const regex_t *preg,
617 		    const char *string,
618 		    int length, int start, int range, int stop,
619 		    size_t nmatch, regmatch_t pmatch[],
620 		    int eflags)
621 {
622   reg_errcode_t err;
623   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
624   int left_lim, right_lim, incr;
625   int fl_longest_match, match_first, match_kind, match_last = -1;
626   int extra_nmatch;
627   int sb, ch;
628 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
629   re_match_context_t mctx = { .dfa = dfa };
630 #else
631   re_match_context_t mctx;
632 #endif
633   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
634 		   && range && !preg->can_be_null) ? preg->fastmap : NULL;
635   RE_TRANSLATE_TYPE t = preg->translate;
636 
637 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
638   memset (&mctx, '\0', sizeof (re_match_context_t));
639   mctx.dfa = dfa;
640 #endif
641 
642   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
643   nmatch -= extra_nmatch;
644 
645   /* Check if the DFA haven't been compiled.  */
646   if (BE (preg->used == 0 || dfa->init_state == NULL
647 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
648 	  || dfa->init_state_begbuf == NULL, 0))
649     return REG_NOMATCH;
650 
651 #ifdef DEBUG
652   /* We assume front-end functions already check them.  */
653   assert (start + range >= 0 && start + range <= length);
654 #endif
655 
656   /* If initial states with non-begbuf contexts have no elements,
657      the regex must be anchored.  If preg->newline_anchor is set,
658      we'll never use init_state_nl, so do not check it.  */
659   if (dfa->init_state->nodes.nelem == 0
660       && dfa->init_state_word->nodes.nelem == 0
661       && (dfa->init_state_nl->nodes.nelem == 0
662 	  || !preg->newline_anchor))
663     {
664       if (start != 0 && start + range != 0)
665 	return REG_NOMATCH;
666       start = range = 0;
667     }
668 
669   /* We must check the longest matching, if nmatch > 0.  */
670   fl_longest_match = (nmatch != 0 || dfa->nbackref);
671 
672   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
673 			    preg->translate, preg->syntax & RE_ICASE, dfa);
674   if (BE (err != REG_NOERROR, 0))
675     goto free_return;
676   mctx.input.stop = stop;
677   mctx.input.raw_stop = stop;
678   mctx.input.newline_anchor = preg->newline_anchor;
679 
680   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
681   if (BE (err != REG_NOERROR, 0))
682     goto free_return;
683 
684   /* We will log all the DFA states through which the dfa pass,
685      if nmatch > 1, or this dfa has "multibyte node", which is a
686      back-reference or a node which can accept multibyte character or
687      multi character collating element.  */
688   if (nmatch > 1 || dfa->has_mb_node)
689     {
690       /* Avoid overflow.  */
691       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
692 	{
693 	  err = REG_ESPACE;
694 	  goto free_return;
695 	}
696 
697       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
698       if (BE (mctx.state_log == NULL, 0))
699 	{
700 	  err = REG_ESPACE;
701 	  goto free_return;
702 	}
703     }
704   else
705     mctx.state_log = NULL;
706 
707   match_first = start;
708   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
709 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
710 
711   /* Check incrementally whether of not the input string match.  */
712   incr = (range < 0) ? -1 : 1;
713   left_lim = (range < 0) ? start + range : start;
714   right_lim = (range < 0) ? start : start + range;
715   sb = dfa->mb_cur_max == 1;
716   match_kind =
717     (fastmap
718      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
719 	| (range >= 0 ? 2 : 0)
720 	| (t != NULL ? 1 : 0))
721      : 8);
722 
723   for (;; match_first += incr)
724     {
725       err = REG_NOMATCH;
726       if (match_first < left_lim || right_lim < match_first)
727 	goto free_return;
728 
729       /* Advance as rapidly as possible through the string, until we
730 	 find a plausible place to start matching.  This may be done
731 	 with varying efficiency, so there are various possibilities:
732 	 only the most common of them are specialized, in order to
733 	 save on code size.  We use a switch statement for speed.  */
734       switch (match_kind)
735 	{
736 	case 8:
737 	  /* No fastmap.  */
738 	  break;
739 
740 	case 7:
741 	  /* Fastmap with single-byte translation, match forward.  */
742 	  while (BE (match_first < right_lim, 1)
743 		 && !fastmap[t[(unsigned char) string[match_first]]])
744 	    ++match_first;
745 	  goto forward_match_found_start_or_reached_end;
746 
747 	case 6:
748 	  /* Fastmap without translation, match forward.  */
749 	  while (BE (match_first < right_lim, 1)
750 		 && !fastmap[(unsigned char) string[match_first]])
751 	    ++match_first;
752 
753 	forward_match_found_start_or_reached_end:
754 	  if (BE (match_first == right_lim, 0))
755 	    {
756 	      ch = match_first >= length
757 		       ? 0 : (unsigned char) string[match_first];
758 	      if (!fastmap[t ? t[ch] : ch])
759 		goto free_return;
760 	    }
761 	  break;
762 
763 	case 4:
764 	case 5:
765 	  /* Fastmap without multi-byte translation, match backwards.  */
766 	  while (match_first >= left_lim)
767 	    {
768 	      ch = match_first >= length
769 		       ? 0 : (unsigned char) string[match_first];
770 	      if (fastmap[t ? t[ch] : ch])
771 		break;
772 	      --match_first;
773 	    }
774 	  if (match_first < left_lim)
775 	    goto free_return;
776 	  break;
777 
778 	default:
779 	  /* In this case, we can't determine easily the current byte,
780 	     since it might be a component byte of a multibyte
781 	     character.  Then we use the constructed buffer instead.  */
782 	  for (;;)
783 	    {
784 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
785 		 buffers.  */
786 	      unsigned int offset = match_first - mctx.input.raw_mbs_idx;
787 	      if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
788 		{
789 		  err = re_string_reconstruct (&mctx.input, match_first,
790 					       eflags);
791 		  if (BE (err != REG_NOERROR, 0))
792 		    goto free_return;
793 
794 		  offset = match_first - mctx.input.raw_mbs_idx;
795 		}
796 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
797 		 Note that MATCH_FIRST must not be smaller than 0.  */
798 	      ch = (match_first >= length
799 		    ? 0 : re_string_byte_at (&mctx.input, offset));
800 	      if (fastmap[ch])
801 		break;
802 	      match_first += incr;
803 	      if (match_first < left_lim || match_first > right_lim)
804 		{
805 		  err = REG_NOMATCH;
806 		  goto free_return;
807 		}
808 	    }
809 	  break;
810 	}
811 
812       /* Reconstruct the buffers so that the matcher can assume that
813 	 the matching starts from the beginning of the buffer.  */
814       err = re_string_reconstruct (&mctx.input, match_first, eflags);
815       if (BE (err != REG_NOERROR, 0))
816 	goto free_return;
817 
818 #ifdef RE_ENABLE_I18N
819      /* Don't consider this char as a possible match start if it part,
820 	yet isn't the head, of a multibyte character.  */
821       if (!sb && !re_string_first_byte (&mctx.input, 0))
822 	continue;
823 #endif
824 
825       /* It seems to be appropriate one, then use the matcher.  */
826       /* We assume that the matching starts from 0.  */
827       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
828       match_last = check_matching (&mctx, fl_longest_match,
829 				   range >= 0 ? &match_first : NULL);
830       if (match_last != -1)
831 	{
832 	  if (BE (match_last == -2, 0))
833 	    {
834 	      err = REG_ESPACE;
835 	      goto free_return;
836 	    }
837 	  else
838 	    {
839 	      mctx.match_last = match_last;
840 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
841 		{
842 		  re_dfastate_t *pstate = mctx.state_log[match_last];
843 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
844 							     match_last);
845 		}
846 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
847 		  || dfa->nbackref)
848 		{
849 		  err = prune_impossible_nodes (&mctx);
850 		  if (err == REG_NOERROR)
851 		    break;
852 		  if (BE (err != REG_NOMATCH, 0))
853 		    goto free_return;
854 		  match_last = -1;
855 		}
856 	      else
857 		break; /* We found a match.  */
858 	    }
859 	}
860 
861       match_ctx_clean (&mctx);
862     }
863 
864 #ifdef DEBUG
865   assert (match_last != -1);
866   assert (err == REG_NOERROR);
867 #endif
868 
869   /* Set pmatch[] if we need.  */
870   if (nmatch > 0)
871     {
872       int reg_idx;
873 
874       /* Initialize registers.  */
875       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
876 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
877 
878       /* Set the points where matching start/end.  */
879       pmatch[0].rm_so = 0;
880       pmatch[0].rm_eo = mctx.match_last;
881 
882       if (!preg->no_sub && nmatch > 1)
883 	{
884 	  err = set_regs (preg, &mctx, nmatch, pmatch,
885 			  dfa->has_plural_match && dfa->nbackref > 0);
886 	  if (BE (err != REG_NOERROR, 0))
887 	    goto free_return;
888 	}
889 
890       /* At last, add the offset to the each registers, since we slided
891 	 the buffers so that we could assume that the matching starts
892 	 from 0.  */
893       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
894 	if (pmatch[reg_idx].rm_so != -1)
895 	  {
896 #ifdef RE_ENABLE_I18N
897 	    if (BE (mctx.input.offsets_needed != 0, 0))
898 	      {
899 		pmatch[reg_idx].rm_so =
900 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
901 		   ? mctx.input.valid_raw_len
902 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
903 		pmatch[reg_idx].rm_eo =
904 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
905 		   ? mctx.input.valid_raw_len
906 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
907 	      }
908 #else
909 	    assert (mctx.input.offsets_needed == 0);
910 #endif
911 	    pmatch[reg_idx].rm_so += match_first;
912 	    pmatch[reg_idx].rm_eo += match_first;
913 	  }
914       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
915 	{
916 	  pmatch[nmatch + reg_idx].rm_so = -1;
917 	  pmatch[nmatch + reg_idx].rm_eo = -1;
918 	}
919 
920       if (dfa->subexp_map)
921 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
922 	  if (dfa->subexp_map[reg_idx] != reg_idx)
923 	    {
924 	      pmatch[reg_idx + 1].rm_so
925 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
926 	      pmatch[reg_idx + 1].rm_eo
927 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
928 	    }
929     }
930 
931  free_return:
932   re_free (mctx.state_log);
933   if (dfa->nbackref)
934     match_ctx_free (&mctx);
935   re_string_destruct (&mctx.input);
936   return err;
937 }
938 
939 static reg_errcode_t
prune_impossible_nodes(re_match_context_t * mctx)940 prune_impossible_nodes (re_match_context_t *mctx)
941 {
942   const re_dfa_t *const dfa = mctx->dfa;
943   int halt_node, match_last;
944   reg_errcode_t ret;
945   re_dfastate_t **sifted_states;
946   re_dfastate_t **lim_states = NULL;
947   re_sift_context_t sctx;
948 #ifdef DEBUG
949   assert (mctx->state_log != NULL);
950 #endif
951   match_last = mctx->match_last;
952   halt_node = mctx->last_node;
953 
954   /* Avoid overflow.  */
955   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
956     return REG_ESPACE;
957 
958   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
959   if (BE (sifted_states == NULL, 0))
960     {
961       ret = REG_ESPACE;
962       goto free_return;
963     }
964   if (dfa->nbackref)
965     {
966       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
967       if (BE (lim_states == NULL, 0))
968 	{
969 	  ret = REG_ESPACE;
970 	  goto free_return;
971 	}
972       while (1)
973 	{
974 	  memset (lim_states, '\0',
975 		  sizeof (re_dfastate_t *) * (match_last + 1));
976 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
977 			 match_last);
978 	  ret = sift_states_backward (mctx, &sctx);
979 	  re_node_set_free (&sctx.limits);
980 	  if (BE (ret != REG_NOERROR, 0))
981 	      goto free_return;
982 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
983 	    break;
984 	  do
985 	    {
986 	      --match_last;
987 	      if (match_last < 0)
988 		{
989 		  ret = REG_NOMATCH;
990 		  goto free_return;
991 		}
992 	    } while (mctx->state_log[match_last] == NULL
993 		     || !mctx->state_log[match_last]->halt);
994 	  halt_node = check_halt_state_context (mctx,
995 						mctx->state_log[match_last],
996 						match_last);
997 	}
998       ret = merge_state_array (dfa, sifted_states, lim_states,
999 			       match_last + 1);
1000       re_free (lim_states);
1001       lim_states = NULL;
1002       if (BE (ret != REG_NOERROR, 0))
1003 	goto free_return;
1004     }
1005   else
1006     {
1007       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1008       ret = sift_states_backward (mctx, &sctx);
1009       re_node_set_free (&sctx.limits);
1010       if (BE (ret != REG_NOERROR, 0))
1011 	goto free_return;
1012       if (sifted_states[0] == NULL)
1013 	{
1014 	  ret = REG_NOMATCH;
1015 	  goto free_return;
1016 	}
1017     }
1018   re_free (mctx->state_log);
1019   mctx->state_log = sifted_states;
1020   sifted_states = NULL;
1021   mctx->last_node = halt_node;
1022   mctx->match_last = match_last;
1023   ret = REG_NOERROR;
1024  free_return:
1025   re_free (sifted_states);
1026   re_free (lim_states);
1027   return ret;
1028 }
1029 
1030 /* Acquire an initial state and return it.
1031    We must select appropriate initial state depending on the context,
1032    since initial states may have constraints like "\<", "^", etc..  */
1033 
1034 static inline re_dfastate_t *
1035 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,int idx)1036 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1037 			    int idx)
1038 {
1039   const re_dfa_t *const dfa = mctx->dfa;
1040   if (dfa->init_state->has_constraint)
1041     {
1042       unsigned int context;
1043       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1044       if (IS_WORD_CONTEXT (context))
1045 	return dfa->init_state_word;
1046       else if (IS_ORDINARY_CONTEXT (context))
1047 	return dfa->init_state;
1048       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1049 	return dfa->init_state_begbuf;
1050       else if (IS_NEWLINE_CONTEXT (context))
1051 	return dfa->init_state_nl;
1052       else if (IS_BEGBUF_CONTEXT (context))
1053 	{
1054 	  /* It is relatively rare case, then calculate on demand.  */
1055 	  return re_acquire_state_context (err, dfa,
1056 					   dfa->init_state->entrance_nodes,
1057 					   context);
1058 	}
1059       else
1060 	/* Must not happen?  */
1061 	return dfa->init_state;
1062     }
1063   else
1064     return dfa->init_state;
1065 }
1066 
1067 /* Check whether the regular expression match input string INPUT or not,
1068    and return the index where the matching end, return -1 if not match,
1069    or return -2 in case of an error.
1070    FL_LONGEST_MATCH means we want the POSIX longest matching.
1071    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1072    next place where we may want to try matching.
1073    Note that the matcher assume that the matching starts from the current
1074    index of the buffer.  */
1075 
1076 static int
1077 internal_function
check_matching(re_match_context_t * mctx,int fl_longest_match,int * p_match_first)1078 check_matching (re_match_context_t *mctx, int fl_longest_match,
1079 		int *p_match_first)
1080 {
1081   const re_dfa_t *const dfa = mctx->dfa;
1082   reg_errcode_t err;
1083   int match = 0;
1084   int match_last = -1;
1085   int cur_str_idx = re_string_cur_idx (&mctx->input);
1086   re_dfastate_t *cur_state;
1087   int at_init_state = p_match_first != NULL;
1088   int next_start_idx = cur_str_idx;
1089 
1090   err = REG_NOERROR;
1091   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1092   /* An initial state must not be NULL (invalid).  */
1093   if (BE (cur_state == NULL, 0))
1094     {
1095       assert (err == REG_ESPACE);
1096       return -2;
1097     }
1098 
1099   if (mctx->state_log != NULL)
1100     {
1101       mctx->state_log[cur_str_idx] = cur_state;
1102 
1103       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1104 	 later.  E.g. Processing back references.  */
1105       if (BE (dfa->nbackref, 0))
1106 	{
1107 	  at_init_state = 0;
1108 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1109 	  if (BE (err != REG_NOERROR, 0))
1110 	    return err;
1111 
1112 	  if (cur_state->has_backref)
1113 	    {
1114 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1115 	      if (BE (err != REG_NOERROR, 0))
1116 		return err;
1117 	    }
1118 	}
1119     }
1120 
1121   /* If the RE accepts NULL string.  */
1122   if (BE (cur_state->halt, 0))
1123     {
1124       if (!cur_state->has_constraint
1125 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1126 	{
1127 	  if (!fl_longest_match)
1128 	    return cur_str_idx;
1129 	  else
1130 	    {
1131 	      match_last = cur_str_idx;
1132 	      match = 1;
1133 	    }
1134 	}
1135     }
1136 
1137   while (!re_string_eoi (&mctx->input))
1138     {
1139       re_dfastate_t *old_state = cur_state;
1140       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1141 
1142       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1143 	  || (BE (next_char_idx >= mctx->input.valid_len, 0)
1144 	      && mctx->input.valid_len < mctx->input.len))
1145 	{
1146 	  err = extend_buffers (mctx);
1147 	  if (BE (err != REG_NOERROR, 0))
1148 	    {
1149 	      assert (err == REG_ESPACE);
1150 	      return -2;
1151 	    }
1152 	}
1153 
1154       cur_state = transit_state (&err, mctx, cur_state);
1155       if (mctx->state_log != NULL)
1156 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1157 
1158       if (cur_state == NULL)
1159 	{
1160 	  /* Reached the invalid state or an error.  Try to recover a valid
1161 	     state using the state log, if available and if we have not
1162 	     already found a valid (even if not the longest) match.  */
1163 	  if (BE (err != REG_NOERROR, 0))
1164 	    return -2;
1165 
1166 	  if (mctx->state_log == NULL
1167 	      || (match && !fl_longest_match)
1168 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1169 	    break;
1170 	}
1171 
1172       if (BE (at_init_state, 0))
1173 	{
1174 	  if (old_state == cur_state)
1175 	    next_start_idx = next_char_idx;
1176 	  else
1177 	    at_init_state = 0;
1178 	}
1179 
1180       if (cur_state->halt)
1181 	{
1182 	  /* Reached a halt state.
1183 	     Check the halt state can satisfy the current context.  */
1184 	  if (!cur_state->has_constraint
1185 	      || check_halt_state_context (mctx, cur_state,
1186 					   re_string_cur_idx (&mctx->input)))
1187 	    {
1188 	      /* We found an appropriate halt state.  */
1189 	      match_last = re_string_cur_idx (&mctx->input);
1190 	      match = 1;
1191 
1192 	      /* We found a match, do not modify match_first below.  */
1193 	      p_match_first = NULL;
1194 	      if (!fl_longest_match)
1195 		break;
1196 	    }
1197 	}
1198     }
1199 
1200   if (p_match_first)
1201     *p_match_first += next_start_idx;
1202 
1203   return match_last;
1204 }
1205 
1206 /* Check NODE match the current context.  */
1207 
1208 static int
1209 internal_function
check_halt_node_context(const re_dfa_t * dfa,int node,unsigned int context)1210 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1211 {
1212   re_token_type_t type = dfa->nodes[node].type;
1213   unsigned int constraint = dfa->nodes[node].constraint;
1214   if (type != END_OF_RE)
1215     return 0;
1216   if (!constraint)
1217     return 1;
1218   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1219     return 0;
1220   return 1;
1221 }
1222 
1223 /* Check the halt state STATE match the current context.
1224    Return 0 if not match, if the node, STATE has, is a halt node and
1225    match the context, return the node.  */
1226 
1227 static int
1228 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,int idx)1229 check_halt_state_context (const re_match_context_t *mctx,
1230 			  const re_dfastate_t *state, int idx)
1231 {
1232   int i;
1233   unsigned int context;
1234 #ifdef DEBUG
1235   assert (state->halt);
1236 #endif
1237   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1238   for (i = 0; i < state->nodes.nelem; ++i)
1239     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1240       return state->nodes.elems[i];
1241   return 0;
1242 }
1243 
1244 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1245    corresponding to the DFA).
1246    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1247    of errors.  */
1248 
1249 static int
1250 internal_function
proceed_next_node(const re_match_context_t * mctx,int nregs,regmatch_t * regs,int * pidx,int node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1251 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1252 		   int *pidx, int node, re_node_set *eps_via_nodes,
1253 		   struct re_fail_stack_t *fs)
1254 {
1255   const re_dfa_t *const dfa = mctx->dfa;
1256   int i, err;
1257   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1258     {
1259       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1260       re_node_set *edests = &dfa->edests[node];
1261       int dest_node;
1262       err = re_node_set_insert (eps_via_nodes, node);
1263       if (BE (err < 0, 0))
1264 	return -2;
1265       /* Pick up a valid destination, or return -1 if none is found.  */
1266       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1267 	{
1268 	  int candidate = edests->elems[i];
1269 	  if (!re_node_set_contains (cur_nodes, candidate))
1270 	    continue;
1271 	  if (dest_node == -1)
1272 	    dest_node = candidate;
1273 
1274 	  else
1275 	    {
1276 	      /* In order to avoid infinite loop like "(a*)*", return the second
1277 		 epsilon-transition if the first was already considered.  */
1278 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1279 		return candidate;
1280 
1281 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1282 	      else if (fs != NULL
1283 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1284 					   eps_via_nodes))
1285 		return -2;
1286 
1287 	      /* We know we are going to exit.  */
1288 	      break;
1289 	    }
1290 	}
1291       return dest_node;
1292     }
1293   else
1294     {
1295       int naccepted = 0;
1296       re_token_type_t type = dfa->nodes[node].type;
1297 
1298 #ifdef RE_ENABLE_I18N
1299       if (dfa->nodes[node].accept_mb)
1300 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1301       else
1302 #endif /* RE_ENABLE_I18N */
1303       if (type == OP_BACK_REF)
1304 	{
1305 	  int subexp_idx = dfa->nodes[node].opr.idx + 1;
1306 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1307 	  if (fs != NULL)
1308 	    {
1309 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1310 		return -1;
1311 	      else if (naccepted)
1312 		{
1313 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1314 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1315 			      naccepted) != 0)
1316 		    return -1;
1317 		}
1318 	    }
1319 
1320 	  if (naccepted == 0)
1321 	    {
1322 	      int dest_node;
1323 	      err = re_node_set_insert (eps_via_nodes, node);
1324 	      if (BE (err < 0, 0))
1325 		return -2;
1326 	      dest_node = dfa->edests[node].elems[0];
1327 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1328 					dest_node))
1329 		return dest_node;
1330 	    }
1331 	}
1332 
1333       if (naccepted != 0
1334 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1335 	{
1336 	  int dest_node = dfa->nexts[node];
1337 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1338 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1339 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1340 					       dest_node)))
1341 	    return -1;
1342 	  re_node_set_empty (eps_via_nodes);
1343 	  return dest_node;
1344 	}
1345     }
1346   return -1;
1347 }
1348 
1349 static reg_errcode_t
1350 internal_function
push_fail_stack(struct re_fail_stack_t * fs,int str_idx,int dest_node,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1351 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1352 		 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1353 {
1354   reg_errcode_t err;
1355   int num = fs->num++;
1356   if (fs->num == fs->alloc)
1357     {
1358       struct re_fail_stack_ent_t *new_array;
1359       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1360 				       * fs->alloc * 2));
1361       if (new_array == NULL)
1362 	return REG_ESPACE;
1363       fs->alloc *= 2;
1364       fs->stack = new_array;
1365     }
1366   fs->stack[num].idx = str_idx;
1367   fs->stack[num].node = dest_node;
1368   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1369   if (fs->stack[num].regs == NULL)
1370     return REG_ESPACE;
1371   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1372   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1373   return err;
1374 }
1375 
1376 static int
1377 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,int * pidx,int nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1378 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1379 		regmatch_t *regs, re_node_set *eps_via_nodes)
1380 {
1381   int num = --fs->num;
1382   assert (num >= 0);
1383   *pidx = fs->stack[num].idx;
1384   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1385   re_node_set_free (eps_via_nodes);
1386   re_free (fs->stack[num].regs);
1387   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1388   return fs->stack[num].node;
1389 }
1390 
1391 /* Set the positions where the subexpressions are starts/ends to registers
1392    PMATCH.
1393    Note: We assume that pmatch[0] is already set, and
1394    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1395 
1396 static reg_errcode_t
1397 internal_function
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,int fl_backtrack)1398 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1399 	  regmatch_t *pmatch, int fl_backtrack)
1400 {
1401   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1402   int idx, cur_node;
1403   re_node_set eps_via_nodes;
1404   struct re_fail_stack_t *fs;
1405   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1406   regmatch_t *prev_idx_match;
1407   int prev_idx_match_malloced = 0;
1408 
1409 #ifdef DEBUG
1410   assert (nmatch > 1);
1411   assert (mctx->state_log != NULL);
1412 #endif
1413   if (fl_backtrack)
1414     {
1415       fs = &fs_body;
1416       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1417       if (fs->stack == NULL)
1418 	return REG_ESPACE;
1419     }
1420   else
1421     fs = NULL;
1422 
1423   cur_node = dfa->init_node;
1424   re_node_set_init_empty (&eps_via_nodes);
1425 
1426 #ifdef HAVE_ALLOCA
1427   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1428     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1429   else
1430 #endif
1431     {
1432       prev_idx_match = re_malloc (regmatch_t, nmatch);
1433       if (prev_idx_match == NULL)
1434 	{
1435 	  free_fail_stack_return (fs);
1436 	  return REG_ESPACE;
1437 	}
1438       prev_idx_match_malloced = 1;
1439     }
1440   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1441 
1442   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1443     {
1444       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1445 
1446       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1447 	{
1448 	  int reg_idx;
1449 	  if (fs)
1450 	    {
1451 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1452 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1453 		  break;
1454 	      if (reg_idx == nmatch)
1455 		{
1456 		  re_node_set_free (&eps_via_nodes);
1457 		  if (prev_idx_match_malloced)
1458 		    re_free (prev_idx_match);
1459 		  return free_fail_stack_return (fs);
1460 		}
1461 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1462 					 &eps_via_nodes);
1463 	    }
1464 	  else
1465 	    {
1466 	      re_node_set_free (&eps_via_nodes);
1467 	      if (prev_idx_match_malloced)
1468 		re_free (prev_idx_match);
1469 	      return REG_NOERROR;
1470 	    }
1471 	}
1472 
1473       /* Proceed to next node.  */
1474       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1475 				    &eps_via_nodes, fs);
1476 
1477       if (BE (cur_node < 0, 0))
1478 	{
1479 	  if (BE (cur_node == -2, 0))
1480 	    {
1481 	      re_node_set_free (&eps_via_nodes);
1482 	      if (prev_idx_match_malloced)
1483 		re_free (prev_idx_match);
1484 	      free_fail_stack_return (fs);
1485 	      return REG_ESPACE;
1486 	    }
1487 	  if (fs)
1488 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1489 				       &eps_via_nodes);
1490 	  else
1491 	    {
1492 	      re_node_set_free (&eps_via_nodes);
1493 	      if (prev_idx_match_malloced)
1494 		re_free (prev_idx_match);
1495 	      return REG_NOMATCH;
1496 	    }
1497 	}
1498     }
1499   re_node_set_free (&eps_via_nodes);
1500   if (prev_idx_match_malloced)
1501     re_free (prev_idx_match);
1502   return free_fail_stack_return (fs);
1503 }
1504 
1505 static reg_errcode_t
1506 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1507 free_fail_stack_return (struct re_fail_stack_t *fs)
1508 {
1509   if (fs)
1510     {
1511       int fs_idx;
1512       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1513 	{
1514 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1515 	  re_free (fs->stack[fs_idx].regs);
1516 	}
1517       re_free (fs->stack);
1518     }
1519   return REG_NOERROR;
1520 }
1521 
1522 static void
1523 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,int cur_node,int cur_idx,int nmatch)1524 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1525 	     regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1526 {
1527   int type = dfa->nodes[cur_node].type;
1528   if (type == OP_OPEN_SUBEXP)
1529     {
1530       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1531 
1532       /* We are at the first node of this sub expression.  */
1533       if (reg_num < nmatch)
1534 	{
1535 	  pmatch[reg_num].rm_so = cur_idx;
1536 	  pmatch[reg_num].rm_eo = -1;
1537 	}
1538     }
1539   else if (type == OP_CLOSE_SUBEXP)
1540     {
1541       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1542       if (reg_num < nmatch)
1543 	{
1544 	  /* We are at the last node of this sub expression.  */
1545 	  if (pmatch[reg_num].rm_so < cur_idx)
1546 	    {
1547 	      pmatch[reg_num].rm_eo = cur_idx;
1548 	      /* This is a non-empty match or we are not inside an optional
1549 		 subexpression.  Accept this right away.  */
1550 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1551 	    }
1552 	  else
1553 	    {
1554 	      if (dfa->nodes[cur_node].opt_subexp
1555 		  && prev_idx_match[reg_num].rm_so != -1)
1556 		/* We transited through an empty match for an optional
1557 		   subexpression, like (a?)*, and this is not the subexp's
1558 		   first match.  Copy back the old content of the registers
1559 		   so that matches of an inner subexpression are undone as
1560 		   well, like in ((a?))*.  */
1561 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1562 	      else
1563 		/* We completed a subexpression, but it may be part of
1564 		   an optional one, so do not update PREV_IDX_MATCH.  */
1565 		pmatch[reg_num].rm_eo = cur_idx;
1566 	    }
1567 	}
1568     }
1569 }
1570 
1571 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1572    and sift the nodes in each states according to the following rules.
1573    Updated state_log will be wrote to STATE_LOG.
1574 
1575    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1576      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1577 	If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1578 	the LAST_NODE, we throw away the node `a'.
1579      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1580 	string `s' and transit to `b':
1581 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1582 	   away the node `a'.
1583 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1584 	    thrown away, we throw away the node `a'.
1585      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1586 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1587 	   node `a'.
1588 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1589 	    we throw away the node `a'.  */
1590 
1591 #define STATE_NODE_CONTAINS(state,node) \
1592   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1593 
1594 static reg_errcode_t
1595 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1596 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1597 {
1598   reg_errcode_t err;
1599   int null_cnt = 0;
1600   int str_idx = sctx->last_str_idx;
1601   re_node_set cur_dest;
1602 
1603 #ifdef DEBUG
1604   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1605 #endif
1606 
1607   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1608      transit to the last_node and the last_node itself.  */
1609   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1610   if (BE (err != REG_NOERROR, 0))
1611     return err;
1612   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1613   if (BE (err != REG_NOERROR, 0))
1614     goto free_return;
1615 
1616   /* Then check each states in the state_log.  */
1617   while (str_idx > 0)
1618     {
1619       /* Update counters.  */
1620       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1621       if (null_cnt > mctx->max_mb_elem_len)
1622 	{
1623 	  memset (sctx->sifted_states, '\0',
1624 		  sizeof (re_dfastate_t *) * str_idx);
1625 	  re_node_set_free (&cur_dest);
1626 	  return REG_NOERROR;
1627 	}
1628       re_node_set_empty (&cur_dest);
1629       --str_idx;
1630 
1631       if (mctx->state_log[str_idx])
1632 	{
1633 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1634 	  if (BE (err != REG_NOERROR, 0))
1635 	    goto free_return;
1636 	}
1637 
1638       /* Add all the nodes which satisfy the following conditions:
1639 	 - It can epsilon transit to a node in CUR_DEST.
1640 	 - It is in CUR_SRC.
1641 	 And update state_log.  */
1642       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1643       if (BE (err != REG_NOERROR, 0))
1644 	goto free_return;
1645     }
1646   err = REG_NOERROR;
1647  free_return:
1648   re_node_set_free (&cur_dest);
1649   return err;
1650 }
1651 
1652 static reg_errcode_t
1653 internal_function
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * cur_dest)1654 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1655 		     int str_idx, re_node_set *cur_dest)
1656 {
1657   const re_dfa_t *const dfa = mctx->dfa;
1658   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1659   int i;
1660 
1661   /* Then build the next sifted state.
1662      We build the next sifted state on `cur_dest', and update
1663      `sifted_states[str_idx]' with `cur_dest'.
1664      Note:
1665      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1666      `cur_src' points the node_set of the old `state_log[str_idx]'
1667      (with the epsilon nodes pre-filtered out).  */
1668   for (i = 0; i < cur_src->nelem; i++)
1669     {
1670       int prev_node = cur_src->elems[i];
1671       int naccepted = 0;
1672       int ret;
1673 
1674 #ifdef DEBUG
1675       re_token_type_t type = dfa->nodes[prev_node].type;
1676       assert (!IS_EPSILON_NODE (type));
1677 #endif
1678 #ifdef RE_ENABLE_I18N
1679       /* If the node may accept `multi byte'.  */
1680       if (dfa->nodes[prev_node].accept_mb)
1681 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1682 					 str_idx, sctx->last_str_idx);
1683 #endif /* RE_ENABLE_I18N */
1684 
1685       /* We don't check backreferences here.
1686 	 See update_cur_sifted_state().  */
1687       if (!naccepted
1688 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1689 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1690 				  dfa->nexts[prev_node]))
1691 	naccepted = 1;
1692 
1693       if (naccepted == 0)
1694 	continue;
1695 
1696       if (sctx->limits.nelem)
1697 	{
1698 	  int to_idx = str_idx + naccepted;
1699 	  if (check_dst_limits (mctx, &sctx->limits,
1700 				dfa->nexts[prev_node], to_idx,
1701 				prev_node, str_idx))
1702 	    continue;
1703 	}
1704       ret = re_node_set_insert (cur_dest, prev_node);
1705       if (BE (ret == -1, 0))
1706 	return REG_ESPACE;
1707     }
1708 
1709   return REG_NOERROR;
1710 }
1711 
1712 /* Helper functions.  */
1713 
1714 static reg_errcode_t
1715 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,int next_state_log_idx)1716 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1717 {
1718   int top = mctx->state_log_top;
1719 
1720   if (next_state_log_idx >= mctx->input.bufs_len
1721       || (next_state_log_idx >= mctx->input.valid_len
1722 	  && mctx->input.valid_len < mctx->input.len))
1723     {
1724       reg_errcode_t err;
1725       err = extend_buffers (mctx);
1726       if (BE (err != REG_NOERROR, 0))
1727 	return err;
1728     }
1729 
1730   if (top < next_state_log_idx)
1731     {
1732       memset (mctx->state_log + top + 1, '\0',
1733 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1734       mctx->state_log_top = next_state_log_idx;
1735     }
1736   return REG_NOERROR;
1737 }
1738 
1739 static reg_errcode_t
1740 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,int num)1741 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1742 		   re_dfastate_t **src, int num)
1743 {
1744   int st_idx;
1745   reg_errcode_t err;
1746   for (st_idx = 0; st_idx < num; ++st_idx)
1747     {
1748       if (dst[st_idx] == NULL)
1749 	dst[st_idx] = src[st_idx];
1750       else if (src[st_idx] != NULL)
1751 	{
1752 	  re_node_set merged_set;
1753 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1754 					&src[st_idx]->nodes);
1755 	  if (BE (err != REG_NOERROR, 0))
1756 	    return err;
1757 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1758 	  re_node_set_free (&merged_set);
1759 	  if (BE (err != REG_NOERROR, 0))
1760 	    return err;
1761 	}
1762     }
1763   return REG_NOERROR;
1764 }
1765 
1766 static reg_errcode_t
1767 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,re_node_set * dest_nodes)1768 update_cur_sifted_state (const re_match_context_t *mctx,
1769 			 re_sift_context_t *sctx, int str_idx,
1770 			 re_node_set *dest_nodes)
1771 {
1772   const re_dfa_t *const dfa = mctx->dfa;
1773   reg_errcode_t err = REG_NOERROR;
1774   const re_node_set *candidates;
1775   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1776 		: &mctx->state_log[str_idx]->nodes);
1777 
1778   if (dest_nodes->nelem == 0)
1779     sctx->sifted_states[str_idx] = NULL;
1780   else
1781     {
1782       if (candidates)
1783 	{
1784 	  /* At first, add the nodes which can epsilon transit to a node in
1785 	     DEST_NODE.  */
1786 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1787 	  if (BE (err != REG_NOERROR, 0))
1788 	    return err;
1789 
1790 	  /* Then, check the limitations in the current sift_context.  */
1791 	  if (sctx->limits.nelem)
1792 	    {
1793 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1794 					 mctx->bkref_ents, str_idx);
1795 	      if (BE (err != REG_NOERROR, 0))
1796 		return err;
1797 	    }
1798 	}
1799 
1800       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1801       if (BE (err != REG_NOERROR, 0))
1802 	return err;
1803     }
1804 
1805   if (candidates && mctx->state_log[str_idx]->has_backref)
1806     {
1807       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1808       if (BE (err != REG_NOERROR, 0))
1809 	return err;
1810     }
1811   return REG_NOERROR;
1812 }
1813 
1814 static reg_errcode_t
1815 internal_function
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1816 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1817 		       const re_node_set *candidates)
1818 {
1819   reg_errcode_t err = REG_NOERROR;
1820   int i;
1821 
1822   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1823   if (BE (err != REG_NOERROR, 0))
1824     return err;
1825 
1826   if (!state->inveclosure.alloc)
1827     {
1828       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1829       if (BE (err != REG_NOERROR, 0))
1830 	return REG_ESPACE;
1831       for (i = 0; i < dest_nodes->nelem; i++)
1832 	{
1833 	  err = re_node_set_merge (&state->inveclosure,
1834 				   dfa->inveclosures + dest_nodes->elems[i]);
1835 	  if (BE (err != REG_NOERROR, 0))
1836 	    return REG_ESPACE;
1837 	}
1838     }
1839   return re_node_set_add_intersect (dest_nodes, candidates,
1840 				    &state->inveclosure);
1841 }
1842 
1843 static reg_errcode_t
1844 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,int node,re_node_set * dest_nodes,const re_node_set * candidates)1845 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1846 		       const re_node_set *candidates)
1847 {
1848     int ecl_idx;
1849     reg_errcode_t err;
1850     re_node_set *inv_eclosure = dfa->inveclosures + node;
1851     re_node_set except_nodes;
1852     re_node_set_init_empty (&except_nodes);
1853     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854       {
1855 	int cur_node = inv_eclosure->elems[ecl_idx];
1856 	if (cur_node == node)
1857 	  continue;
1858 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1859 	  {
1860 	    int edst1 = dfa->edests[cur_node].elems[0];
1861 	    int edst2 = ((dfa->edests[cur_node].nelem > 1)
1862 			 ? dfa->edests[cur_node].elems[1] : -1);
1863 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1864 		 && re_node_set_contains (dest_nodes, edst1))
1865 		|| (edst2 > 0
1866 		    && !re_node_set_contains (inv_eclosure, edst2)
1867 		    && re_node_set_contains (dest_nodes, edst2)))
1868 	      {
1869 		err = re_node_set_add_intersect (&except_nodes, candidates,
1870 						 dfa->inveclosures + cur_node);
1871 		if (BE (err != REG_NOERROR, 0))
1872 		  {
1873 		    re_node_set_free (&except_nodes);
1874 		    return err;
1875 		  }
1876 	      }
1877 	  }
1878       }
1879     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880       {
1881 	int cur_node = inv_eclosure->elems[ecl_idx];
1882 	if (!re_node_set_contains (&except_nodes, cur_node))
1883 	  {
1884 	    int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1885 	    re_node_set_remove_at (dest_nodes, idx);
1886 	  }
1887       }
1888     re_node_set_free (&except_nodes);
1889     return REG_NOERROR;
1890 }
1891 
1892 static int
1893 internal_function
check_dst_limits(const re_match_context_t * mctx,re_node_set * limits,int dst_node,int dst_idx,int src_node,int src_idx)1894 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1895 		  int dst_node, int dst_idx, int src_node, int src_idx)
1896 {
1897   const re_dfa_t *const dfa = mctx->dfa;
1898   int lim_idx, src_pos, dst_pos;
1899 
1900   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1901   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1902   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1903     {
1904       int subexp_idx;
1905       struct re_backref_cache_entry *ent;
1906       ent = mctx->bkref_ents + limits->elems[lim_idx];
1907       subexp_idx = dfa->nodes[ent->node].opr.idx;
1908 
1909       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1910 					   subexp_idx, dst_node, dst_idx,
1911 					   dst_bkref_idx);
1912       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1913 					   subexp_idx, src_node, src_idx,
1914 					   src_bkref_idx);
1915 
1916       /* In case of:
1917 	 <src> <dst> ( <subexp> )
1918 	 ( <subexp> ) <src> <dst>
1919 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1920       if (src_pos == dst_pos)
1921 	continue; /* This is unrelated limitation.  */
1922       else
1923 	return 1;
1924     }
1925   return 0;
1926 }
1927 
1928 static int
1929 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,int subexp_idx,int from_node,int bkref_idx)1930 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1931 			     int subexp_idx, int from_node, int bkref_idx)
1932 {
1933   const re_dfa_t *const dfa = mctx->dfa;
1934   const re_node_set *eclosures = dfa->eclosures + from_node;
1935   int node_idx;
1936 
1937   /* Else, we are on the boundary: examine the nodes on the epsilon
1938      closure.  */
1939   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1940     {
1941       int node = eclosures->elems[node_idx];
1942       switch (dfa->nodes[node].type)
1943 	{
1944 	case OP_BACK_REF:
1945 	  if (bkref_idx != -1)
1946 	    {
1947 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1948 	      do
1949 		{
1950 		  int dst, cpos;
1951 
1952 		  if (ent->node != node)
1953 		    continue;
1954 
1955 		  if (subexp_idx < BITSET_WORD_BITS
1956 		      && !(ent->eps_reachable_subexps_map
1957 			   & ((bitset_word_t) 1 << subexp_idx)))
1958 		    continue;
1959 
1960 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1961 		     OP_CLOSE_SUBEXP cases below.  But, if the
1962 		     destination node is the same node as the source
1963 		     node, don't recurse because it would cause an
1964 		     infinite loop: a regex that exhibits this behavior
1965 		     is ()\1*\1*  */
1966 		  dst = dfa->edests[node].elems[0];
1967 		  if (dst == from_node)
1968 		    {
1969 		      if (boundaries & 1)
1970 			return -1;
1971 		      else /* if (boundaries & 2) */
1972 			return 0;
1973 		    }
1974 
1975 		  cpos =
1976 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1977 						 dst, bkref_idx);
1978 		  if (cpos == -1 /* && (boundaries & 1) */)
1979 		    return -1;
1980 		  if (cpos == 0 && (boundaries & 2))
1981 		    return 0;
1982 
1983 		  if (subexp_idx < BITSET_WORD_BITS)
1984 		    ent->eps_reachable_subexps_map
1985 		      &= ~((bitset_word_t) 1 << subexp_idx);
1986 		}
1987 	      while (ent++->more);
1988 	    }
1989 	  break;
1990 
1991 	case OP_OPEN_SUBEXP:
1992 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1993 	    return -1;
1994 	  break;
1995 
1996 	case OP_CLOSE_SUBEXP:
1997 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1998 	    return 0;
1999 	  break;
2000 
2001 	default:
2002 	    break;
2003 	}
2004     }
2005 
2006   return (boundaries & 2) ? 1 : 0;
2007 }
2008 
2009 static int
2010 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,int limit,int subexp_idx,int from_node,int str_idx,int bkref_idx)2011 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2012 			   int subexp_idx, int from_node, int str_idx,
2013 			   int bkref_idx)
2014 {
2015   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2016   int boundaries;
2017 
2018   /* If we are outside the range of the subexpression, return -1 or 1.  */
2019   if (str_idx < lim->subexp_from)
2020     return -1;
2021 
2022   if (lim->subexp_to < str_idx)
2023     return 1;
2024 
2025   /* If we are within the subexpression, return 0.  */
2026   boundaries = (str_idx == lim->subexp_from);
2027   boundaries |= (str_idx == lim->subexp_to) << 1;
2028   if (boundaries == 0)
2029     return 0;
2030 
2031   /* Else, examine epsilon closure.  */
2032   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2033 				      from_node, bkref_idx);
2034 }
2035 
2036 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2037    which are against limitations from DEST_NODES. */
2038 
2039 static reg_errcode_t
2040 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,int str_idx)2041 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2042 		     const re_node_set *candidates, re_node_set *limits,
2043 		     struct re_backref_cache_entry *bkref_ents, int str_idx)
2044 {
2045   reg_errcode_t err;
2046   int node_idx, lim_idx;
2047 
2048   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2049     {
2050       int subexp_idx;
2051       struct re_backref_cache_entry *ent;
2052       ent = bkref_ents + limits->elems[lim_idx];
2053 
2054       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2055 	continue; /* This is unrelated limitation.  */
2056 
2057       subexp_idx = dfa->nodes[ent->node].opr.idx;
2058       if (ent->subexp_to == str_idx)
2059 	{
2060 	  int ops_node = -1;
2061 	  int cls_node = -1;
2062 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2063 	    {
2064 	      int node = dest_nodes->elems[node_idx];
2065 	      re_token_type_t type = dfa->nodes[node].type;
2066 	      if (type == OP_OPEN_SUBEXP
2067 		  && subexp_idx == dfa->nodes[node].opr.idx)
2068 		ops_node = node;
2069 	      else if (type == OP_CLOSE_SUBEXP
2070 		       && subexp_idx == dfa->nodes[node].opr.idx)
2071 		cls_node = node;
2072 	    }
2073 
2074 	  /* Check the limitation of the open subexpression.  */
2075 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2076 	  if (ops_node >= 0)
2077 	    {
2078 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2079 					   candidates);
2080 	      if (BE (err != REG_NOERROR, 0))
2081 		return err;
2082 	    }
2083 
2084 	  /* Check the limitation of the close subexpression.  */
2085 	  if (cls_node >= 0)
2086 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2087 	      {
2088 		int node = dest_nodes->elems[node_idx];
2089 		if (!re_node_set_contains (dfa->inveclosures + node,
2090 					   cls_node)
2091 		    && !re_node_set_contains (dfa->eclosures + node,
2092 					      cls_node))
2093 		  {
2094 		    /* It is against this limitation.
2095 		       Remove it form the current sifted state.  */
2096 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2097 						 candidates);
2098 		    if (BE (err != REG_NOERROR, 0))
2099 		      return err;
2100 		    --node_idx;
2101 		  }
2102 	      }
2103 	}
2104       else /* (ent->subexp_to != str_idx)  */
2105 	{
2106 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2107 	    {
2108 	      int node = dest_nodes->elems[node_idx];
2109 	      re_token_type_t type = dfa->nodes[node].type;
2110 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2111 		{
2112 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2113 		    continue;
2114 		  /* It is against this limitation.
2115 		     Remove it form the current sifted state.  */
2116 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2117 					       candidates);
2118 		  if (BE (err != REG_NOERROR, 0))
2119 		    return err;
2120 		}
2121 	    }
2122 	}
2123     }
2124   return REG_NOERROR;
2125 }
2126 
2127 static reg_errcode_t
2128 internal_function
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,int str_idx,const re_node_set * candidates)2129 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2130 		   int str_idx, const re_node_set *candidates)
2131 {
2132   const re_dfa_t *const dfa = mctx->dfa;
2133   reg_errcode_t err;
2134   int node_idx, node;
2135   re_sift_context_t local_sctx;
2136   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2137 
2138   if (first_idx == -1)
2139     return REG_NOERROR;
2140 
2141   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2142 
2143   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2144     {
2145       int enabled_idx;
2146       re_token_type_t type;
2147       struct re_backref_cache_entry *entry;
2148       node = candidates->elems[node_idx];
2149       type = dfa->nodes[node].type;
2150       /* Avoid infinite loop for the REs like "()\1+".  */
2151       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2152 	continue;
2153       if (type != OP_BACK_REF)
2154 	continue;
2155 
2156       entry = mctx->bkref_ents + first_idx;
2157       enabled_idx = first_idx;
2158       do
2159 	{
2160 	  int subexp_len;
2161 	  int to_idx;
2162 	  int dst_node;
2163 	  int ret;
2164 	  re_dfastate_t *cur_state;
2165 
2166 	  if (entry->node != node)
2167 	    continue;
2168 	  subexp_len = entry->subexp_to - entry->subexp_from;
2169 	  to_idx = str_idx + subexp_len;
2170 	  dst_node = (subexp_len ? dfa->nexts[node]
2171 		      : dfa->edests[node].elems[0]);
2172 
2173 	  if (to_idx > sctx->last_str_idx
2174 	      || sctx->sifted_states[to_idx] == NULL
2175 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2176 	      || check_dst_limits (mctx, &sctx->limits, node,
2177 				   str_idx, dst_node, to_idx))
2178 	    continue;
2179 
2180 	  if (local_sctx.sifted_states == NULL)
2181 	    {
2182 	      local_sctx = *sctx;
2183 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2184 	      if (BE (err != REG_NOERROR, 0))
2185 		goto free_return;
2186 	    }
2187 	  local_sctx.last_node = node;
2188 	  local_sctx.last_str_idx = str_idx;
2189 	  ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2190 	  if (BE (ret < 0, 0))
2191 	    {
2192 	      err = REG_ESPACE;
2193 	      goto free_return;
2194 	    }
2195 	  cur_state = local_sctx.sifted_states[str_idx];
2196 	  err = sift_states_backward (mctx, &local_sctx);
2197 	  if (BE (err != REG_NOERROR, 0))
2198 	    goto free_return;
2199 	  if (sctx->limited_states != NULL)
2200 	    {
2201 	      err = merge_state_array (dfa, sctx->limited_states,
2202 				       local_sctx.sifted_states,
2203 				       str_idx + 1);
2204 	      if (BE (err != REG_NOERROR, 0))
2205 		goto free_return;
2206 	    }
2207 	  local_sctx.sifted_states[str_idx] = cur_state;
2208 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2209 
2210 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2211 	  entry = mctx->bkref_ents + enabled_idx;
2212 	}
2213       while (enabled_idx++, entry++->more);
2214     }
2215   err = REG_NOERROR;
2216  free_return:
2217   if (local_sctx.sifted_states != NULL)
2218     {
2219       re_node_set_free (&local_sctx.limits);
2220     }
2221 
2222   return err;
2223 }
2224 
2225 
2226 #ifdef RE_ENABLE_I18N
2227 static int
2228 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,int node_idx,int str_idx,int max_str_idx)2229 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2230 		     int node_idx, int str_idx, int max_str_idx)
2231 {
2232   const re_dfa_t *const dfa = mctx->dfa;
2233   int naccepted;
2234   /* Check the node can accept `multi byte'.  */
2235   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2236   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2237       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2238 			    dfa->nexts[node_idx]))
2239     /* The node can't accept the `multi byte', or the
2240        destination was already thrown away, then the node
2241        couldn't accept the current input `multi byte'.   */
2242     naccepted = 0;
2243   /* Otherwise, it is sure that the node could accept
2244      `naccepted' bytes input.  */
2245   return naccepted;
2246 }
2247 #endif /* RE_ENABLE_I18N */
2248 
2249 
2250 /* Functions for state transition.  */
2251 
2252 /* Return the next state to which the current state STATE will transit by
2253    accepting the current input byte, and update STATE_LOG if necessary.
2254    If STATE can accept a multibyte char/collating element/back reference
2255    update the destination of STATE_LOG.  */
2256 
2257 static re_dfastate_t *
2258 internal_function
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2259 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2260 	       re_dfastate_t *state)
2261 {
2262   re_dfastate_t **trtable;
2263   unsigned char ch;
2264 
2265 #ifdef RE_ENABLE_I18N
2266   /* If the current state can accept multibyte.  */
2267   if (BE (state->accept_mb, 0))
2268     {
2269       *err = transit_state_mb (mctx, state);
2270       if (BE (*err != REG_NOERROR, 0))
2271 	return NULL;
2272     }
2273 #endif /* RE_ENABLE_I18N */
2274 
2275   /* Then decide the next state with the single byte.  */
2276 #if 0
2277   if (0)
2278     /* don't use transition table  */
2279     return transit_state_sb (err, mctx, state);
2280 #endif
2281 
2282   /* Use transition table  */
2283   ch = re_string_fetch_byte (&mctx->input);
2284   for (;;)
2285     {
2286       trtable = state->trtable;
2287       if (BE (trtable != NULL, 1))
2288 	return trtable[ch];
2289 
2290       trtable = state->word_trtable;
2291       if (BE (trtable != NULL, 1))
2292 	{
2293 	  unsigned int context;
2294 	  context
2295 	    = re_string_context_at (&mctx->input,
2296 				    re_string_cur_idx (&mctx->input) - 1,
2297 				    mctx->eflags);
2298 	  if (IS_WORD_CONTEXT (context))
2299 	    return trtable[ch + SBC_MAX];
2300 	  else
2301 	    return trtable[ch];
2302 	}
2303 
2304       if (!build_trtable (mctx->dfa, state))
2305 	{
2306 	  *err = REG_ESPACE;
2307 	  return NULL;
2308 	}
2309 
2310       /* Retry, we now have a transition table.  */
2311     }
2312 }
2313 
2314 /* Update the state_log if we need */
2315 static re_dfastate_t *
2316 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2317 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2318 		      re_dfastate_t *next_state)
2319 {
2320   const re_dfa_t *const dfa = mctx->dfa;
2321   int cur_idx = re_string_cur_idx (&mctx->input);
2322 
2323   if (cur_idx > mctx->state_log_top)
2324     {
2325       mctx->state_log[cur_idx] = next_state;
2326       mctx->state_log_top = cur_idx;
2327     }
2328   else if (mctx->state_log[cur_idx] == NULL)
2329     {
2330       mctx->state_log[cur_idx] = next_state;
2331     }
2332   else
2333     {
2334       re_dfastate_t *pstate;
2335       unsigned int context;
2336       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2337       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2338 	 the destination of a multibyte char/collating element/
2339 	 back reference.  Then the next state is the union set of
2340 	 these destinations and the results of the transition table.  */
2341       pstate = mctx->state_log[cur_idx];
2342       log_nodes = pstate->entrance_nodes;
2343       if (next_state != NULL)
2344 	{
2345 	  table_nodes = next_state->entrance_nodes;
2346 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2347 					     log_nodes);
2348 	  if (BE (*err != REG_NOERROR, 0))
2349 	    return NULL;
2350 	}
2351       else
2352 	next_nodes = *log_nodes;
2353       /* Note: We already add the nodes of the initial state,
2354 	 then we don't need to add them here.  */
2355 
2356       context = re_string_context_at (&mctx->input,
2357 				      re_string_cur_idx (&mctx->input) - 1,
2358 				      mctx->eflags);
2359       next_state = mctx->state_log[cur_idx]
2360 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2361       /* We don't need to check errors here, since the return value of
2362 	 this function is next_state and ERR is already set.  */
2363 
2364       if (table_nodes != NULL)
2365 	re_node_set_free (&next_nodes);
2366     }
2367 
2368   if (BE (dfa->nbackref, 0) && next_state != NULL)
2369     {
2370       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2371 	 later.  We must check them here, since the back references in the
2372 	 next state might use them.  */
2373       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2374 					cur_idx);
2375       if (BE (*err != REG_NOERROR, 0))
2376 	return NULL;
2377 
2378       /* If the next state has back references.  */
2379       if (next_state->has_backref)
2380 	{
2381 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2382 	  if (BE (*err != REG_NOERROR, 0))
2383 	    return NULL;
2384 	  next_state = mctx->state_log[cur_idx];
2385 	}
2386     }
2387 
2388   return next_state;
2389 }
2390 
2391 /* Skip bytes in the input that correspond to part of a
2392    multi-byte match, then look in the log for a state
2393    from which to restart matching.  */
2394 static re_dfastate_t *
2395 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2396 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2397 {
2398   re_dfastate_t *cur_state;
2399   do
2400     {
2401       int max = mctx->state_log_top;
2402       int cur_str_idx = re_string_cur_idx (&mctx->input);
2403 
2404       do
2405 	{
2406 	  if (++cur_str_idx > max)
2407 	    return NULL;
2408 	  re_string_skip_bytes (&mctx->input, 1);
2409 	}
2410       while (mctx->state_log[cur_str_idx] == NULL);
2411 
2412       cur_state = merge_state_with_log (err, mctx, NULL);
2413     }
2414   while (*err == REG_NOERROR && cur_state == NULL);
2415   return cur_state;
2416 }
2417 
2418 /* Helper functions for transit_state.  */
2419 
2420 /* From the node set CUR_NODES, pick up the nodes whose types are
2421    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2422    expression. And register them to use them later for evaluating the
2423    corresponding back references.  */
2424 
2425 static reg_errcode_t
2426 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,int str_idx)2427 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2428 			   int str_idx)
2429 {
2430   const re_dfa_t *const dfa = mctx->dfa;
2431   int node_idx;
2432   reg_errcode_t err;
2433 
2434   /* TODO: This isn't efficient.
2435 	   Because there might be more than one nodes whose types are
2436 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2437 	   nodes.
2438 	   E.g. RE: (a){2}  */
2439   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2440     {
2441       int node = cur_nodes->elems[node_idx];
2442       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2443 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2444 	  && (dfa->used_bkref_map
2445 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2446 	{
2447 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2448 	  if (BE (err != REG_NOERROR, 0))
2449 	    return err;
2450 	}
2451     }
2452   return REG_NOERROR;
2453 }
2454 
2455 #if 0
2456 /* Return the next state to which the current state STATE will transit by
2457    accepting the current input byte.  */
2458 
2459 static re_dfastate_t *
2460 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2461 		  re_dfastate_t *state)
2462 {
2463   const re_dfa_t *const dfa = mctx->dfa;
2464   re_node_set next_nodes;
2465   re_dfastate_t *next_state;
2466   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2467   unsigned int context;
2468 
2469   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2470   if (BE (*err != REG_NOERROR, 0))
2471     return NULL;
2472   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2473     {
2474       int cur_node = state->nodes.elems[node_cnt];
2475       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2476 	{
2477 	  *err = re_node_set_merge (&next_nodes,
2478 				    dfa->eclosures + dfa->nexts[cur_node]);
2479 	  if (BE (*err != REG_NOERROR, 0))
2480 	    {
2481 	      re_node_set_free (&next_nodes);
2482 	      return NULL;
2483 	    }
2484 	}
2485     }
2486   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2487   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2488   /* We don't need to check errors here, since the return value of
2489      this function is next_state and ERR is already set.  */
2490 
2491   re_node_set_free (&next_nodes);
2492   re_string_skip_bytes (&mctx->input, 1);
2493   return next_state;
2494 }
2495 #endif
2496 
2497 #ifdef RE_ENABLE_I18N
2498 static reg_errcode_t
2499 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2500 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2501 {
2502   const re_dfa_t *const dfa = mctx->dfa;
2503   reg_errcode_t err;
2504   int i;
2505 
2506   for (i = 0; i < pstate->nodes.nelem; ++i)
2507     {
2508       re_node_set dest_nodes, *new_nodes;
2509       int cur_node_idx = pstate->nodes.elems[i];
2510       int naccepted, dest_idx;
2511       unsigned int context;
2512       re_dfastate_t *dest_state;
2513 
2514       if (!dfa->nodes[cur_node_idx].accept_mb)
2515 	continue;
2516 
2517       if (dfa->nodes[cur_node_idx].constraint)
2518 	{
2519 	  context = re_string_context_at (&mctx->input,
2520 					  re_string_cur_idx (&mctx->input),
2521 					  mctx->eflags);
2522 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2523 					   context))
2524 	    continue;
2525 	}
2526 
2527       /* How many bytes the node can accept?  */
2528       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2529 					   re_string_cur_idx (&mctx->input));
2530       if (naccepted == 0)
2531 	continue;
2532 
2533       /* The node can accepts `naccepted' bytes.  */
2534       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2535       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2536 			       : mctx->max_mb_elem_len);
2537       err = clean_state_log_if_needed (mctx, dest_idx);
2538       if (BE (err != REG_NOERROR, 0))
2539 	return err;
2540 #ifdef DEBUG
2541       assert (dfa->nexts[cur_node_idx] != -1);
2542 #endif
2543       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2544 
2545       dest_state = mctx->state_log[dest_idx];
2546       if (dest_state == NULL)
2547 	dest_nodes = *new_nodes;
2548       else
2549 	{
2550 	  err = re_node_set_init_union (&dest_nodes,
2551 					dest_state->entrance_nodes, new_nodes);
2552 	  if (BE (err != REG_NOERROR, 0))
2553 	    return err;
2554 	}
2555       context = re_string_context_at (&mctx->input, dest_idx - 1,
2556 				      mctx->eflags);
2557       mctx->state_log[dest_idx]
2558 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2559       if (dest_state != NULL)
2560 	re_node_set_free (&dest_nodes);
2561       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2562 	return err;
2563     }
2564   return REG_NOERROR;
2565 }
2566 #endif /* RE_ENABLE_I18N */
2567 
2568 static reg_errcode_t
2569 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2570 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2571 {
2572   const re_dfa_t *const dfa = mctx->dfa;
2573   reg_errcode_t err;
2574   int i;
2575   int cur_str_idx = re_string_cur_idx (&mctx->input);
2576 
2577   for (i = 0; i < nodes->nelem; ++i)
2578     {
2579       int dest_str_idx, prev_nelem, bkc_idx;
2580       int node_idx = nodes->elems[i];
2581       unsigned int context;
2582       const re_token_t *node = dfa->nodes + node_idx;
2583       re_node_set *new_dest_nodes;
2584 
2585       /* Check whether `node' is a backreference or not.  */
2586       if (node->type != OP_BACK_REF)
2587 	continue;
2588 
2589       if (node->constraint)
2590 	{
2591 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2592 					  mctx->eflags);
2593 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2594 	    continue;
2595 	}
2596 
2597       /* `node' is a backreference.
2598 	 Check the substring which the substring matched.  */
2599       bkc_idx = mctx->nbkref_ents;
2600       err = get_subexp (mctx, node_idx, cur_str_idx);
2601       if (BE (err != REG_NOERROR, 0))
2602 	goto free_return;
2603 
2604       /* And add the epsilon closures (which is `new_dest_nodes') of
2605 	 the backreference to appropriate state_log.  */
2606 #ifdef DEBUG
2607       assert (dfa->nexts[node_idx] != -1);
2608 #endif
2609       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2610 	{
2611 	  int subexp_len;
2612 	  re_dfastate_t *dest_state;
2613 	  struct re_backref_cache_entry *bkref_ent;
2614 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2615 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2616 	    continue;
2617 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2618 	  new_dest_nodes = (subexp_len == 0
2619 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2620 			    : dfa->eclosures + dfa->nexts[node_idx]);
2621 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2622 			  - bkref_ent->subexp_from);
2623 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2624 					  mctx->eflags);
2625 	  dest_state = mctx->state_log[dest_str_idx];
2626 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2627 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2628 	  /* Add `new_dest_node' to state_log.  */
2629 	  if (dest_state == NULL)
2630 	    {
2631 	      mctx->state_log[dest_str_idx]
2632 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2633 					    context);
2634 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2635 		      && err != REG_NOERROR, 0))
2636 		goto free_return;
2637 	    }
2638 	  else
2639 	    {
2640 	      re_node_set dest_nodes;
2641 	      err = re_node_set_init_union (&dest_nodes,
2642 					    dest_state->entrance_nodes,
2643 					    new_dest_nodes);
2644 	      if (BE (err != REG_NOERROR, 0))
2645 		{
2646 		  re_node_set_free (&dest_nodes);
2647 		  goto free_return;
2648 		}
2649 	      mctx->state_log[dest_str_idx]
2650 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2651 	      re_node_set_free (&dest_nodes);
2652 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2653 		      && err != REG_NOERROR, 0))
2654 		goto free_return;
2655 	    }
2656 	  /* We need to check recursively if the backreference can epsilon
2657 	     transit.  */
2658 	  if (subexp_len == 0
2659 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2660 	    {
2661 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2662 					       cur_str_idx);
2663 	      if (BE (err != REG_NOERROR, 0))
2664 		goto free_return;
2665 	      err = transit_state_bkref (mctx, new_dest_nodes);
2666 	      if (BE (err != REG_NOERROR, 0))
2667 		goto free_return;
2668 	    }
2669 	}
2670     }
2671   err = REG_NOERROR;
2672  free_return:
2673   return err;
2674 }
2675 
2676 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2677    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2678    Note that we might collect inappropriate candidates here.
2679    However, the cost of checking them strictly here is too high, then we
2680    delay these checking for prune_impossible_nodes().  */
2681 
2682 static reg_errcode_t
2683 internal_function
get_subexp(re_match_context_t * mctx,int bkref_node,int bkref_str_idx)2684 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2685 {
2686   const re_dfa_t *const dfa = mctx->dfa;
2687   int subexp_num, sub_top_idx;
2688   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2689   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2690   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2691   if (cache_idx != -1)
2692     {
2693       const struct re_backref_cache_entry *entry
2694 	= mctx->bkref_ents + cache_idx;
2695       do
2696 	if (entry->node == bkref_node)
2697 	  return REG_NOERROR; /* We already checked it.  */
2698       while (entry++->more);
2699     }
2700 
2701   subexp_num = dfa->nodes[bkref_node].opr.idx;
2702 
2703   /* For each sub expression  */
2704   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2705     {
2706       reg_errcode_t err;
2707       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2708       re_sub_match_last_t *sub_last;
2709       int sub_last_idx, sl_str, bkref_str_off;
2710 
2711       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2712 	continue; /* It isn't related.  */
2713 
2714       sl_str = sub_top->str_idx;
2715       bkref_str_off = bkref_str_idx;
2716       /* At first, check the last node of sub expressions we already
2717 	 evaluated.  */
2718       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2719 	{
2720 	  int sl_str_diff;
2721 	  sub_last = sub_top->lasts[sub_last_idx];
2722 	  sl_str_diff = sub_last->str_idx - sl_str;
2723 	  /* The matched string by the sub expression match with the substring
2724 	     at the back reference?  */
2725 	  if (sl_str_diff > 0)
2726 	    {
2727 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2728 		{
2729 		  /* Not enough chars for a successful match.  */
2730 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2731 		    break;
2732 
2733 		  err = clean_state_log_if_needed (mctx,
2734 						   bkref_str_off
2735 						   + sl_str_diff);
2736 		  if (BE (err != REG_NOERROR, 0))
2737 		    return err;
2738 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2739 		}
2740 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2741 		/* We don't need to search this sub expression any more.  */
2742 		break;
2743 	    }
2744 	  bkref_str_off += sl_str_diff;
2745 	  sl_str += sl_str_diff;
2746 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2747 				bkref_str_idx);
2748 
2749 	  /* Reload buf, since the preceding call might have reallocated
2750 	     the buffer.  */
2751 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2752 
2753 	  if (err == REG_NOMATCH)
2754 	    continue;
2755 	  if (BE (err != REG_NOERROR, 0))
2756 	    return err;
2757 	}
2758 
2759       if (sub_last_idx < sub_top->nlasts)
2760 	continue;
2761       if (sub_last_idx > 0)
2762 	++sl_str;
2763       /* Then, search for the other last nodes of the sub expression.  */
2764       for (; sl_str <= bkref_str_idx; ++sl_str)
2765 	{
2766 	  int cls_node, sl_str_off;
2767 	  const re_node_set *nodes;
2768 	  sl_str_off = sl_str - sub_top->str_idx;
2769 	  /* The matched string by the sub expression match with the substring
2770 	     at the back reference?  */
2771 	  if (sl_str_off > 0)
2772 	    {
2773 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2774 		{
2775 		  /* If we are at the end of the input, we cannot match.  */
2776 		  if (bkref_str_off >= mctx->input.len)
2777 		    break;
2778 
2779 		  err = extend_buffers (mctx);
2780 		  if (BE (err != REG_NOERROR, 0))
2781 		    return err;
2782 
2783 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2784 		}
2785 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2786 		break; /* We don't need to search this sub expression
2787 			  any more.  */
2788 	    }
2789 	  if (mctx->state_log[sl_str] == NULL)
2790 	    continue;
2791 	  /* Does this state have a ')' of the sub expression?  */
2792 	  nodes = &mctx->state_log[sl_str]->nodes;
2793 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2794 				       OP_CLOSE_SUBEXP);
2795 	  if (cls_node == -1)
2796 	    continue; /* No.  */
2797 	  if (sub_top->path == NULL)
2798 	    {
2799 	      sub_top->path = calloc (sizeof (state_array_t),
2800 				      sl_str - sub_top->str_idx + 1);
2801 	      if (sub_top->path == NULL)
2802 		return REG_ESPACE;
2803 	    }
2804 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2805 	     in the current context?  */
2806 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2807 			       sub_top->str_idx, cls_node, sl_str,
2808 			       OP_CLOSE_SUBEXP);
2809 	  if (err == REG_NOMATCH)
2810 	      continue;
2811 	  if (BE (err != REG_NOERROR, 0))
2812 	      return err;
2813 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2814 	  if (BE (sub_last == NULL, 0))
2815 	    return REG_ESPACE;
2816 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2817 				bkref_str_idx);
2818 	  if (err == REG_NOMATCH)
2819 	    continue;
2820 	}
2821     }
2822   return REG_NOERROR;
2823 }
2824 
2825 /* Helper functions for get_subexp().  */
2826 
2827 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2828    If it can arrive, register the sub expression expressed with SUB_TOP
2829    and SUB_LAST.  */
2830 
2831 static reg_errcode_t
2832 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,int bkref_node,int bkref_str)2833 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2834 		re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2835 {
2836   reg_errcode_t err;
2837   int to_idx;
2838   /* Can the subexpression arrive the back reference?  */
2839   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2840 		       sub_last->str_idx, bkref_node, bkref_str,
2841 		       OP_OPEN_SUBEXP);
2842   if (err != REG_NOERROR)
2843     return err;
2844   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2845 			     sub_last->str_idx);
2846   if (BE (err != REG_NOERROR, 0))
2847     return err;
2848   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2849   return clean_state_log_if_needed (mctx, to_idx);
2850 }
2851 
2852 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2853    Search '(' if FL_OPEN, or search ')' otherwise.
2854    TODO: This function isn't efficient...
2855 	 Because there might be more than one nodes whose types are
2856 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2857 	 nodes.
2858 	 E.g. RE: (a){2}  */
2859 
2860 static int
2861 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,int subexp_idx,int type)2862 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2863 		  int subexp_idx, int type)
2864 {
2865   int cls_idx;
2866   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2867     {
2868       int cls_node = nodes->elems[cls_idx];
2869       const re_token_t *node = dfa->nodes + cls_node;
2870       if (node->type == type
2871 	  && node->opr.idx == subexp_idx)
2872 	return cls_node;
2873     }
2874   return -1;
2875 }
2876 
2877 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2878    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2879    heavily reused.
2880    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2881 
2882 static reg_errcode_t
2883 internal_function
check_arrival(re_match_context_t * mctx,state_array_t * path,int top_node,int top_str,int last_node,int last_str,int type)2884 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2885 	       int top_str, int last_node, int last_str, int type)
2886 {
2887   const re_dfa_t *const dfa = mctx->dfa;
2888   reg_errcode_t err = REG_NOERROR;
2889   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2890   re_dfastate_t *cur_state = NULL;
2891   re_node_set *cur_nodes, next_nodes;
2892   re_dfastate_t **backup_state_log;
2893   unsigned int context;
2894 
2895   subexp_num = dfa->nodes[top_node].opr.idx;
2896   /* Extend the buffer if we need.  */
2897   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2898     {
2899       re_dfastate_t **new_array;
2900       int old_alloc = path->alloc;
2901       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2902       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2903       if (BE (new_array == NULL, 0))
2904 	{
2905 	  path->alloc = old_alloc;
2906 	  return REG_ESPACE;
2907 	}
2908       path->array = new_array;
2909       memset (new_array + old_alloc, '\0',
2910 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2911     }
2912 
2913   str_idx = path->next_idx ? path->next_idx : top_str;
2914 
2915   /* Temporary modify MCTX.  */
2916   backup_state_log = mctx->state_log;
2917   backup_cur_idx = mctx->input.cur_idx;
2918   mctx->state_log = path->array;
2919   mctx->input.cur_idx = str_idx;
2920 
2921   /* Setup initial node set.  */
2922   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2923   if (str_idx == top_str)
2924     {
2925       err = re_node_set_init_1 (&next_nodes, top_node);
2926       if (BE (err != REG_NOERROR, 0))
2927 	return err;
2928       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2929       if (BE (err != REG_NOERROR, 0))
2930 	{
2931 	  re_node_set_free (&next_nodes);
2932 	  return err;
2933 	}
2934     }
2935   else
2936     {
2937       cur_state = mctx->state_log[str_idx];
2938       if (cur_state && cur_state->has_backref)
2939 	{
2940 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2941 	  if (BE (err != REG_NOERROR, 0))
2942 	    return err;
2943 	}
2944       else
2945 	re_node_set_init_empty (&next_nodes);
2946     }
2947   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2948     {
2949       if (next_nodes.nelem)
2950 	{
2951 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2952 				    subexp_num, type);
2953 	  if (BE (err != REG_NOERROR, 0))
2954 	    {
2955 	      re_node_set_free (&next_nodes);
2956 	      return err;
2957 	    }
2958 	}
2959       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2960       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2961 	{
2962 	  re_node_set_free (&next_nodes);
2963 	  return err;
2964 	}
2965       mctx->state_log[str_idx] = cur_state;
2966     }
2967 
2968   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2969     {
2970       re_node_set_empty (&next_nodes);
2971       if (mctx->state_log[str_idx + 1])
2972 	{
2973 	  err = re_node_set_merge (&next_nodes,
2974 				   &mctx->state_log[str_idx + 1]->nodes);
2975 	  if (BE (err != REG_NOERROR, 0))
2976 	    {
2977 	      re_node_set_free (&next_nodes);
2978 	      return err;
2979 	    }
2980 	}
2981       if (cur_state)
2982 	{
2983 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2984 					      &cur_state->non_eps_nodes,
2985 					      &next_nodes);
2986 	  if (BE (err != REG_NOERROR, 0))
2987 	    {
2988 	      re_node_set_free (&next_nodes);
2989 	      return err;
2990 	    }
2991 	}
2992       ++str_idx;
2993       if (next_nodes.nelem)
2994 	{
2995 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2996 	  if (BE (err != REG_NOERROR, 0))
2997 	    {
2998 	      re_node_set_free (&next_nodes);
2999 	      return err;
3000 	    }
3001 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3002 				    subexp_num, type);
3003 	  if (BE (err != REG_NOERROR, 0))
3004 	    {
3005 	      re_node_set_free (&next_nodes);
3006 	      return err;
3007 	    }
3008 	}
3009       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3010       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3011       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3012 	{
3013 	  re_node_set_free (&next_nodes);
3014 	  return err;
3015 	}
3016       mctx->state_log[str_idx] = cur_state;
3017       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3018     }
3019   re_node_set_free (&next_nodes);
3020   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3021 	       : &mctx->state_log[last_str]->nodes);
3022   path->next_idx = str_idx;
3023 
3024   /* Fix MCTX.  */
3025   mctx->state_log = backup_state_log;
3026   mctx->input.cur_idx = backup_cur_idx;
3027 
3028   /* Then check the current node set has the node LAST_NODE.  */
3029   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3030     return REG_NOERROR;
3031 
3032   return REG_NOMATCH;
3033 }
3034 
3035 /* Helper functions for check_arrival.  */
3036 
3037 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3038    to NEXT_NODES.
3039    TODO: This function is similar to the functions transit_state*(),
3040 	 however this function has many additional works.
3041 	 Can't we unify them?  */
3042 
3043 static reg_errcode_t
3044 internal_function
check_arrival_add_next_nodes(re_match_context_t * mctx,int str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3045 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3046 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3047 {
3048   const re_dfa_t *const dfa = mctx->dfa;
3049   int result;
3050   int cur_idx;
3051 #ifdef RE_ENABLE_I18N
3052   reg_errcode_t err = REG_NOERROR;
3053 #endif
3054   re_node_set union_set;
3055   re_node_set_init_empty (&union_set);
3056   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3057     {
3058       int naccepted = 0;
3059       int cur_node = cur_nodes->elems[cur_idx];
3060 #ifdef DEBUG
3061       re_token_type_t type = dfa->nodes[cur_node].type;
3062       assert (!IS_EPSILON_NODE (type));
3063 #endif
3064 #ifdef RE_ENABLE_I18N
3065       /* If the node may accept `multi byte'.  */
3066       if (dfa->nodes[cur_node].accept_mb)
3067 	{
3068 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3069 					       str_idx);
3070 	  if (naccepted > 1)
3071 	    {
3072 	      re_dfastate_t *dest_state;
3073 	      int next_node = dfa->nexts[cur_node];
3074 	      int next_idx = str_idx + naccepted;
3075 	      dest_state = mctx->state_log[next_idx];
3076 	      re_node_set_empty (&union_set);
3077 	      if (dest_state)
3078 		{
3079 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3080 		  if (BE (err != REG_NOERROR, 0))
3081 		    {
3082 		      re_node_set_free (&union_set);
3083 		      return err;
3084 		    }
3085 		}
3086 	      result = re_node_set_insert (&union_set, next_node);
3087 	      if (BE (result < 0, 0))
3088 		{
3089 		  re_node_set_free (&union_set);
3090 		  return REG_ESPACE;
3091 		}
3092 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3093 							    &union_set);
3094 	      if (BE (mctx->state_log[next_idx] == NULL
3095 		      && err != REG_NOERROR, 0))
3096 		{
3097 		  re_node_set_free (&union_set);
3098 		  return err;
3099 		}
3100 	    }
3101 	}
3102 #endif /* RE_ENABLE_I18N */
3103       if (naccepted
3104 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3105 	{
3106 	  result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3107 	  if (BE (result < 0, 0))
3108 	    {
3109 	      re_node_set_free (&union_set);
3110 	      return REG_ESPACE;
3111 	    }
3112 	}
3113     }
3114   re_node_set_free (&union_set);
3115   return REG_NOERROR;
3116 }
3117 
3118 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3119    CUR_NODES, however exclude the nodes which are:
3120     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3121     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3122 */
3123 
3124 static reg_errcode_t
3125 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,int ex_subexp,int type)3126 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3127 			  int ex_subexp, int type)
3128 {
3129   reg_errcode_t err;
3130   int idx, outside_node;
3131   re_node_set new_nodes;
3132 #ifdef DEBUG
3133   assert (cur_nodes->nelem);
3134 #endif
3135   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3136   if (BE (err != REG_NOERROR, 0))
3137     return err;
3138   /* Create a new node set NEW_NODES with the nodes which are epsilon
3139      closures of the node in CUR_NODES.  */
3140 
3141   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3142     {
3143       int cur_node = cur_nodes->elems[idx];
3144       const re_node_set *eclosure = dfa->eclosures + cur_node;
3145       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3146       if (outside_node == -1)
3147 	{
3148 	  /* There are no problematic nodes, just merge them.  */
3149 	  err = re_node_set_merge (&new_nodes, eclosure);
3150 	  if (BE (err != REG_NOERROR, 0))
3151 	    {
3152 	      re_node_set_free (&new_nodes);
3153 	      return err;
3154 	    }
3155 	}
3156       else
3157 	{
3158 	  /* There are problematic nodes, re-calculate incrementally.  */
3159 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3160 					      ex_subexp, type);
3161 	  if (BE (err != REG_NOERROR, 0))
3162 	    {
3163 	      re_node_set_free (&new_nodes);
3164 	      return err;
3165 	    }
3166 	}
3167     }
3168   re_node_set_free (cur_nodes);
3169   *cur_nodes = new_nodes;
3170   return REG_NOERROR;
3171 }
3172 
3173 /* Helper function for check_arrival_expand_ecl.
3174    Check incrementally the epsilon closure of TARGET, and if it isn't
3175    problematic append it to DST_NODES.  */
3176 
3177 static reg_errcode_t
3178 internal_function
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,int target,int ex_subexp,int type)3179 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3180 			      int target, int ex_subexp, int type)
3181 {
3182   int cur_node;
3183   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3184     {
3185       int err;
3186 
3187       if (dfa->nodes[cur_node].type == type
3188 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3189 	{
3190 	  if (type == OP_CLOSE_SUBEXP)
3191 	    {
3192 	      err = re_node_set_insert (dst_nodes, cur_node);
3193 	      if (BE (err == -1, 0))
3194 		return REG_ESPACE;
3195 	    }
3196 	  break;
3197 	}
3198       err = re_node_set_insert (dst_nodes, cur_node);
3199       if (BE (err == -1, 0))
3200 	return REG_ESPACE;
3201       if (dfa->edests[cur_node].nelem == 0)
3202 	break;
3203       if (dfa->edests[cur_node].nelem == 2)
3204 	{
3205 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3206 					      dfa->edests[cur_node].elems[1],
3207 					      ex_subexp, type);
3208 	  if (BE (err != REG_NOERROR, 0))
3209 	    return err;
3210 	}
3211       cur_node = dfa->edests[cur_node].elems[0];
3212     }
3213   return REG_NOERROR;
3214 }
3215 
3216 
3217 /* For all the back references in the current state, calculate the
3218    destination of the back references by the appropriate entry
3219    in MCTX->BKREF_ENTS.  */
3220 
3221 static reg_errcode_t
3222 internal_function
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,int cur_str,int subexp_num,int type)3223 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3224 		    int cur_str, int subexp_num, int type)
3225 {
3226   const re_dfa_t *const dfa = mctx->dfa;
3227   reg_errcode_t err;
3228   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3229   struct re_backref_cache_entry *ent;
3230 
3231   if (cache_idx_start == -1)
3232     return REG_NOERROR;
3233 
3234  restart:
3235   ent = mctx->bkref_ents + cache_idx_start;
3236   do
3237     {
3238       int to_idx, next_node;
3239 
3240       /* Is this entry ENT is appropriate?  */
3241       if (!re_node_set_contains (cur_nodes, ent->node))
3242 	continue; /* No.  */
3243 
3244       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3245       /* Calculate the destination of the back reference, and append it
3246 	 to MCTX->STATE_LOG.  */
3247       if (to_idx == cur_str)
3248 	{
3249 	  /* The backreference did epsilon transit, we must re-check all the
3250 	     node in the current state.  */
3251 	  re_node_set new_dests;
3252 	  reg_errcode_t err2, err3;
3253 	  next_node = dfa->edests[ent->node].elems[0];
3254 	  if (re_node_set_contains (cur_nodes, next_node))
3255 	    continue;
3256 	  err = re_node_set_init_1 (&new_dests, next_node);
3257 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3258 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3259 	  re_node_set_free (&new_dests);
3260 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3261 		  || err3 != REG_NOERROR, 0))
3262 	    {
3263 	      err = (err != REG_NOERROR ? err
3264 		     : (err2 != REG_NOERROR ? err2 : err3));
3265 	      return err;
3266 	    }
3267 	  /* TODO: It is still inefficient...  */
3268 	  goto restart;
3269 	}
3270       else
3271 	{
3272 	  re_node_set union_set;
3273 	  next_node = dfa->nexts[ent->node];
3274 	  if (mctx->state_log[to_idx])
3275 	    {
3276 	      int ret;
3277 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3278 					next_node))
3279 		continue;
3280 	      err = re_node_set_init_copy (&union_set,
3281 					   &mctx->state_log[to_idx]->nodes);
3282 	      ret = re_node_set_insert (&union_set, next_node);
3283 	      if (BE (err != REG_NOERROR || ret < 0, 0))
3284 		{
3285 		  re_node_set_free (&union_set);
3286 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3287 		  return err;
3288 		}
3289 	    }
3290 	  else
3291 	    {
3292 	      err = re_node_set_init_1 (&union_set, next_node);
3293 	      if (BE (err != REG_NOERROR, 0))
3294 		return err;
3295 	    }
3296 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3297 	  re_node_set_free (&union_set);
3298 	  if (BE (mctx->state_log[to_idx] == NULL
3299 		  && err != REG_NOERROR, 0))
3300 	    return err;
3301 	}
3302     }
3303   while (ent++->more);
3304   return REG_NOERROR;
3305 }
3306 
3307 /* Build transition table for the state.
3308    Return 1 if succeeded, otherwise return NULL.  */
3309 
3310 static int
3311 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3312 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3313 {
3314   reg_errcode_t err;
3315   int i, j, ch, need_word_trtable = 0;
3316   bitset_word_t elem, mask;
3317   bool dests_node_malloced = false;
3318   bool dest_states_malloced = false;
3319   int ndests; /* Number of the destination states from `state'.  */
3320   re_dfastate_t **trtable;
3321   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3322   re_node_set follows, *dests_node;
3323   bitset_t *dests_ch;
3324   bitset_t acceptable;
3325 
3326   struct dests_alloc
3327   {
3328     re_node_set dests_node[SBC_MAX];
3329     bitset_t dests_ch[SBC_MAX];
3330   } *dests_alloc;
3331 
3332   /* We build DFA states which corresponds to the destination nodes
3333      from `state'.  `dests_node[i]' represents the nodes which i-th
3334      destination state contains, and `dests_ch[i]' represents the
3335      characters which i-th destination state accepts.  */
3336 #ifdef HAVE_ALLOCA
3337   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3338     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3339   else
3340 #endif
3341     {
3342       dests_alloc = re_malloc (struct dests_alloc, 1);
3343       if (BE (dests_alloc == NULL, 0))
3344 	return 0;
3345       dests_node_malloced = true;
3346     }
3347   dests_node = dests_alloc->dests_node;
3348   dests_ch = dests_alloc->dests_ch;
3349 
3350   /* Initialize transition table.  */
3351   state->word_trtable = state->trtable = NULL;
3352 
3353   /* At first, group all nodes belonging to `state' into several
3354      destinations.  */
3355   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3356   if (BE (ndests <= 0, 0))
3357     {
3358       if (dests_node_malloced)
3359 	free (dests_alloc);
3360       /* Return 0 in case of an error, 1 otherwise.  */
3361       if (ndests == 0)
3362 	{
3363 	  state->trtable = (re_dfastate_t **)
3364 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365 	  return 1;
3366 	}
3367       return 0;
3368     }
3369 
3370   err = re_node_set_alloc (&follows, ndests + 1);
3371   if (BE (err != REG_NOERROR, 0))
3372     goto out_free;
3373 
3374   /* Avoid arithmetic overflow in size calculation.  */
3375   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3376 	    / (3 * sizeof (re_dfastate_t *)))
3377 	   < ndests),
3378 	  0))
3379     goto out_free;
3380 
3381 #ifdef HAVE_ALLOCA
3382   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3383 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3384     dest_states = (re_dfastate_t **)
3385       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3386   else
3387 #endif
3388     {
3389       dest_states = (re_dfastate_t **)
3390 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
3391       if (BE (dest_states == NULL, 0))
3392 	{
3393 out_free:
3394 	  if (dest_states_malloced)
3395 	    free (dest_states);
3396 	  re_node_set_free (&follows);
3397 	  for (i = 0; i < ndests; ++i)
3398 	    re_node_set_free (dests_node + i);
3399 	  if (dests_node_malloced)
3400 	    free (dests_alloc);
3401 	  return 0;
3402 	}
3403       dest_states_malloced = true;
3404     }
3405   dest_states_word = dest_states + ndests;
3406   dest_states_nl = dest_states_word + ndests;
3407   bitset_empty (acceptable);
3408 
3409   /* Then build the states for all destinations.  */
3410   for (i = 0; i < ndests; ++i)
3411     {
3412       int next_node;
3413       re_node_set_empty (&follows);
3414       /* Merge the follows of this destination states.  */
3415       for (j = 0; j < dests_node[i].nelem; ++j)
3416 	{
3417 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3418 	  if (next_node != -1)
3419 	    {
3420 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3421 	      if (BE (err != REG_NOERROR, 0))
3422 		goto out_free;
3423 	    }
3424 	}
3425       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3426       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3427 	goto out_free;
3428       /* If the new state has context constraint,
3429 	 build appropriate states for these contexts.  */
3430       if (dest_states[i]->has_constraint)
3431 	{
3432 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3433 							  CONTEXT_WORD);
3434 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3435 	    goto out_free;
3436 
3437 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3438 	    need_word_trtable = 1;
3439 
3440 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3441 							CONTEXT_NEWLINE);
3442 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3443 	    goto out_free;
3444  	}
3445       else
3446 	{
3447 	  dest_states_word[i] = dest_states[i];
3448 	  dest_states_nl[i] = dest_states[i];
3449 	}
3450       bitset_merge (acceptable, dests_ch[i]);
3451     }
3452 
3453   if (!BE (need_word_trtable, 0))
3454     {
3455       /* We don't care about whether the following character is a word
3456 	 character, or we are in a single-byte character set so we can
3457 	 discern by looking at the character code: allocate a
3458 	 256-entry transition table.  */
3459       trtable = state->trtable =
3460 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3461       if (BE (trtable == NULL, 0))
3462 	goto out_free;
3463 
3464       /* For all characters ch...:  */
3465       for (i = 0; i < BITSET_WORDS; ++i)
3466 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3467 	     elem;
3468 	     mask <<= 1, elem >>= 1, ++ch)
3469 	  if (BE (elem & 1, 0))
3470 	    {
3471 	      /* There must be exactly one destination which accepts
3472 		 character ch.  See group_nodes_into_DFAstates.  */
3473 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3474 		;
3475 
3476 	      /* j-th destination accepts the word character ch.  */
3477 	      if (dfa->word_char[i] & mask)
3478 		trtable[ch] = dest_states_word[j];
3479 	      else
3480 		trtable[ch] = dest_states[j];
3481 	    }
3482     }
3483   else
3484     {
3485       /* We care about whether the following character is a word
3486 	 character, and we are in a multi-byte character set: discern
3487 	 by looking at the character code: build two 256-entry
3488 	 transition tables, one starting at trtable[0] and one
3489 	 starting at trtable[SBC_MAX].  */
3490       trtable = state->word_trtable =
3491 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3492       if (BE (trtable == NULL, 0))
3493 	goto out_free;
3494 
3495       /* For all characters ch...:  */
3496       for (i = 0; i < BITSET_WORDS; ++i)
3497 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3498 	     elem;
3499 	     mask <<= 1, elem >>= 1, ++ch)
3500 	  if (BE (elem & 1, 0))
3501 	    {
3502 	      /* There must be exactly one destination which accepts
3503 		 character ch.  See group_nodes_into_DFAstates.  */
3504 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3505 		;
3506 
3507 	      /* j-th destination accepts the word character ch.  */
3508 	      trtable[ch] = dest_states[j];
3509 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3510 	    }
3511     }
3512 
3513   /* new line */
3514   if (bitset_contain (acceptable, NEWLINE_CHAR))
3515     {
3516       /* The current state accepts newline character.  */
3517       for (j = 0; j < ndests; ++j)
3518 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3519 	  {
3520 	    /* k-th destination accepts newline character.  */
3521 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3522 	    if (need_word_trtable)
3523 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3524 	    /* There must be only one destination which accepts
3525 	       newline.  See group_nodes_into_DFAstates.  */
3526 	    break;
3527 	  }
3528     }
3529 
3530   if (dest_states_malloced)
3531     free (dest_states);
3532 
3533   re_node_set_free (&follows);
3534   for (i = 0; i < ndests; ++i)
3535     re_node_set_free (dests_node + i);
3536 
3537   if (dests_node_malloced)
3538     free (dests_alloc);
3539 
3540   return 1;
3541 }
3542 
3543 /* Group all nodes belonging to STATE into several destinations.
3544    Then for all destinations, set the nodes belonging to the destination
3545    to DESTS_NODE[i] and set the characters accepted by the destination
3546    to DEST_CH[i].  This function return the number of destinations.  */
3547 
3548 static int
3549 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3550 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3551 			    re_node_set *dests_node, bitset_t *dests_ch)
3552 {
3553   reg_errcode_t err;
3554   int result;
3555   int i, j, k;
3556   int ndests; /* Number of the destinations from `state'.  */
3557   bitset_t accepts; /* Characters a node can accept.  */
3558   const re_node_set *cur_nodes = &state->nodes;
3559   bitset_empty (accepts);
3560   ndests = 0;
3561 
3562   /* For all the nodes belonging to `state',  */
3563   for (i = 0; i < cur_nodes->nelem; ++i)
3564     {
3565       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3566       re_token_type_t type = node->type;
3567       unsigned int constraint = node->constraint;
3568 
3569       /* Enumerate all single byte character this node can accept.  */
3570       if (type == CHARACTER)
3571 	bitset_set (accepts, node->opr.c);
3572       else if (type == SIMPLE_BRACKET)
3573 	{
3574 	  bitset_merge (accepts, node->opr.sbcset);
3575 	}
3576       else if (type == OP_PERIOD)
3577 	{
3578 #ifdef RE_ENABLE_I18N
3579 	  if (dfa->mb_cur_max > 1)
3580 	    bitset_merge (accepts, dfa->sb_char);
3581 	  else
3582 #endif
3583 	    bitset_set_all (accepts);
3584 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3585 	    bitset_clear (accepts, '\n');
3586 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3587 	    bitset_clear (accepts, '\0');
3588 	}
3589 #ifdef RE_ENABLE_I18N
3590       else if (type == OP_UTF8_PERIOD)
3591 	{
3592 	  memset (accepts, '\xff', sizeof (bitset_t) / 2);
3593 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3594 	    bitset_clear (accepts, '\n');
3595 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3596 	    bitset_clear (accepts, '\0');
3597 	}
3598 #endif
3599       else
3600 	continue;
3601 
3602       /* Check the `accepts' and sift the characters which are not
3603 	 match it the context.  */
3604       if (constraint)
3605 	{
3606 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3607 	    {
3608 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3609 	      bitset_empty (accepts);
3610 	      if (accepts_newline)
3611 		bitset_set (accepts, NEWLINE_CHAR);
3612 	      else
3613 		continue;
3614 	    }
3615 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3616 	    {
3617 	      bitset_empty (accepts);
3618 	      continue;
3619 	    }
3620 
3621 	  if (constraint & NEXT_WORD_CONSTRAINT)
3622 	    {
3623 	      bitset_word_t any_set = 0;
3624 	      if (type == CHARACTER && !node->word_char)
3625 		{
3626 		  bitset_empty (accepts);
3627 		  continue;
3628 		}
3629 #ifdef RE_ENABLE_I18N
3630 	      if (dfa->mb_cur_max > 1)
3631 		for (j = 0; j < BITSET_WORDS; ++j)
3632 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3633 	      else
3634 #endif
3635 		for (j = 0; j < BITSET_WORDS; ++j)
3636 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3637 	      if (!any_set)
3638 		continue;
3639 	    }
3640 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3641 	    {
3642 	      bitset_word_t any_set = 0;
3643 	      if (type == CHARACTER && node->word_char)
3644 		{
3645 		  bitset_empty (accepts);
3646 		  continue;
3647 		}
3648 #ifdef RE_ENABLE_I18N
3649 	      if (dfa->mb_cur_max > 1)
3650 		for (j = 0; j < BITSET_WORDS; ++j)
3651 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3652 	      else
3653 #endif
3654 		for (j = 0; j < BITSET_WORDS; ++j)
3655 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3656 	      if (!any_set)
3657 		continue;
3658 	    }
3659 	}
3660 
3661       /* Then divide `accepts' into DFA states, or create a new
3662 	 state.  Above, we make sure that accepts is not empty.  */
3663       for (j = 0; j < ndests; ++j)
3664 	{
3665 	  bitset_t intersec; /* Intersection sets, see below.  */
3666 	  bitset_t remains;
3667 	  /* Flags, see below.  */
3668 	  bitset_word_t has_intersec, not_subset, not_consumed;
3669 
3670 	  /* Optimization, skip if this state doesn't accept the character.  */
3671 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3672 	    continue;
3673 
3674 	  /* Enumerate the intersection set of this state and `accepts'.  */
3675 	  has_intersec = 0;
3676 	  for (k = 0; k < BITSET_WORDS; ++k)
3677 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3678 	  /* And skip if the intersection set is empty.  */
3679 	  if (!has_intersec)
3680 	    continue;
3681 
3682 	  /* Then check if this state is a subset of `accepts'.  */
3683 	  not_subset = not_consumed = 0;
3684 	  for (k = 0; k < BITSET_WORDS; ++k)
3685 	    {
3686 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3687 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3688 	    }
3689 
3690 	  /* If this state isn't a subset of `accepts', create a
3691 	     new group state, which has the `remains'. */
3692 	  if (not_subset)
3693 	    {
3694 	      bitset_copy (dests_ch[ndests], remains);
3695 	      bitset_copy (dests_ch[j], intersec);
3696 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3697 	      if (BE (err != REG_NOERROR, 0))
3698 		goto error_return;
3699 	      ++ndests;
3700 	    }
3701 
3702 	  /* Put the position in the current group. */
3703 	  result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3704 	  if (BE (result < 0, 0))
3705 	    goto error_return;
3706 
3707 	  /* If all characters are consumed, go to next node. */
3708 	  if (!not_consumed)
3709 	    break;
3710 	}
3711       /* Some characters remain, create a new group. */
3712       if (j == ndests)
3713 	{
3714 	  bitset_copy (dests_ch[ndests], accepts);
3715 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3716 	  if (BE (err != REG_NOERROR, 0))
3717 	    goto error_return;
3718 	  ++ndests;
3719 	  bitset_empty (accepts);
3720 	}
3721     }
3722   return ndests;
3723  error_return:
3724   for (j = 0; j < ndests; ++j)
3725     re_node_set_free (dests_node + j);
3726   return -1;
3727 }
3728 
3729 #ifdef RE_ENABLE_I18N
3730 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3731    Return the number of the bytes the node accepts.
3732    STR_IDX is the current index of the input string.
3733 
3734    This function handles the nodes which can accept one character, or
3735    one collating element like '.', '[a-z]', opposite to the other nodes
3736    can only accept one byte.  */
3737 
3738 static int
3739 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,int node_idx,const re_string_t * input,int str_idx)3740 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3741 			 const re_string_t *input, int str_idx)
3742 {
3743   const re_token_t *node = dfa->nodes + node_idx;
3744   int char_len, elem_len;
3745   int i;
3746   wint_t wc;
3747 
3748   if (BE (node->type == OP_UTF8_PERIOD, 0))
3749     {
3750       unsigned char c = re_string_byte_at (input, str_idx), d;
3751       if (BE (c < 0xc2, 1))
3752 	return 0;
3753 
3754       if (str_idx + 2 > input->len)
3755 	return 0;
3756 
3757       d = re_string_byte_at (input, str_idx + 1);
3758       if (c < 0xe0)
3759 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3760       else if (c < 0xf0)
3761 	{
3762 	  char_len = 3;
3763 	  if (c == 0xe0 && d < 0xa0)
3764 	    return 0;
3765 	}
3766       else if (c < 0xf8)
3767 	{
3768 	  char_len = 4;
3769 	  if (c == 0xf0 && d < 0x90)
3770 	    return 0;
3771 	}
3772       else if (c < 0xfc)
3773 	{
3774 	  char_len = 5;
3775 	  if (c == 0xf8 && d < 0x88)
3776 	    return 0;
3777 	}
3778       else if (c < 0xfe)
3779 	{
3780 	  char_len = 6;
3781 	  if (c == 0xfc && d < 0x84)
3782 	    return 0;
3783 	}
3784       else
3785 	return 0;
3786 
3787       if (str_idx + char_len > input->len)
3788 	return 0;
3789 
3790       for (i = 1; i < char_len; ++i)
3791 	{
3792 	  d = re_string_byte_at (input, str_idx + i);
3793 	  if (d < 0x80 || d > 0xbf)
3794 	    return 0;
3795 	}
3796       return char_len;
3797     }
3798 
3799   char_len = re_string_char_size_at (input, str_idx);
3800   if (node->type == OP_PERIOD)
3801     {
3802       if (char_len <= 1)
3803 	return 0;
3804       /* FIXME: I don't think this if is needed, as both '\n'
3805 	 and '\0' are char_len == 1.  */
3806       /* '.' accepts any one character except the following two cases.  */
3807       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3808 	   re_string_byte_at (input, str_idx) == '\n') ||
3809 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
3810 	   re_string_byte_at (input, str_idx) == '\0'))
3811 	return 0;
3812       return char_len;
3813     }
3814 
3815   elem_len = re_string_elem_size_at (input, str_idx);
3816   wc = __btowc(*(input->mbs+str_idx));
3817   if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3818     return 0;
3819 
3820   if (node->type == COMPLEX_BRACKET)
3821     {
3822       const re_charset_t *cset = node->opr.mbcset;
3823 # ifdef _LIBC
3824       const unsigned char *pin
3825 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3826       int j;
3827       uint32_t nrules;
3828 # endif /* _LIBC */
3829       int match_len = 0;
3830       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3831 		    ? re_string_wchar_at (input, str_idx) : 0);
3832 
3833       /* match with multibyte character?  */
3834       for (i = 0; i < cset->nmbchars; ++i)
3835 	if (wc == cset->mbchars[i])
3836 	  {
3837 	    match_len = char_len;
3838 	    goto check_node_accept_bytes_match;
3839 	  }
3840       /* match with character_class?  */
3841       for (i = 0; i < cset->nchar_classes; ++i)
3842 	{
3843 	  wctype_t wt = cset->char_classes[i];
3844 	  if (__iswctype (wc, wt))
3845 	    {
3846 	      match_len = char_len;
3847 	      goto check_node_accept_bytes_match;
3848 	    }
3849 	}
3850 
3851 # ifdef _LIBC
3852       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3853       if (nrules != 0)
3854 	{
3855 	  unsigned int in_collseq = 0;
3856 	  const int32_t *table, *indirect;
3857 	  const unsigned char *weights, *extra;
3858 	  const char *collseqwc;
3859 	  /* This #include defines a local function!  */
3860 #  include <locale/weight.h>
3861 
3862 	  /* match with collating_symbol?  */
3863 	  if (cset->ncoll_syms)
3864 	    extra = (const unsigned char *)
3865 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3866 	  for (i = 0; i < cset->ncoll_syms; ++i)
3867 	    {
3868 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3869 	      /* Compare the length of input collating element and
3870 		 the length of current collating element.  */
3871 	      if (*coll_sym != elem_len)
3872 		continue;
3873 	      /* Compare each bytes.  */
3874 	      for (j = 0; j < *coll_sym; j++)
3875 		if (pin[j] != coll_sym[1 + j])
3876 		  break;
3877 	      if (j == *coll_sym)
3878 		{
3879 		  /* Match if every bytes is equal.  */
3880 		  match_len = j;
3881 		  goto check_node_accept_bytes_match;
3882 		}
3883 	    }
3884 
3885 	  if (cset->nranges)
3886 	    {
3887 	      if (elem_len <= char_len)
3888 		{
3889 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3890 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3891 		}
3892 	      else
3893 		in_collseq = find_collation_sequence_value (pin, elem_len);
3894 	    }
3895 	  /* match with range expression?  */
3896 	  for (i = 0; i < cset->nranges; ++i)
3897 	    if (cset->range_starts[i] <= in_collseq
3898 		&& in_collseq <= cset->range_ends[i])
3899 	      {
3900 		match_len = elem_len;
3901 		goto check_node_accept_bytes_match;
3902 	      }
3903 
3904 	  /* match with equivalence_class?  */
3905 	  if (cset->nequiv_classes)
3906 	    {
3907 	      const unsigned char *cp = pin;
3908 	      table = (const int32_t *)
3909 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3910 	      weights = (const unsigned char *)
3911 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3912 	      extra = (const unsigned char *)
3913 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3914 	      indirect = (const int32_t *)
3915 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3916 	      int32_t idx = findidx (&cp);
3917 	      if (idx > 0)
3918 		for (i = 0; i < cset->nequiv_classes; ++i)
3919 		  {
3920 		    int32_t equiv_class_idx = cset->equiv_classes[i];
3921 		    size_t weight_len = weights[idx & 0xffffff];
3922 		    if (weight_len == weights[equiv_class_idx & 0xffffff]
3923 			&& (idx >> 24) == (equiv_class_idx >> 24))
3924 		      {
3925 			int cnt = 0;
3926 
3927 			idx &= 0xffffff;
3928 			equiv_class_idx &= 0xffffff;
3929 
3930 			while (cnt <= weight_len
3931 			       && (weights[equiv_class_idx + 1 + cnt]
3932 				   == weights[idx + 1 + cnt]))
3933 			  ++cnt;
3934 			if (cnt > weight_len)
3935 			  {
3936 			    match_len = elem_len;
3937 			    goto check_node_accept_bytes_match;
3938 			  }
3939 		      }
3940 		  }
3941 	    }
3942 	}
3943       else
3944 # endif /* _LIBC */
3945 	{
3946 	  /* match with range expression?  */
3947 #if __GNUC__ >= 2
3948 	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3949 #else
3950 	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3951 	  cmp_buf[2] = wc;
3952 #endif
3953 	  for (i = 0; i < cset->nranges; ++i)
3954 	    {
3955 	      cmp_buf[0] = cset->range_starts[i];
3956 	      cmp_buf[4] = cset->range_ends[i];
3957 	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3958 		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3959 		{
3960 		  match_len = char_len;
3961 		  goto check_node_accept_bytes_match;
3962 		}
3963 	    }
3964 	}
3965     check_node_accept_bytes_match:
3966       if (!cset->non_match)
3967 	return match_len;
3968       else
3969 	{
3970 	  if (match_len > 0)
3971 	    return 0;
3972 	  else
3973 	    return (elem_len > char_len) ? elem_len : char_len;
3974 	}
3975     }
3976   return 0;
3977 }
3978 
3979 # ifdef _LIBC
3980 static unsigned int
3981 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3982 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3983 {
3984   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3985   if (nrules == 0)
3986     {
3987       if (mbs_len == 1)
3988 	{
3989 	  /* No valid character.  Match it as a single byte character.  */
3990 	  const unsigned char *collseq = (const unsigned char *)
3991 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3992 	  return collseq[mbs[0]];
3993 	}
3994       return UINT_MAX;
3995     }
3996   else
3997     {
3998       int32_t idx;
3999       const unsigned char *extra = (const unsigned char *)
4000 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4001       int32_t extrasize = (const unsigned char *)
4002 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4003 
4004       for (idx = 0; idx < extrasize;)
4005 	{
4006 	  int mbs_cnt, found = 0;
4007 	  int32_t elem_mbs_len;
4008 	  /* Skip the name of collating element name.  */
4009 	  idx = idx + extra[idx] + 1;
4010 	  elem_mbs_len = extra[idx++];
4011 	  if (mbs_len == elem_mbs_len)
4012 	    {
4013 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4014 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4015 		  break;
4016 	      if (mbs_cnt == elem_mbs_len)
4017 		/* Found the entry.  */
4018 		found = 1;
4019 	    }
4020 	  /* Skip the byte sequence of the collating element.  */
4021 	  idx += elem_mbs_len;
4022 	  /* Adjust for the alignment.  */
4023 	  idx = (idx + 3) & ~3;
4024 	  /* Skip the collation sequence value.  */
4025 	  idx += sizeof (uint32_t);
4026 	  /* Skip the wide char sequence of the collating element.  */
4027 	  idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4028 	  /* If we found the entry, return the sequence value.  */
4029 	  if (found)
4030 	    return *(uint32_t *) (extra + idx);
4031 	  /* Skip the collation sequence value.  */
4032 	  idx += sizeof (uint32_t);
4033 	}
4034       return UINT_MAX;
4035     }
4036 }
4037 # endif /* _LIBC */
4038 #endif /* RE_ENABLE_I18N */
4039 
4040 /* Check whether the node accepts the byte which is IDX-th
4041    byte of the INPUT.  */
4042 
4043 static int
4044 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,int idx)4045 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4046 		   int idx)
4047 {
4048   unsigned char ch;
4049   ch = re_string_byte_at (&mctx->input, idx);
4050   switch (node->type)
4051     {
4052     case CHARACTER:
4053       if (node->opr.c != ch)
4054 	return 0;
4055       break;
4056 
4057     case SIMPLE_BRACKET:
4058       if (!bitset_contain (node->opr.sbcset, ch))
4059 	return 0;
4060       break;
4061 
4062 #ifdef RE_ENABLE_I18N
4063     case OP_UTF8_PERIOD:
4064       if (ch >= 0x80)
4065 	return 0;
4066       /* FALLTHROUGH */
4067 #endif
4068     case OP_PERIOD:
4069       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4070 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4071 	return 0;
4072       break;
4073 
4074     default:
4075       return 0;
4076     }
4077 
4078   if (node->constraint)
4079     {
4080       /* The node has constraints.  Check whether the current context
4081 	 satisfies the constraints.  */
4082       unsigned int context = re_string_context_at (&mctx->input, idx,
4083 						   mctx->eflags);
4084       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4085 	return 0;
4086     }
4087 
4088   return 1;
4089 }
4090 
4091 /* Extend the buffers, if the buffers have run out.  */
4092 
4093 static reg_errcode_t
4094 internal_function
extend_buffers(re_match_context_t * mctx)4095 extend_buffers (re_match_context_t *mctx)
4096 {
4097   reg_errcode_t ret;
4098   re_string_t *pstr = &mctx->input;
4099 
4100   /* Avoid overflow.  */
4101   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4102     return REG_ESPACE;
4103 
4104   /* Double the lengths of the buffers.  */
4105   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4106   if (BE (ret != REG_NOERROR, 0))
4107     return ret;
4108 
4109   if (mctx->state_log != NULL)
4110     {
4111       /* And double the length of state_log.  */
4112       /* XXX We have no indication of the size of this buffer.  If this
4113 	 allocation fail we have no indication that the state_log array
4114 	 does not have the right size.  */
4115       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4116 					      pstr->bufs_len + 1);
4117       if (BE (new_array == NULL, 0))
4118 	return REG_ESPACE;
4119       mctx->state_log = new_array;
4120     }
4121 
4122   /* Then reconstruct the buffers.  */
4123   if (pstr->icase)
4124     {
4125 #ifdef RE_ENABLE_I18N
4126       if (pstr->mb_cur_max > 1)
4127 	{
4128 	  ret = build_wcs_upper_buffer (pstr);
4129 	  if (BE (ret != REG_NOERROR, 0))
4130 	    return ret;
4131 	}
4132       else
4133 #endif /* RE_ENABLE_I18N  */
4134 	build_upper_buffer (pstr);
4135     }
4136   else
4137     {
4138 #ifdef RE_ENABLE_I18N
4139       if (pstr->mb_cur_max > 1)
4140 	build_wcs_buffer (pstr);
4141       else
4142 #endif /* RE_ENABLE_I18N  */
4143 	{
4144 	  if (pstr->trans != NULL)
4145 	    re_string_translate_buffer (pstr);
4146 	}
4147     }
4148   return REG_NOERROR;
4149 }
4150 
4151 
4152 /* Functions for matching context.  */
4153 
4154 /* Initialize MCTX.  */
4155 
4156 static reg_errcode_t
4157 internal_function
match_ctx_init(re_match_context_t * mctx,int eflags,int n)4158 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4159 {
4160   mctx->eflags = eflags;
4161   mctx->match_last = -1;
4162   if (n > 0)
4163     {
4164       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4165       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4166       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4167 	return REG_ESPACE;
4168     }
4169   /* Already zero-ed by the caller.
4170      else
4171        mctx->bkref_ents = NULL;
4172      mctx->nbkref_ents = 0;
4173      mctx->nsub_tops = 0;  */
4174   mctx->abkref_ents = n;
4175   mctx->max_mb_elem_len = 1;
4176   mctx->asub_tops = n;
4177   return REG_NOERROR;
4178 }
4179 
4180 /* Clean the entries which depend on the current input in MCTX.
4181    This function must be invoked when the matcher changes the start index
4182    of the input, or changes the input string.  */
4183 
4184 static void
4185 internal_function
match_ctx_clean(re_match_context_t * mctx)4186 match_ctx_clean (re_match_context_t *mctx)
4187 {
4188   int st_idx;
4189   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4190     {
4191       int sl_idx;
4192       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4193       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4194 	{
4195 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4196 	  re_free (last->path.array);
4197 	  re_free (last);
4198 	}
4199       re_free (top->lasts);
4200       if (top->path)
4201 	{
4202 	  re_free (top->path->array);
4203 	  re_free (top->path);
4204 	}
4205       free (top);
4206     }
4207 
4208   mctx->nsub_tops = 0;
4209   mctx->nbkref_ents = 0;
4210 }
4211 
4212 /* Free all the memory associated with MCTX.  */
4213 
4214 static void
4215 internal_function
match_ctx_free(re_match_context_t * mctx)4216 match_ctx_free (re_match_context_t *mctx)
4217 {
4218   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4219   match_ctx_clean (mctx);
4220   re_free (mctx->sub_tops);
4221   re_free (mctx->bkref_ents);
4222 }
4223 
4224 /* Add a new backreference entry to MCTX.
4225    Note that we assume that caller never call this function with duplicate
4226    entry, and call with STR_IDX which isn't smaller than any existing entry.
4227 */
4228 
4229 static reg_errcode_t
4230 internal_function
match_ctx_add_entry(re_match_context_t * mctx,int node,int str_idx,int from,int to)4231 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4232 		     int to)
4233 {
4234   if (mctx->nbkref_ents >= mctx->abkref_ents)
4235     {
4236       struct re_backref_cache_entry* new_entry;
4237       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4238 			      mctx->abkref_ents * 2);
4239       if (BE (new_entry == NULL, 0))
4240 	{
4241 	  re_free (mctx->bkref_ents);
4242 	  return REG_ESPACE;
4243 	}
4244       mctx->bkref_ents = new_entry;
4245       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4246 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4247       mctx->abkref_ents *= 2;
4248     }
4249   if (mctx->nbkref_ents > 0
4250       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4251     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4252 
4253   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4254   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4255   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4256   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4257 
4258   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4259      If bit N is clear, means that this entry won't epsilon-transition to
4260      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4261      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4262      such node.
4263 
4264      A backreference does not epsilon-transition unless it is empty, so set
4265      to all zeros if FROM != TO.  */
4266   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4267     = (from == to ? ~0 : 0);
4268 
4269   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4270   if (mctx->max_mb_elem_len < to - from)
4271     mctx->max_mb_elem_len = to - from;
4272   return REG_NOERROR;
4273 }
4274 
4275 /* Search for the first entry which has the same str_idx, or -1 if none is
4276    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4277 
4278 static int
4279 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,int str_idx)4280 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4281 {
4282   int left, right, mid, last;
4283   last = right = mctx->nbkref_ents;
4284   for (left = 0; left < right;)
4285     {
4286       mid = left + (right - left) / 2;
4287       if (mctx->bkref_ents[mid].str_idx < str_idx)
4288 	left = mid + 1;
4289       else
4290 	right = mid;
4291     }
4292   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4293     return left;
4294   else
4295     return -1;
4296 }
4297 
4298 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4299    at STR_IDX.  */
4300 
4301 static reg_errcode_t
4302 internal_function
match_ctx_add_subtop(re_match_context_t * mctx,int node,int str_idx)4303 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4304 {
4305 #ifdef DEBUG
4306   assert (mctx->sub_tops != NULL);
4307   assert (mctx->asub_tops > 0);
4308 #endif
4309   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4310     {
4311       int new_asub_tops = mctx->asub_tops * 2;
4312       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4313 						   re_sub_match_top_t *,
4314 						   new_asub_tops);
4315       if (BE (new_array == NULL, 0))
4316 	return REG_ESPACE;
4317       mctx->sub_tops = new_array;
4318       mctx->asub_tops = new_asub_tops;
4319     }
4320   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4321   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4322     return REG_ESPACE;
4323   mctx->sub_tops[mctx->nsub_tops]->node = node;
4324   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4325   return REG_NOERROR;
4326 }
4327 
4328 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4329    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4330 
4331 static re_sub_match_last_t *
4332 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,int node,int str_idx)4333 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4334 {
4335   re_sub_match_last_t *new_entry;
4336   if (BE (subtop->nlasts == subtop->alasts, 0))
4337     {
4338       int new_alasts = 2 * subtop->alasts + 1;
4339       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4340 						    re_sub_match_last_t *,
4341 						    new_alasts);
4342       if (BE (new_array == NULL, 0))
4343 	return NULL;
4344       subtop->lasts = new_array;
4345       subtop->alasts = new_alasts;
4346     }
4347   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4348   if (BE (new_entry != NULL, 1))
4349     {
4350       subtop->lasts[subtop->nlasts] = new_entry;
4351       new_entry->node = node;
4352       new_entry->str_idx = str_idx;
4353       ++subtop->nlasts;
4354     }
4355   return new_entry;
4356 }
4357 
4358 static void
4359 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,int last_node,int last_str_idx)4360 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4361 	       re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4362 {
4363   sctx->sifted_states = sifted_sts;
4364   sctx->limited_states = limited_sts;
4365   sctx->last_node = last_node;
4366   sctx->last_str_idx = last_str_idx;
4367   re_node_set_init_empty (&sctx->limits);
4368 }
4369