1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
18 
19 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
20 				     Idx n) internal_function;
21 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
22 static void match_ctx_free (re_match_context_t *cache) internal_function;
23 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
24 					  Idx str_idx, Idx from, Idx to)
25      internal_function;
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
27      internal_function;
28 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
29 					   Idx str_idx) internal_function;
30 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
31 						    Idx node, Idx str_idx)
32      internal_function;
33 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
34 			   re_dfastate_t **limited_sts, Idx last_node,
35 			   Idx last_str_idx)
36      internal_function;
37 static reg_errcode_t re_search_internal (const regex_t *preg,
38 					 const char *string, Idx length,
39 					 Idx start, Idx last_start, Idx stop,
40 					 size_t nmatch, regmatch_t pmatch[],
41 					 int eflags) internal_function;
42 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
43 				  const char *string1, Idx length1,
44 				  const char *string2, Idx length2,
45 				  Idx start, regoff_t range,
46 				  struct re_registers *regs,
47 				  Idx stop, bool ret_len) internal_function;
48 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
49 				const char *string, Idx length, Idx start,
50 				regoff_t range, Idx stop,
51 				struct re_registers *regs,
52 				bool ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54                               Idx nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56      internal_function;
57 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
58 			   Idx *p_match_first) internal_function;
59 static Idx check_halt_state_context (const re_match_context_t *mctx,
60 				     const re_dfastate_t *state, Idx idx)
61      internal_function;
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 			 regmatch_t *prev_idx_match, Idx cur_node,
64 			 Idx cur_idx, Idx nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 				      Idx str_idx, Idx dest_node, Idx nregs,
67 				      regmatch_t *regs,
68 				      re_node_set *eps_via_nodes)
69      internal_function;
70 static reg_errcode_t set_regs (const regex_t *preg,
71 			       const re_match_context_t *mctx,
72 			       size_t nmatch, regmatch_t *pmatch,
73 			       bool fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75      internal_function;
76 
77 #ifdef RE_ENABLE_I18N
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 				re_sift_context_t *sctx,
80 				Idx node_idx, Idx str_idx, Idx max_str_idx)
81      internal_function;
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 					   re_sift_context_t *sctx)
85      internal_function;
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 					  re_sift_context_t *sctx, Idx str_idx,
88 					  re_node_set *cur_dest)
89      internal_function;
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 					      re_sift_context_t *sctx,
92 					      Idx str_idx,
93 					      re_node_set *dest_nodes)
94      internal_function;
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 					    re_node_set *dest_nodes,
97 					    const re_node_set *candidates)
98      internal_function;
99 static bool check_dst_limits (const re_match_context_t *mctx,
100 			      const re_node_set *limits,
101 			      Idx dst_node, Idx dst_idx, Idx src_node,
102 			      Idx src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 					int boundaries, Idx subexp_idx,
105 					Idx from_node, Idx bkref_idx)
106      internal_function;
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 				      Idx limit, Idx subexp_idx,
109 				      Idx node, Idx str_idx,
110 				      Idx bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 					  re_node_set *dest_nodes,
113 					  const re_node_set *candidates,
114 					  re_node_set *limits,
115 					  struct re_backref_cache_entry *bkref_ents,
116 					  Idx str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 					re_sift_context_t *sctx,
119 					Idx str_idx, const re_node_set *candidates)
120      internal_function;
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 					re_dfastate_t **dst,
123 					re_dfastate_t **src, Idx num)
124      internal_function;
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 					 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 				     re_match_context_t *mctx,
129 				     re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 					    re_match_context_t *mctx,
132 					    re_dfastate_t *next_state)
133      internal_function;
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 						re_node_set *cur_nodes,
136 						Idx str_idx) internal_function;
137 #if 0
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 					re_match_context_t *mctx,
140 					re_dfastate_t *pstate)
141      internal_function;
142 #endif
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 				       re_dfastate_t *pstate)
146      internal_function;
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 					  const re_node_set *nodes)
150      internal_function;
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 				 Idx bkref_node, Idx bkref_str_idx)
153      internal_function;
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 				     const re_sub_match_top_t *sub_top,
156 				     re_sub_match_last_t *sub_last,
157 				     Idx bkref_node, Idx bkref_str)
158      internal_function;
159 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 			     Idx subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 				    state_array_t *path, Idx top_node,
163 				    Idx top_str, Idx last_node, Idx last_str,
164 				    int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 						   Idx str_idx,
167 						   re_node_set *cur_nodes,
168 						   re_node_set *next_nodes)
169      internal_function;
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 					       re_node_set *cur_nodes,
172 					       Idx ex_subexp, int type)
173      internal_function;
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 						   re_node_set *dst_nodes,
176 						   Idx target, Idx ex_subexp,
177 						   int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 					 re_node_set *cur_nodes, Idx cur_str,
180 					 Idx subexp_num, int type)
181      internal_function;
182 static bool build_trtable (const re_dfa_t *dfa,
183 			   re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
186 				    const re_string_t *input, Idx idx)
187      internal_function;
188 # ifdef _LIBC
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 						   size_t name_len)
191      internal_function;
192 # endif /* _LIBC */
193 #endif /* RE_ENABLE_I18N */
194 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 				       const re_dfastate_t *state,
196 				       re_node_set *states_node,
197 				       bitset_t *states_ch) internal_function;
198 static bool check_node_accept (const re_match_context_t *mctx,
199 			       const re_token_t *node, Idx idx)
200      internal_function;
201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202      internal_function;
203 
204 /* Entry point for POSIX code.  */
205 
206 /* regexec searches for a given pattern, specified by PREG, in the
207    string STRING.
208 
209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
211    least NMATCH elements, and we set them to the offsets of the
212    corresponding matched substrings.
213 
214    EFLAGS specifies "execution flags" which affect matching: if
215    REG_NOTBOL is set, then ^ does not match at the beginning of the
216    string; if REG_NOTEOL is set, then $ does not match at the end.
217 
218    We return 0 if we find a match and REG_NOMATCH if not.  */
219 
220 int
regexec(preg,string,nmatch,pmatch,eflags)221 regexec (preg, string, nmatch, pmatch, eflags)
222     const regex_t *_Restrict_ preg;
223     const char *_Restrict_ string;
224     size_t nmatch;
225     regmatch_t pmatch[_Restrict_arr_];
226     int eflags;
227 {
228   reg_errcode_t err;
229   Idx start, length;
230 #ifdef _LIBC
231   re_dfa_t *dfa = preg->buffer;
232 #endif
233 
234   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
235     return REG_BADPAT;
236 
237   if (eflags & REG_STARTEND)
238     {
239       start = pmatch[0].rm_so;
240       length = pmatch[0].rm_eo;
241     }
242   else
243     {
244       start = 0;
245       length = strlen (string);
246     }
247 
248   __libc_lock_lock (dfa->lock);
249   if (preg->no_sub)
250     err = re_search_internal (preg, string, length, start, length,
251 			      length, 0, NULL, eflags);
252   else
253     err = re_search_internal (preg, string, length, start, length,
254 			      length, nmatch, pmatch, eflags);
255   __libc_lock_unlock (dfa->lock);
256   return err != REG_NOERROR;
257 }
258 
259 #ifdef _LIBC
260 # include <shlib-compat.h>
261 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
262 
263 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
264 __typeof__ (__regexec) __compat_regexec;
265 
266 int
267 attribute_compat_text_section
__compat_regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)268 __compat_regexec (const regex_t *_Restrict_ preg,
269 		  const char *_Restrict_ string, size_t nmatch,
270 		  regmatch_t pmatch[], int eflags)
271 {
272   return regexec (preg, string, nmatch, pmatch,
273 		  eflags & (REG_NOTBOL | REG_NOTEOL));
274 }
275 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
276 # endif
277 #endif
278 
279 /* Entry points for GNU code.  */
280 
281 /* re_match, re_search, re_match_2, re_search_2
282 
283    The former two functions operate on STRING with length LENGTH,
284    while the later two operate on concatenation of STRING1 and STRING2
285    with lengths LENGTH1 and LENGTH2, respectively.
286 
287    re_match() matches the compiled pattern in BUFP against the string,
288    starting at index START.
289 
290    re_search() first tries matching at index START, then it tries to match
291    starting from index START + 1, and so on.  The last start position tried
292    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
293    way as re_match().)
294 
295    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
296    the first STOP characters of the concatenation of the strings should be
297    concerned.
298 
299    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
300    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
301    computed relative to the concatenation, not relative to the individual
302    strings.)
303 
304    On success, re_match* functions return the length of the match, re_search*
305    return the position of the start of the match.  Return value -1 means no
306    match was found and -2 indicates an internal error.  */
307 
308 regoff_t
re_match(bufp,string,length,start,regs)309 re_match (bufp, string, length, start, regs)
310     struct re_pattern_buffer *bufp;
311     const char *string;
312     Idx length, start;
313     struct re_registers *regs;
314 {
315   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
316 }
317 #ifdef _LIBC
318 weak_alias (__re_match, re_match)
319 #endif
320 
321 regoff_t
322 re_search (bufp, string, length, start, range, regs)
323     struct re_pattern_buffer *bufp;
324     const char *string;
325     Idx length, start;
326     regoff_t range;
327     struct re_registers *regs;
328 {
329   return re_search_stub (bufp, string, length, start, range, length, regs,
330 			 false);
331 }
332 #ifdef _LIBC
333 weak_alias (__re_search, re_search)
334 #endif
335 
336 regoff_t
337 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
338     struct re_pattern_buffer *bufp;
339     const char *string1, *string2;
340     Idx length1, length2, start, stop;
341     struct re_registers *regs;
342 {
343   return re_search_2_stub (bufp, string1, length1, string2, length2,
344 			   start, 0, regs, stop, true);
345 }
346 #ifdef _LIBC
347 weak_alias (__re_match_2, re_match_2)
348 #endif
349 
350 regoff_t
351 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
352     struct re_pattern_buffer *bufp;
353     const char *string1, *string2;
354     Idx length1, length2, start, stop;
355     regoff_t range;
356     struct re_registers *regs;
357 {
358   return re_search_2_stub (bufp, string1, length1, string2, length2,
359 			   start, range, regs, stop, false);
360 }
361 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)362 weak_alias (__re_search_2, re_search_2)
363 #endif
364 
365 static regoff_t
366 re_search_2_stub (struct re_pattern_buffer *bufp,
367 		  const char *string1, Idx length1,
368 		  const char *string2, Idx length2,
369 		  Idx start, regoff_t range, struct re_registers *regs,
370 		  Idx stop, bool ret_len)
371 {
372   const char *str;
373   regoff_t rval;
374   Idx len = length1 + length2;
375   char *s = NULL;
376 
377   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
378     return -2;
379 
380   /* Concatenate the strings.  */
381   if (length2 > 0)
382     if (length1 > 0)
383       {
384 	s = re_malloc (char, len);
385 
386 	if (BE (s == NULL, 0))
387 	  return -2;
388 #ifdef _LIBC
389 	memcpy (__mempcpy (s, string1, length1), string2, length2);
390 #else
391 	memcpy (s, string1, length1);
392 	memcpy (s + length1, string2, length2);
393 #endif
394 	str = s;
395       }
396     else
397       str = string2;
398   else
399     str = string1;
400 
401   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
402 			 ret_len);
403   re_free (s);
404   return rval;
405 }
406 
407 /* The parameters have the same meaning as those of re_search.
408    Additional parameters:
409    If RET_LEN is true the length of the match is returned (re_match style);
410    otherwise the position of the match is returned.  */
411 
412 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)413 re_search_stub (struct re_pattern_buffer *bufp,
414 		const char *string, Idx length,
415 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
416 		bool ret_len)
417 {
418   reg_errcode_t result;
419   regmatch_t *pmatch;
420   Idx nregs;
421   regoff_t rval;
422   int eflags = 0;
423 #ifdef _LIBC
424   re_dfa_t *dfa = bufp->buffer;
425 #endif
426   Idx last_start = start + range;
427 
428   /* Check for out-of-range.  */
429   if (BE (start < 0 || start > length, 0))
430     return -1;
431   if (BE (length < last_start || (0 <= range && last_start < start), 0))
432     last_start = length;
433   else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
434     last_start = 0;
435 
436   __libc_lock_lock (dfa->lock);
437 
438   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
439   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
440 
441   /* Compile fastmap if we haven't yet.  */
442   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
443     re_compile_fastmap (bufp);
444 
445   if (BE (bufp->no_sub, 0))
446     regs = NULL;
447 
448   /* We need at least 1 register.  */
449   if (regs == NULL)
450     nregs = 1;
451   else if (BE (bufp->regs_allocated == REGS_FIXED
452 	       && regs->num_regs <= bufp->re_nsub, 0))
453     {
454       nregs = regs->num_regs;
455       if (BE (nregs < 1, 0))
456 	{
457 	  /* Nothing can be copied to regs.  */
458 	  regs = NULL;
459 	  nregs = 1;
460 	}
461     }
462   else
463     nregs = bufp->re_nsub + 1;
464   pmatch = re_malloc (regmatch_t, nregs);
465   if (BE (pmatch == NULL, 0))
466     {
467       rval = -2;
468       goto out;
469     }
470 
471   result = re_search_internal (bufp, string, length, start, last_start, stop,
472 			       nregs, pmatch, eflags);
473 
474   rval = 0;
475 
476   /* I hope we needn't fill ther regs with -1's when no match was found.  */
477   if (result != REG_NOERROR)
478     rval = result == REG_NOMATCH ? -1 : -2;
479   else if (regs != NULL)
480     {
481       /* If caller wants register contents data back, copy them.  */
482       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
483 					   bufp->regs_allocated);
484       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
485 	rval = -2;
486     }
487 
488   if (BE (rval == 0, 1))
489     {
490       if (ret_len)
491 	{
492 	  assert (pmatch[0].rm_so == start);
493 	  rval = pmatch[0].rm_eo - start;
494 	}
495       else
496 	rval = pmatch[0].rm_so;
497     }
498   re_free (pmatch);
499  out:
500   __libc_lock_unlock (dfa->lock);
501   return rval;
502 }
503 
504 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)505 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
506 	      int regs_allocated)
507 {
508   int rval = REGS_REALLOCATE;
509   Idx i;
510   Idx need_regs = nregs + 1;
511   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
512      uses.  */
513 
514   /* Have the register data arrays been allocated?  */
515   if (regs_allocated == REGS_UNALLOCATED)
516     { /* No.  So allocate them with malloc.  */
517       regs->start = re_malloc (regoff_t, need_regs);
518       if (BE (regs->start == NULL, 0))
519 	return REGS_UNALLOCATED;
520       regs->end = re_malloc (regoff_t, need_regs);
521       if (BE (regs->end == NULL, 0))
522 	{
523 	  re_free (regs->start);
524 	  return REGS_UNALLOCATED;
525 	}
526       regs->num_regs = need_regs;
527     }
528   else if (regs_allocated == REGS_REALLOCATE)
529     { /* Yes.  If we need more elements than were already
530 	 allocated, reallocate them.  If we need fewer, just
531 	 leave it alone.  */
532       if (BE (need_regs > regs->num_regs, 0))
533 	{
534 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
535 	  regoff_t *new_end;
536 	  if (BE (new_start == NULL, 0))
537 	    return REGS_UNALLOCATED;
538 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
539 	  if (BE (new_end == NULL, 0))
540 	    {
541 	      re_free (new_start);
542 	      return REGS_UNALLOCATED;
543 	    }
544 	  regs->start = new_start;
545 	  regs->end = new_end;
546 	  regs->num_regs = need_regs;
547 	}
548     }
549   else
550     {
551       assert (regs_allocated == REGS_FIXED);
552       /* This function may not be called with REGS_FIXED and nregs too big.  */
553       assert (regs->num_regs >= nregs);
554       rval = REGS_FIXED;
555     }
556 
557   /* Copy the regs.  */
558   for (i = 0; i < nregs; ++i)
559     {
560       regs->start[i] = pmatch[i].rm_so;
561       regs->end[i] = pmatch[i].rm_eo;
562     }
563   for ( ; i < regs->num_regs; ++i)
564     regs->start[i] = regs->end[i] = -1;
565 
566   return rval;
567 }
568 
569 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
570    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
571    this memory for recording register information.  STARTS and ENDS
572    must be allocated using the malloc library routine, and must each
573    be at least NUM_REGS * sizeof (regoff_t) bytes long.
574 
575    If NUM_REGS == 0, then subsequent matches should allocate their own
576    register data.
577 
578    Unless this function is called, the first search or match using
579    PATTERN_BUFFER will allocate its own register data, without
580    freeing the old data.  */
581 
582 void
re_set_registers(bufp,regs,num_regs,starts,ends)583 re_set_registers (bufp, regs, num_regs, starts, ends)
584     struct re_pattern_buffer *bufp;
585     struct re_registers *regs;
586     __re_size_t num_regs;
587     regoff_t *starts, *ends;
588 {
589   if (num_regs)
590     {
591       bufp->regs_allocated = REGS_REALLOCATE;
592       regs->num_regs = num_regs;
593       regs->start = starts;
594       regs->end = ends;
595     }
596   else
597     {
598       bufp->regs_allocated = REGS_UNALLOCATED;
599       regs->num_regs = 0;
600       regs->start = regs->end = NULL;
601     }
602 }
603 #ifdef _LIBC
604 weak_alias (__re_set_registers, re_set_registers)
605 #endif
606 
607 /* Entry points compatible with 4.2 BSD regex library.  We don't define
608    them unless specifically requested.  */
609 
610 #if defined _REGEX_RE_COMP || defined _LIBC
611 int
612 # ifdef _LIBC
613 weak_function
614 # endif
615 re_exec (s)
616      const char *s;
617 {
618   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
619 }
620 #endif /* _REGEX_RE_COMP */
621 
622 /* Internal entry point.  */
623 
624 /* Searches for a compiled pattern PREG in the string STRING, whose
625    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
626    meaning as with regexec.  LAST_START is START + RANGE, where
627    START and RANGE have the same meaning as with re_search.
628    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
629    otherwise return the error code.
630    Note: We assume front end functions already check ranges.
631    (0 <= LAST_START && LAST_START <= LENGTH)  */
632 
633 static reg_errcode_t
634 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)635 re_search_internal (const regex_t *preg,
636 		    const char *string, Idx length,
637 		    Idx start, Idx last_start, Idx stop,
638 		    size_t nmatch, regmatch_t pmatch[],
639 		    int eflags)
640 {
641   reg_errcode_t err;
642   const re_dfa_t *dfa = preg->buffer;
643   Idx left_lim, right_lim;
644   int incr;
645   bool fl_longest_match;
646   int match_kind;
647   Idx match_first;
648   Idx match_last = REG_MISSING;
649   Idx extra_nmatch;
650   bool sb;
651   int ch;
652 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
653   re_match_context_t mctx = { .dfa = dfa };
654 #else
655   re_match_context_t mctx;
656 #endif
657   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
658 		    && start != last_start && !preg->can_be_null)
659 		   ? preg->fastmap : NULL);
660   RE_TRANSLATE_TYPE t = preg->translate;
661 
662 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
663   memset (&mctx, '\0', sizeof (re_match_context_t));
664   mctx.dfa = dfa;
665 #endif
666 
667   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
668   nmatch -= extra_nmatch;
669 
670   /* Check if the DFA haven't been compiled.  */
671   if (BE (preg->used == 0 || dfa->init_state == NULL
672 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
673 	  || dfa->init_state_begbuf == NULL, 0))
674     return REG_NOMATCH;
675 
676 #ifdef DEBUG
677   /* We assume front-end functions already check them.  */
678   assert (0 <= last_start && last_start <= length);
679 #endif
680 
681   /* If initial states with non-begbuf contexts have no elements,
682      the regex must be anchored.  If preg->newline_anchor is set,
683      we'll never use init_state_nl, so do not check it.  */
684   if (dfa->init_state->nodes.nelem == 0
685       && dfa->init_state_word->nodes.nelem == 0
686       && (dfa->init_state_nl->nodes.nelem == 0
687 	  || !preg->newline_anchor))
688     {
689       if (start != 0 && last_start != 0)
690         return REG_NOMATCH;
691       start = last_start = 0;
692     }
693 
694   /* We must check the longest matching, if nmatch > 0.  */
695   fl_longest_match = (nmatch != 0 || dfa->nbackref);
696 
697   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
698 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
699 			    dfa);
700   if (BE (err != REG_NOERROR, 0))
701     goto free_return;
702   mctx.input.stop = stop;
703   mctx.input.raw_stop = stop;
704   mctx.input.newline_anchor = preg->newline_anchor;
705 
706   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
707   if (BE (err != REG_NOERROR, 0))
708     goto free_return;
709 
710   /* We will log all the DFA states through which the dfa pass,
711      if nmatch > 1, or this dfa has "multibyte node", which is a
712      back-reference or a node which can accept multibyte character or
713      multi character collating element.  */
714   if (nmatch > 1 || dfa->has_mb_node)
715     {
716       /* Avoid overflow.  */
717       if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
718                <= mctx.input.bufs_len), 0))
719 	{
720 	  err = REG_ESPACE;
721 	  goto free_return;
722 	}
723 
724       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
725       if (BE (mctx.state_log == NULL, 0))
726 	{
727 	  err = REG_ESPACE;
728 	  goto free_return;
729 	}
730     }
731   else
732     mctx.state_log = NULL;
733 
734   match_first = start;
735   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
736 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
737 
738   /* Check incrementally whether of not the input string match.  */
739   incr = (last_start < start) ? -1 : 1;
740   left_lim = (last_start < start) ? last_start : start;
741   right_lim = (last_start < start) ? start : last_start;
742   sb = dfa->mb_cur_max == 1;
743   match_kind =
744     (fastmap
745      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
746 	| (start <= last_start ? 2 : 0)
747 	| (t != NULL ? 1 : 0))
748      : 8);
749 
750   for (;; match_first += incr)
751     {
752       err = REG_NOMATCH;
753       if (match_first < left_lim || right_lim < match_first)
754 	goto free_return;
755 
756       /* Advance as rapidly as possible through the string, until we
757 	 find a plausible place to start matching.  This may be done
758 	 with varying efficiency, so there are various possibilities:
759 	 only the most common of them are specialized, in order to
760 	 save on code size.  We use a switch statement for speed.  */
761       switch (match_kind)
762 	{
763 	case 8:
764 	  /* No fastmap.  */
765 	  break;
766 
767 	case 7:
768 	  /* Fastmap with single-byte translation, match forward.  */
769 	  while (BE (match_first < right_lim, 1)
770 		 && !fastmap[t[(unsigned char) string[match_first]]])
771 	    ++match_first;
772 	  goto forward_match_found_start_or_reached_end;
773 
774 	case 6:
775 	  /* Fastmap without translation, match forward.  */
776 	  while (BE (match_first < right_lim, 1)
777 		 && !fastmap[(unsigned char) string[match_first]])
778 	    ++match_first;
779 
780 	forward_match_found_start_or_reached_end:
781 	  if (BE (match_first == right_lim, 0))
782 	    {
783 	      ch = match_first >= length
784 		       ? 0 : (unsigned char) string[match_first];
785 	      if (!fastmap[t ? t[ch] : ch])
786 		goto free_return;
787 	    }
788 	  break;
789 
790 	case 4:
791 	case 5:
792 	  /* Fastmap without multi-byte translation, match backwards.  */
793 	  while (match_first >= left_lim)
794 	    {
795 	      ch = match_first >= length
796 		       ? 0 : (unsigned char) string[match_first];
797 	      if (fastmap[t ? t[ch] : ch])
798 		break;
799 	      --match_first;
800 	    }
801 	  if (match_first < left_lim)
802 	    goto free_return;
803 	  break;
804 
805 	default:
806 	  /* In this case, we can't determine easily the current byte,
807 	     since it might be a component byte of a multibyte
808 	     character.  Then we use the constructed buffer instead.  */
809 	  for (;;)
810 	    {
811 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
812 		 buffers.  */
813 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
814 	      if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
815 		{
816 		  err = re_string_reconstruct (&mctx.input, match_first,
817 					       eflags);
818 		  if (BE (err != REG_NOERROR, 0))
819 		    goto free_return;
820 
821 		  offset = match_first - mctx.input.raw_mbs_idx;
822 		}
823 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
824 		 Note that MATCH_FIRST must not be smaller than 0.  */
825 	      ch = (match_first >= length
826 		    ? 0 : re_string_byte_at (&mctx.input, offset));
827 	      if (fastmap[ch])
828 		break;
829 	      match_first += incr;
830 	      if (match_first < left_lim || match_first > right_lim)
831 		{
832 		  err = REG_NOMATCH;
833 		  goto free_return;
834 		}
835 	    }
836 	  break;
837 	}
838 
839       /* Reconstruct the buffers so that the matcher can assume that
840 	 the matching starts from the beginning of the buffer.  */
841       err = re_string_reconstruct (&mctx.input, match_first, eflags);
842       if (BE (err != REG_NOERROR, 0))
843 	goto free_return;
844 
845 #ifdef RE_ENABLE_I18N
846      /* Don't consider this char as a possible match start if it part,
847 	yet isn't the head, of a multibyte character.  */
848       if (!sb && !re_string_first_byte (&mctx.input, 0))
849 	continue;
850 #endif
851 
852       /* It seems to be appropriate one, then use the matcher.  */
853       /* We assume that the matching starts from 0.  */
854       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
855       match_last = check_matching (&mctx, fl_longest_match,
856 				   start <= last_start ? &match_first : NULL);
857       if (match_last != REG_MISSING)
858 	{
859 	  if (BE (match_last == REG_ERROR, 0))
860 	    {
861 	      err = REG_ESPACE;
862 	      goto free_return;
863 	    }
864 	  else
865 	    {
866 	      mctx.match_last = match_last;
867 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
868 		{
869 		  re_dfastate_t *pstate = mctx.state_log[match_last];
870 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
871 							     match_last);
872 		}
873 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
874 		  || dfa->nbackref)
875 		{
876 		  err = prune_impossible_nodes (&mctx);
877 		  if (err == REG_NOERROR)
878 		    break;
879 		  if (BE (err != REG_NOMATCH, 0))
880 		    goto free_return;
881 		  match_last = REG_MISSING;
882 		}
883 	      else
884 		break; /* We found a match.  */
885 	    }
886 	}
887 
888       match_ctx_clean (&mctx);
889     }
890 
891 #ifdef DEBUG
892   assert (match_last != REG_MISSING);
893   assert (err == REG_NOERROR);
894 #endif
895 
896   /* Set pmatch[] if we need.  */
897   if (nmatch > 0)
898     {
899       Idx reg_idx;
900 
901       /* Initialize registers.  */
902       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
903 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
904 
905       /* Set the points where matching start/end.  */
906       pmatch[0].rm_so = 0;
907       pmatch[0].rm_eo = mctx.match_last;
908       /* FIXME: This function should fail if mctx.match_last exceeds
909 	 the maximum possible regoff_t value.  We need a new error
910 	 code REG_OVERFLOW.  */
911 
912       if (!preg->no_sub && nmatch > 1)
913 	{
914 	  err = set_regs (preg, &mctx, nmatch, pmatch,
915 			  dfa->has_plural_match && dfa->nbackref > 0);
916 	  if (BE (err != REG_NOERROR, 0))
917 	    goto free_return;
918 	}
919 
920       /* At last, add the offset to each register, since we slid
921 	 the buffers so that we could assume that the matching starts
922 	 from 0.  */
923       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
924 	if (pmatch[reg_idx].rm_so != -1)
925 	  {
926 #ifdef RE_ENABLE_I18N
927 	    if (BE (mctx.input.offsets_needed != 0, 0))
928 	      {
929 		pmatch[reg_idx].rm_so =
930 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
931 		   ? mctx.input.valid_raw_len
932 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
933 		pmatch[reg_idx].rm_eo =
934 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
935 		   ? mctx.input.valid_raw_len
936 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
937 	      }
938 #else
939 	    assert (mctx.input.offsets_needed == 0);
940 #endif
941 	    pmatch[reg_idx].rm_so += match_first;
942 	    pmatch[reg_idx].rm_eo += match_first;
943 	  }
944       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
945 	{
946 	  pmatch[nmatch + reg_idx].rm_so = -1;
947 	  pmatch[nmatch + reg_idx].rm_eo = -1;
948 	}
949 
950       if (dfa->subexp_map)
951 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
952 	  if (dfa->subexp_map[reg_idx] != reg_idx)
953 	    {
954 	      pmatch[reg_idx + 1].rm_so
955 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
956 	      pmatch[reg_idx + 1].rm_eo
957 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
958 	    }
959     }
960 
961  free_return:
962   re_free (mctx.state_log);
963   if (dfa->nbackref)
964     match_ctx_free (&mctx);
965   re_string_destruct (&mctx.input);
966   return err;
967 }
968 
969 static reg_errcode_t
970 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)971 prune_impossible_nodes (re_match_context_t *mctx)
972 {
973   const re_dfa_t *const dfa = mctx->dfa;
974   Idx halt_node, match_last;
975   reg_errcode_t ret;
976   re_dfastate_t **sifted_states;
977   re_dfastate_t **lim_states = NULL;
978   re_sift_context_t sctx;
979 #ifdef DEBUG
980   assert (mctx->state_log != NULL);
981 #endif
982   match_last = mctx->match_last;
983   halt_node = mctx->last_node;
984 
985   /* Avoid overflow.  */
986   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
987     return REG_ESPACE;
988 
989   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
990   if (BE (sifted_states == NULL, 0))
991     {
992       ret = REG_ESPACE;
993       goto free_return;
994     }
995   if (dfa->nbackref)
996     {
997       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
998       if (BE (lim_states == NULL, 0))
999 	{
1000 	  ret = REG_ESPACE;
1001 	  goto free_return;
1002 	}
1003       while (1)
1004 	{
1005 	  memset (lim_states, '\0',
1006 		  sizeof (re_dfastate_t *) * (match_last + 1));
1007 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1008 			 match_last);
1009 	  ret = sift_states_backward (mctx, &sctx);
1010 	  re_node_set_free (&sctx.limits);
1011 	  if (BE (ret != REG_NOERROR, 0))
1012 	      goto free_return;
1013 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
1014 	    break;
1015 	  do
1016 	    {
1017 	      --match_last;
1018 	      if (! REG_VALID_INDEX (match_last))
1019 		{
1020 		  ret = REG_NOMATCH;
1021 		  goto free_return;
1022 		}
1023 	    } while (mctx->state_log[match_last] == NULL
1024 		     || !mctx->state_log[match_last]->halt);
1025 	  halt_node = check_halt_state_context (mctx,
1026 						mctx->state_log[match_last],
1027 						match_last);
1028 	}
1029       ret = merge_state_array (dfa, sifted_states, lim_states,
1030 			       match_last + 1);
1031       re_free (lim_states);
1032       lim_states = NULL;
1033       if (BE (ret != REG_NOERROR, 0))
1034 	goto free_return;
1035     }
1036   else
1037     {
1038       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1039       ret = sift_states_backward (mctx, &sctx);
1040       re_node_set_free (&sctx.limits);
1041       if (BE (ret != REG_NOERROR, 0))
1042 	goto free_return;
1043       if (sifted_states[0] == NULL)
1044 	{
1045 	  ret = REG_NOMATCH;
1046 	  goto free_return;
1047 	}
1048     }
1049   re_free (mctx->state_log);
1050   mctx->state_log = sifted_states;
1051   sifted_states = NULL;
1052   mctx->last_node = halt_node;
1053   mctx->match_last = match_last;
1054   ret = REG_NOERROR;
1055  free_return:
1056   re_free (sifted_states);
1057   re_free (lim_states);
1058   return ret;
1059 }
1060 
1061 /* Acquire an initial state and return it.
1062    We must select appropriate initial state depending on the context,
1063    since initial states may have constraints like "\<", "^", etc..  */
1064 
1065 static inline re_dfastate_t *
1066 __attribute ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1067 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1068 			    Idx idx)
1069 {
1070   const re_dfa_t *const dfa = mctx->dfa;
1071   if (dfa->init_state->has_constraint)
1072     {
1073       unsigned int context;
1074       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1075       if (IS_WORD_CONTEXT (context))
1076 	return dfa->init_state_word;
1077       else if (IS_ORDINARY_CONTEXT (context))
1078 	return dfa->init_state;
1079       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1080 	return dfa->init_state_begbuf;
1081       else if (IS_NEWLINE_CONTEXT (context))
1082 	return dfa->init_state_nl;
1083       else if (IS_BEGBUF_CONTEXT (context))
1084 	{
1085 	  /* It is relatively rare case, then calculate on demand.  */
1086 	  return re_acquire_state_context (err, dfa,
1087 					   dfa->init_state->entrance_nodes,
1088 					   context);
1089 	}
1090       else
1091 	/* Must not happen?  */
1092 	return dfa->init_state;
1093     }
1094   else
1095     return dfa->init_state;
1096 }
1097 
1098 /* Check whether the regular expression match input string INPUT or not,
1099    and return the index where the matching end.  Return REG_MISSING if
1100    there is no match, and return REG_ERROR in case of an error.
1101    FL_LONGEST_MATCH means we want the POSIX longest matching.
1102    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1103    next place where we may want to try matching.
1104    Note that the matcher assumes that the matching starts from the current
1105    index of the buffer.  */
1106 
1107 static Idx
1108 internal_function __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1109 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1110 		Idx *p_match_first)
1111 {
1112   const re_dfa_t *const dfa = mctx->dfa;
1113   reg_errcode_t err;
1114   Idx match = 0;
1115   Idx match_last = REG_MISSING;
1116   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1117   re_dfastate_t *cur_state;
1118   bool at_init_state = p_match_first != NULL;
1119   Idx next_start_idx = cur_str_idx;
1120 
1121   err = REG_NOERROR;
1122   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1123   /* An initial state must not be NULL (invalid).  */
1124   if (BE (cur_state == NULL, 0))
1125     {
1126       assert (err == REG_ESPACE);
1127       return REG_ERROR;
1128     }
1129 
1130   if (mctx->state_log != NULL)
1131     {
1132       mctx->state_log[cur_str_idx] = cur_state;
1133 
1134       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1135 	 later.  E.g. Processing back references.  */
1136       if (BE (dfa->nbackref, 0))
1137 	{
1138 	  at_init_state = false;
1139 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1140 	  if (BE (err != REG_NOERROR, 0))
1141 	    return err;
1142 
1143 	  if (cur_state->has_backref)
1144 	    {
1145 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1146 	      if (BE (err != REG_NOERROR, 0))
1147 		return err;
1148 	    }
1149 	}
1150     }
1151 
1152   /* If the RE accepts NULL string.  */
1153   if (BE (cur_state->halt, 0))
1154     {
1155       if (!cur_state->has_constraint
1156 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1157 	{
1158 	  if (!fl_longest_match)
1159 	    return cur_str_idx;
1160 	  else
1161 	    {
1162 	      match_last = cur_str_idx;
1163 	      match = 1;
1164 	    }
1165 	}
1166     }
1167 
1168   while (!re_string_eoi (&mctx->input))
1169     {
1170       re_dfastate_t *old_state = cur_state;
1171       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1172 
1173       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1174 	   && mctx->input.bufs_len < mctx->input.len)
1175 	  || (BE (next_char_idx >= mctx->input.valid_len, 0)
1176 	      && mctx->input.valid_len < mctx->input.len))
1177 	{
1178 	  err = extend_buffers (mctx);
1179 	  if (BE (err != REG_NOERROR, 0))
1180 	    {
1181 	      assert (err == REG_ESPACE);
1182 	      return REG_ERROR;
1183 	    }
1184 	}
1185 
1186       cur_state = transit_state (&err, mctx, cur_state);
1187       if (mctx->state_log != NULL)
1188 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1189 
1190       if (cur_state == NULL)
1191 	{
1192 	  /* Reached the invalid state or an error.  Try to recover a valid
1193 	     state using the state log, if available and if we have not
1194 	     already found a valid (even if not the longest) match.  */
1195 	  if (BE (err != REG_NOERROR, 0))
1196 	    return REG_ERROR;
1197 
1198 	  if (mctx->state_log == NULL
1199 	      || (match && !fl_longest_match)
1200 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1201 	    break;
1202 	}
1203 
1204       if (BE (at_init_state, 0))
1205 	{
1206 	  if (old_state == cur_state)
1207 	    next_start_idx = next_char_idx;
1208 	  else
1209 	    at_init_state = false;
1210 	}
1211 
1212       if (cur_state->halt)
1213 	{
1214 	  /* Reached a halt state.
1215 	     Check the halt state can satisfy the current context.  */
1216 	  if (!cur_state->has_constraint
1217 	      || check_halt_state_context (mctx, cur_state,
1218 					   re_string_cur_idx (&mctx->input)))
1219 	    {
1220 	      /* We found an appropriate halt state.  */
1221 	      match_last = re_string_cur_idx (&mctx->input);
1222 	      match = 1;
1223 
1224 	      /* We found a match, do not modify match_first below.  */
1225 	      p_match_first = NULL;
1226 	      if (!fl_longest_match)
1227 		break;
1228 	    }
1229 	}
1230     }
1231 
1232   if (p_match_first)
1233     *p_match_first += next_start_idx;
1234 
1235   return match_last;
1236 }
1237 
1238 /* Check NODE match the current context.  */
1239 
1240 static bool
1241 internal_function
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1242 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1243 {
1244   re_token_type_t type = dfa->nodes[node].type;
1245   unsigned int constraint = dfa->nodes[node].constraint;
1246   if (type != END_OF_RE)
1247     return false;
1248   if (!constraint)
1249     return true;
1250   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1251     return false;
1252   return true;
1253 }
1254 
1255 /* Check the halt state STATE match the current context.
1256    Return 0 if not match, if the node, STATE has, is a halt node and
1257    match the context, return the node.  */
1258 
1259 static Idx
1260 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1261 check_halt_state_context (const re_match_context_t *mctx,
1262 			  const re_dfastate_t *state, Idx idx)
1263 {
1264   Idx i;
1265   unsigned int context;
1266 #ifdef DEBUG
1267   assert (state->halt);
1268 #endif
1269   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1270   for (i = 0; i < state->nodes.nelem; ++i)
1271     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1272       return state->nodes.elems[i];
1273   return 0;
1274 }
1275 
1276 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1277    corresponding to the DFA).
1278    Return the destination node, and update EPS_VIA_NODES;
1279    return REG_MISSING in case of errors.  */
1280 
1281 static Idx
1282 internal_function
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1283 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1284 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1285 		   struct re_fail_stack_t *fs)
1286 {
1287   const re_dfa_t *const dfa = mctx->dfa;
1288   Idx i;
1289   bool ok;
1290   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1291     {
1292       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1293       re_node_set *edests = &dfa->edests[node];
1294       Idx dest_node;
1295       ok = re_node_set_insert (eps_via_nodes, node);
1296       if (BE (! ok, 0))
1297 	return REG_ERROR;
1298       /* Pick up a valid destination, or return REG_MISSING if none
1299 	 is found.  */
1300       for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1301 	{
1302 	  Idx candidate = edests->elems[i];
1303 	  if (!re_node_set_contains (cur_nodes, candidate))
1304 	    continue;
1305           if (dest_node == REG_MISSING)
1306 	    dest_node = candidate;
1307 
1308 	  else
1309 	    {
1310 	      /* In order to avoid infinite loop like "(a*)*", return the second
1311 		 epsilon-transition if the first was already considered.  */
1312 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1313 		return candidate;
1314 
1315 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1316 	      else if (fs != NULL
1317 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1318 					   eps_via_nodes))
1319 		return REG_ERROR;
1320 
1321 	      /* We know we are going to exit.  */
1322 	      break;
1323 	    }
1324 	}
1325       return dest_node;
1326     }
1327   else
1328     {
1329       Idx naccepted = 0;
1330       re_token_type_t type = dfa->nodes[node].type;
1331 
1332 #ifdef RE_ENABLE_I18N
1333       if (dfa->nodes[node].accept_mb)
1334 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1335       else
1336 #endif /* RE_ENABLE_I18N */
1337       if (type == OP_BACK_REF)
1338 	{
1339 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1340 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1341 	  if (fs != NULL)
1342 	    {
1343 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1344 		return REG_MISSING;
1345 	      else if (naccepted)
1346 		{
1347 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1348 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1349 			      naccepted) != 0)
1350 		    return REG_MISSING;
1351 		}
1352 	    }
1353 
1354 	  if (naccepted == 0)
1355 	    {
1356 	      Idx dest_node;
1357 	      ok = re_node_set_insert (eps_via_nodes, node);
1358 	      if (BE (! ok, 0))
1359 		return REG_ERROR;
1360 	      dest_node = dfa->edests[node].elems[0];
1361 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1362 					dest_node))
1363 		return dest_node;
1364 	    }
1365 	}
1366 
1367       if (naccepted != 0
1368 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1369 	{
1370 	  Idx dest_node = dfa->nexts[node];
1371 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1372 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1373 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1374 					       dest_node)))
1375 	    return REG_MISSING;
1376 	  re_node_set_empty (eps_via_nodes);
1377 	  return dest_node;
1378 	}
1379     }
1380   return REG_MISSING;
1381 }
1382 
1383 static reg_errcode_t
1384 internal_function __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1385 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1386 		 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1387 {
1388   reg_errcode_t err;
1389   Idx num = fs->num++;
1390   if (fs->num == fs->alloc)
1391     {
1392       struct re_fail_stack_ent_t *new_array;
1393       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1394 				       * fs->alloc * 2));
1395       if (new_array == NULL)
1396 	return REG_ESPACE;
1397       fs->alloc *= 2;
1398       fs->stack = new_array;
1399     }
1400   fs->stack[num].idx = str_idx;
1401   fs->stack[num].node = dest_node;
1402   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1403   if (fs->stack[num].regs == NULL)
1404     return REG_ESPACE;
1405   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1406   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1407   return err;
1408 }
1409 
1410 static Idx
1411 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1412 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1413 		regmatch_t *regs, re_node_set *eps_via_nodes)
1414 {
1415   Idx num = --fs->num;
1416   assert (REG_VALID_INDEX (num));
1417   *pidx = fs->stack[num].idx;
1418   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1419   re_node_set_free (eps_via_nodes);
1420   re_free (fs->stack[num].regs);
1421   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1422   return fs->stack[num].node;
1423 }
1424 
1425 /* Set the positions where the subexpressions are starts/ends to registers
1426    PMATCH.
1427    Note: We assume that pmatch[0] is already set, and
1428    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1429 
1430 static reg_errcode_t
1431 internal_function __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1432 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1433 	  regmatch_t *pmatch, bool fl_backtrack)
1434 {
1435   const re_dfa_t *dfa = preg->buffer;
1436   Idx idx, cur_node;
1437   re_node_set eps_via_nodes;
1438   struct re_fail_stack_t *fs;
1439   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1440   regmatch_t *prev_idx_match;
1441   bool prev_idx_match_malloced = false;
1442 
1443 #ifdef DEBUG
1444   assert (nmatch > 1);
1445   assert (mctx->state_log != NULL);
1446 #endif
1447   if (fl_backtrack)
1448     {
1449       fs = &fs_body;
1450       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1451       if (fs->stack == NULL)
1452 	return REG_ESPACE;
1453     }
1454   else
1455     fs = NULL;
1456 
1457   cur_node = dfa->init_node;
1458   re_node_set_init_empty (&eps_via_nodes);
1459 
1460   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1461     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1462   else
1463     {
1464       prev_idx_match = re_malloc (regmatch_t, nmatch);
1465       if (prev_idx_match == NULL)
1466 	{
1467 	  free_fail_stack_return (fs);
1468 	  return REG_ESPACE;
1469 	}
1470       prev_idx_match_malloced = true;
1471     }
1472   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1473 
1474   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1475     {
1476       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1477 
1478       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1479 	{
1480 	  Idx reg_idx;
1481 	  if (fs)
1482 	    {
1483 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1484 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1485 		  break;
1486 	      if (reg_idx == nmatch)
1487 		{
1488 		  re_node_set_free (&eps_via_nodes);
1489 		  if (prev_idx_match_malloced)
1490 		    re_free (prev_idx_match);
1491 		  return free_fail_stack_return (fs);
1492 		}
1493 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1494 					 &eps_via_nodes);
1495 	    }
1496 	  else
1497 	    {
1498 	      re_node_set_free (&eps_via_nodes);
1499 	      if (prev_idx_match_malloced)
1500 		re_free (prev_idx_match);
1501 	      return REG_NOERROR;
1502 	    }
1503 	}
1504 
1505       /* Proceed to next node.  */
1506       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1507 				    &eps_via_nodes, fs);
1508 
1509       if (BE (! REG_VALID_INDEX (cur_node), 0))
1510 	{
1511 	  if (BE (cur_node == REG_ERROR, 0))
1512 	    {
1513 	      re_node_set_free (&eps_via_nodes);
1514 	      if (prev_idx_match_malloced)
1515 		re_free (prev_idx_match);
1516 	      free_fail_stack_return (fs);
1517 	      return REG_ESPACE;
1518 	    }
1519 	  if (fs)
1520 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1521 				       &eps_via_nodes);
1522 	  else
1523 	    {
1524 	      re_node_set_free (&eps_via_nodes);
1525 	      if (prev_idx_match_malloced)
1526 		re_free (prev_idx_match);
1527 	      return REG_NOMATCH;
1528 	    }
1529 	}
1530     }
1531   re_node_set_free (&eps_via_nodes);
1532   if (prev_idx_match_malloced)
1533     re_free (prev_idx_match);
1534   return free_fail_stack_return (fs);
1535 }
1536 
1537 static reg_errcode_t
1538 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1539 free_fail_stack_return (struct re_fail_stack_t *fs)
1540 {
1541   if (fs)
1542     {
1543       Idx fs_idx;
1544       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1545 	{
1546 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1547 	  re_free (fs->stack[fs_idx].regs);
1548 	}
1549       re_free (fs->stack);
1550     }
1551   return REG_NOERROR;
1552 }
1553 
1554 static void
1555 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1556 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1557 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1558 {
1559   int type = dfa->nodes[cur_node].type;
1560   if (type == OP_OPEN_SUBEXP)
1561     {
1562       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1563 
1564       /* We are at the first node of this sub expression.  */
1565       if (reg_num < nmatch)
1566 	{
1567 	  pmatch[reg_num].rm_so = cur_idx;
1568 	  pmatch[reg_num].rm_eo = -1;
1569 	}
1570     }
1571   else if (type == OP_CLOSE_SUBEXP)
1572     {
1573       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1574       if (reg_num < nmatch)
1575 	{
1576 	  /* We are at the last node of this sub expression.  */
1577 	  if (pmatch[reg_num].rm_so < cur_idx)
1578 	    {
1579 	      pmatch[reg_num].rm_eo = cur_idx;
1580 	      /* This is a non-empty match or we are not inside an optional
1581 		 subexpression.  Accept this right away.  */
1582 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1583 	    }
1584 	  else
1585 	    {
1586 	      if (dfa->nodes[cur_node].opt_subexp
1587 		  && prev_idx_match[reg_num].rm_so != -1)
1588 		/* We transited through an empty match for an optional
1589 		   subexpression, like (a?)*, and this is not the subexp's
1590 		   first match.  Copy back the old content of the registers
1591 		   so that matches of an inner subexpression are undone as
1592 		   well, like in ((a?))*.  */
1593 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1594 	      else
1595 		/* We completed a subexpression, but it may be part of
1596 		   an optional one, so do not update PREV_IDX_MATCH.  */
1597 		pmatch[reg_num].rm_eo = cur_idx;
1598 	    }
1599 	}
1600     }
1601 }
1602 
1603 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1604    and sift the nodes in each states according to the following rules.
1605    Updated state_log will be wrote to STATE_LOG.
1606 
1607    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1608      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1609 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1610 	the LAST_NODE, we throw away the node 'a'.
1611      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1612 	string 's' and transit to 'b':
1613 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1614 	   away the node 'a'.
1615 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1616 	    thrown away, we throw away the node 'a'.
1617      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1618 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1619 	   node 'a'.
1620 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1621 	    we throw away the node 'a'.  */
1622 
1623 #define STATE_NODE_CONTAINS(state,node) \
1624   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1625 
1626 static reg_errcode_t
1627 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1628 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1629 {
1630   reg_errcode_t err;
1631   int null_cnt = 0;
1632   Idx str_idx = sctx->last_str_idx;
1633   re_node_set cur_dest;
1634 
1635 #ifdef DEBUG
1636   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1637 #endif
1638 
1639   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1640      transit to the last_node and the last_node itself.  */
1641   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1642   if (BE (err != REG_NOERROR, 0))
1643     return err;
1644   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1645   if (BE (err != REG_NOERROR, 0))
1646     goto free_return;
1647 
1648   /* Then check each states in the state_log.  */
1649   while (str_idx > 0)
1650     {
1651       /* Update counters.  */
1652       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1653       if (null_cnt > mctx->max_mb_elem_len)
1654 	{
1655 	  memset (sctx->sifted_states, '\0',
1656 		  sizeof (re_dfastate_t *) * str_idx);
1657 	  re_node_set_free (&cur_dest);
1658 	  return REG_NOERROR;
1659 	}
1660       re_node_set_empty (&cur_dest);
1661       --str_idx;
1662 
1663       if (mctx->state_log[str_idx])
1664 	{
1665 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1666 	  if (BE (err != REG_NOERROR, 0))
1667 	    goto free_return;
1668 	}
1669 
1670       /* Add all the nodes which satisfy the following conditions:
1671 	 - It can epsilon transit to a node in CUR_DEST.
1672 	 - It is in CUR_SRC.
1673 	 And update state_log.  */
1674       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1675       if (BE (err != REG_NOERROR, 0))
1676 	goto free_return;
1677     }
1678   err = REG_NOERROR;
1679  free_return:
1680   re_node_set_free (&cur_dest);
1681   return err;
1682 }
1683 
1684 static reg_errcode_t
1685 internal_function __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1686 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1687 		     Idx str_idx, re_node_set *cur_dest)
1688 {
1689   const re_dfa_t *const dfa = mctx->dfa;
1690   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1691   Idx i;
1692 
1693   /* Then build the next sifted state.
1694      We build the next sifted state on 'cur_dest', and update
1695      'sifted_states[str_idx]' with 'cur_dest'.
1696      Note:
1697      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1698      'cur_src' points the node_set of the old 'state_log[str_idx]'
1699      (with the epsilon nodes pre-filtered out).  */
1700   for (i = 0; i < cur_src->nelem; i++)
1701     {
1702       Idx prev_node = cur_src->elems[i];
1703       int naccepted = 0;
1704       bool ok;
1705 
1706 #ifdef DEBUG
1707       re_token_type_t type = dfa->nodes[prev_node].type;
1708       assert (!IS_EPSILON_NODE (type));
1709 #endif
1710 #ifdef RE_ENABLE_I18N
1711       /* If the node may accept "multi byte".  */
1712       if (dfa->nodes[prev_node].accept_mb)
1713 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1714 					 str_idx, sctx->last_str_idx);
1715 #endif /* RE_ENABLE_I18N */
1716 
1717       /* We don't check backreferences here.
1718 	 See update_cur_sifted_state().  */
1719       if (!naccepted
1720 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1721 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1722 				  dfa->nexts[prev_node]))
1723 	naccepted = 1;
1724 
1725       if (naccepted == 0)
1726 	continue;
1727 
1728       if (sctx->limits.nelem)
1729 	{
1730 	  Idx to_idx = str_idx + naccepted;
1731 	  if (check_dst_limits (mctx, &sctx->limits,
1732 				dfa->nexts[prev_node], to_idx,
1733 				prev_node, str_idx))
1734 	    continue;
1735 	}
1736       ok = re_node_set_insert (cur_dest, prev_node);
1737       if (BE (! ok, 0))
1738 	return REG_ESPACE;
1739     }
1740 
1741   return REG_NOERROR;
1742 }
1743 
1744 /* Helper functions.  */
1745 
1746 static reg_errcode_t
1747 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1748 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1749 {
1750   Idx top = mctx->state_log_top;
1751 
1752   if ((next_state_log_idx >= mctx->input.bufs_len
1753        && mctx->input.bufs_len < mctx->input.len)
1754       || (next_state_log_idx >= mctx->input.valid_len
1755 	  && mctx->input.valid_len < mctx->input.len))
1756     {
1757       reg_errcode_t err;
1758       err = extend_buffers (mctx);
1759       if (BE (err != REG_NOERROR, 0))
1760 	return err;
1761     }
1762 
1763   if (top < next_state_log_idx)
1764     {
1765       memset (mctx->state_log + top + 1, '\0',
1766 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1767       mctx->state_log_top = next_state_log_idx;
1768     }
1769   return REG_NOERROR;
1770 }
1771 
1772 static reg_errcode_t
1773 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1774 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1775 		   re_dfastate_t **src, Idx num)
1776 {
1777   Idx st_idx;
1778   reg_errcode_t err;
1779   for (st_idx = 0; st_idx < num; ++st_idx)
1780     {
1781       if (dst[st_idx] == NULL)
1782 	dst[st_idx] = src[st_idx];
1783       else if (src[st_idx] != NULL)
1784 	{
1785 	  re_node_set merged_set;
1786 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1787 					&src[st_idx]->nodes);
1788 	  if (BE (err != REG_NOERROR, 0))
1789 	    return err;
1790 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1791 	  re_node_set_free (&merged_set);
1792 	  if (BE (err != REG_NOERROR, 0))
1793 	    return err;
1794 	}
1795     }
1796   return REG_NOERROR;
1797 }
1798 
1799 static reg_errcode_t
1800 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1801 update_cur_sifted_state (const re_match_context_t *mctx,
1802 			 re_sift_context_t *sctx, Idx str_idx,
1803 			 re_node_set *dest_nodes)
1804 {
1805   const re_dfa_t *const dfa = mctx->dfa;
1806   reg_errcode_t err = REG_NOERROR;
1807   const re_node_set *candidates;
1808   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1809 		: &mctx->state_log[str_idx]->nodes);
1810 
1811   if (dest_nodes->nelem == 0)
1812     sctx->sifted_states[str_idx] = NULL;
1813   else
1814     {
1815       if (candidates)
1816 	{
1817 	  /* At first, add the nodes which can epsilon transit to a node in
1818 	     DEST_NODE.  */
1819 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1820 	  if (BE (err != REG_NOERROR, 0))
1821 	    return err;
1822 
1823 	  /* Then, check the limitations in the current sift_context.  */
1824 	  if (sctx->limits.nelem)
1825 	    {
1826 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1827 					 mctx->bkref_ents, str_idx);
1828 	      if (BE (err != REG_NOERROR, 0))
1829 		return err;
1830 	    }
1831 	}
1832 
1833       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1834       if (BE (err != REG_NOERROR, 0))
1835 	return err;
1836     }
1837 
1838   if (candidates && mctx->state_log[str_idx]->has_backref)
1839     {
1840       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1841       if (BE (err != REG_NOERROR, 0))
1842 	return err;
1843     }
1844   return REG_NOERROR;
1845 }
1846 
1847 static reg_errcode_t
1848 internal_function __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1849 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1850 		       const re_node_set *candidates)
1851 {
1852   reg_errcode_t err = REG_NOERROR;
1853   Idx i;
1854 
1855   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1856   if (BE (err != REG_NOERROR, 0))
1857     return err;
1858 
1859   if (!state->inveclosure.alloc)
1860     {
1861       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1862       if (BE (err != REG_NOERROR, 0))
1863 	return REG_ESPACE;
1864       for (i = 0; i < dest_nodes->nelem; i++)
1865 	{
1866 	  err = re_node_set_merge (&state->inveclosure,
1867 				   dfa->inveclosures + dest_nodes->elems[i]);
1868 	  if (BE (err != REG_NOERROR, 0))
1869 	    return REG_ESPACE;
1870 	}
1871     }
1872   return re_node_set_add_intersect (dest_nodes, candidates,
1873 				    &state->inveclosure);
1874 }
1875 
1876 static reg_errcode_t
1877 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1878 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1879 		       const re_node_set *candidates)
1880 {
1881     Idx ecl_idx;
1882     reg_errcode_t err;
1883     re_node_set *inv_eclosure = dfa->inveclosures + node;
1884     re_node_set except_nodes;
1885     re_node_set_init_empty (&except_nodes);
1886     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1887       {
1888 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1889 	if (cur_node == node)
1890 	  continue;
1891 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1892 	  {
1893 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1894 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1895 			 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1896 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1897 		 && re_node_set_contains (dest_nodes, edst1))
1898 		|| (REG_VALID_NONZERO_INDEX (edst2)
1899 		    && !re_node_set_contains (inv_eclosure, edst2)
1900 		    && re_node_set_contains (dest_nodes, edst2)))
1901 	      {
1902 		err = re_node_set_add_intersect (&except_nodes, candidates,
1903 						 dfa->inveclosures + cur_node);
1904 		if (BE (err != REG_NOERROR, 0))
1905 		  {
1906 		    re_node_set_free (&except_nodes);
1907 		    return err;
1908 		  }
1909 	      }
1910 	  }
1911       }
1912     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1913       {
1914 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1915 	if (!re_node_set_contains (&except_nodes, cur_node))
1916 	  {
1917 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1918 	    re_node_set_remove_at (dest_nodes, idx);
1919 	  }
1920       }
1921     re_node_set_free (&except_nodes);
1922     return REG_NOERROR;
1923 }
1924 
1925 static bool
1926 internal_function
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1927 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1928 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1929 {
1930   const re_dfa_t *const dfa = mctx->dfa;
1931   Idx lim_idx, src_pos, dst_pos;
1932 
1933   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1934   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1935   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1936     {
1937       Idx subexp_idx;
1938       struct re_backref_cache_entry *ent;
1939       ent = mctx->bkref_ents + limits->elems[lim_idx];
1940       subexp_idx = dfa->nodes[ent->node].opr.idx;
1941 
1942       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1943 					   subexp_idx, dst_node, dst_idx,
1944 					   dst_bkref_idx);
1945       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1946 					   subexp_idx, src_node, src_idx,
1947 					   src_bkref_idx);
1948 
1949       /* In case of:
1950 	 <src> <dst> ( <subexp> )
1951 	 ( <subexp> ) <src> <dst>
1952 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1953       if (src_pos == dst_pos)
1954 	continue; /* This is unrelated limitation.  */
1955       else
1956 	return true;
1957     }
1958   return false;
1959 }
1960 
1961 static int
1962 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1963 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1964 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1965 {
1966   const re_dfa_t *const dfa = mctx->dfa;
1967   const re_node_set *eclosures = dfa->eclosures + from_node;
1968   Idx node_idx;
1969 
1970   /* Else, we are on the boundary: examine the nodes on the epsilon
1971      closure.  */
1972   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1973     {
1974       Idx node = eclosures->elems[node_idx];
1975       switch (dfa->nodes[node].type)
1976 	{
1977 	case OP_BACK_REF:
1978 	  if (bkref_idx != REG_MISSING)
1979 	    {
1980 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1981 	      do
1982 		{
1983 		  Idx dst;
1984 		  int cpos;
1985 
1986 		  if (ent->node != node)
1987 		    continue;
1988 
1989 		  if (subexp_idx < BITSET_WORD_BITS
1990 		      && !(ent->eps_reachable_subexps_map
1991 			   & ((bitset_word_t) 1 << subexp_idx)))
1992 		    continue;
1993 
1994 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1995 		     OP_CLOSE_SUBEXP cases below.  But, if the
1996 		     destination node is the same node as the source
1997 		     node, don't recurse because it would cause an
1998 		     infinite loop: a regex that exhibits this behavior
1999 		     is ()\1*\1*  */
2000 		  dst = dfa->edests[node].elems[0];
2001 		  if (dst == from_node)
2002 		    {
2003 		      if (boundaries & 1)
2004 			return -1;
2005 		      else /* if (boundaries & 2) */
2006 			return 0;
2007 		    }
2008 
2009 		  cpos =
2010 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2011 						 dst, bkref_idx);
2012 		  if (cpos == -1 /* && (boundaries & 1) */)
2013 		    return -1;
2014 		  if (cpos == 0 && (boundaries & 2))
2015 		    return 0;
2016 
2017 		  if (subexp_idx < BITSET_WORD_BITS)
2018 		    ent->eps_reachable_subexps_map
2019 		      &= ~((bitset_word_t) 1 << subexp_idx);
2020 		}
2021 	      while (ent++->more);
2022 	    }
2023 	  break;
2024 
2025 	case OP_OPEN_SUBEXP:
2026 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2027 	    return -1;
2028 	  break;
2029 
2030 	case OP_CLOSE_SUBEXP:
2031 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2032 	    return 0;
2033 	  break;
2034 
2035 	default:
2036 	    break;
2037 	}
2038     }
2039 
2040   return (boundaries & 2) ? 1 : 0;
2041 }
2042 
2043 static int
2044 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)2045 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2046 			   Idx subexp_idx, Idx from_node, Idx str_idx,
2047 			   Idx bkref_idx)
2048 {
2049   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2050   int boundaries;
2051 
2052   /* If we are outside the range of the subexpression, return -1 or 1.  */
2053   if (str_idx < lim->subexp_from)
2054     return -1;
2055 
2056   if (lim->subexp_to < str_idx)
2057     return 1;
2058 
2059   /* If we are within the subexpression, return 0.  */
2060   boundaries = (str_idx == lim->subexp_from);
2061   boundaries |= (str_idx == lim->subexp_to) << 1;
2062   if (boundaries == 0)
2063     return 0;
2064 
2065   /* Else, examine epsilon closure.  */
2066   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2067 				      from_node, bkref_idx);
2068 }
2069 
2070 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2071    which are against limitations from DEST_NODES. */
2072 
2073 static reg_errcode_t
2074 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2075 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2076 		     const re_node_set *candidates, re_node_set *limits,
2077 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2078 {
2079   reg_errcode_t err;
2080   Idx node_idx, lim_idx;
2081 
2082   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2083     {
2084       Idx subexp_idx;
2085       struct re_backref_cache_entry *ent;
2086       ent = bkref_ents + limits->elems[lim_idx];
2087 
2088       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2089 	continue; /* This is unrelated limitation.  */
2090 
2091       subexp_idx = dfa->nodes[ent->node].opr.idx;
2092       if (ent->subexp_to == str_idx)
2093 	{
2094 	  Idx ops_node = REG_MISSING;
2095 	  Idx cls_node = REG_MISSING;
2096 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2097 	    {
2098 	      Idx node = dest_nodes->elems[node_idx];
2099 	      re_token_type_t type = dfa->nodes[node].type;
2100 	      if (type == OP_OPEN_SUBEXP
2101 		  && subexp_idx == dfa->nodes[node].opr.idx)
2102 		ops_node = node;
2103 	      else if (type == OP_CLOSE_SUBEXP
2104 		       && subexp_idx == dfa->nodes[node].opr.idx)
2105 		cls_node = node;
2106 	    }
2107 
2108 	  /* Check the limitation of the open subexpression.  */
2109 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2110 	  if (REG_VALID_INDEX (ops_node))
2111 	    {
2112 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2113 					   candidates);
2114 	      if (BE (err != REG_NOERROR, 0))
2115 		return err;
2116 	    }
2117 
2118 	  /* Check the limitation of the close subexpression.  */
2119 	  if (REG_VALID_INDEX (cls_node))
2120 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2121 	      {
2122 		Idx node = dest_nodes->elems[node_idx];
2123 		if (!re_node_set_contains (dfa->inveclosures + node,
2124 					   cls_node)
2125 		    && !re_node_set_contains (dfa->eclosures + node,
2126 					      cls_node))
2127 		  {
2128 		    /* It is against this limitation.
2129 		       Remove it form the current sifted state.  */
2130 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2131 						 candidates);
2132 		    if (BE (err != REG_NOERROR, 0))
2133 		      return err;
2134 		    --node_idx;
2135 		  }
2136 	      }
2137 	}
2138       else /* (ent->subexp_to != str_idx)  */
2139 	{
2140 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2141 	    {
2142 	      Idx node = dest_nodes->elems[node_idx];
2143 	      re_token_type_t type = dfa->nodes[node].type;
2144 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2145 		{
2146 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2147 		    continue;
2148 		  /* It is against this limitation.
2149 		     Remove it form the current sifted state.  */
2150 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2151 					       candidates);
2152 		  if (BE (err != REG_NOERROR, 0))
2153 		    return err;
2154 		}
2155 	    }
2156 	}
2157     }
2158   return REG_NOERROR;
2159 }
2160 
2161 static reg_errcode_t
2162 internal_function __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2163 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2164 		   Idx str_idx, const re_node_set *candidates)
2165 {
2166   const re_dfa_t *const dfa = mctx->dfa;
2167   reg_errcode_t err;
2168   Idx node_idx, node;
2169   re_sift_context_t local_sctx;
2170   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2171 
2172   if (first_idx == REG_MISSING)
2173     return REG_NOERROR;
2174 
2175   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2176 
2177   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2178     {
2179       Idx enabled_idx;
2180       re_token_type_t type;
2181       struct re_backref_cache_entry *entry;
2182       node = candidates->elems[node_idx];
2183       type = dfa->nodes[node].type;
2184       /* Avoid infinite loop for the REs like "()\1+".  */
2185       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2186 	continue;
2187       if (type != OP_BACK_REF)
2188 	continue;
2189 
2190       entry = mctx->bkref_ents + first_idx;
2191       enabled_idx = first_idx;
2192       do
2193 	{
2194 	  Idx subexp_len;
2195 	  Idx to_idx;
2196 	  Idx dst_node;
2197 	  bool ok;
2198 	  re_dfastate_t *cur_state;
2199 
2200 	  if (entry->node != node)
2201 	    continue;
2202 	  subexp_len = entry->subexp_to - entry->subexp_from;
2203 	  to_idx = str_idx + subexp_len;
2204 	  dst_node = (subexp_len ? dfa->nexts[node]
2205 		      : dfa->edests[node].elems[0]);
2206 
2207 	  if (to_idx > sctx->last_str_idx
2208 	      || sctx->sifted_states[to_idx] == NULL
2209 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2210 	      || check_dst_limits (mctx, &sctx->limits, node,
2211 				   str_idx, dst_node, to_idx))
2212 	    continue;
2213 
2214 	  if (local_sctx.sifted_states == NULL)
2215 	    {
2216 	      local_sctx = *sctx;
2217 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2218 	      if (BE (err != REG_NOERROR, 0))
2219 		goto free_return;
2220 	    }
2221 	  local_sctx.last_node = node;
2222 	  local_sctx.last_str_idx = str_idx;
2223 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2224 	  if (BE (! ok, 0))
2225 	    {
2226 	      err = REG_ESPACE;
2227 	      goto free_return;
2228 	    }
2229 	  cur_state = local_sctx.sifted_states[str_idx];
2230 	  err = sift_states_backward (mctx, &local_sctx);
2231 	  if (BE (err != REG_NOERROR, 0))
2232 	    goto free_return;
2233 	  if (sctx->limited_states != NULL)
2234 	    {
2235 	      err = merge_state_array (dfa, sctx->limited_states,
2236 				       local_sctx.sifted_states,
2237 				       str_idx + 1);
2238 	      if (BE (err != REG_NOERROR, 0))
2239 		goto free_return;
2240 	    }
2241 	  local_sctx.sifted_states[str_idx] = cur_state;
2242 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2243 
2244 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2245 	  entry = mctx->bkref_ents + enabled_idx;
2246 	}
2247       while (enabled_idx++, entry++->more);
2248     }
2249   err = REG_NOERROR;
2250  free_return:
2251   if (local_sctx.sifted_states != NULL)
2252     {
2253       re_node_set_free (&local_sctx.limits);
2254     }
2255 
2256   return err;
2257 }
2258 
2259 
2260 #ifdef RE_ENABLE_I18N
2261 static int
2262 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2263 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2264 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2265 {
2266   const re_dfa_t *const dfa = mctx->dfa;
2267   int naccepted;
2268   /* Check the node can accept "multi byte".  */
2269   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2270   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2271       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2272 			    dfa->nexts[node_idx]))
2273     /* The node can't accept the "multi byte", or the
2274        destination was already thrown away, then the node
2275        could't accept the current input "multi byte".   */
2276     naccepted = 0;
2277   /* Otherwise, it is sure that the node could accept
2278      'naccepted' bytes input.  */
2279   return naccepted;
2280 }
2281 #endif /* RE_ENABLE_I18N */
2282 
2283 
2284 /* Functions for state transition.  */
2285 
2286 /* Return the next state to which the current state STATE will transit by
2287    accepting the current input byte, and update STATE_LOG if necessary.
2288    If STATE can accept a multibyte char/collating element/back reference
2289    update the destination of STATE_LOG.  */
2290 
2291 static re_dfastate_t *
2292 internal_function __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2293 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2294 	       re_dfastate_t *state)
2295 {
2296   re_dfastate_t **trtable;
2297   unsigned char ch;
2298 
2299 #ifdef RE_ENABLE_I18N
2300   /* If the current state can accept multibyte.  */
2301   if (BE (state->accept_mb, 0))
2302     {
2303       *err = transit_state_mb (mctx, state);
2304       if (BE (*err != REG_NOERROR, 0))
2305 	return NULL;
2306     }
2307 #endif /* RE_ENABLE_I18N */
2308 
2309   /* Then decide the next state with the single byte.  */
2310 #if 0
2311   if (0)
2312     /* don't use transition table  */
2313     return transit_state_sb (err, mctx, state);
2314 #endif
2315 
2316   /* Use transition table  */
2317   ch = re_string_fetch_byte (&mctx->input);
2318   for (;;)
2319     {
2320       trtable = state->trtable;
2321       if (BE (trtable != NULL, 1))
2322 	return trtable[ch];
2323 
2324       trtable = state->word_trtable;
2325       if (BE (trtable != NULL, 1))
2326 	{
2327 	  unsigned int context;
2328 	  context
2329 	    = re_string_context_at (&mctx->input,
2330 				    re_string_cur_idx (&mctx->input) - 1,
2331 				    mctx->eflags);
2332 	  if (IS_WORD_CONTEXT (context))
2333 	    return trtable[ch + SBC_MAX];
2334 	  else
2335 	    return trtable[ch];
2336 	}
2337 
2338       if (!build_trtable (mctx->dfa, state))
2339 	{
2340 	  *err = REG_ESPACE;
2341 	  return NULL;
2342 	}
2343 
2344       /* Retry, we now have a transition table.  */
2345     }
2346 }
2347 
2348 /* Update the state_log if we need */
2349 static re_dfastate_t *
2350 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2351 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2352 		      re_dfastate_t *next_state)
2353 {
2354   const re_dfa_t *const dfa = mctx->dfa;
2355   Idx cur_idx = re_string_cur_idx (&mctx->input);
2356 
2357   if (cur_idx > mctx->state_log_top)
2358     {
2359       mctx->state_log[cur_idx] = next_state;
2360       mctx->state_log_top = cur_idx;
2361     }
2362   else if (mctx->state_log[cur_idx] == 0)
2363     {
2364       mctx->state_log[cur_idx] = next_state;
2365     }
2366   else
2367     {
2368       re_dfastate_t *pstate;
2369       unsigned int context;
2370       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2371       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2372 	 the destination of a multibyte char/collating element/
2373 	 back reference.  Then the next state is the union set of
2374 	 these destinations and the results of the transition table.  */
2375       pstate = mctx->state_log[cur_idx];
2376       log_nodes = pstate->entrance_nodes;
2377       if (next_state != NULL)
2378 	{
2379 	  table_nodes = next_state->entrance_nodes;
2380 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2381 					     log_nodes);
2382 	  if (BE (*err != REG_NOERROR, 0))
2383 	    return NULL;
2384 	}
2385       else
2386 	next_nodes = *log_nodes;
2387       /* Note: We already add the nodes of the initial state,
2388 	 then we don't need to add them here.  */
2389 
2390       context = re_string_context_at (&mctx->input,
2391 				      re_string_cur_idx (&mctx->input) - 1,
2392 				      mctx->eflags);
2393       next_state = mctx->state_log[cur_idx]
2394 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2395       /* We don't need to check errors here, since the return value of
2396 	 this function is next_state and ERR is already set.  */
2397 
2398       if (table_nodes != NULL)
2399 	re_node_set_free (&next_nodes);
2400     }
2401 
2402   if (BE (dfa->nbackref, 0) && next_state != NULL)
2403     {
2404       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2405 	 later.  We must check them here, since the back references in the
2406 	 next state might use them.  */
2407       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2408 					cur_idx);
2409       if (BE (*err != REG_NOERROR, 0))
2410 	return NULL;
2411 
2412       /* If the next state has back references.  */
2413       if (next_state->has_backref)
2414 	{
2415 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2416 	  if (BE (*err != REG_NOERROR, 0))
2417 	    return NULL;
2418 	  next_state = mctx->state_log[cur_idx];
2419 	}
2420     }
2421 
2422   return next_state;
2423 }
2424 
2425 /* Skip bytes in the input that correspond to part of a
2426    multi-byte match, then look in the log for a state
2427    from which to restart matching.  */
2428 static re_dfastate_t *
2429 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2430 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2431 {
2432   re_dfastate_t *cur_state;
2433   do
2434     {
2435       Idx max = mctx->state_log_top;
2436       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2437 
2438       do
2439 	{
2440 	  if (++cur_str_idx > max)
2441 	    return NULL;
2442 	  re_string_skip_bytes (&mctx->input, 1);
2443 	}
2444       while (mctx->state_log[cur_str_idx] == NULL);
2445 
2446       cur_state = merge_state_with_log (err, mctx, NULL);
2447     }
2448   while (*err == REG_NOERROR && cur_state == NULL);
2449   return cur_state;
2450 }
2451 
2452 /* Helper functions for transit_state.  */
2453 
2454 /* From the node set CUR_NODES, pick up the nodes whose types are
2455    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2456    expression. And register them to use them later for evaluating the
2457    corresponding back references.  */
2458 
2459 static reg_errcode_t
2460 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2461 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2462 			   Idx str_idx)
2463 {
2464   const re_dfa_t *const dfa = mctx->dfa;
2465   Idx node_idx;
2466   reg_errcode_t err;
2467 
2468   /* TODO: This isn't efficient.
2469 	   Because there might be more than one nodes whose types are
2470 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2471 	   nodes.
2472 	   E.g. RE: (a){2}  */
2473   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2474     {
2475       Idx node = cur_nodes->elems[node_idx];
2476       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2477 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2478 	  && (dfa->used_bkref_map
2479 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2480 	{
2481 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2482 	  if (BE (err != REG_NOERROR, 0))
2483 	    return err;
2484 	}
2485     }
2486   return REG_NOERROR;
2487 }
2488 
2489 #if 0
2490 /* Return the next state to which the current state STATE will transit by
2491    accepting the current input byte.  */
2492 
2493 static re_dfastate_t *
2494 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2495 		  re_dfastate_t *state)
2496 {
2497   const re_dfa_t *const dfa = mctx->dfa;
2498   re_node_set next_nodes;
2499   re_dfastate_t *next_state;
2500   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2501   unsigned int context;
2502 
2503   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2504   if (BE (*err != REG_NOERROR, 0))
2505     return NULL;
2506   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2507     {
2508       Idx cur_node = state->nodes.elems[node_cnt];
2509       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2510 	{
2511 	  *err = re_node_set_merge (&next_nodes,
2512 				    dfa->eclosures + dfa->nexts[cur_node]);
2513 	  if (BE (*err != REG_NOERROR, 0))
2514 	    {
2515 	      re_node_set_free (&next_nodes);
2516 	      return NULL;
2517 	    }
2518 	}
2519     }
2520   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2521   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2522   /* We don't need to check errors here, since the return value of
2523      this function is next_state and ERR is already set.  */
2524 
2525   re_node_set_free (&next_nodes);
2526   re_string_skip_bytes (&mctx->input, 1);
2527   return next_state;
2528 }
2529 #endif
2530 
2531 #ifdef RE_ENABLE_I18N
2532 static reg_errcode_t
2533 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2534 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2535 {
2536   const re_dfa_t *const dfa = mctx->dfa;
2537   reg_errcode_t err;
2538   Idx i;
2539 
2540   for (i = 0; i < pstate->nodes.nelem; ++i)
2541     {
2542       re_node_set dest_nodes, *new_nodes;
2543       Idx cur_node_idx = pstate->nodes.elems[i];
2544       int naccepted;
2545       Idx dest_idx;
2546       unsigned int context;
2547       re_dfastate_t *dest_state;
2548 
2549       if (!dfa->nodes[cur_node_idx].accept_mb)
2550 	continue;
2551 
2552       if (dfa->nodes[cur_node_idx].constraint)
2553 	{
2554 	  context = re_string_context_at (&mctx->input,
2555 					  re_string_cur_idx (&mctx->input),
2556 					  mctx->eflags);
2557 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2558 					   context))
2559 	    continue;
2560 	}
2561 
2562       /* How many bytes the node can accept?  */
2563       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2564 					   re_string_cur_idx (&mctx->input));
2565       if (naccepted == 0)
2566 	continue;
2567 
2568       /* The node can accepts 'naccepted' bytes.  */
2569       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2570       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2571 			       : mctx->max_mb_elem_len);
2572       err = clean_state_log_if_needed (mctx, dest_idx);
2573       if (BE (err != REG_NOERROR, 0))
2574 	return err;
2575 #ifdef DEBUG
2576       assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2577 #endif
2578       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2579 
2580       dest_state = mctx->state_log[dest_idx];
2581       if (dest_state == NULL)
2582 	dest_nodes = *new_nodes;
2583       else
2584 	{
2585 	  err = re_node_set_init_union (&dest_nodes,
2586 					dest_state->entrance_nodes, new_nodes);
2587 	  if (BE (err != REG_NOERROR, 0))
2588 	    return err;
2589 	}
2590       context = re_string_context_at (&mctx->input, dest_idx - 1,
2591 				      mctx->eflags);
2592       mctx->state_log[dest_idx]
2593 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2594       if (dest_state != NULL)
2595 	re_node_set_free (&dest_nodes);
2596       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2597 	return err;
2598     }
2599   return REG_NOERROR;
2600 }
2601 #endif /* RE_ENABLE_I18N */
2602 
2603 static reg_errcode_t
2604 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2605 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2606 {
2607   const re_dfa_t *const dfa = mctx->dfa;
2608   reg_errcode_t err;
2609   Idx i;
2610   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2611 
2612   for (i = 0; i < nodes->nelem; ++i)
2613     {
2614       Idx dest_str_idx, prev_nelem, bkc_idx;
2615       Idx node_idx = nodes->elems[i];
2616       unsigned int context;
2617       const re_token_t *node = dfa->nodes + node_idx;
2618       re_node_set *new_dest_nodes;
2619 
2620       /* Check whether 'node' is a backreference or not.  */
2621       if (node->type != OP_BACK_REF)
2622 	continue;
2623 
2624       if (node->constraint)
2625 	{
2626 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2627 					  mctx->eflags);
2628 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2629 	    continue;
2630 	}
2631 
2632       /* 'node' is a backreference.
2633 	 Check the substring which the substring matched.  */
2634       bkc_idx = mctx->nbkref_ents;
2635       err = get_subexp (mctx, node_idx, cur_str_idx);
2636       if (BE (err != REG_NOERROR, 0))
2637 	goto free_return;
2638 
2639       /* And add the epsilon closures (which is 'new_dest_nodes') of
2640 	 the backreference to appropriate state_log.  */
2641 #ifdef DEBUG
2642       assert (dfa->nexts[node_idx] != REG_MISSING);
2643 #endif
2644       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2645 	{
2646 	  Idx subexp_len;
2647 	  re_dfastate_t *dest_state;
2648 	  struct re_backref_cache_entry *bkref_ent;
2649 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2650 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2651 	    continue;
2652 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2653 	  new_dest_nodes = (subexp_len == 0
2654 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2655 			    : dfa->eclosures + dfa->nexts[node_idx]);
2656 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2657 			  - bkref_ent->subexp_from);
2658 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2659 					  mctx->eflags);
2660 	  dest_state = mctx->state_log[dest_str_idx];
2661 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2662 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2663 	  /* Add 'new_dest_node' to state_log.  */
2664 	  if (dest_state == NULL)
2665 	    {
2666 	      mctx->state_log[dest_str_idx]
2667 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2668 					    context);
2669 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2670 		      && err != REG_NOERROR, 0))
2671 		goto free_return;
2672 	    }
2673 	  else
2674 	    {
2675 	      re_node_set dest_nodes;
2676 	      err = re_node_set_init_union (&dest_nodes,
2677 					    dest_state->entrance_nodes,
2678 					    new_dest_nodes);
2679 	      if (BE (err != REG_NOERROR, 0))
2680 		{
2681 		  re_node_set_free (&dest_nodes);
2682 		  goto free_return;
2683 		}
2684 	      mctx->state_log[dest_str_idx]
2685 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2686 	      re_node_set_free (&dest_nodes);
2687 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2688 		      && err != REG_NOERROR, 0))
2689 		goto free_return;
2690 	    }
2691 	  /* We need to check recursively if the backreference can epsilon
2692 	     transit.  */
2693 	  if (subexp_len == 0
2694 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2695 	    {
2696 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2697 					       cur_str_idx);
2698 	      if (BE (err != REG_NOERROR, 0))
2699 		goto free_return;
2700 	      err = transit_state_bkref (mctx, new_dest_nodes);
2701 	      if (BE (err != REG_NOERROR, 0))
2702 		goto free_return;
2703 	    }
2704 	}
2705     }
2706   err = REG_NOERROR;
2707  free_return:
2708   return err;
2709 }
2710 
2711 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2712    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2713    Note that we might collect inappropriate candidates here.
2714    However, the cost of checking them strictly here is too high, then we
2715    delay these checking for prune_impossible_nodes().  */
2716 
2717 static reg_errcode_t
2718 internal_function __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2719 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2720 {
2721   const re_dfa_t *const dfa = mctx->dfa;
2722   Idx subexp_num, sub_top_idx;
2723   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2724   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2725   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2726   if (cache_idx != REG_MISSING)
2727     {
2728       const struct re_backref_cache_entry *entry
2729 	= mctx->bkref_ents + cache_idx;
2730       do
2731 	if (entry->node == bkref_node)
2732 	  return REG_NOERROR; /* We already checked it.  */
2733       while (entry++->more);
2734     }
2735 
2736   subexp_num = dfa->nodes[bkref_node].opr.idx;
2737 
2738   /* For each sub expression  */
2739   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2740     {
2741       reg_errcode_t err;
2742       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2743       re_sub_match_last_t *sub_last;
2744       Idx sub_last_idx, sl_str, bkref_str_off;
2745 
2746       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2747 	continue; /* It isn't related.  */
2748 
2749       sl_str = sub_top->str_idx;
2750       bkref_str_off = bkref_str_idx;
2751       /* At first, check the last node of sub expressions we already
2752 	 evaluated.  */
2753       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2754 	{
2755 	  regoff_t sl_str_diff;
2756 	  sub_last = sub_top->lasts[sub_last_idx];
2757 	  sl_str_diff = sub_last->str_idx - sl_str;
2758 	  /* The matched string by the sub expression match with the substring
2759 	     at the back reference?  */
2760 	  if (sl_str_diff > 0)
2761 	    {
2762 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2763 		{
2764 		  /* Not enough chars for a successful match.  */
2765 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2766 		    break;
2767 
2768 		  err = clean_state_log_if_needed (mctx,
2769 						   bkref_str_off
2770 						   + sl_str_diff);
2771 		  if (BE (err != REG_NOERROR, 0))
2772 		    return err;
2773 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2774 		}
2775 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2776 		/* We don't need to search this sub expression any more.  */
2777 		break;
2778 	    }
2779 	  bkref_str_off += sl_str_diff;
2780 	  sl_str += sl_str_diff;
2781 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2782 				bkref_str_idx);
2783 
2784 	  /* Reload buf, since the preceding call might have reallocated
2785 	     the buffer.  */
2786 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2787 
2788 	  if (err == REG_NOMATCH)
2789 	    continue;
2790 	  if (BE (err != REG_NOERROR, 0))
2791 	    return err;
2792 	}
2793 
2794       if (sub_last_idx < sub_top->nlasts)
2795 	continue;
2796       if (sub_last_idx > 0)
2797 	++sl_str;
2798       /* Then, search for the other last nodes of the sub expression.  */
2799       for (; sl_str <= bkref_str_idx; ++sl_str)
2800 	{
2801 	  Idx cls_node;
2802 	  regoff_t sl_str_off;
2803 	  const re_node_set *nodes;
2804 	  sl_str_off = sl_str - sub_top->str_idx;
2805 	  /* The matched string by the sub expression match with the substring
2806 	     at the back reference?  */
2807 	  if (sl_str_off > 0)
2808 	    {
2809 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2810 		{
2811 		  /* If we are at the end of the input, we cannot match.  */
2812 		  if (bkref_str_off >= mctx->input.len)
2813 		    break;
2814 
2815 		  err = extend_buffers (mctx);
2816 		  if (BE (err != REG_NOERROR, 0))
2817 		    return err;
2818 
2819 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2820 		}
2821 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2822 		break; /* We don't need to search this sub expression
2823 			  any more.  */
2824 	    }
2825 	  if (mctx->state_log[sl_str] == NULL)
2826 	    continue;
2827 	  /* Does this state have a ')' of the sub expression?  */
2828 	  nodes = &mctx->state_log[sl_str]->nodes;
2829 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2830 				       OP_CLOSE_SUBEXP);
2831 	  if (cls_node == REG_MISSING)
2832 	    continue; /* No.  */
2833 	  if (sub_top->path == NULL)
2834 	    {
2835 	      sub_top->path = calloc (sizeof (state_array_t),
2836 				      sl_str - sub_top->str_idx + 1);
2837 	      if (sub_top->path == NULL)
2838 		return REG_ESPACE;
2839 	    }
2840 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2841 	     in the current context?  */
2842 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2843 			       sub_top->str_idx, cls_node, sl_str,
2844 			       OP_CLOSE_SUBEXP);
2845 	  if (err == REG_NOMATCH)
2846 	      continue;
2847 	  if (BE (err != REG_NOERROR, 0))
2848 	      return err;
2849 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2850 	  if (BE (sub_last == NULL, 0))
2851 	    return REG_ESPACE;
2852 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2853 				bkref_str_idx);
2854 	  if (err == REG_NOMATCH)
2855 	    continue;
2856 	}
2857     }
2858   return REG_NOERROR;
2859 }
2860 
2861 /* Helper functions for get_subexp().  */
2862 
2863 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2864    If it can arrive, register the sub expression expressed with SUB_TOP
2865    and SUB_LAST.  */
2866 
2867 static reg_errcode_t
2868 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2869 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2870 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2871 {
2872   reg_errcode_t err;
2873   Idx to_idx;
2874   /* Can the subexpression arrive the back reference?  */
2875   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2876 		       sub_last->str_idx, bkref_node, bkref_str,
2877 		       OP_OPEN_SUBEXP);
2878   if (err != REG_NOERROR)
2879     return err;
2880   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2881 			     sub_last->str_idx);
2882   if (BE (err != REG_NOERROR, 0))
2883     return err;
2884   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2885   return clean_state_log_if_needed (mctx, to_idx);
2886 }
2887 
2888 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2889    Search '(' if FL_OPEN, or search ')' otherwise.
2890    TODO: This function isn't efficient...
2891 	 Because there might be more than one nodes whose types are
2892 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2893 	 nodes.
2894 	 E.g. RE: (a){2}  */
2895 
2896 static Idx
2897 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2898 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2899 		  Idx subexp_idx, int type)
2900 {
2901   Idx cls_idx;
2902   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2903     {
2904       Idx cls_node = nodes->elems[cls_idx];
2905       const re_token_t *node = dfa->nodes + cls_node;
2906       if (node->type == type
2907 	  && node->opr.idx == subexp_idx)
2908 	return cls_node;
2909     }
2910   return REG_MISSING;
2911 }
2912 
2913 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2914    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2915    heavily reused.
2916    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2917 
2918 static reg_errcode_t
2919 internal_function __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2920 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2921 	       Idx top_str, Idx last_node, Idx last_str, int type)
2922 {
2923   const re_dfa_t *const dfa = mctx->dfa;
2924   reg_errcode_t err = REG_NOERROR;
2925   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2926   re_dfastate_t *cur_state = NULL;
2927   re_node_set *cur_nodes, next_nodes;
2928   re_dfastate_t **backup_state_log;
2929   unsigned int context;
2930 
2931   subexp_num = dfa->nodes[top_node].opr.idx;
2932   /* Extend the buffer if we need.  */
2933   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2934     {
2935       re_dfastate_t **new_array;
2936       Idx old_alloc = path->alloc;
2937       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2938       Idx new_alloc;
2939       if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2940 	return REG_ESPACE;
2941       new_alloc = old_alloc + incr_alloc;
2942       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2943 	return REG_ESPACE;
2944       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2945       if (BE (new_array == NULL, 0))
2946 	return REG_ESPACE;
2947       path->array = new_array;
2948       path->alloc = new_alloc;
2949       memset (new_array + old_alloc, '\0',
2950 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2951     }
2952 
2953   str_idx = path->next_idx ? path->next_idx : top_str;
2954 
2955   /* Temporary modify MCTX.  */
2956   backup_state_log = mctx->state_log;
2957   backup_cur_idx = mctx->input.cur_idx;
2958   mctx->state_log = path->array;
2959   mctx->input.cur_idx = str_idx;
2960 
2961   /* Setup initial node set.  */
2962   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2963   if (str_idx == top_str)
2964     {
2965       err = re_node_set_init_1 (&next_nodes, top_node);
2966       if (BE (err != REG_NOERROR, 0))
2967 	return err;
2968       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2969       if (BE (err != REG_NOERROR, 0))
2970 	{
2971 	  re_node_set_free (&next_nodes);
2972 	  return err;
2973 	}
2974     }
2975   else
2976     {
2977       cur_state = mctx->state_log[str_idx];
2978       if (cur_state && cur_state->has_backref)
2979 	{
2980 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2981 	  if (BE (err != REG_NOERROR, 0))
2982 	    return err;
2983 	}
2984       else
2985 	re_node_set_init_empty (&next_nodes);
2986     }
2987   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2988     {
2989       if (next_nodes.nelem)
2990 	{
2991 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2992 				    subexp_num, type);
2993 	  if (BE (err != REG_NOERROR, 0))
2994 	    {
2995 	      re_node_set_free (&next_nodes);
2996 	      return err;
2997 	    }
2998 	}
2999       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3000       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3001 	{
3002 	  re_node_set_free (&next_nodes);
3003 	  return err;
3004 	}
3005       mctx->state_log[str_idx] = cur_state;
3006     }
3007 
3008   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3009     {
3010       re_node_set_empty (&next_nodes);
3011       if (mctx->state_log[str_idx + 1])
3012 	{
3013 	  err = re_node_set_merge (&next_nodes,
3014 				   &mctx->state_log[str_idx + 1]->nodes);
3015 	  if (BE (err != REG_NOERROR, 0))
3016 	    {
3017 	      re_node_set_free (&next_nodes);
3018 	      return err;
3019 	    }
3020 	}
3021       if (cur_state)
3022 	{
3023 	  err = check_arrival_add_next_nodes (mctx, str_idx,
3024 					      &cur_state->non_eps_nodes,
3025 					      &next_nodes);
3026 	  if (BE (err != REG_NOERROR, 0))
3027 	    {
3028 	      re_node_set_free (&next_nodes);
3029 	      return err;
3030 	    }
3031 	}
3032       ++str_idx;
3033       if (next_nodes.nelem)
3034 	{
3035 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3036 	  if (BE (err != REG_NOERROR, 0))
3037 	    {
3038 	      re_node_set_free (&next_nodes);
3039 	      return err;
3040 	    }
3041 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3042 				    subexp_num, type);
3043 	  if (BE (err != REG_NOERROR, 0))
3044 	    {
3045 	      re_node_set_free (&next_nodes);
3046 	      return err;
3047 	    }
3048 	}
3049       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3050       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3051       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3052 	{
3053 	  re_node_set_free (&next_nodes);
3054 	  return err;
3055 	}
3056       mctx->state_log[str_idx] = cur_state;
3057       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3058     }
3059   re_node_set_free (&next_nodes);
3060   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3061 	       : &mctx->state_log[last_str]->nodes);
3062   path->next_idx = str_idx;
3063 
3064   /* Fix MCTX.  */
3065   mctx->state_log = backup_state_log;
3066   mctx->input.cur_idx = backup_cur_idx;
3067 
3068   /* Then check the current node set has the node LAST_NODE.  */
3069   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3070     return REG_NOERROR;
3071 
3072   return REG_NOMATCH;
3073 }
3074 
3075 /* Helper functions for check_arrival.  */
3076 
3077 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3078    to NEXT_NODES.
3079    TODO: This function is similar to the functions transit_state*(),
3080 	 however this function has many additional works.
3081 	 Can't we unify them?  */
3082 
3083 static reg_errcode_t
3084 internal_function __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3085 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3086 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3087 {
3088   const re_dfa_t *const dfa = mctx->dfa;
3089   bool ok;
3090   Idx cur_idx;
3091 #ifdef RE_ENABLE_I18N
3092   reg_errcode_t err = REG_NOERROR;
3093 #endif
3094   re_node_set union_set;
3095   re_node_set_init_empty (&union_set);
3096   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3097     {
3098       int naccepted = 0;
3099       Idx cur_node = cur_nodes->elems[cur_idx];
3100 #ifdef DEBUG
3101       re_token_type_t type = dfa->nodes[cur_node].type;
3102       assert (!IS_EPSILON_NODE (type));
3103 #endif
3104 #ifdef RE_ENABLE_I18N
3105       /* If the node may accept "multi byte".  */
3106       if (dfa->nodes[cur_node].accept_mb)
3107 	{
3108 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3109 					       str_idx);
3110 	  if (naccepted > 1)
3111 	    {
3112 	      re_dfastate_t *dest_state;
3113 	      Idx next_node = dfa->nexts[cur_node];
3114 	      Idx next_idx = str_idx + naccepted;
3115 	      dest_state = mctx->state_log[next_idx];
3116 	      re_node_set_empty (&union_set);
3117 	      if (dest_state)
3118 		{
3119 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3120 		  if (BE (err != REG_NOERROR, 0))
3121 		    {
3122 		      re_node_set_free (&union_set);
3123 		      return err;
3124 		    }
3125 		}
3126 	      ok = re_node_set_insert (&union_set, next_node);
3127 	      if (BE (! ok, 0))
3128 		{
3129 		  re_node_set_free (&union_set);
3130 		  return REG_ESPACE;
3131 		}
3132 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3133 							    &union_set);
3134 	      if (BE (mctx->state_log[next_idx] == NULL
3135 		      && err != REG_NOERROR, 0))
3136 		{
3137 		  re_node_set_free (&union_set);
3138 		  return err;
3139 		}
3140 	    }
3141 	}
3142 #endif /* RE_ENABLE_I18N */
3143       if (naccepted
3144 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3145 	{
3146 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3147 	  if (BE (! ok, 0))
3148 	    {
3149 	      re_node_set_free (&union_set);
3150 	      return REG_ESPACE;
3151 	    }
3152 	}
3153     }
3154   re_node_set_free (&union_set);
3155   return REG_NOERROR;
3156 }
3157 
3158 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3159    CUR_NODES, however exclude the nodes which are:
3160     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3161     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3162 */
3163 
3164 static reg_errcode_t
3165 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3166 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3167 			  Idx ex_subexp, int type)
3168 {
3169   reg_errcode_t err;
3170   Idx idx, outside_node;
3171   re_node_set new_nodes;
3172 #ifdef DEBUG
3173   assert (cur_nodes->nelem);
3174 #endif
3175   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3176   if (BE (err != REG_NOERROR, 0))
3177     return err;
3178   /* Create a new node set NEW_NODES with the nodes which are epsilon
3179      closures of the node in CUR_NODES.  */
3180 
3181   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3182     {
3183       Idx cur_node = cur_nodes->elems[idx];
3184       const re_node_set *eclosure = dfa->eclosures + cur_node;
3185       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3186       if (outside_node == REG_MISSING)
3187 	{
3188 	  /* There are no problematic nodes, just merge them.  */
3189 	  err = re_node_set_merge (&new_nodes, eclosure);
3190 	  if (BE (err != REG_NOERROR, 0))
3191 	    {
3192 	      re_node_set_free (&new_nodes);
3193 	      return err;
3194 	    }
3195 	}
3196       else
3197 	{
3198 	  /* There are problematic nodes, re-calculate incrementally.  */
3199 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3200 					      ex_subexp, type);
3201 	  if (BE (err != REG_NOERROR, 0))
3202 	    {
3203 	      re_node_set_free (&new_nodes);
3204 	      return err;
3205 	    }
3206 	}
3207     }
3208   re_node_set_free (cur_nodes);
3209   *cur_nodes = new_nodes;
3210   return REG_NOERROR;
3211 }
3212 
3213 /* Helper function for check_arrival_expand_ecl.
3214    Check incrementally the epsilon closure of TARGET, and if it isn't
3215    problematic append it to DST_NODES.  */
3216 
3217 static reg_errcode_t
3218 internal_function __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3219 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3220 			      Idx target, Idx ex_subexp, int type)
3221 {
3222   Idx cur_node;
3223   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3224     {
3225       bool ok;
3226 
3227       if (dfa->nodes[cur_node].type == type
3228 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3229 	{
3230 	  if (type == OP_CLOSE_SUBEXP)
3231 	    {
3232 	      ok = re_node_set_insert (dst_nodes, cur_node);
3233 	      if (BE (! ok, 0))
3234 		return REG_ESPACE;
3235 	    }
3236 	  break;
3237 	}
3238       ok = re_node_set_insert (dst_nodes, cur_node);
3239       if (BE (! ok, 0))
3240 	return REG_ESPACE;
3241       if (dfa->edests[cur_node].nelem == 0)
3242 	break;
3243       if (dfa->edests[cur_node].nelem == 2)
3244 	{
3245 	  reg_errcode_t err;
3246 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3247 					      dfa->edests[cur_node].elems[1],
3248 					      ex_subexp, type);
3249 	  if (BE (err != REG_NOERROR, 0))
3250 	    return err;
3251 	}
3252       cur_node = dfa->edests[cur_node].elems[0];
3253     }
3254   return REG_NOERROR;
3255 }
3256 
3257 
3258 /* For all the back references in the current state, calculate the
3259    destination of the back references by the appropriate entry
3260    in MCTX->BKREF_ENTS.  */
3261 
3262 static reg_errcode_t
3263 internal_function __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3264 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3265 		    Idx cur_str, Idx subexp_num, int type)
3266 {
3267   const re_dfa_t *const dfa = mctx->dfa;
3268   reg_errcode_t err;
3269   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3270   struct re_backref_cache_entry *ent;
3271 
3272   if (cache_idx_start == REG_MISSING)
3273     return REG_NOERROR;
3274 
3275  restart:
3276   ent = mctx->bkref_ents + cache_idx_start;
3277   do
3278     {
3279       Idx to_idx, next_node;
3280 
3281       /* Is this entry ENT is appropriate?  */
3282       if (!re_node_set_contains (cur_nodes, ent->node))
3283 	continue; /* No.  */
3284 
3285       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3286       /* Calculate the destination of the back reference, and append it
3287 	 to MCTX->STATE_LOG.  */
3288       if (to_idx == cur_str)
3289 	{
3290 	  /* The backreference did epsilon transit, we must re-check all the
3291 	     node in the current state.  */
3292 	  re_node_set new_dests;
3293 	  reg_errcode_t err2, err3;
3294 	  next_node = dfa->edests[ent->node].elems[0];
3295 	  if (re_node_set_contains (cur_nodes, next_node))
3296 	    continue;
3297 	  err = re_node_set_init_1 (&new_dests, next_node);
3298 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3299 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3300 	  re_node_set_free (&new_dests);
3301 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3302 		  || err3 != REG_NOERROR, 0))
3303 	    {
3304 	      err = (err != REG_NOERROR ? err
3305 		     : (err2 != REG_NOERROR ? err2 : err3));
3306 	      return err;
3307 	    }
3308 	  /* TODO: It is still inefficient...  */
3309 	  goto restart;
3310 	}
3311       else
3312 	{
3313 	  re_node_set union_set;
3314 	  next_node = dfa->nexts[ent->node];
3315 	  if (mctx->state_log[to_idx])
3316 	    {
3317 	      bool ok;
3318 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3319 					next_node))
3320 		continue;
3321 	      err = re_node_set_init_copy (&union_set,
3322 					   &mctx->state_log[to_idx]->nodes);
3323 	      ok = re_node_set_insert (&union_set, next_node);
3324 	      if (BE (err != REG_NOERROR || ! ok, 0))
3325 		{
3326 		  re_node_set_free (&union_set);
3327 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3328 		  return err;
3329 		}
3330 	    }
3331 	  else
3332 	    {
3333 	      err = re_node_set_init_1 (&union_set, next_node);
3334 	      if (BE (err != REG_NOERROR, 0))
3335 		return err;
3336 	    }
3337 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3338 	  re_node_set_free (&union_set);
3339 	  if (BE (mctx->state_log[to_idx] == NULL
3340 		  && err != REG_NOERROR, 0))
3341 	    return err;
3342 	}
3343     }
3344   while (ent++->more);
3345   return REG_NOERROR;
3346 }
3347 
3348 /* Build transition table for the state.
3349    Return true if successful.  */
3350 
3351 static bool
3352 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3353 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3354 {
3355   reg_errcode_t err;
3356   Idx i, j;
3357   int ch;
3358   bool need_word_trtable = false;
3359   bitset_word_t elem, mask;
3360   bool dests_node_malloced = false;
3361   bool dest_states_malloced = false;
3362   Idx ndests; /* Number of the destination states from 'state'.  */
3363   re_dfastate_t **trtable;
3364   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3365   re_node_set follows, *dests_node;
3366   bitset_t *dests_ch;
3367   bitset_t acceptable;
3368 
3369   struct dests_alloc
3370   {
3371     re_node_set dests_node[SBC_MAX];
3372     bitset_t dests_ch[SBC_MAX];
3373   } *dests_alloc;
3374 
3375   /* We build DFA states which corresponds to the destination nodes
3376      from 'state'.  'dests_node[i]' represents the nodes which i-th
3377      destination state contains, and 'dests_ch[i]' represents the
3378      characters which i-th destination state accepts.  */
3379   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3380     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3381   else
3382     {
3383       dests_alloc = re_malloc (struct dests_alloc, 1);
3384       if (BE (dests_alloc == NULL, 0))
3385 	return false;
3386       dests_node_malloced = true;
3387     }
3388   dests_node = dests_alloc->dests_node;
3389   dests_ch = dests_alloc->dests_ch;
3390 
3391   /* Initialize transition table.  */
3392   state->word_trtable = state->trtable = NULL;
3393 
3394   /* At first, group all nodes belonging to 'state' into several
3395      destinations.  */
3396   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3397   if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3398     {
3399       if (dests_node_malloced)
3400 	free (dests_alloc);
3401       /* Return false in case of an error, true otherwise.  */
3402       if (ndests == 0)
3403 	{
3404 	  state->trtable = (re_dfastate_t **)
3405 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3406           if (BE (state->trtable == NULL, 0))
3407             return false;
3408 	  return true;
3409 	}
3410       return false;
3411     }
3412 
3413   err = re_node_set_alloc (&follows, ndests + 1);
3414   if (BE (err != REG_NOERROR, 0))
3415     goto out_free;
3416 
3417   /* Avoid arithmetic overflow in size calculation.  */
3418   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3419 	    / (3 * sizeof (re_dfastate_t *)))
3420 	   < ndests),
3421 	  0))
3422     goto out_free;
3423 
3424   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3425 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3426     dest_states = (re_dfastate_t **)
3427       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3428   else
3429     {
3430       dest_states = (re_dfastate_t **)
3431 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
3432       if (BE (dest_states == NULL, 0))
3433 	{
3434 out_free:
3435 	  if (dest_states_malloced)
3436 	    free (dest_states);
3437 	  re_node_set_free (&follows);
3438 	  for (i = 0; i < ndests; ++i)
3439 	    re_node_set_free (dests_node + i);
3440 	  if (dests_node_malloced)
3441 	    free (dests_alloc);
3442 	  return false;
3443 	}
3444       dest_states_malloced = true;
3445     }
3446   dest_states_word = dest_states + ndests;
3447   dest_states_nl = dest_states_word + ndests;
3448   bitset_empty (acceptable);
3449 
3450   /* Then build the states for all destinations.  */
3451   for (i = 0; i < ndests; ++i)
3452     {
3453       Idx next_node;
3454       re_node_set_empty (&follows);
3455       /* Merge the follows of this destination states.  */
3456       for (j = 0; j < dests_node[i].nelem; ++j)
3457 	{
3458 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3459 	  if (next_node != REG_MISSING)
3460 	    {
3461 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3462 	      if (BE (err != REG_NOERROR, 0))
3463 		goto out_free;
3464 	    }
3465 	}
3466       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3467       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3468 	goto out_free;
3469       /* If the new state has context constraint,
3470 	 build appropriate states for these contexts.  */
3471       if (dest_states[i]->has_constraint)
3472 	{
3473 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3474 							  CONTEXT_WORD);
3475 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3476 	    goto out_free;
3477 
3478 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3479 	    need_word_trtable = true;
3480 
3481 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3482 							CONTEXT_NEWLINE);
3483 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3484 	    goto out_free;
3485 	}
3486       else
3487 	{
3488 	  dest_states_word[i] = dest_states[i];
3489 	  dest_states_nl[i] = dest_states[i];
3490 	}
3491       bitset_merge (acceptable, dests_ch[i]);
3492     }
3493 
3494   if (!BE (need_word_trtable, 0))
3495     {
3496       /* We don't care about whether the following character is a word
3497 	 character, or we are in a single-byte character set so we can
3498 	 discern by looking at the character code: allocate a
3499 	 256-entry transition table.  */
3500       trtable = state->trtable =
3501 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3502       if (BE (trtable == NULL, 0))
3503 	goto out_free;
3504 
3505       /* For all characters ch...:  */
3506       for (i = 0; i < BITSET_WORDS; ++i)
3507 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3508 	     elem;
3509 	     mask <<= 1, elem >>= 1, ++ch)
3510 	  if (BE (elem & 1, 0))
3511 	    {
3512 	      /* There must be exactly one destination which accepts
3513 		 character ch.  See group_nodes_into_DFAstates.  */
3514 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3515 		;
3516 
3517 	      /* j-th destination accepts the word character ch.  */
3518 	      if (dfa->word_char[i] & mask)
3519 		trtable[ch] = dest_states_word[j];
3520 	      else
3521 		trtable[ch] = dest_states[j];
3522 	    }
3523     }
3524   else
3525     {
3526       /* We care about whether the following character is a word
3527 	 character, and we are in a multi-byte character set: discern
3528 	 by looking at the character code: build two 256-entry
3529 	 transition tables, one starting at trtable[0] and one
3530 	 starting at trtable[SBC_MAX].  */
3531       trtable = state->word_trtable =
3532 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3533       if (BE (trtable == NULL, 0))
3534 	goto out_free;
3535 
3536       /* For all characters ch...:  */
3537       for (i = 0; i < BITSET_WORDS; ++i)
3538 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3539 	     elem;
3540 	     mask <<= 1, elem >>= 1, ++ch)
3541 	  if (BE (elem & 1, 0))
3542 	    {
3543 	      /* There must be exactly one destination which accepts
3544 		 character ch.  See group_nodes_into_DFAstates.  */
3545 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3546 		;
3547 
3548 	      /* j-th destination accepts the word character ch.  */
3549 	      trtable[ch] = dest_states[j];
3550 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3551 	    }
3552     }
3553 
3554   /* new line */
3555   if (bitset_contain (acceptable, NEWLINE_CHAR))
3556     {
3557       /* The current state accepts newline character.  */
3558       for (j = 0; j < ndests; ++j)
3559 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3560 	  {
3561 	    /* k-th destination accepts newline character.  */
3562 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3563 	    if (need_word_trtable)
3564 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3565 	    /* There must be only one destination which accepts
3566 	       newline.  See group_nodes_into_DFAstates.  */
3567 	    break;
3568 	  }
3569     }
3570 
3571   if (dest_states_malloced)
3572     free (dest_states);
3573 
3574   re_node_set_free (&follows);
3575   for (i = 0; i < ndests; ++i)
3576     re_node_set_free (dests_node + i);
3577 
3578   if (dests_node_malloced)
3579     free (dests_alloc);
3580 
3581   return true;
3582 }
3583 
3584 /* Group all nodes belonging to STATE into several destinations.
3585    Then for all destinations, set the nodes belonging to the destination
3586    to DESTS_NODE[i] and set the characters accepted by the destination
3587    to DEST_CH[i].  This function return the number of destinations.  */
3588 
3589 static Idx
3590 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3591 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3592 			    re_node_set *dests_node, bitset_t *dests_ch)
3593 {
3594   reg_errcode_t err;
3595   bool ok;
3596   Idx i, j, k;
3597   Idx ndests; /* Number of the destinations from 'state'.  */
3598   bitset_t accepts; /* Characters a node can accept.  */
3599   const re_node_set *cur_nodes = &state->nodes;
3600   bitset_empty (accepts);
3601   ndests = 0;
3602 
3603   /* For all the nodes belonging to 'state',  */
3604   for (i = 0; i < cur_nodes->nelem; ++i)
3605     {
3606       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3607       re_token_type_t type = node->type;
3608       unsigned int constraint = node->constraint;
3609 
3610       /* Enumerate all single byte character this node can accept.  */
3611       if (type == CHARACTER)
3612 	bitset_set (accepts, node->opr.c);
3613       else if (type == SIMPLE_BRACKET)
3614 	{
3615 	  bitset_merge (accepts, node->opr.sbcset);
3616 	}
3617       else if (type == OP_PERIOD)
3618 	{
3619 #ifdef RE_ENABLE_I18N
3620 	  if (dfa->mb_cur_max > 1)
3621 	    bitset_merge (accepts, dfa->sb_char);
3622 	  else
3623 #endif
3624 	    bitset_set_all (accepts);
3625 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3626 	    bitset_clear (accepts, '\n');
3627 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3628 	    bitset_clear (accepts, '\0');
3629 	}
3630 #ifdef RE_ENABLE_I18N
3631       else if (type == OP_UTF8_PERIOD)
3632 	{
3633 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3634 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3635 	  else
3636 	    bitset_merge (accepts, utf8_sb_map);
3637 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3638 	    bitset_clear (accepts, '\n');
3639 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3640 	    bitset_clear (accepts, '\0');
3641 	}
3642 #endif
3643       else
3644 	continue;
3645 
3646       /* Check the 'accepts' and sift the characters which are not
3647 	 match it the context.  */
3648       if (constraint)
3649 	{
3650 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3651 	    {
3652 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3653 	      bitset_empty (accepts);
3654 	      if (accepts_newline)
3655 		bitset_set (accepts, NEWLINE_CHAR);
3656 	      else
3657 		continue;
3658 	    }
3659 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3660 	    {
3661 	      bitset_empty (accepts);
3662 	      continue;
3663 	    }
3664 
3665 	  if (constraint & NEXT_WORD_CONSTRAINT)
3666 	    {
3667 	      bitset_word_t any_set = 0;
3668 	      if (type == CHARACTER && !node->word_char)
3669 		{
3670 		  bitset_empty (accepts);
3671 		  continue;
3672 		}
3673 #ifdef RE_ENABLE_I18N
3674 	      if (dfa->mb_cur_max > 1)
3675 		for (j = 0; j < BITSET_WORDS; ++j)
3676 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3677 	      else
3678 #endif
3679 		for (j = 0; j < BITSET_WORDS; ++j)
3680 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3681 	      if (!any_set)
3682 		continue;
3683 	    }
3684 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3685 	    {
3686 	      bitset_word_t any_set = 0;
3687 	      if (type == CHARACTER && node->word_char)
3688 		{
3689 		  bitset_empty (accepts);
3690 		  continue;
3691 		}
3692 #ifdef RE_ENABLE_I18N
3693 	      if (dfa->mb_cur_max > 1)
3694 		for (j = 0; j < BITSET_WORDS; ++j)
3695 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3696 	      else
3697 #endif
3698 		for (j = 0; j < BITSET_WORDS; ++j)
3699 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3700 	      if (!any_set)
3701 		continue;
3702 	    }
3703 	}
3704 
3705       /* Then divide 'accepts' into DFA states, or create a new
3706 	 state.  Above, we make sure that accepts is not empty.  */
3707       for (j = 0; j < ndests; ++j)
3708 	{
3709 	  bitset_t intersec; /* Intersection sets, see below.  */
3710 	  bitset_t remains;
3711 	  /* Flags, see below.  */
3712 	  bitset_word_t has_intersec, not_subset, not_consumed;
3713 
3714 	  /* Optimization, skip if this state doesn't accept the character.  */
3715 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3716 	    continue;
3717 
3718 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3719 	  has_intersec = 0;
3720 	  for (k = 0; k < BITSET_WORDS; ++k)
3721 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3722 	  /* And skip if the intersection set is empty.  */
3723 	  if (!has_intersec)
3724 	    continue;
3725 
3726 	  /* Then check if this state is a subset of 'accepts'.  */
3727 	  not_subset = not_consumed = 0;
3728 	  for (k = 0; k < BITSET_WORDS; ++k)
3729 	    {
3730 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3731 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3732 	    }
3733 
3734 	  /* If this state isn't a subset of 'accepts', create a
3735 	     new group state, which has the 'remains'. */
3736 	  if (not_subset)
3737 	    {
3738 	      bitset_copy (dests_ch[ndests], remains);
3739 	      bitset_copy (dests_ch[j], intersec);
3740 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3741 	      if (BE (err != REG_NOERROR, 0))
3742 		goto error_return;
3743 	      ++ndests;
3744 	    }
3745 
3746 	  /* Put the position in the current group. */
3747 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3748 	  if (BE (! ok, 0))
3749 	    goto error_return;
3750 
3751 	  /* If all characters are consumed, go to next node. */
3752 	  if (!not_consumed)
3753 	    break;
3754 	}
3755       /* Some characters remain, create a new group. */
3756       if (j == ndests)
3757 	{
3758 	  bitset_copy (dests_ch[ndests], accepts);
3759 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3760 	  if (BE (err != REG_NOERROR, 0))
3761 	    goto error_return;
3762 	  ++ndests;
3763 	  bitset_empty (accepts);
3764 	}
3765     }
3766   return ndests;
3767  error_return:
3768   for (j = 0; j < ndests; ++j)
3769     re_node_set_free (dests_node + j);
3770   return REG_MISSING;
3771 }
3772 
3773 #ifdef RE_ENABLE_I18N
3774 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3775    Return the number of the bytes the node accepts.
3776    STR_IDX is the current index of the input string.
3777 
3778    This function handles the nodes which can accept one character, or
3779    one collating element like '.', '[a-z]', opposite to the other nodes
3780    can only accept one byte.  */
3781 
3782 static int
3783 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3784 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3785 			 const re_string_t *input, Idx str_idx)
3786 {
3787   const re_token_t *node = dfa->nodes + node_idx;
3788   int char_len, elem_len;
3789   Idx i;
3790 
3791   if (BE (node->type == OP_UTF8_PERIOD, 0))
3792     {
3793       unsigned char c = re_string_byte_at (input, str_idx), d;
3794       if (BE (c < 0xc2, 1))
3795 	return 0;
3796 
3797       if (str_idx + 2 > input->len)
3798 	return 0;
3799 
3800       d = re_string_byte_at (input, str_idx + 1);
3801       if (c < 0xe0)
3802 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3803       else if (c < 0xf0)
3804 	{
3805 	  char_len = 3;
3806 	  if (c == 0xe0 && d < 0xa0)
3807 	    return 0;
3808 	}
3809       else if (c < 0xf8)
3810 	{
3811 	  char_len = 4;
3812 	  if (c == 0xf0 && d < 0x90)
3813 	    return 0;
3814 	}
3815       else if (c < 0xfc)
3816 	{
3817 	  char_len = 5;
3818 	  if (c == 0xf8 && d < 0x88)
3819 	    return 0;
3820 	}
3821       else if (c < 0xfe)
3822 	{
3823 	  char_len = 6;
3824 	  if (c == 0xfc && d < 0x84)
3825 	    return 0;
3826 	}
3827       else
3828 	return 0;
3829 
3830       if (str_idx + char_len > input->len)
3831 	return 0;
3832 
3833       for (i = 1; i < char_len; ++i)
3834 	{
3835 	  d = re_string_byte_at (input, str_idx + i);
3836 	  if (d < 0x80 || d > 0xbf)
3837 	    return 0;
3838 	}
3839       return char_len;
3840     }
3841 
3842   char_len = re_string_char_size_at (input, str_idx);
3843   if (node->type == OP_PERIOD)
3844     {
3845       if (char_len <= 1)
3846 	return 0;
3847       /* FIXME: I don't think this if is needed, as both '\n'
3848 	 and '\0' are char_len == 1.  */
3849       /* '.' accepts any one character except the following two cases.  */
3850       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3851 	   re_string_byte_at (input, str_idx) == '\n') ||
3852 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
3853 	   re_string_byte_at (input, str_idx) == '\0'))
3854 	return 0;
3855       return char_len;
3856     }
3857 
3858   elem_len = re_string_elem_size_at (input, str_idx);
3859   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3860     return 0;
3861 
3862   if (node->type == COMPLEX_BRACKET)
3863     {
3864       const re_charset_t *cset = node->opr.mbcset;
3865 # ifdef _LIBC
3866       const unsigned char *pin
3867 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3868       Idx j;
3869       uint32_t nrules;
3870 # endif /* _LIBC */
3871       int match_len = 0;
3872       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3873 		    ? re_string_wchar_at (input, str_idx) : 0);
3874 
3875       /* match with multibyte character?  */
3876       for (i = 0; i < cset->nmbchars; ++i)
3877 	if (wc == cset->mbchars[i])
3878 	  {
3879 	    match_len = char_len;
3880 	    goto check_node_accept_bytes_match;
3881 	  }
3882       /* match with character_class?  */
3883       for (i = 0; i < cset->nchar_classes; ++i)
3884 	{
3885 	  wctype_t wt = cset->char_classes[i];
3886 	  if (__iswctype (wc, wt))
3887 	    {
3888 	      match_len = char_len;
3889 	      goto check_node_accept_bytes_match;
3890 	    }
3891 	}
3892 
3893 # ifdef _LIBC
3894       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3895       if (nrules != 0)
3896 	{
3897 	  unsigned int in_collseq = 0;
3898 	  const int32_t *table, *indirect;
3899 	  const unsigned char *weights, *extra;
3900 	  const char *collseqwc;
3901 	  /* This #include defines a local function!  */
3902 #  include <locale/weight.h>
3903 
3904 	  /* match with collating_symbol?  */
3905 	  if (cset->ncoll_syms)
3906 	    extra = (const unsigned char *)
3907 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3908 	  for (i = 0; i < cset->ncoll_syms; ++i)
3909 	    {
3910 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3911 	      /* Compare the length of input collating element and
3912 		 the length of current collating element.  */
3913 	      if (*coll_sym != elem_len)
3914 		continue;
3915 	      /* Compare each bytes.  */
3916 	      for (j = 0; j < *coll_sym; j++)
3917 		if (pin[j] != coll_sym[1 + j])
3918 		  break;
3919 	      if (j == *coll_sym)
3920 		{
3921 		  /* Match if every bytes is equal.  */
3922 		  match_len = j;
3923 		  goto check_node_accept_bytes_match;
3924 		}
3925 	    }
3926 
3927 	  if (cset->nranges)
3928 	    {
3929 	      if (elem_len <= char_len)
3930 		{
3931 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3932 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3933 		}
3934 	      else
3935 		in_collseq = find_collation_sequence_value (pin, elem_len);
3936 	    }
3937 	  /* match with range expression?  */
3938 	  for (i = 0; i < cset->nranges; ++i)
3939 	    if (cset->range_starts[i] <= in_collseq
3940 		&& in_collseq <= cset->range_ends[i])
3941 	      {
3942 		match_len = elem_len;
3943 		goto check_node_accept_bytes_match;
3944 	      }
3945 
3946 	  /* match with equivalence_class?  */
3947 	  if (cset->nequiv_classes)
3948 	    {
3949 	      const unsigned char *cp = pin;
3950 	      table = (const int32_t *)
3951 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3952 	      weights = (const unsigned char *)
3953 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3954 	      extra = (const unsigned char *)
3955 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3956 	      indirect = (const int32_t *)
3957 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3958 	      int32_t idx = findidx (&cp, elem_len);
3959 	      if (idx > 0)
3960 		for (i = 0; i < cset->nequiv_classes; ++i)
3961 		  {
3962 		    int32_t equiv_class_idx = cset->equiv_classes[i];
3963 		    size_t weight_len = weights[idx & 0xffffff];
3964 		    if (weight_len == weights[equiv_class_idx & 0xffffff]
3965 			&& (idx >> 24) == (equiv_class_idx >> 24))
3966 		      {
3967 			Idx cnt = 0;
3968 
3969 			idx &= 0xffffff;
3970 			equiv_class_idx &= 0xffffff;
3971 
3972 			while (cnt <= weight_len
3973 			       && (weights[equiv_class_idx + 1 + cnt]
3974 				   == weights[idx + 1 + cnt]))
3975 			  ++cnt;
3976 			if (cnt > weight_len)
3977 			  {
3978 			    match_len = elem_len;
3979 			    goto check_node_accept_bytes_match;
3980 			  }
3981 		      }
3982 		  }
3983 	    }
3984 	}
3985       else
3986 # endif /* _LIBC */
3987 	{
3988 	  /* match with range expression?  */
3989 #if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && defined __STRICT_ANSI__)
3990 	  wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3991 #else
3992 	  wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3993 	  cmp_buf[2] = wc;
3994 #endif
3995 	  for (i = 0; i < cset->nranges; ++i)
3996 	    {
3997 	      cmp_buf[0] = cset->range_starts[i];
3998 	      cmp_buf[4] = cset->range_ends[i];
3999 	      if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
4000 		  && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
4001 		{
4002 		  match_len = char_len;
4003 		  goto check_node_accept_bytes_match;
4004 		}
4005 	    }
4006 	}
4007     check_node_accept_bytes_match:
4008       if (!cset->non_match)
4009 	return match_len;
4010       else
4011 	{
4012 	  if (match_len > 0)
4013 	    return 0;
4014 	  else
4015 	    return (elem_len > char_len) ? elem_len : char_len;
4016 	}
4017     }
4018   return 0;
4019 }
4020 
4021 # ifdef _LIBC
4022 static unsigned int
4023 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)4024 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4025 {
4026   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4027   if (nrules == 0)
4028     {
4029       if (mbs_len == 1)
4030 	{
4031 	  /* No valid character.  Match it as a single byte character.  */
4032 	  const unsigned char *collseq = (const unsigned char *)
4033 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4034 	  return collseq[mbs[0]];
4035 	}
4036       return UINT_MAX;
4037     }
4038   else
4039     {
4040       int32_t idx;
4041       const unsigned char *extra = (const unsigned char *)
4042 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4043       int32_t extrasize = (const unsigned char *)
4044 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4045 
4046       for (idx = 0; idx < extrasize;)
4047 	{
4048 	  int mbs_cnt;
4049 	  bool found = false;
4050 	  int32_t elem_mbs_len;
4051 	  /* Skip the name of collating element name.  */
4052 	  idx = idx + extra[idx] + 1;
4053 	  elem_mbs_len = extra[idx++];
4054 	  if (mbs_len == elem_mbs_len)
4055 	    {
4056 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4057 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4058 		  break;
4059 	      if (mbs_cnt == elem_mbs_len)
4060 		/* Found the entry.  */
4061 		found = true;
4062 	    }
4063 	  /* Skip the byte sequence of the collating element.  */
4064 	  idx += elem_mbs_len;
4065 	  /* Adjust for the alignment.  */
4066 	  idx = (idx + 3) & ~3;
4067 	  /* Skip the collation sequence value.  */
4068 	  idx += sizeof (uint32_t);
4069 	  /* Skip the wide char sequence of the collating element.  */
4070 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4071 	  /* If we found the entry, return the sequence value.  */
4072 	  if (found)
4073 	    return *(uint32_t *) (extra + idx);
4074 	  /* Skip the collation sequence value.  */
4075 	  idx += sizeof (uint32_t);
4076 	}
4077       return UINT_MAX;
4078     }
4079 }
4080 # endif /* _LIBC */
4081 #endif /* RE_ENABLE_I18N */
4082 
4083 /* Check whether the node accepts the byte which is IDX-th
4084    byte of the INPUT.  */
4085 
4086 static bool
4087 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)4088 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4089 		   Idx idx)
4090 {
4091   unsigned char ch;
4092   ch = re_string_byte_at (&mctx->input, idx);
4093   switch (node->type)
4094     {
4095     case CHARACTER:
4096       if (node->opr.c != ch)
4097         return false;
4098       break;
4099 
4100     case SIMPLE_BRACKET:
4101       if (!bitset_contain (node->opr.sbcset, ch))
4102         return false;
4103       break;
4104 
4105 #ifdef RE_ENABLE_I18N
4106     case OP_UTF8_PERIOD:
4107       if (ch >= ASCII_CHARS)
4108         return false;
4109       /* FALLTHROUGH */
4110 #endif
4111     case OP_PERIOD:
4112       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4113 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4114 	return false;
4115       break;
4116 
4117     default:
4118       return false;
4119     }
4120 
4121   if (node->constraint)
4122     {
4123       /* The node has constraints.  Check whether the current context
4124 	 satisfies the constraints.  */
4125       unsigned int context = re_string_context_at (&mctx->input, idx,
4126 						   mctx->eflags);
4127       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4128 	return false;
4129     }
4130 
4131   return true;
4132 }
4133 
4134 /* Extend the buffers, if the buffers have run out.  */
4135 
4136 static reg_errcode_t
4137 internal_function __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx)4138 extend_buffers (re_match_context_t *mctx)
4139 {
4140   reg_errcode_t ret;
4141   re_string_t *pstr = &mctx->input;
4142 
4143   /* Avoid overflow.  */
4144   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4145           <= pstr->bufs_len, 0))
4146     return REG_ESPACE;
4147 
4148   /* Double the lengths of the buffers.  */
4149   ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
4150   if (BE (ret != REG_NOERROR, 0))
4151     return ret;
4152 
4153   if (mctx->state_log != NULL)
4154     {
4155       /* And double the length of state_log.  */
4156       /* XXX We have no indication of the size of this buffer.  If this
4157 	 allocation fail we have no indication that the state_log array
4158 	 does not have the right size.  */
4159       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4160 					      pstr->bufs_len + 1);
4161       if (BE (new_array == NULL, 0))
4162 	return REG_ESPACE;
4163       mctx->state_log = new_array;
4164     }
4165 
4166   /* Then reconstruct the buffers.  */
4167   if (pstr->icase)
4168     {
4169 #ifdef RE_ENABLE_I18N
4170       if (pstr->mb_cur_max > 1)
4171 	{
4172 	  ret = build_wcs_upper_buffer (pstr);
4173 	  if (BE (ret != REG_NOERROR, 0))
4174 	    return ret;
4175 	}
4176       else
4177 #endif /* RE_ENABLE_I18N  */
4178 	build_upper_buffer (pstr);
4179     }
4180   else
4181     {
4182 #ifdef RE_ENABLE_I18N
4183       if (pstr->mb_cur_max > 1)
4184 	build_wcs_buffer (pstr);
4185       else
4186 #endif /* RE_ENABLE_I18N  */
4187 	{
4188 	  if (pstr->trans != NULL)
4189 	    re_string_translate_buffer (pstr);
4190 	}
4191     }
4192   return REG_NOERROR;
4193 }
4194 
4195 
4196 /* Functions for matching context.  */
4197 
4198 /* Initialize MCTX.  */
4199 
4200 static reg_errcode_t
4201 internal_function __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4202 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4203 {
4204   mctx->eflags = eflags;
4205   mctx->match_last = REG_MISSING;
4206   if (n > 0)
4207     {
4208       /* Avoid overflow.  */
4209       size_t max_object_size =
4210 	MAX (sizeof (struct re_backref_cache_entry),
4211 	     sizeof (re_sub_match_top_t *));
4212       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4213 	return REG_ESPACE;
4214 
4215       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4216       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4217       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4218 	return REG_ESPACE;
4219     }
4220   /* Already zero-ed by the caller.
4221      else
4222        mctx->bkref_ents = NULL;
4223      mctx->nbkref_ents = 0;
4224      mctx->nsub_tops = 0;  */
4225   mctx->abkref_ents = n;
4226   mctx->max_mb_elem_len = 1;
4227   mctx->asub_tops = n;
4228   return REG_NOERROR;
4229 }
4230 
4231 /* Clean the entries which depend on the current input in MCTX.
4232    This function must be invoked when the matcher changes the start index
4233    of the input, or changes the input string.  */
4234 
4235 static void
4236 internal_function
match_ctx_clean(re_match_context_t * mctx)4237 match_ctx_clean (re_match_context_t *mctx)
4238 {
4239   Idx st_idx;
4240   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4241     {
4242       Idx sl_idx;
4243       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4244       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4245 	{
4246 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4247 	  re_free (last->path.array);
4248 	  re_free (last);
4249 	}
4250       re_free (top->lasts);
4251       if (top->path)
4252 	{
4253 	  re_free (top->path->array);
4254 	  re_free (top->path);
4255 	}
4256       free (top);
4257     }
4258 
4259   mctx->nsub_tops = 0;
4260   mctx->nbkref_ents = 0;
4261 }
4262 
4263 /* Free all the memory associated with MCTX.  */
4264 
4265 static void
4266 internal_function
match_ctx_free(re_match_context_t * mctx)4267 match_ctx_free (re_match_context_t *mctx)
4268 {
4269   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4270   match_ctx_clean (mctx);
4271   re_free (mctx->sub_tops);
4272   re_free (mctx->bkref_ents);
4273 }
4274 
4275 /* Add a new backreference entry to MCTX.
4276    Note that we assume that caller never call this function with duplicate
4277    entry, and call with STR_IDX which isn't smaller than any existing entry.
4278 */
4279 
4280 static reg_errcode_t
4281 internal_function __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4282 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4283 		     Idx to)
4284 {
4285   if (mctx->nbkref_ents >= mctx->abkref_ents)
4286     {
4287       struct re_backref_cache_entry* new_entry;
4288       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4289 			      mctx->abkref_ents * 2);
4290       if (BE (new_entry == NULL, 0))
4291 	{
4292 	  re_free (mctx->bkref_ents);
4293 	  return REG_ESPACE;
4294 	}
4295       mctx->bkref_ents = new_entry;
4296       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4297 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4298       mctx->abkref_ents *= 2;
4299     }
4300   if (mctx->nbkref_ents > 0
4301       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4302     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4303 
4304   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4305   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4306   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4307   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4308 
4309   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4310      If bit N is clear, means that this entry won't epsilon-transition to
4311      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4312      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4313      such node.
4314 
4315      A backreference does not epsilon-transition unless it is empty, so set
4316      to all zeros if FROM != TO.  */
4317   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4318     = (from == to ? -1 : 0);
4319 
4320   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4321   if (mctx->max_mb_elem_len < to - from)
4322     mctx->max_mb_elem_len = to - from;
4323   return REG_NOERROR;
4324 }
4325 
4326 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4327    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4328 
4329 static Idx
4330 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4331 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4332 {
4333   Idx left, right, mid, last;
4334   last = right = mctx->nbkref_ents;
4335   for (left = 0; left < right;)
4336     {
4337       mid = (left + right) / 2;
4338       if (mctx->bkref_ents[mid].str_idx < str_idx)
4339 	left = mid + 1;
4340       else
4341 	right = mid;
4342     }
4343   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4344     return left;
4345   else
4346     return REG_MISSING;
4347 }
4348 
4349 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4350    at STR_IDX.  */
4351 
4352 static reg_errcode_t
4353 internal_function __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4354 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4355 {
4356 #ifdef DEBUG
4357   assert (mctx->sub_tops != NULL);
4358   assert (mctx->asub_tops > 0);
4359 #endif
4360   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4361     {
4362       Idx new_asub_tops = mctx->asub_tops * 2;
4363       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4364 						   re_sub_match_top_t *,
4365 						   new_asub_tops);
4366       if (BE (new_array == NULL, 0))
4367 	return REG_ESPACE;
4368       mctx->sub_tops = new_array;
4369       mctx->asub_tops = new_asub_tops;
4370     }
4371   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4372   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4373     return REG_ESPACE;
4374   mctx->sub_tops[mctx->nsub_tops]->node = node;
4375   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4376   return REG_NOERROR;
4377 }
4378 
4379 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4380    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4381 
4382 static re_sub_match_last_t *
4383 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4384 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4385 {
4386   re_sub_match_last_t *new_entry;
4387   if (BE (subtop->nlasts == subtop->alasts, 0))
4388     {
4389       Idx new_alasts = 2 * subtop->alasts + 1;
4390       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4391 						    re_sub_match_last_t *,
4392 						    new_alasts);
4393       if (BE (new_array == NULL, 0))
4394 	return NULL;
4395       subtop->lasts = new_array;
4396       subtop->alasts = new_alasts;
4397     }
4398   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4399   if (BE (new_entry != NULL, 1))
4400     {
4401       subtop->lasts[subtop->nlasts] = new_entry;
4402       new_entry->node = node;
4403       new_entry->str_idx = str_idx;
4404       ++subtop->nlasts;
4405     }
4406   return new_entry;
4407 }
4408 
4409 static void
4410 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4411 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4412 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4413 {
4414   sctx->sifted_states = sifted_sts;
4415   sctx->limited_states = limited_sts;
4416   sctx->last_node = last_node;
4417   sctx->last_str_idx = last_str_idx;
4418   re_node_set_init_empty (&sctx->limits);
4419 }
4420