1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 					  Idx str_idx, Idx from, Idx to)
26      internal_function;
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
28      internal_function;
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 					   Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 						    Idx node, Idx str_idx)
33      internal_function;
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 			   re_dfastate_t **limited_sts, Idx last_node,
36 			   Idx last_str_idx)
37      internal_function;
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 					 const char *string, Idx length,
40 					 Idx start, Idx last_start, Idx stop,
41 					 size_t nmatch, regmatch_t pmatch[],
42 					 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 				  const char *string1, Idx length1,
45 				  const char *string2, Idx length2,
46 				  Idx start, regoff_t range,
47 				  struct re_registers *regs,
48 				  Idx stop, bool ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 				const char *string, Idx length, Idx start,
51 				regoff_t range, Idx stop,
52 				struct re_registers *regs,
53 				bool ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55                               Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57      internal_function;
58 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
59 			   Idx *p_match_first) internal_function;
60 static Idx check_halt_state_context (const re_match_context_t *mctx,
61 				     const re_dfastate_t *state, Idx idx)
62      internal_function;
63 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 			 regmatch_t *prev_idx_match, Idx cur_node,
65 			 Idx cur_idx, Idx nmatch) internal_function;
66 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 				      Idx str_idx, Idx dest_node, Idx nregs,
68 				      regmatch_t *regs,
69 				      re_node_set *eps_via_nodes)
70      internal_function;
71 static reg_errcode_t set_regs (const regex_t *preg,
72 			       const re_match_context_t *mctx,
73 			       size_t nmatch, regmatch_t *pmatch,
74 			       bool fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
76      internal_function;
77 
78 #ifdef RE_ENABLE_I18N
79 static int sift_states_iter_mb (const re_match_context_t *mctx,
80 				re_sift_context_t *sctx,
81 				Idx node_idx, Idx str_idx, Idx max_str_idx)
82      internal_function;
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 					   re_sift_context_t *sctx)
86      internal_function;
87 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 					  re_sift_context_t *sctx, Idx str_idx,
89 					  re_node_set *cur_dest)
90      internal_function;
91 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 					      re_sift_context_t *sctx,
93 					      Idx str_idx,
94 					      re_node_set *dest_nodes)
95      internal_function;
96 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 					    re_node_set *dest_nodes,
98 					    const re_node_set *candidates)
99      internal_function;
100 static bool check_dst_limits (const re_match_context_t *mctx,
101 			      const re_node_set *limits,
102 			      Idx dst_node, Idx dst_idx, Idx src_node,
103 			      Idx src_idx) internal_function;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 					int boundaries, Idx subexp_idx,
106 					Idx from_node, Idx bkref_idx)
107      internal_function;
108 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 				      Idx limit, Idx subexp_idx,
110 				      Idx node, Idx str_idx,
111 				      Idx bkref_idx) internal_function;
112 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 					  re_node_set *dest_nodes,
114 					  const re_node_set *candidates,
115 					  re_node_set *limits,
116 					  struct re_backref_cache_entry *bkref_ents,
117 					  Idx str_idx) internal_function;
118 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 					re_sift_context_t *sctx,
120 					Idx str_idx, const re_node_set *candidates)
121      internal_function;
122 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 					re_dfastate_t **dst,
124 					re_dfastate_t **src, Idx num)
125      internal_function;
126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 					 re_match_context_t *mctx) internal_function;
128 static re_dfastate_t *transit_state (reg_errcode_t *err,
129 				     re_match_context_t *mctx,
130 				     re_dfastate_t *state) internal_function;
131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 					    re_match_context_t *mctx,
133 					    re_dfastate_t *next_state)
134      internal_function;
135 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 						re_node_set *cur_nodes,
137 						Idx str_idx) internal_function;
138 #if 0
139 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 					re_match_context_t *mctx,
141 					re_dfastate_t *pstate)
142      internal_function;
143 #endif
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 				       re_dfastate_t *pstate)
147      internal_function;
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 					  const re_node_set *nodes)
151      internal_function;
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 				 Idx bkref_node, Idx bkref_str_idx)
154      internal_function;
155 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 				     const re_sub_match_top_t *sub_top,
157 				     re_sub_match_last_t *sub_last,
158 				     Idx bkref_node, Idx bkref_str)
159      internal_function;
160 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 			     Idx subexp_idx, int type) internal_function;
162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 				    state_array_t *path, Idx top_node,
164 				    Idx top_str, Idx last_node, Idx last_str,
165 				    int type) internal_function;
166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 						   Idx str_idx,
168 						   re_node_set *cur_nodes,
169 						   re_node_set *next_nodes)
170      internal_function;
171 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 					       re_node_set *cur_nodes,
173 					       Idx ex_subexp, int type)
174      internal_function;
175 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 						   re_node_set *dst_nodes,
177 						   Idx target, Idx ex_subexp,
178 						   int type) internal_function;
179 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 					 re_node_set *cur_nodes, Idx cur_str,
181 					 Idx subexp_num, int type)
182      internal_function;
183 static bool build_trtable (const re_dfa_t *dfa,
184 			   re_dfastate_t *state) internal_function;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
187 				    const re_string_t *input, Idx idx)
188      internal_function;
189 # ifdef _LIBC
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
191 						   size_t name_len)
192      internal_function;
193 # endif /* _LIBC */
194 #endif /* RE_ENABLE_I18N */
195 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 				       const re_dfastate_t *state,
197 				       re_node_set *states_node,
198 				       bitset_t *states_ch) internal_function;
199 static bool check_node_accept (const re_match_context_t *mctx,
200 			       const re_token_t *node, Idx idx)
201      internal_function;
202 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
203      internal_function;
204 
205 /* Entry point for POSIX code.  */
206 
207 /* regexec searches for a given pattern, specified by PREG, in the
208    string STRING.
209 
210    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
212    least NMATCH elements, and we set them to the offsets of the
213    corresponding matched substrings.
214 
215    EFLAGS specifies "execution flags" which affect matching: if
216    REG_NOTBOL is set, then ^ does not match at the beginning of the
217    string; if REG_NOTEOL is set, then $ does not match at the end.
218 
219    We return 0 if we find a match and REG_NOMATCH if not.  */
220 
221 int
regexec(preg,string,nmatch,pmatch,eflags)222 regexec (preg, string, nmatch, pmatch, eflags)
223     const regex_t *_Restrict_ preg;
224     const char *_Restrict_ string;
225     size_t nmatch;
226     regmatch_t pmatch[_Restrict_arr_];
227     int eflags;
228 {
229   reg_errcode_t err;
230   Idx start, length;
231 #ifdef _LIBC
232   re_dfa_t *dfa = preg->buffer;
233 #endif
234 
235   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
236     return REG_BADPAT;
237 
238   if (eflags & REG_STARTEND)
239     {
240       start = pmatch[0].rm_so;
241       length = pmatch[0].rm_eo;
242     }
243   else
244     {
245       start = 0;
246       length = strlen (string);
247     }
248 
249   __libc_lock_lock (dfa->lock);
250   if (preg->no_sub)
251     err = re_search_internal (preg, string, length, start, length,
252 			      length, 0, NULL, eflags);
253   else
254     err = re_search_internal (preg, string, length, start, length,
255 			      length, nmatch, pmatch, eflags);
256   __libc_lock_unlock (dfa->lock);
257   return err != REG_NOERROR;
258 }
259 
260 #ifdef _LIBC
261 # include <shlib-compat.h>
262 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
263 
264 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
265 __typeof__ (__regexec) __compat_regexec;
266 
267 int
268 attribute_compat_text_section
__compat_regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)269 __compat_regexec (const regex_t *_Restrict_ preg,
270 		  const char *_Restrict_ string, size_t nmatch,
271 		  regmatch_t pmatch[], int eflags)
272 {
273   return regexec (preg, string, nmatch, pmatch,
274 		  eflags & (REG_NOTBOL | REG_NOTEOL));
275 }
276 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
277 # endif
278 #endif
279 
280 /* Entry points for GNU code.  */
281 
282 /* re_match, re_search, re_match_2, re_search_2
283 
284    The former two functions operate on STRING with length LENGTH,
285    while the later two operate on concatenation of STRING1 and STRING2
286    with lengths LENGTH1 and LENGTH2, respectively.
287 
288    re_match() matches the compiled pattern in BUFP against the string,
289    starting at index START.
290 
291    re_search() first tries matching at index START, then it tries to match
292    starting from index START + 1, and so on.  The last start position tried
293    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
294    way as re_match().)
295 
296    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
297    the first STOP characters of the concatenation of the strings should be
298    concerned.
299 
300    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
301    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
302    computed relative to the concatenation, not relative to the individual
303    strings.)
304 
305    On success, re_match* functions return the length of the match, re_search*
306    return the position of the start of the match.  Return value -1 means no
307    match was found and -2 indicates an internal error.  */
308 
309 regoff_t
re_match(bufp,string,length,start,regs)310 re_match (bufp, string, length, start, regs)
311     struct re_pattern_buffer *bufp;
312     const char *string;
313     Idx length, start;
314     struct re_registers *regs;
315 {
316   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
317 }
318 #ifdef _LIBC
319 weak_alias (__re_match, re_match)
320 #endif
321 
322 regoff_t
323 re_search (bufp, string, length, start, range, regs)
324     struct re_pattern_buffer *bufp;
325     const char *string;
326     Idx length, start;
327     regoff_t range;
328     struct re_registers *regs;
329 {
330   return re_search_stub (bufp, string, length, start, range, length, regs,
331 			 false);
332 }
333 #ifdef _LIBC
334 weak_alias (__re_search, re_search)
335 #endif
336 
337 regoff_t
338 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
339     struct re_pattern_buffer *bufp;
340     const char *string1, *string2;
341     Idx length1, length2, start, stop;
342     struct re_registers *regs;
343 {
344   return re_search_2_stub (bufp, string1, length1, string2, length2,
345 			   start, 0, regs, stop, true);
346 }
347 #ifdef _LIBC
348 weak_alias (__re_match_2, re_match_2)
349 #endif
350 
351 regoff_t
352 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
353     struct re_pattern_buffer *bufp;
354     const char *string1, *string2;
355     Idx length1, length2, start, stop;
356     regoff_t range;
357     struct re_registers *regs;
358 {
359   return re_search_2_stub (bufp, string1, length1, string2, length2,
360 			   start, range, regs, stop, false);
361 }
362 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)363 weak_alias (__re_search_2, re_search_2)
364 #endif
365 
366 static regoff_t
367 re_search_2_stub (struct re_pattern_buffer *bufp,
368 		  const char *string1, Idx length1,
369 		  const char *string2, Idx length2,
370 		  Idx start, regoff_t range, struct re_registers *regs,
371 		  Idx stop, bool ret_len)
372 {
373   const char *str;
374   regoff_t rval;
375   Idx len = length1 + length2;
376   char *s = NULL;
377 
378   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
379     return -2;
380 
381   /* Concatenate the strings.  */
382   if (length2 > 0)
383     if (length1 > 0)
384       {
385 	s = re_malloc (char, len);
386 
387 	if (BE (s == NULL, 0))
388 	  return -2;
389 #ifdef _LIBC
390 	memcpy (__mempcpy (s, string1, length1), string2, length2);
391 #else
392 	memcpy (s, string1, length1);
393 	memcpy (s + length1, string2, length2);
394 #endif
395 	str = s;
396       }
397     else
398       str = string2;
399   else
400     str = string1;
401 
402   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
403 			 ret_len);
404   re_free (s);
405   return rval;
406 }
407 
408 /* The parameters have the same meaning as those of re_search.
409    Additional parameters:
410    If RET_LEN is true the length of the match is returned (re_match style);
411    otherwise the position of the match is returned.  */
412 
413 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)414 re_search_stub (struct re_pattern_buffer *bufp,
415 		const char *string, Idx length,
416 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
417 		bool ret_len)
418 {
419   reg_errcode_t result;
420   regmatch_t *pmatch;
421   Idx nregs;
422   regoff_t rval;
423   int eflags = 0;
424 #ifdef _LIBC
425   re_dfa_t *dfa = bufp->buffer;
426 #endif
427   Idx last_start = start + range;
428 
429   /* Check for out-of-range.  */
430   if (BE (start < 0 || start > length, 0))
431     return -1;
432   if (BE (length < last_start || (0 <= range && last_start < start), 0))
433     last_start = length;
434   else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
435     last_start = 0;
436 
437   __libc_lock_lock (dfa->lock);
438 
439   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
440   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
441 
442   /* Compile fastmap if we haven't yet.  */
443   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
444     re_compile_fastmap (bufp);
445 
446   if (BE (bufp->no_sub, 0))
447     regs = NULL;
448 
449   /* We need at least 1 register.  */
450   if (regs == NULL)
451     nregs = 1;
452   else if (BE (bufp->regs_allocated == REGS_FIXED
453 	       && regs->num_regs <= bufp->re_nsub, 0))
454     {
455       nregs = regs->num_regs;
456       if (BE (nregs < 1, 0))
457 	{
458 	  /* Nothing can be copied to regs.  */
459 	  regs = NULL;
460 	  nregs = 1;
461 	}
462     }
463   else
464     nregs = bufp->re_nsub + 1;
465   pmatch = re_malloc (regmatch_t, nregs);
466   if (BE (pmatch == NULL, 0))
467     {
468       rval = -2;
469       goto out;
470     }
471 
472   result = re_search_internal (bufp, string, length, start, last_start, stop,
473 			       nregs, pmatch, eflags);
474 
475   rval = 0;
476 
477   /* I hope we needn't fill their regs with -1's when no match was found.  */
478   if (result != REG_NOERROR)
479     rval = result == REG_NOMATCH ? -1 : -2;
480   else if (regs != NULL)
481     {
482       /* If caller wants register contents data back, copy them.  */
483       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
484 					   bufp->regs_allocated);
485       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
486 	rval = -2;
487     }
488 
489   if (BE (rval == 0, 1))
490     {
491       if (ret_len)
492 	{
493 	  assert (pmatch[0].rm_so == start);
494 	  rval = pmatch[0].rm_eo - start;
495 	}
496       else
497 	rval = pmatch[0].rm_so;
498     }
499   re_free (pmatch);
500  out:
501   __libc_lock_unlock (dfa->lock);
502   return rval;
503 }
504 
505 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)506 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
507 	      int regs_allocated)
508 {
509   int rval = REGS_REALLOCATE;
510   Idx i;
511   Idx need_regs = nregs + 1;
512   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
513      uses.  */
514 
515   /* Have the register data arrays been allocated?  */
516   if (regs_allocated == REGS_UNALLOCATED)
517     { /* No.  So allocate them with malloc.  */
518       regs->start = re_malloc (regoff_t, need_regs);
519       if (BE (regs->start == NULL, 0))
520 	return REGS_UNALLOCATED;
521       regs->end = re_malloc (regoff_t, need_regs);
522       if (BE (regs->end == NULL, 0))
523 	{
524 	  re_free (regs->start);
525 	  return REGS_UNALLOCATED;
526 	}
527       regs->num_regs = need_regs;
528     }
529   else if (regs_allocated == REGS_REALLOCATE)
530     { /* Yes.  If we need more elements than were already
531 	 allocated, reallocate them.  If we need fewer, just
532 	 leave it alone.  */
533       if (BE (need_regs > regs->num_regs, 0))
534 	{
535 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
536 	  regoff_t *new_end;
537 	  if (BE (new_start == NULL, 0))
538 	    return REGS_UNALLOCATED;
539 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
540 	  if (BE (new_end == NULL, 0))
541 	    {
542 	      re_free (new_start);
543 	      return REGS_UNALLOCATED;
544 	    }
545 	  regs->start = new_start;
546 	  regs->end = new_end;
547 	  regs->num_regs = need_regs;
548 	}
549     }
550   else
551     {
552       assert (regs_allocated == REGS_FIXED);
553       /* This function may not be called with REGS_FIXED and nregs too big.  */
554       assert (regs->num_regs >= nregs);
555       rval = REGS_FIXED;
556     }
557 
558   /* Copy the regs.  */
559   for (i = 0; i < nregs; ++i)
560     {
561       regs->start[i] = pmatch[i].rm_so;
562       regs->end[i] = pmatch[i].rm_eo;
563     }
564   for ( ; i < regs->num_regs; ++i)
565     regs->start[i] = regs->end[i] = -1;
566 
567   return rval;
568 }
569 
570 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
571    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
572    this memory for recording register information.  STARTS and ENDS
573    must be allocated using the malloc library routine, and must each
574    be at least NUM_REGS * sizeof (regoff_t) bytes long.
575 
576    If NUM_REGS == 0, then subsequent matches should allocate their own
577    register data.
578 
579    Unless this function is called, the first search or match using
580    PATTERN_BUFFER will allocate its own register data, without
581    freeing the old data.  */
582 
583 void
re_set_registers(bufp,regs,num_regs,starts,ends)584 re_set_registers (bufp, regs, num_regs, starts, ends)
585     struct re_pattern_buffer *bufp;
586     struct re_registers *regs;
587     __re_size_t num_regs;
588     regoff_t *starts, *ends;
589 {
590   if (num_regs)
591     {
592       bufp->regs_allocated = REGS_REALLOCATE;
593       regs->num_regs = num_regs;
594       regs->start = starts;
595       regs->end = ends;
596     }
597   else
598     {
599       bufp->regs_allocated = REGS_UNALLOCATED;
600       regs->num_regs = 0;
601       regs->start = regs->end = NULL;
602     }
603 }
604 #ifdef _LIBC
605 weak_alias (__re_set_registers, re_set_registers)
606 #endif
607 
608 /* Entry points compatible with 4.2 BSD regex library.  We don't define
609    them unless specifically requested.  */
610 
611 #if defined _REGEX_RE_COMP || defined _LIBC
612 int
613 # ifdef _LIBC
614 weak_function
615 # endif
616 re_exec (s)
617      const char *s;
618 {
619   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
620 }
621 #endif /* _REGEX_RE_COMP */
622 
623 /* Internal entry point.  */
624 
625 /* Searches for a compiled pattern PREG in the string STRING, whose
626    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
627    meaning as with regexec.  LAST_START is START + RANGE, where
628    START and RANGE have the same meaning as with re_search.
629    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
630    otherwise return the error code.
631    Note: We assume front end functions already check ranges.
632    (0 <= LAST_START && LAST_START <= LENGTH)  */
633 
634 static reg_errcode_t
635 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)636 re_search_internal (const regex_t *preg,
637 		    const char *string, Idx length,
638 		    Idx start, Idx last_start, Idx stop,
639 		    size_t nmatch, regmatch_t pmatch[],
640 		    int eflags)
641 {
642   reg_errcode_t err;
643   const re_dfa_t *dfa = preg->buffer;
644   Idx left_lim, right_lim;
645   int incr;
646   bool fl_longest_match;
647   int match_kind;
648   Idx match_first;
649   Idx match_last = REG_MISSING;
650   Idx extra_nmatch;
651   bool sb;
652   int ch;
653 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
654   re_match_context_t mctx = { .dfa = dfa };
655 #else
656   re_match_context_t mctx;
657 #endif
658   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
659 		    && start != last_start && !preg->can_be_null)
660 		   ? preg->fastmap : NULL);
661   RE_TRANSLATE_TYPE t = preg->translate;
662 
663 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
664   memset (&mctx, '\0', sizeof (re_match_context_t));
665   mctx.dfa = dfa;
666 #endif
667 
668   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
669   nmatch -= extra_nmatch;
670 
671   /* Check if the DFA haven't been compiled.  */
672   if (BE (preg->used == 0 || dfa->init_state == NULL
673 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
674 	  || dfa->init_state_begbuf == NULL, 0))
675     return REG_NOMATCH;
676 
677 #ifdef DEBUG
678   /* We assume front-end functions already check them.  */
679   assert (0 <= last_start && last_start <= length);
680 #endif
681 
682   /* If initial states with non-begbuf contexts have no elements,
683      the regex must be anchored.  If preg->newline_anchor is set,
684      we'll never use init_state_nl, so do not check it.  */
685   if (dfa->init_state->nodes.nelem == 0
686       && dfa->init_state_word->nodes.nelem == 0
687       && (dfa->init_state_nl->nodes.nelem == 0
688 	  || !preg->newline_anchor))
689     {
690       if (start != 0 && last_start != 0)
691         return REG_NOMATCH;
692       start = last_start = 0;
693     }
694 
695   /* We must check the longest matching, if nmatch > 0.  */
696   fl_longest_match = (nmatch != 0 || dfa->nbackref);
697 
698   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
699 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
700 			    dfa);
701   if (BE (err != REG_NOERROR, 0))
702     goto free_return;
703   mctx.input.stop = stop;
704   mctx.input.raw_stop = stop;
705   mctx.input.newline_anchor = preg->newline_anchor;
706 
707   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
708   if (BE (err != REG_NOERROR, 0))
709     goto free_return;
710 
711   /* We will log all the DFA states through which the dfa pass,
712      if nmatch > 1, or this dfa has "multibyte node", which is a
713      back-reference or a node which can accept multibyte character or
714      multi character collating element.  */
715   if (nmatch > 1 || dfa->has_mb_node)
716     {
717       /* Avoid overflow.  */
718       if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
719                <= mctx.input.bufs_len), 0))
720 	{
721 	  err = REG_ESPACE;
722 	  goto free_return;
723 	}
724 
725       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
726       if (BE (mctx.state_log == NULL, 0))
727 	{
728 	  err = REG_ESPACE;
729 	  goto free_return;
730 	}
731     }
732   else
733     mctx.state_log = NULL;
734 
735   match_first = start;
736   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
737 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
738 
739   /* Check incrementally whether the input string matches.  */
740   incr = (last_start < start) ? -1 : 1;
741   left_lim = (last_start < start) ? last_start : start;
742   right_lim = (last_start < start) ? start : last_start;
743   sb = dfa->mb_cur_max == 1;
744   match_kind =
745     (fastmap
746      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
747 	| (start <= last_start ? 2 : 0)
748 	| (t != NULL ? 1 : 0))
749      : 8);
750 
751   for (;; match_first += incr)
752     {
753       err = REG_NOMATCH;
754       if (match_first < left_lim || right_lim < match_first)
755 	goto free_return;
756 
757       /* Advance as rapidly as possible through the string, until we
758 	 find a plausible place to start matching.  This may be done
759 	 with varying efficiency, so there are various possibilities:
760 	 only the most common of them are specialized, in order to
761 	 save on code size.  We use a switch statement for speed.  */
762       switch (match_kind)
763 	{
764 	case 8:
765 	  /* No fastmap.  */
766 	  break;
767 
768 	case 7:
769 	  /* Fastmap with single-byte translation, match forward.  */
770 	  while (BE (match_first < right_lim, 1)
771 		 && !fastmap[t[(unsigned char) string[match_first]]])
772 	    ++match_first;
773 	  goto forward_match_found_start_or_reached_end;
774 
775 	case 6:
776 	  /* Fastmap without translation, match forward.  */
777 	  while (BE (match_first < right_lim, 1)
778 		 && !fastmap[(unsigned char) string[match_first]])
779 	    ++match_first;
780 
781 	forward_match_found_start_or_reached_end:
782 	  if (BE (match_first == right_lim, 0))
783 	    {
784 	      ch = match_first >= length
785 		       ? 0 : (unsigned char) string[match_first];
786 	      if (!fastmap[t ? t[ch] : ch])
787 		goto free_return;
788 	    }
789 	  break;
790 
791 	case 4:
792 	case 5:
793 	  /* Fastmap without multi-byte translation, match backwards.  */
794 	  while (match_first >= left_lim)
795 	    {
796 	      ch = match_first >= length
797 		       ? 0 : (unsigned char) string[match_first];
798 	      if (fastmap[t ? t[ch] : ch])
799 		break;
800 	      --match_first;
801 	    }
802 	  if (match_first < left_lim)
803 	    goto free_return;
804 	  break;
805 
806 	default:
807 	  /* In this case, we can't determine easily the current byte,
808 	     since it might be a component byte of a multibyte
809 	     character.  Then we use the constructed buffer instead.  */
810 	  for (;;)
811 	    {
812 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
813 		 buffers.  */
814 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
815 	      if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
816 		{
817 		  err = re_string_reconstruct (&mctx.input, match_first,
818 					       eflags);
819 		  if (BE (err != REG_NOERROR, 0))
820 		    goto free_return;
821 
822 		  offset = match_first - mctx.input.raw_mbs_idx;
823 		}
824 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
825 		 Note that MATCH_FIRST must not be smaller than 0.  */
826 	      ch = (match_first >= length
827 		    ? 0 : re_string_byte_at (&mctx.input, offset));
828 	      if (fastmap[ch])
829 		break;
830 	      match_first += incr;
831 	      if (match_first < left_lim || match_first > right_lim)
832 		{
833 		  err = REG_NOMATCH;
834 		  goto free_return;
835 		}
836 	    }
837 	  break;
838 	}
839 
840       /* Reconstruct the buffers so that the matcher can assume that
841 	 the matching starts from the beginning of the buffer.  */
842       err = re_string_reconstruct (&mctx.input, match_first, eflags);
843       if (BE (err != REG_NOERROR, 0))
844 	goto free_return;
845 
846 #ifdef RE_ENABLE_I18N
847      /* Don't consider this char as a possible match start if it part,
848 	yet isn't the head, of a multibyte character.  */
849       if (!sb && !re_string_first_byte (&mctx.input, 0))
850 	continue;
851 #endif
852 
853       /* It seems to be appropriate one, then use the matcher.  */
854       /* We assume that the matching starts from 0.  */
855       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
856       match_last = check_matching (&mctx, fl_longest_match,
857 				   start <= last_start ? &match_first : NULL);
858       if (match_last != REG_MISSING)
859 	{
860 	  if (BE (match_last == REG_ERROR, 0))
861 	    {
862 	      err = REG_ESPACE;
863 	      goto free_return;
864 	    }
865 	  else
866 	    {
867 	      mctx.match_last = match_last;
868 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
869 		{
870 		  re_dfastate_t *pstate = mctx.state_log[match_last];
871 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
872 							     match_last);
873 		}
874 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
875 		  || dfa->nbackref)
876 		{
877 		  err = prune_impossible_nodes (&mctx);
878 		  if (err == REG_NOERROR)
879 		    break;
880 		  if (BE (err != REG_NOMATCH, 0))
881 		    goto free_return;
882 		  match_last = REG_MISSING;
883 		}
884 	      else
885 		break; /* We found a match.  */
886 	    }
887 	}
888 
889       match_ctx_clean (&mctx);
890     }
891 
892 #ifdef DEBUG
893   assert (match_last != REG_MISSING);
894   assert (err == REG_NOERROR);
895 #endif
896 
897   /* Set pmatch[] if we need.  */
898   if (nmatch > 0)
899     {
900       Idx reg_idx;
901 
902       /* Initialize registers.  */
903       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
904 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
905 
906       /* Set the points where matching start/end.  */
907       pmatch[0].rm_so = 0;
908       pmatch[0].rm_eo = mctx.match_last;
909       /* FIXME: This function should fail if mctx.match_last exceeds
910 	 the maximum possible regoff_t value.  We need a new error
911 	 code REG_OVERFLOW.  */
912 
913       if (!preg->no_sub && nmatch > 1)
914 	{
915 	  err = set_regs (preg, &mctx, nmatch, pmatch,
916 			  dfa->has_plural_match && dfa->nbackref > 0);
917 	  if (BE (err != REG_NOERROR, 0))
918 	    goto free_return;
919 	}
920 
921       /* At last, add the offset to each register, since we slid
922 	 the buffers so that we could assume that the matching starts
923 	 from 0.  */
924       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
925 	if (pmatch[reg_idx].rm_so != -1)
926 	  {
927 #ifdef RE_ENABLE_I18N
928 	    if (BE (mctx.input.offsets_needed != 0, 0))
929 	      {
930 		pmatch[reg_idx].rm_so =
931 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
932 		   ? mctx.input.valid_raw_len
933 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
934 		pmatch[reg_idx].rm_eo =
935 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
936 		   ? mctx.input.valid_raw_len
937 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
938 	      }
939 #else
940 	    assert (mctx.input.offsets_needed == 0);
941 #endif
942 	    pmatch[reg_idx].rm_so += match_first;
943 	    pmatch[reg_idx].rm_eo += match_first;
944 	  }
945       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
946 	{
947 	  pmatch[nmatch + reg_idx].rm_so = -1;
948 	  pmatch[nmatch + reg_idx].rm_eo = -1;
949 	}
950 
951       if (dfa->subexp_map)
952 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
953 	  if (dfa->subexp_map[reg_idx] != reg_idx)
954 	    {
955 	      pmatch[reg_idx + 1].rm_so
956 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
957 	      pmatch[reg_idx + 1].rm_eo
958 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
959 	    }
960     }
961 
962  free_return:
963   re_free (mctx.state_log);
964   if (dfa->nbackref)
965     match_ctx_free (&mctx);
966   re_string_destruct (&mctx.input);
967   return err;
968 }
969 
970 static reg_errcode_t
971 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)972 prune_impossible_nodes (re_match_context_t *mctx)
973 {
974   const re_dfa_t *const dfa = mctx->dfa;
975   Idx halt_node, match_last;
976   reg_errcode_t ret;
977   re_dfastate_t **sifted_states;
978   re_dfastate_t **lim_states = NULL;
979   re_sift_context_t sctx;
980 #ifdef DEBUG
981   assert (mctx->state_log != NULL);
982 #endif
983   match_last = mctx->match_last;
984   halt_node = mctx->last_node;
985 
986   /* Avoid overflow.  */
987   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
988     return REG_ESPACE;
989 
990   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
991   if (BE (sifted_states == NULL, 0))
992     {
993       ret = REG_ESPACE;
994       goto free_return;
995     }
996   if (dfa->nbackref)
997     {
998       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
999       if (BE (lim_states == NULL, 0))
1000 	{
1001 	  ret = REG_ESPACE;
1002 	  goto free_return;
1003 	}
1004       while (1)
1005 	{
1006 	  memset (lim_states, '\0',
1007 		  sizeof (re_dfastate_t *) * (match_last + 1));
1008 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1009 			 match_last);
1010 	  ret = sift_states_backward (mctx, &sctx);
1011 	  re_node_set_free (&sctx.limits);
1012 	  if (BE (ret != REG_NOERROR, 0))
1013 	      goto free_return;
1014 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
1015 	    break;
1016 	  do
1017 	    {
1018 	      --match_last;
1019 	      if (! REG_VALID_INDEX (match_last))
1020 		{
1021 		  ret = REG_NOMATCH;
1022 		  goto free_return;
1023 		}
1024 	    } while (mctx->state_log[match_last] == NULL
1025 		     || !mctx->state_log[match_last]->halt);
1026 	  halt_node = check_halt_state_context (mctx,
1027 						mctx->state_log[match_last],
1028 						match_last);
1029 	}
1030       ret = merge_state_array (dfa, sifted_states, lim_states,
1031 			       match_last + 1);
1032       re_free (lim_states);
1033       lim_states = NULL;
1034       if (BE (ret != REG_NOERROR, 0))
1035 	goto free_return;
1036     }
1037   else
1038     {
1039       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1040       ret = sift_states_backward (mctx, &sctx);
1041       re_node_set_free (&sctx.limits);
1042       if (BE (ret != REG_NOERROR, 0))
1043 	goto free_return;
1044       if (sifted_states[0] == NULL)
1045 	{
1046 	  ret = REG_NOMATCH;
1047 	  goto free_return;
1048 	}
1049     }
1050   re_free (mctx->state_log);
1051   mctx->state_log = sifted_states;
1052   sifted_states = NULL;
1053   mctx->last_node = halt_node;
1054   mctx->match_last = match_last;
1055   ret = REG_NOERROR;
1056  free_return:
1057   re_free (sifted_states);
1058   re_free (lim_states);
1059   return ret;
1060 }
1061 
1062 /* Acquire an initial state and return it.
1063    We must select appropriate initial state depending on the context,
1064    since initial states may have constraints like "\<", "^", etc..  */
1065 
1066 static inline re_dfastate_t *
1067 __attribute__ ((always_inline)) internal_function
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1068 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1069 			    Idx idx)
1070 {
1071   const re_dfa_t *const dfa = mctx->dfa;
1072   if (dfa->init_state->has_constraint)
1073     {
1074       unsigned int context;
1075       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1076       if (IS_WORD_CONTEXT (context))
1077 	return dfa->init_state_word;
1078       else if (IS_ORDINARY_CONTEXT (context))
1079 	return dfa->init_state;
1080       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1081 	return dfa->init_state_begbuf;
1082       else if (IS_NEWLINE_CONTEXT (context))
1083 	return dfa->init_state_nl;
1084       else if (IS_BEGBUF_CONTEXT (context))
1085 	{
1086 	  /* It is relatively rare case, then calculate on demand.  */
1087 	  return re_acquire_state_context (err, dfa,
1088 					   dfa->init_state->entrance_nodes,
1089 					   context);
1090 	}
1091       else
1092 	/* Must not happen?  */
1093 	return dfa->init_state;
1094     }
1095   else
1096     return dfa->init_state;
1097 }
1098 
1099 /* Check whether the regular expression match input string INPUT or not,
1100    and return the index where the matching end.  Return REG_MISSING if
1101    there is no match, and return REG_ERROR in case of an error.
1102    FL_LONGEST_MATCH means we want the POSIX longest matching.
1103    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1104    next place where we may want to try matching.
1105    Note that the matcher assumes that the matching starts from the current
1106    index of the buffer.  */
1107 
1108 static Idx
1109 internal_function __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1110 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1111 		Idx *p_match_first)
1112 {
1113   const re_dfa_t *const dfa = mctx->dfa;
1114   reg_errcode_t err;
1115   Idx match = 0;
1116   Idx match_last = REG_MISSING;
1117   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1118   re_dfastate_t *cur_state;
1119   bool at_init_state = p_match_first != NULL;
1120   Idx next_start_idx = cur_str_idx;
1121 
1122   err = REG_NOERROR;
1123   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1124   /* An initial state must not be NULL (invalid).  */
1125   if (BE (cur_state == NULL, 0))
1126     {
1127       assert (err == REG_ESPACE);
1128       return REG_ERROR;
1129     }
1130 
1131   if (mctx->state_log != NULL)
1132     {
1133       mctx->state_log[cur_str_idx] = cur_state;
1134 
1135       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1136 	 later.  E.g. Processing back references.  */
1137       if (BE (dfa->nbackref, 0))
1138 	{
1139 	  at_init_state = false;
1140 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1141 	  if (BE (err != REG_NOERROR, 0))
1142 	    return err;
1143 
1144 	  if (cur_state->has_backref)
1145 	    {
1146 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1147 	      if (BE (err != REG_NOERROR, 0))
1148 		return err;
1149 	    }
1150 	}
1151     }
1152 
1153   /* If the RE accepts NULL string.  */
1154   if (BE (cur_state->halt, 0))
1155     {
1156       if (!cur_state->has_constraint
1157 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1158 	{
1159 	  if (!fl_longest_match)
1160 	    return cur_str_idx;
1161 	  else
1162 	    {
1163 	      match_last = cur_str_idx;
1164 	      match = 1;
1165 	    }
1166 	}
1167     }
1168 
1169   while (!re_string_eoi (&mctx->input))
1170     {
1171       re_dfastate_t *old_state = cur_state;
1172       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1173 
1174       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1175 	   && mctx->input.bufs_len < mctx->input.len)
1176 	  || (BE (next_char_idx >= mctx->input.valid_len, 0)
1177 	      && mctx->input.valid_len < mctx->input.len))
1178 	{
1179 	  err = extend_buffers (mctx, next_char_idx + 1);
1180 	  if (BE (err != REG_NOERROR, 0))
1181 	    {
1182 	      assert (err == REG_ESPACE);
1183 	      return REG_ERROR;
1184 	    }
1185 	}
1186 
1187       cur_state = transit_state (&err, mctx, cur_state);
1188       if (mctx->state_log != NULL)
1189 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1190 
1191       if (cur_state == NULL)
1192 	{
1193 	  /* Reached the invalid state or an error.  Try to recover a valid
1194 	     state using the state log, if available and if we have not
1195 	     already found a valid (even if not the longest) match.  */
1196 	  if (BE (err != REG_NOERROR, 0))
1197 	    return REG_ERROR;
1198 
1199 	  if (mctx->state_log == NULL
1200 	      || (match && !fl_longest_match)
1201 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1202 	    break;
1203 	}
1204 
1205       if (BE (at_init_state, 0))
1206 	{
1207 	  if (old_state == cur_state)
1208 	    next_start_idx = next_char_idx;
1209 	  else
1210 	    at_init_state = false;
1211 	}
1212 
1213       if (cur_state->halt)
1214 	{
1215 	  /* Reached a halt state.
1216 	     Check the halt state can satisfy the current context.  */
1217 	  if (!cur_state->has_constraint
1218 	      || check_halt_state_context (mctx, cur_state,
1219 					   re_string_cur_idx (&mctx->input)))
1220 	    {
1221 	      /* We found an appropriate halt state.  */
1222 	      match_last = re_string_cur_idx (&mctx->input);
1223 	      match = 1;
1224 
1225 	      /* We found a match, do not modify match_first below.  */
1226 	      p_match_first = NULL;
1227 	      if (!fl_longest_match)
1228 		break;
1229 	    }
1230 	}
1231     }
1232 
1233   if (p_match_first)
1234     *p_match_first += next_start_idx;
1235 
1236   return match_last;
1237 }
1238 
1239 /* Check NODE match the current context.  */
1240 
1241 static bool
1242 internal_function
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1243 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1244 {
1245   re_token_type_t type = dfa->nodes[node].type;
1246   unsigned int constraint = dfa->nodes[node].constraint;
1247   if (type != END_OF_RE)
1248     return false;
1249   if (!constraint)
1250     return true;
1251   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1252     return false;
1253   return true;
1254 }
1255 
1256 /* Check the halt state STATE match the current context.
1257    Return 0 if not match, if the node, STATE has, is a halt node and
1258    match the context, return the node.  */
1259 
1260 static Idx
1261 internal_function
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1262 check_halt_state_context (const re_match_context_t *mctx,
1263 			  const re_dfastate_t *state, Idx idx)
1264 {
1265   Idx i;
1266   unsigned int context;
1267 #ifdef DEBUG
1268   assert (state->halt);
1269 #endif
1270   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1271   for (i = 0; i < state->nodes.nelem; ++i)
1272     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1273       return state->nodes.elems[i];
1274   return 0;
1275 }
1276 
1277 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1278    corresponding to the DFA).
1279    Return the destination node, and update EPS_VIA_NODES;
1280    return REG_MISSING in case of errors.  */
1281 
1282 static Idx
1283 internal_function
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1284 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1285 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1286 		   struct re_fail_stack_t *fs)
1287 {
1288   const re_dfa_t *const dfa = mctx->dfa;
1289   Idx i;
1290   bool ok;
1291   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1292     {
1293       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1294       re_node_set *edests = &dfa->edests[node];
1295       Idx dest_node;
1296       ok = re_node_set_insert (eps_via_nodes, node);
1297       if (BE (! ok, 0))
1298 	return REG_ERROR;
1299       /* Pick up a valid destination, or return REG_MISSING if none
1300 	 is found.  */
1301       for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i)
1302 	{
1303 	  Idx candidate = edests->elems[i];
1304 	  if (!re_node_set_contains (cur_nodes, candidate))
1305 	    continue;
1306           if (dest_node == REG_MISSING)
1307 	    dest_node = candidate;
1308 
1309 	  else
1310 	    {
1311 	      /* In order to avoid infinite loop like "(a*)*", return the second
1312 		 epsilon-transition if the first was already considered.  */
1313 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1314 		return candidate;
1315 
1316 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1317 	      else if (fs != NULL
1318 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1319 					   eps_via_nodes))
1320 		return REG_ERROR;
1321 
1322 	      /* We know we are going to exit.  */
1323 	      break;
1324 	    }
1325 	}
1326       return dest_node;
1327     }
1328   else
1329     {
1330       Idx naccepted = 0;
1331       re_token_type_t type = dfa->nodes[node].type;
1332 
1333 #ifdef RE_ENABLE_I18N
1334       if (dfa->nodes[node].accept_mb)
1335 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1336       else
1337 #endif /* RE_ENABLE_I18N */
1338       if (type == OP_BACK_REF)
1339 	{
1340 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1341 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1342 	  if (fs != NULL)
1343 	    {
1344 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1345 		return REG_MISSING;
1346 	      else if (naccepted)
1347 		{
1348 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1349 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1350 			      naccepted) != 0)
1351 		    return REG_MISSING;
1352 		}
1353 	    }
1354 
1355 	  if (naccepted == 0)
1356 	    {
1357 	      Idx dest_node;
1358 	      ok = re_node_set_insert (eps_via_nodes, node);
1359 	      if (BE (! ok, 0))
1360 		return REG_ERROR;
1361 	      dest_node = dfa->edests[node].elems[0];
1362 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1363 					dest_node))
1364 		return dest_node;
1365 	    }
1366 	}
1367 
1368       if (naccepted != 0
1369 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1370 	{
1371 	  Idx dest_node = dfa->nexts[node];
1372 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1373 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1374 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1375 					       dest_node)))
1376 	    return REG_MISSING;
1377 	  re_node_set_empty (eps_via_nodes);
1378 	  return dest_node;
1379 	}
1380     }
1381   return REG_MISSING;
1382 }
1383 
1384 static reg_errcode_t
1385 internal_function __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1386 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1387 		 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1388 {
1389   reg_errcode_t err;
1390   Idx num = fs->num++;
1391   if (fs->num == fs->alloc)
1392     {
1393       struct re_fail_stack_ent_t *new_array;
1394       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1395 				       * fs->alloc * 2));
1396       if (new_array == NULL)
1397 	return REG_ESPACE;
1398       fs->alloc *= 2;
1399       fs->stack = new_array;
1400     }
1401   fs->stack[num].idx = str_idx;
1402   fs->stack[num].node = dest_node;
1403   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1404   if (fs->stack[num].regs == NULL)
1405     return REG_ESPACE;
1406   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1407   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1408   return err;
1409 }
1410 
1411 static Idx
1412 internal_function
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1413 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1414 		regmatch_t *regs, re_node_set *eps_via_nodes)
1415 {
1416   Idx num = --fs->num;
1417   assert (REG_VALID_INDEX (num));
1418   *pidx = fs->stack[num].idx;
1419   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1420   re_node_set_free (eps_via_nodes);
1421   re_free (fs->stack[num].regs);
1422   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1423   return fs->stack[num].node;
1424 }
1425 
1426 /* Set the positions where the subexpressions are starts/ends to registers
1427    PMATCH.
1428    Note: We assume that pmatch[0] is already set, and
1429    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1430 
1431 static reg_errcode_t
1432 internal_function __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1433 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1434 	  regmatch_t *pmatch, bool fl_backtrack)
1435 {
1436   const re_dfa_t *dfa = preg->buffer;
1437   Idx idx, cur_node;
1438   re_node_set eps_via_nodes;
1439   struct re_fail_stack_t *fs;
1440   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1441   regmatch_t *prev_idx_match;
1442   bool prev_idx_match_malloced = false;
1443 
1444 #ifdef DEBUG
1445   assert (nmatch > 1);
1446   assert (mctx->state_log != NULL);
1447 #endif
1448   if (fl_backtrack)
1449     {
1450       fs = &fs_body;
1451       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1452       if (fs->stack == NULL)
1453 	return REG_ESPACE;
1454     }
1455   else
1456     fs = NULL;
1457 
1458   cur_node = dfa->init_node;
1459   re_node_set_init_empty (&eps_via_nodes);
1460 
1461   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1462     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1463   else
1464     {
1465       prev_idx_match = re_malloc (regmatch_t, nmatch);
1466       if (prev_idx_match == NULL)
1467 	{
1468 	  free_fail_stack_return (fs);
1469 	  return REG_ESPACE;
1470 	}
1471       prev_idx_match_malloced = true;
1472     }
1473   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1474 
1475   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1476     {
1477       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1478 
1479       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1480 	{
1481 	  Idx reg_idx;
1482 	  if (fs)
1483 	    {
1484 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1485 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1486 		  break;
1487 	      if (reg_idx == nmatch)
1488 		{
1489 		  re_node_set_free (&eps_via_nodes);
1490 		  if (prev_idx_match_malloced)
1491 		    re_free (prev_idx_match);
1492 		  return free_fail_stack_return (fs);
1493 		}
1494 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1495 					 &eps_via_nodes);
1496 	    }
1497 	  else
1498 	    {
1499 	      re_node_set_free (&eps_via_nodes);
1500 	      if (prev_idx_match_malloced)
1501 		re_free (prev_idx_match);
1502 	      return REG_NOERROR;
1503 	    }
1504 	}
1505 
1506       /* Proceed to next node.  */
1507       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1508 				    &eps_via_nodes, fs);
1509 
1510       if (BE (! REG_VALID_INDEX (cur_node), 0))
1511 	{
1512 	  if (BE (cur_node == REG_ERROR, 0))
1513 	    {
1514 	      re_node_set_free (&eps_via_nodes);
1515 	      if (prev_idx_match_malloced)
1516 		re_free (prev_idx_match);
1517 	      free_fail_stack_return (fs);
1518 	      return REG_ESPACE;
1519 	    }
1520 	  if (fs)
1521 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1522 				       &eps_via_nodes);
1523 	  else
1524 	    {
1525 	      re_node_set_free (&eps_via_nodes);
1526 	      if (prev_idx_match_malloced)
1527 		re_free (prev_idx_match);
1528 	      return REG_NOMATCH;
1529 	    }
1530 	}
1531     }
1532   re_node_set_free (&eps_via_nodes);
1533   if (prev_idx_match_malloced)
1534     re_free (prev_idx_match);
1535   return free_fail_stack_return (fs);
1536 }
1537 
1538 static reg_errcode_t
1539 internal_function
free_fail_stack_return(struct re_fail_stack_t * fs)1540 free_fail_stack_return (struct re_fail_stack_t *fs)
1541 {
1542   if (fs)
1543     {
1544       Idx fs_idx;
1545       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1546 	{
1547 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1548 	  re_free (fs->stack[fs_idx].regs);
1549 	}
1550       re_free (fs->stack);
1551     }
1552   return REG_NOERROR;
1553 }
1554 
1555 static void
1556 internal_function
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1557 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1558 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1559 {
1560   int type = dfa->nodes[cur_node].type;
1561   if (type == OP_OPEN_SUBEXP)
1562     {
1563       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1564 
1565       /* We are at the first node of this sub expression.  */
1566       if (reg_num < nmatch)
1567 	{
1568 	  pmatch[reg_num].rm_so = cur_idx;
1569 	  pmatch[reg_num].rm_eo = -1;
1570 	}
1571     }
1572   else if (type == OP_CLOSE_SUBEXP)
1573     {
1574       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1575       if (reg_num < nmatch)
1576 	{
1577 	  /* We are at the last node of this sub expression.  */
1578 	  if (pmatch[reg_num].rm_so < cur_idx)
1579 	    {
1580 	      pmatch[reg_num].rm_eo = cur_idx;
1581 	      /* This is a non-empty match or we are not inside an optional
1582 		 subexpression.  Accept this right away.  */
1583 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1584 	    }
1585 	  else
1586 	    {
1587 	      if (dfa->nodes[cur_node].opt_subexp
1588 		  && prev_idx_match[reg_num].rm_so != -1)
1589 		/* We transited through an empty match for an optional
1590 		   subexpression, like (a?)*, and this is not the subexp's
1591 		   first match.  Copy back the old content of the registers
1592 		   so that matches of an inner subexpression are undone as
1593 		   well, like in ((a?))*.  */
1594 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1595 	      else
1596 		/* We completed a subexpression, but it may be part of
1597 		   an optional one, so do not update PREV_IDX_MATCH.  */
1598 		pmatch[reg_num].rm_eo = cur_idx;
1599 	    }
1600 	}
1601     }
1602 }
1603 
1604 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1605    and sift the nodes in each states according to the following rules.
1606    Updated state_log will be wrote to STATE_LOG.
1607 
1608    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1609      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1610 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1611 	the LAST_NODE, we throw away the node 'a'.
1612      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1613 	string 's' and transit to 'b':
1614 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1615 	   away the node 'a'.
1616 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1617 	    thrown away, we throw away the node 'a'.
1618      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1619 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1620 	   node 'a'.
1621 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1622 	    we throw away the node 'a'.  */
1623 
1624 #define STATE_NODE_CONTAINS(state,node) \
1625   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1626 
1627 static reg_errcode_t
1628 internal_function
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1629 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1630 {
1631   reg_errcode_t err;
1632   int null_cnt = 0;
1633   Idx str_idx = sctx->last_str_idx;
1634   re_node_set cur_dest;
1635 
1636 #ifdef DEBUG
1637   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1638 #endif
1639 
1640   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1641      transit to the last_node and the last_node itself.  */
1642   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1643   if (BE (err != REG_NOERROR, 0))
1644     return err;
1645   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1646   if (BE (err != REG_NOERROR, 0))
1647     goto free_return;
1648 
1649   /* Then check each states in the state_log.  */
1650   while (str_idx > 0)
1651     {
1652       /* Update counters.  */
1653       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1654       if (null_cnt > mctx->max_mb_elem_len)
1655 	{
1656 	  memset (sctx->sifted_states, '\0',
1657 		  sizeof (re_dfastate_t *) * str_idx);
1658 	  re_node_set_free (&cur_dest);
1659 	  return REG_NOERROR;
1660 	}
1661       re_node_set_empty (&cur_dest);
1662       --str_idx;
1663 
1664       if (mctx->state_log[str_idx])
1665 	{
1666 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1667 	  if (BE (err != REG_NOERROR, 0))
1668 	    goto free_return;
1669 	}
1670 
1671       /* Add all the nodes which satisfy the following conditions:
1672 	 - It can epsilon transit to a node in CUR_DEST.
1673 	 - It is in CUR_SRC.
1674 	 And update state_log.  */
1675       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1676       if (BE (err != REG_NOERROR, 0))
1677 	goto free_return;
1678     }
1679   err = REG_NOERROR;
1680  free_return:
1681   re_node_set_free (&cur_dest);
1682   return err;
1683 }
1684 
1685 static reg_errcode_t
1686 internal_function __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1687 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1688 		     Idx str_idx, re_node_set *cur_dest)
1689 {
1690   const re_dfa_t *const dfa = mctx->dfa;
1691   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1692   Idx i;
1693 
1694   /* Then build the next sifted state.
1695      We build the next sifted state on 'cur_dest', and update
1696      'sifted_states[str_idx]' with 'cur_dest'.
1697      Note:
1698      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1699      'cur_src' points the node_set of the old 'state_log[str_idx]'
1700      (with the epsilon nodes pre-filtered out).  */
1701   for (i = 0; i < cur_src->nelem; i++)
1702     {
1703       Idx prev_node = cur_src->elems[i];
1704       int naccepted = 0;
1705       bool ok;
1706 
1707 #ifdef DEBUG
1708       re_token_type_t type = dfa->nodes[prev_node].type;
1709       assert (!IS_EPSILON_NODE (type));
1710 #endif
1711 #ifdef RE_ENABLE_I18N
1712       /* If the node may accept "multi byte".  */
1713       if (dfa->nodes[prev_node].accept_mb)
1714 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1715 					 str_idx, sctx->last_str_idx);
1716 #endif /* RE_ENABLE_I18N */
1717 
1718       /* We don't check backreferences here.
1719 	 See update_cur_sifted_state().  */
1720       if (!naccepted
1721 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1722 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1723 				  dfa->nexts[prev_node]))
1724 	naccepted = 1;
1725 
1726       if (naccepted == 0)
1727 	continue;
1728 
1729       if (sctx->limits.nelem)
1730 	{
1731 	  Idx to_idx = str_idx + naccepted;
1732 	  if (check_dst_limits (mctx, &sctx->limits,
1733 				dfa->nexts[prev_node], to_idx,
1734 				prev_node, str_idx))
1735 	    continue;
1736 	}
1737       ok = re_node_set_insert (cur_dest, prev_node);
1738       if (BE (! ok, 0))
1739 	return REG_ESPACE;
1740     }
1741 
1742   return REG_NOERROR;
1743 }
1744 
1745 /* Helper functions.  */
1746 
1747 static reg_errcode_t
1748 internal_function
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1749 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1750 {
1751   Idx top = mctx->state_log_top;
1752 
1753   if ((next_state_log_idx >= mctx->input.bufs_len
1754        && mctx->input.bufs_len < mctx->input.len)
1755       || (next_state_log_idx >= mctx->input.valid_len
1756 	  && mctx->input.valid_len < mctx->input.len))
1757     {
1758       reg_errcode_t err;
1759       err = extend_buffers (mctx, next_state_log_idx + 1);
1760       if (BE (err != REG_NOERROR, 0))
1761 	return err;
1762     }
1763 
1764   if (top < next_state_log_idx)
1765     {
1766       memset (mctx->state_log + top + 1, '\0',
1767 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1768       mctx->state_log_top = next_state_log_idx;
1769     }
1770   return REG_NOERROR;
1771 }
1772 
1773 static reg_errcode_t
1774 internal_function
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1775 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1776 		   re_dfastate_t **src, Idx num)
1777 {
1778   Idx st_idx;
1779   reg_errcode_t err;
1780   for (st_idx = 0; st_idx < num; ++st_idx)
1781     {
1782       if (dst[st_idx] == NULL)
1783 	dst[st_idx] = src[st_idx];
1784       else if (src[st_idx] != NULL)
1785 	{
1786 	  re_node_set merged_set;
1787 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1788 					&src[st_idx]->nodes);
1789 	  if (BE (err != REG_NOERROR, 0))
1790 	    return err;
1791 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1792 	  re_node_set_free (&merged_set);
1793 	  if (BE (err != REG_NOERROR, 0))
1794 	    return err;
1795 	}
1796     }
1797   return REG_NOERROR;
1798 }
1799 
1800 static reg_errcode_t
1801 internal_function
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1802 update_cur_sifted_state (const re_match_context_t *mctx,
1803 			 re_sift_context_t *sctx, Idx str_idx,
1804 			 re_node_set *dest_nodes)
1805 {
1806   const re_dfa_t *const dfa = mctx->dfa;
1807   reg_errcode_t err = REG_NOERROR;
1808   const re_node_set *candidates;
1809   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1810 		: &mctx->state_log[str_idx]->nodes);
1811 
1812   if (dest_nodes->nelem == 0)
1813     sctx->sifted_states[str_idx] = NULL;
1814   else
1815     {
1816       if (candidates)
1817 	{
1818 	  /* At first, add the nodes which can epsilon transit to a node in
1819 	     DEST_NODE.  */
1820 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1821 	  if (BE (err != REG_NOERROR, 0))
1822 	    return err;
1823 
1824 	  /* Then, check the limitations in the current sift_context.  */
1825 	  if (sctx->limits.nelem)
1826 	    {
1827 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1828 					 mctx->bkref_ents, str_idx);
1829 	      if (BE (err != REG_NOERROR, 0))
1830 		return err;
1831 	    }
1832 	}
1833 
1834       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1835       if (BE (err != REG_NOERROR, 0))
1836 	return err;
1837     }
1838 
1839   if (candidates && mctx->state_log[str_idx]->has_backref)
1840     {
1841       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1842       if (BE (err != REG_NOERROR, 0))
1843 	return err;
1844     }
1845   return REG_NOERROR;
1846 }
1847 
1848 static reg_errcode_t
1849 internal_function __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1850 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1851 		       const re_node_set *candidates)
1852 {
1853   reg_errcode_t err = REG_NOERROR;
1854   Idx i;
1855 
1856   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1857   if (BE (err != REG_NOERROR, 0))
1858     return err;
1859 
1860   if (!state->inveclosure.alloc)
1861     {
1862       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1863       if (BE (err != REG_NOERROR, 0))
1864 	return REG_ESPACE;
1865       for (i = 0; i < dest_nodes->nelem; i++)
1866 	{
1867 	  err = re_node_set_merge (&state->inveclosure,
1868 				   dfa->inveclosures + dest_nodes->elems[i]);
1869 	  if (BE (err != REG_NOERROR, 0))
1870 	    return REG_ESPACE;
1871 	}
1872     }
1873   return re_node_set_add_intersect (dest_nodes, candidates,
1874 				    &state->inveclosure);
1875 }
1876 
1877 static reg_errcode_t
1878 internal_function
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1879 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1880 		       const re_node_set *candidates)
1881 {
1882     Idx ecl_idx;
1883     reg_errcode_t err;
1884     re_node_set *inv_eclosure = dfa->inveclosures + node;
1885     re_node_set except_nodes;
1886     re_node_set_init_empty (&except_nodes);
1887     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1888       {
1889 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1890 	if (cur_node == node)
1891 	  continue;
1892 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1893 	  {
1894 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1895 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1896 			 ? dfa->edests[cur_node].elems[1] : REG_MISSING);
1897 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1898 		 && re_node_set_contains (dest_nodes, edst1))
1899 		|| (REG_VALID_NONZERO_INDEX (edst2)
1900 		    && !re_node_set_contains (inv_eclosure, edst2)
1901 		    && re_node_set_contains (dest_nodes, edst2)))
1902 	      {
1903 		err = re_node_set_add_intersect (&except_nodes, candidates,
1904 						 dfa->inveclosures + cur_node);
1905 		if (BE (err != REG_NOERROR, 0))
1906 		  {
1907 		    re_node_set_free (&except_nodes);
1908 		    return err;
1909 		  }
1910 	      }
1911 	  }
1912       }
1913     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1914       {
1915 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1916 	if (!re_node_set_contains (&except_nodes, cur_node))
1917 	  {
1918 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1919 	    re_node_set_remove_at (dest_nodes, idx);
1920 	  }
1921       }
1922     re_node_set_free (&except_nodes);
1923     return REG_NOERROR;
1924 }
1925 
1926 static bool
1927 internal_function
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1928 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1929 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1930 {
1931   const re_dfa_t *const dfa = mctx->dfa;
1932   Idx lim_idx, src_pos, dst_pos;
1933 
1934   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1935   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1936   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1937     {
1938       Idx subexp_idx;
1939       struct re_backref_cache_entry *ent;
1940       ent = mctx->bkref_ents + limits->elems[lim_idx];
1941       subexp_idx = dfa->nodes[ent->node].opr.idx;
1942 
1943       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1944 					   subexp_idx, dst_node, dst_idx,
1945 					   dst_bkref_idx);
1946       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1947 					   subexp_idx, src_node, src_idx,
1948 					   src_bkref_idx);
1949 
1950       /* In case of:
1951 	 <src> <dst> ( <subexp> )
1952 	 ( <subexp> ) <src> <dst>
1953 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1954       if (src_pos == dst_pos)
1955 	continue; /* This is unrelated limitation.  */
1956       else
1957 	return true;
1958     }
1959   return false;
1960 }
1961 
1962 static int
1963 internal_function
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1964 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1965 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1966 {
1967   const re_dfa_t *const dfa = mctx->dfa;
1968   const re_node_set *eclosures = dfa->eclosures + from_node;
1969   Idx node_idx;
1970 
1971   /* Else, we are on the boundary: examine the nodes on the epsilon
1972      closure.  */
1973   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1974     {
1975       Idx node = eclosures->elems[node_idx];
1976       switch (dfa->nodes[node].type)
1977 	{
1978 	case OP_BACK_REF:
1979 	  if (bkref_idx != REG_MISSING)
1980 	    {
1981 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1982 	      do
1983 		{
1984 		  Idx dst;
1985 		  int cpos;
1986 
1987 		  if (ent->node != node)
1988 		    continue;
1989 
1990 		  if (subexp_idx < BITSET_WORD_BITS
1991 		      && !(ent->eps_reachable_subexps_map
1992 			   & ((bitset_word_t) 1 << subexp_idx)))
1993 		    continue;
1994 
1995 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1996 		     OP_CLOSE_SUBEXP cases below.  But, if the
1997 		     destination node is the same node as the source
1998 		     node, don't recurse because it would cause an
1999 		     infinite loop: a regex that exhibits this behavior
2000 		     is ()\1*\1*  */
2001 		  dst = dfa->edests[node].elems[0];
2002 		  if (dst == from_node)
2003 		    {
2004 		      if (boundaries & 1)
2005 			return -1;
2006 		      else /* if (boundaries & 2) */
2007 			return 0;
2008 		    }
2009 
2010 		  cpos =
2011 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2012 						 dst, bkref_idx);
2013 		  if (cpos == -1 /* && (boundaries & 1) */)
2014 		    return -1;
2015 		  if (cpos == 0 && (boundaries & 2))
2016 		    return 0;
2017 
2018 		  if (subexp_idx < BITSET_WORD_BITS)
2019 		    ent->eps_reachable_subexps_map
2020 		      &= ~((bitset_word_t) 1 << subexp_idx);
2021 		}
2022 	      while (ent++->more);
2023 	    }
2024 	  break;
2025 
2026 	case OP_OPEN_SUBEXP:
2027 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2028 	    return -1;
2029 	  break;
2030 
2031 	case OP_CLOSE_SUBEXP:
2032 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2033 	    return 0;
2034 	  break;
2035 
2036 	default:
2037 	    break;
2038 	}
2039     }
2040 
2041   return (boundaries & 2) ? 1 : 0;
2042 }
2043 
2044 static int
2045 internal_function
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)2046 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2047 			   Idx subexp_idx, Idx from_node, Idx str_idx,
2048 			   Idx bkref_idx)
2049 {
2050   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2051   int boundaries;
2052 
2053   /* If we are outside the range of the subexpression, return -1 or 1.  */
2054   if (str_idx < lim->subexp_from)
2055     return -1;
2056 
2057   if (lim->subexp_to < str_idx)
2058     return 1;
2059 
2060   /* If we are within the subexpression, return 0.  */
2061   boundaries = (str_idx == lim->subexp_from);
2062   boundaries |= (str_idx == lim->subexp_to) << 1;
2063   if (boundaries == 0)
2064     return 0;
2065 
2066   /* Else, examine epsilon closure.  */
2067   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2068 				      from_node, bkref_idx);
2069 }
2070 
2071 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2072    which are against limitations from DEST_NODES. */
2073 
2074 static reg_errcode_t
2075 internal_function
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2076 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2077 		     const re_node_set *candidates, re_node_set *limits,
2078 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2079 {
2080   reg_errcode_t err;
2081   Idx node_idx, lim_idx;
2082 
2083   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2084     {
2085       Idx subexp_idx;
2086       struct re_backref_cache_entry *ent;
2087       ent = bkref_ents + limits->elems[lim_idx];
2088 
2089       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2090 	continue; /* This is unrelated limitation.  */
2091 
2092       subexp_idx = dfa->nodes[ent->node].opr.idx;
2093       if (ent->subexp_to == str_idx)
2094 	{
2095 	  Idx ops_node = REG_MISSING;
2096 	  Idx cls_node = REG_MISSING;
2097 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098 	    {
2099 	      Idx node = dest_nodes->elems[node_idx];
2100 	      re_token_type_t type = dfa->nodes[node].type;
2101 	      if (type == OP_OPEN_SUBEXP
2102 		  && subexp_idx == dfa->nodes[node].opr.idx)
2103 		ops_node = node;
2104 	      else if (type == OP_CLOSE_SUBEXP
2105 		       && subexp_idx == dfa->nodes[node].opr.idx)
2106 		cls_node = node;
2107 	    }
2108 
2109 	  /* Check the limitation of the open subexpression.  */
2110 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2111 	  if (REG_VALID_INDEX (ops_node))
2112 	    {
2113 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2114 					   candidates);
2115 	      if (BE (err != REG_NOERROR, 0))
2116 		return err;
2117 	    }
2118 
2119 	  /* Check the limitation of the close subexpression.  */
2120 	  if (REG_VALID_INDEX (cls_node))
2121 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2122 	      {
2123 		Idx node = dest_nodes->elems[node_idx];
2124 		if (!re_node_set_contains (dfa->inveclosures + node,
2125 					   cls_node)
2126 		    && !re_node_set_contains (dfa->eclosures + node,
2127 					      cls_node))
2128 		  {
2129 		    /* It is against this limitation.
2130 		       Remove it form the current sifted state.  */
2131 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2132 						 candidates);
2133 		    if (BE (err != REG_NOERROR, 0))
2134 		      return err;
2135 		    --node_idx;
2136 		  }
2137 	      }
2138 	}
2139       else /* (ent->subexp_to != str_idx)  */
2140 	{
2141 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2142 	    {
2143 	      Idx node = dest_nodes->elems[node_idx];
2144 	      re_token_type_t type = dfa->nodes[node].type;
2145 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2146 		{
2147 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2148 		    continue;
2149 		  /* It is against this limitation.
2150 		     Remove it form the current sifted state.  */
2151 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2152 					       candidates);
2153 		  if (BE (err != REG_NOERROR, 0))
2154 		    return err;
2155 		}
2156 	    }
2157 	}
2158     }
2159   return REG_NOERROR;
2160 }
2161 
2162 static reg_errcode_t
2163 internal_function __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2164 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2165 		   Idx str_idx, const re_node_set *candidates)
2166 {
2167   const re_dfa_t *const dfa = mctx->dfa;
2168   reg_errcode_t err;
2169   Idx node_idx, node;
2170   re_sift_context_t local_sctx;
2171   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2172 
2173   if (first_idx == REG_MISSING)
2174     return REG_NOERROR;
2175 
2176   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2177 
2178   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2179     {
2180       Idx enabled_idx;
2181       re_token_type_t type;
2182       struct re_backref_cache_entry *entry;
2183       node = candidates->elems[node_idx];
2184       type = dfa->nodes[node].type;
2185       /* Avoid infinite loop for the REs like "()\1+".  */
2186       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2187 	continue;
2188       if (type != OP_BACK_REF)
2189 	continue;
2190 
2191       entry = mctx->bkref_ents + first_idx;
2192       enabled_idx = first_idx;
2193       do
2194 	{
2195 	  Idx subexp_len;
2196 	  Idx to_idx;
2197 	  Idx dst_node;
2198 	  bool ok;
2199 	  re_dfastate_t *cur_state;
2200 
2201 	  if (entry->node != node)
2202 	    continue;
2203 	  subexp_len = entry->subexp_to - entry->subexp_from;
2204 	  to_idx = str_idx + subexp_len;
2205 	  dst_node = (subexp_len ? dfa->nexts[node]
2206 		      : dfa->edests[node].elems[0]);
2207 
2208 	  if (to_idx > sctx->last_str_idx
2209 	      || sctx->sifted_states[to_idx] == NULL
2210 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2211 	      || check_dst_limits (mctx, &sctx->limits, node,
2212 				   str_idx, dst_node, to_idx))
2213 	    continue;
2214 
2215 	  if (local_sctx.sifted_states == NULL)
2216 	    {
2217 	      local_sctx = *sctx;
2218 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2219 	      if (BE (err != REG_NOERROR, 0))
2220 		goto free_return;
2221 	    }
2222 	  local_sctx.last_node = node;
2223 	  local_sctx.last_str_idx = str_idx;
2224 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2225 	  if (BE (! ok, 0))
2226 	    {
2227 	      err = REG_ESPACE;
2228 	      goto free_return;
2229 	    }
2230 	  cur_state = local_sctx.sifted_states[str_idx];
2231 	  err = sift_states_backward (mctx, &local_sctx);
2232 	  if (BE (err != REG_NOERROR, 0))
2233 	    goto free_return;
2234 	  if (sctx->limited_states != NULL)
2235 	    {
2236 	      err = merge_state_array (dfa, sctx->limited_states,
2237 				       local_sctx.sifted_states,
2238 				       str_idx + 1);
2239 	      if (BE (err != REG_NOERROR, 0))
2240 		goto free_return;
2241 	    }
2242 	  local_sctx.sifted_states[str_idx] = cur_state;
2243 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2244 
2245 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2246 	  entry = mctx->bkref_ents + enabled_idx;
2247 	}
2248       while (enabled_idx++, entry++->more);
2249     }
2250   err = REG_NOERROR;
2251  free_return:
2252   if (local_sctx.sifted_states != NULL)
2253     {
2254       re_node_set_free (&local_sctx.limits);
2255     }
2256 
2257   return err;
2258 }
2259 
2260 
2261 #ifdef RE_ENABLE_I18N
2262 static int
2263 internal_function
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2264 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2265 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2266 {
2267   const re_dfa_t *const dfa = mctx->dfa;
2268   int naccepted;
2269   /* Check the node can accept "multi byte".  */
2270   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2271   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2272       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2273 			    dfa->nexts[node_idx]))
2274     /* The node can't accept the "multi byte", or the
2275        destination was already thrown away, then the node
2276        could't accept the current input "multi byte".   */
2277     naccepted = 0;
2278   /* Otherwise, it is sure that the node could accept
2279      'naccepted' bytes input.  */
2280   return naccepted;
2281 }
2282 #endif /* RE_ENABLE_I18N */
2283 
2284 
2285 /* Functions for state transition.  */
2286 
2287 /* Return the next state to which the current state STATE will transit by
2288    accepting the current input byte, and update STATE_LOG if necessary.
2289    If STATE can accept a multibyte char/collating element/back reference
2290    update the destination of STATE_LOG.  */
2291 
2292 static re_dfastate_t *
2293 internal_function __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2294 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2295 	       re_dfastate_t *state)
2296 {
2297   re_dfastate_t **trtable;
2298   unsigned char ch;
2299 
2300 #ifdef RE_ENABLE_I18N
2301   /* If the current state can accept multibyte.  */
2302   if (BE (state->accept_mb, 0))
2303     {
2304       *err = transit_state_mb (mctx, state);
2305       if (BE (*err != REG_NOERROR, 0))
2306 	return NULL;
2307     }
2308 #endif /* RE_ENABLE_I18N */
2309 
2310   /* Then decide the next state with the single byte.  */
2311 #if 0
2312   if (0)
2313     /* don't use transition table  */
2314     return transit_state_sb (err, mctx, state);
2315 #endif
2316 
2317   /* Use transition table  */
2318   ch = re_string_fetch_byte (&mctx->input);
2319   for (;;)
2320     {
2321       trtable = state->trtable;
2322       if (BE (trtable != NULL, 1))
2323 	return trtable[ch];
2324 
2325       trtable = state->word_trtable;
2326       if (BE (trtable != NULL, 1))
2327 	{
2328 	  unsigned int context;
2329 	  context
2330 	    = re_string_context_at (&mctx->input,
2331 				    re_string_cur_idx (&mctx->input) - 1,
2332 				    mctx->eflags);
2333 	  if (IS_WORD_CONTEXT (context))
2334 	    return trtable[ch + SBC_MAX];
2335 	  else
2336 	    return trtable[ch];
2337 	}
2338 
2339       if (!build_trtable (mctx->dfa, state))
2340 	{
2341 	  *err = REG_ESPACE;
2342 	  return NULL;
2343 	}
2344 
2345       /* Retry, we now have a transition table.  */
2346     }
2347 }
2348 
2349 /* Update the state_log if we need */
2350 static re_dfastate_t *
2351 internal_function
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2352 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2353 		      re_dfastate_t *next_state)
2354 {
2355   const re_dfa_t *const dfa = mctx->dfa;
2356   Idx cur_idx = re_string_cur_idx (&mctx->input);
2357 
2358   if (cur_idx > mctx->state_log_top)
2359     {
2360       mctx->state_log[cur_idx] = next_state;
2361       mctx->state_log_top = cur_idx;
2362     }
2363   else if (mctx->state_log[cur_idx] == 0)
2364     {
2365       mctx->state_log[cur_idx] = next_state;
2366     }
2367   else
2368     {
2369       re_dfastate_t *pstate;
2370       unsigned int context;
2371       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2372       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2373 	 the destination of a multibyte char/collating element/
2374 	 back reference.  Then the next state is the union set of
2375 	 these destinations and the results of the transition table.  */
2376       pstate = mctx->state_log[cur_idx];
2377       log_nodes = pstate->entrance_nodes;
2378       if (next_state != NULL)
2379 	{
2380 	  table_nodes = next_state->entrance_nodes;
2381 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2382 					     log_nodes);
2383 	  if (BE (*err != REG_NOERROR, 0))
2384 	    return NULL;
2385 	}
2386       else
2387 	next_nodes = *log_nodes;
2388       /* Note: We already add the nodes of the initial state,
2389 	 then we don't need to add them here.  */
2390 
2391       context = re_string_context_at (&mctx->input,
2392 				      re_string_cur_idx (&mctx->input) - 1,
2393 				      mctx->eflags);
2394       next_state = mctx->state_log[cur_idx]
2395 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2396       /* We don't need to check errors here, since the return value of
2397 	 this function is next_state and ERR is already set.  */
2398 
2399       if (table_nodes != NULL)
2400 	re_node_set_free (&next_nodes);
2401     }
2402 
2403   if (BE (dfa->nbackref, 0) && next_state != NULL)
2404     {
2405       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2406 	 later.  We must check them here, since the back references in the
2407 	 next state might use them.  */
2408       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2409 					cur_idx);
2410       if (BE (*err != REG_NOERROR, 0))
2411 	return NULL;
2412 
2413       /* If the next state has back references.  */
2414       if (next_state->has_backref)
2415 	{
2416 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2417 	  if (BE (*err != REG_NOERROR, 0))
2418 	    return NULL;
2419 	  next_state = mctx->state_log[cur_idx];
2420 	}
2421     }
2422 
2423   return next_state;
2424 }
2425 
2426 /* Skip bytes in the input that correspond to part of a
2427    multi-byte match, then look in the log for a state
2428    from which to restart matching.  */
2429 static re_dfastate_t *
2430 internal_function
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2431 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2432 {
2433   re_dfastate_t *cur_state;
2434   do
2435     {
2436       Idx max = mctx->state_log_top;
2437       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2438 
2439       do
2440 	{
2441 	  if (++cur_str_idx > max)
2442 	    return NULL;
2443 	  re_string_skip_bytes (&mctx->input, 1);
2444 	}
2445       while (mctx->state_log[cur_str_idx] == NULL);
2446 
2447       cur_state = merge_state_with_log (err, mctx, NULL);
2448     }
2449   while (*err == REG_NOERROR && cur_state == NULL);
2450   return cur_state;
2451 }
2452 
2453 /* Helper functions for transit_state.  */
2454 
2455 /* From the node set CUR_NODES, pick up the nodes whose types are
2456    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2457    expression. And register them to use them later for evaluating the
2458    corresponding back references.  */
2459 
2460 static reg_errcode_t
2461 internal_function
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2462 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2463 			   Idx str_idx)
2464 {
2465   const re_dfa_t *const dfa = mctx->dfa;
2466   Idx node_idx;
2467   reg_errcode_t err;
2468 
2469   /* TODO: This isn't efficient.
2470 	   Because there might be more than one nodes whose types are
2471 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2472 	   nodes.
2473 	   E.g. RE: (a){2}  */
2474   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2475     {
2476       Idx node = cur_nodes->elems[node_idx];
2477       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2478 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2479 	  && (dfa->used_bkref_map
2480 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2481 	{
2482 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2483 	  if (BE (err != REG_NOERROR, 0))
2484 	    return err;
2485 	}
2486     }
2487   return REG_NOERROR;
2488 }
2489 
2490 #if 0
2491 /* Return the next state to which the current state STATE will transit by
2492    accepting the current input byte.  */
2493 
2494 static re_dfastate_t *
2495 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2496 		  re_dfastate_t *state)
2497 {
2498   const re_dfa_t *const dfa = mctx->dfa;
2499   re_node_set next_nodes;
2500   re_dfastate_t *next_state;
2501   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2502   unsigned int context;
2503 
2504   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2505   if (BE (*err != REG_NOERROR, 0))
2506     return NULL;
2507   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2508     {
2509       Idx cur_node = state->nodes.elems[node_cnt];
2510       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2511 	{
2512 	  *err = re_node_set_merge (&next_nodes,
2513 				    dfa->eclosures + dfa->nexts[cur_node]);
2514 	  if (BE (*err != REG_NOERROR, 0))
2515 	    {
2516 	      re_node_set_free (&next_nodes);
2517 	      return NULL;
2518 	    }
2519 	}
2520     }
2521   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2522   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2523   /* We don't need to check errors here, since the return value of
2524      this function is next_state and ERR is already set.  */
2525 
2526   re_node_set_free (&next_nodes);
2527   re_string_skip_bytes (&mctx->input, 1);
2528   return next_state;
2529 }
2530 #endif
2531 
2532 #ifdef RE_ENABLE_I18N
2533 static reg_errcode_t
2534 internal_function
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2535 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2536 {
2537   const re_dfa_t *const dfa = mctx->dfa;
2538   reg_errcode_t err;
2539   Idx i;
2540 
2541   for (i = 0; i < pstate->nodes.nelem; ++i)
2542     {
2543       re_node_set dest_nodes, *new_nodes;
2544       Idx cur_node_idx = pstate->nodes.elems[i];
2545       int naccepted;
2546       Idx dest_idx;
2547       unsigned int context;
2548       re_dfastate_t *dest_state;
2549 
2550       if (!dfa->nodes[cur_node_idx].accept_mb)
2551 	continue;
2552 
2553       if (dfa->nodes[cur_node_idx].constraint)
2554 	{
2555 	  context = re_string_context_at (&mctx->input,
2556 					  re_string_cur_idx (&mctx->input),
2557 					  mctx->eflags);
2558 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2559 					   context))
2560 	    continue;
2561 	}
2562 
2563       /* How many bytes the node can accept?  */
2564       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2565 					   re_string_cur_idx (&mctx->input));
2566       if (naccepted == 0)
2567 	continue;
2568 
2569       /* The node can accepts 'naccepted' bytes.  */
2570       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2571       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2572 			       : mctx->max_mb_elem_len);
2573       err = clean_state_log_if_needed (mctx, dest_idx);
2574       if (BE (err != REG_NOERROR, 0))
2575 	return err;
2576 #ifdef DEBUG
2577       assert (dfa->nexts[cur_node_idx] != REG_MISSING);
2578 #endif
2579       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2580 
2581       dest_state = mctx->state_log[dest_idx];
2582       if (dest_state == NULL)
2583 	dest_nodes = *new_nodes;
2584       else
2585 	{
2586 	  err = re_node_set_init_union (&dest_nodes,
2587 					dest_state->entrance_nodes, new_nodes);
2588 	  if (BE (err != REG_NOERROR, 0))
2589 	    return err;
2590 	}
2591       context = re_string_context_at (&mctx->input, dest_idx - 1,
2592 				      mctx->eflags);
2593       mctx->state_log[dest_idx]
2594 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2595       if (dest_state != NULL)
2596 	re_node_set_free (&dest_nodes);
2597       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2598 	return err;
2599     }
2600   return REG_NOERROR;
2601 }
2602 #endif /* RE_ENABLE_I18N */
2603 
2604 static reg_errcode_t
2605 internal_function
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2606 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2607 {
2608   const re_dfa_t *const dfa = mctx->dfa;
2609   reg_errcode_t err;
2610   Idx i;
2611   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2612 
2613   for (i = 0; i < nodes->nelem; ++i)
2614     {
2615       Idx dest_str_idx, prev_nelem, bkc_idx;
2616       Idx node_idx = nodes->elems[i];
2617       unsigned int context;
2618       const re_token_t *node = dfa->nodes + node_idx;
2619       re_node_set *new_dest_nodes;
2620 
2621       /* Check whether 'node' is a backreference or not.  */
2622       if (node->type != OP_BACK_REF)
2623 	continue;
2624 
2625       if (node->constraint)
2626 	{
2627 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2628 					  mctx->eflags);
2629 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2630 	    continue;
2631 	}
2632 
2633       /* 'node' is a backreference.
2634 	 Check the substring which the substring matched.  */
2635       bkc_idx = mctx->nbkref_ents;
2636       err = get_subexp (mctx, node_idx, cur_str_idx);
2637       if (BE (err != REG_NOERROR, 0))
2638 	goto free_return;
2639 
2640       /* And add the epsilon closures (which is 'new_dest_nodes') of
2641 	 the backreference to appropriate state_log.  */
2642 #ifdef DEBUG
2643       assert (dfa->nexts[node_idx] != REG_MISSING);
2644 #endif
2645       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2646 	{
2647 	  Idx subexp_len;
2648 	  re_dfastate_t *dest_state;
2649 	  struct re_backref_cache_entry *bkref_ent;
2650 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2651 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2652 	    continue;
2653 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2654 	  new_dest_nodes = (subexp_len == 0
2655 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2656 			    : dfa->eclosures + dfa->nexts[node_idx]);
2657 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2658 			  - bkref_ent->subexp_from);
2659 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2660 					  mctx->eflags);
2661 	  dest_state = mctx->state_log[dest_str_idx];
2662 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2663 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2664 	  /* Add 'new_dest_node' to state_log.  */
2665 	  if (dest_state == NULL)
2666 	    {
2667 	      mctx->state_log[dest_str_idx]
2668 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2669 					    context);
2670 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2671 		      && err != REG_NOERROR, 0))
2672 		goto free_return;
2673 	    }
2674 	  else
2675 	    {
2676 	      re_node_set dest_nodes;
2677 	      err = re_node_set_init_union (&dest_nodes,
2678 					    dest_state->entrance_nodes,
2679 					    new_dest_nodes);
2680 	      if (BE (err != REG_NOERROR, 0))
2681 		{
2682 		  re_node_set_free (&dest_nodes);
2683 		  goto free_return;
2684 		}
2685 	      mctx->state_log[dest_str_idx]
2686 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2687 	      re_node_set_free (&dest_nodes);
2688 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2689 		      && err != REG_NOERROR, 0))
2690 		goto free_return;
2691 	    }
2692 	  /* We need to check recursively if the backreference can epsilon
2693 	     transit.  */
2694 	  if (subexp_len == 0
2695 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2696 	    {
2697 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2698 					       cur_str_idx);
2699 	      if (BE (err != REG_NOERROR, 0))
2700 		goto free_return;
2701 	      err = transit_state_bkref (mctx, new_dest_nodes);
2702 	      if (BE (err != REG_NOERROR, 0))
2703 		goto free_return;
2704 	    }
2705 	}
2706     }
2707   err = REG_NOERROR;
2708  free_return:
2709   return err;
2710 }
2711 
2712 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2713    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2714    Note that we might collect inappropriate candidates here.
2715    However, the cost of checking them strictly here is too high, then we
2716    delay these checking for prune_impossible_nodes().  */
2717 
2718 static reg_errcode_t
2719 internal_function __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2720 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2721 {
2722   const re_dfa_t *const dfa = mctx->dfa;
2723   Idx subexp_num, sub_top_idx;
2724   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2725   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2726   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2727   if (cache_idx != REG_MISSING)
2728     {
2729       const struct re_backref_cache_entry *entry
2730 	= mctx->bkref_ents + cache_idx;
2731       do
2732 	if (entry->node == bkref_node)
2733 	  return REG_NOERROR; /* We already checked it.  */
2734       while (entry++->more);
2735     }
2736 
2737   subexp_num = dfa->nodes[bkref_node].opr.idx;
2738 
2739   /* For each sub expression  */
2740   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2741     {
2742       reg_errcode_t err;
2743       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2744       re_sub_match_last_t *sub_last;
2745       Idx sub_last_idx, sl_str, bkref_str_off;
2746 
2747       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2748 	continue; /* It isn't related.  */
2749 
2750       sl_str = sub_top->str_idx;
2751       bkref_str_off = bkref_str_idx;
2752       /* At first, check the last node of sub expressions we already
2753 	 evaluated.  */
2754       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2755 	{
2756 	  regoff_t sl_str_diff;
2757 	  sub_last = sub_top->lasts[sub_last_idx];
2758 	  sl_str_diff = sub_last->str_idx - sl_str;
2759 	  /* The matched string by the sub expression match with the substring
2760 	     at the back reference?  */
2761 	  if (sl_str_diff > 0)
2762 	    {
2763 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2764 		{
2765 		  /* Not enough chars for a successful match.  */
2766 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2767 		    break;
2768 
2769 		  err = clean_state_log_if_needed (mctx,
2770 						   bkref_str_off
2771 						   + sl_str_diff);
2772 		  if (BE (err != REG_NOERROR, 0))
2773 		    return err;
2774 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2775 		}
2776 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2777 		/* We don't need to search this sub expression any more.  */
2778 		break;
2779 	    }
2780 	  bkref_str_off += sl_str_diff;
2781 	  sl_str += sl_str_diff;
2782 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2783 				bkref_str_idx);
2784 
2785 	  /* Reload buf, since the preceding call might have reallocated
2786 	     the buffer.  */
2787 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2788 
2789 	  if (err == REG_NOMATCH)
2790 	    continue;
2791 	  if (BE (err != REG_NOERROR, 0))
2792 	    return err;
2793 	}
2794 
2795       if (sub_last_idx < sub_top->nlasts)
2796 	continue;
2797       if (sub_last_idx > 0)
2798 	++sl_str;
2799       /* Then, search for the other last nodes of the sub expression.  */
2800       for (; sl_str <= bkref_str_idx; ++sl_str)
2801 	{
2802 	  Idx cls_node;
2803 	  regoff_t sl_str_off;
2804 	  const re_node_set *nodes;
2805 	  sl_str_off = sl_str - sub_top->str_idx;
2806 	  /* The matched string by the sub expression match with the substring
2807 	     at the back reference?  */
2808 	  if (sl_str_off > 0)
2809 	    {
2810 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2811 		{
2812 		  /* If we are at the end of the input, we cannot match.  */
2813 		  if (bkref_str_off >= mctx->input.len)
2814 		    break;
2815 
2816 		  err = extend_buffers (mctx, bkref_str_off + 1);
2817 		  if (BE (err != REG_NOERROR, 0))
2818 		    return err;
2819 
2820 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2821 		}
2822 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2823 		break; /* We don't need to search this sub expression
2824 			  any more.  */
2825 	    }
2826 	  if (mctx->state_log[sl_str] == NULL)
2827 	    continue;
2828 	  /* Does this state have a ')' of the sub expression?  */
2829 	  nodes = &mctx->state_log[sl_str]->nodes;
2830 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2831 				       OP_CLOSE_SUBEXP);
2832 	  if (cls_node == REG_MISSING)
2833 	    continue; /* No.  */
2834 	  if (sub_top->path == NULL)
2835 	    {
2836 	      sub_top->path = calloc (sizeof (state_array_t),
2837 				      sl_str - sub_top->str_idx + 1);
2838 	      if (sub_top->path == NULL)
2839 		return REG_ESPACE;
2840 	    }
2841 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2842 	     in the current context?  */
2843 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2844 			       sub_top->str_idx, cls_node, sl_str,
2845 			       OP_CLOSE_SUBEXP);
2846 	  if (err == REG_NOMATCH)
2847 	      continue;
2848 	  if (BE (err != REG_NOERROR, 0))
2849 	      return err;
2850 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2851 	  if (BE (sub_last == NULL, 0))
2852 	    return REG_ESPACE;
2853 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2854 				bkref_str_idx);
2855 	  if (err == REG_NOMATCH)
2856 	    continue;
2857 	}
2858     }
2859   return REG_NOERROR;
2860 }
2861 
2862 /* Helper functions for get_subexp().  */
2863 
2864 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2865    If it can arrive, register the sub expression expressed with SUB_TOP
2866    and SUB_LAST.  */
2867 
2868 static reg_errcode_t
2869 internal_function
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2870 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2871 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2872 {
2873   reg_errcode_t err;
2874   Idx to_idx;
2875   /* Can the subexpression arrive the back reference?  */
2876   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2877 		       sub_last->str_idx, bkref_node, bkref_str,
2878 		       OP_OPEN_SUBEXP);
2879   if (err != REG_NOERROR)
2880     return err;
2881   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2882 			     sub_last->str_idx);
2883   if (BE (err != REG_NOERROR, 0))
2884     return err;
2885   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2886   return clean_state_log_if_needed (mctx, to_idx);
2887 }
2888 
2889 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2890    Search '(' if FL_OPEN, or search ')' otherwise.
2891    TODO: This function isn't efficient...
2892 	 Because there might be more than one nodes whose types are
2893 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2894 	 nodes.
2895 	 E.g. RE: (a){2}  */
2896 
2897 static Idx
2898 internal_function
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2899 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2900 		  Idx subexp_idx, int type)
2901 {
2902   Idx cls_idx;
2903   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2904     {
2905       Idx cls_node = nodes->elems[cls_idx];
2906       const re_token_t *node = dfa->nodes + cls_node;
2907       if (node->type == type
2908 	  && node->opr.idx == subexp_idx)
2909 	return cls_node;
2910     }
2911   return REG_MISSING;
2912 }
2913 
2914 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2915    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2916    heavily reused.
2917    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2918 
2919 static reg_errcode_t
2920 internal_function __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2921 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2922 	       Idx top_str, Idx last_node, Idx last_str, int type)
2923 {
2924   const re_dfa_t *const dfa = mctx->dfa;
2925   reg_errcode_t err = REG_NOERROR;
2926   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2927   re_dfastate_t *cur_state = NULL;
2928   re_node_set *cur_nodes, next_nodes;
2929   re_dfastate_t **backup_state_log;
2930   unsigned int context;
2931 
2932   subexp_num = dfa->nodes[top_node].opr.idx;
2933   /* Extend the buffer if we need.  */
2934   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2935     {
2936       re_dfastate_t **new_array;
2937       Idx old_alloc = path->alloc;
2938       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2939       Idx new_alloc;
2940       if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2941 	return REG_ESPACE;
2942       new_alloc = old_alloc + incr_alloc;
2943       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2944 	return REG_ESPACE;
2945       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2946       if (BE (new_array == NULL, 0))
2947 	return REG_ESPACE;
2948       path->array = new_array;
2949       path->alloc = new_alloc;
2950       memset (new_array + old_alloc, '\0',
2951 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2952     }
2953 
2954   str_idx = path->next_idx ? path->next_idx : top_str;
2955 
2956   /* Temporary modify MCTX.  */
2957   backup_state_log = mctx->state_log;
2958   backup_cur_idx = mctx->input.cur_idx;
2959   mctx->state_log = path->array;
2960   mctx->input.cur_idx = str_idx;
2961 
2962   /* Setup initial node set.  */
2963   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2964   if (str_idx == top_str)
2965     {
2966       err = re_node_set_init_1 (&next_nodes, top_node);
2967       if (BE (err != REG_NOERROR, 0))
2968 	return err;
2969       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2970       if (BE (err != REG_NOERROR, 0))
2971 	{
2972 	  re_node_set_free (&next_nodes);
2973 	  return err;
2974 	}
2975     }
2976   else
2977     {
2978       cur_state = mctx->state_log[str_idx];
2979       if (cur_state && cur_state->has_backref)
2980 	{
2981 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2982 	  if (BE (err != REG_NOERROR, 0))
2983 	    return err;
2984 	}
2985       else
2986 	re_node_set_init_empty (&next_nodes);
2987     }
2988   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2989     {
2990       if (next_nodes.nelem)
2991 	{
2992 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2993 				    subexp_num, type);
2994 	  if (BE (err != REG_NOERROR, 0))
2995 	    {
2996 	      re_node_set_free (&next_nodes);
2997 	      return err;
2998 	    }
2999 	}
3000       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3001       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3002 	{
3003 	  re_node_set_free (&next_nodes);
3004 	  return err;
3005 	}
3006       mctx->state_log[str_idx] = cur_state;
3007     }
3008 
3009   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3010     {
3011       re_node_set_empty (&next_nodes);
3012       if (mctx->state_log[str_idx + 1])
3013 	{
3014 	  err = re_node_set_merge (&next_nodes,
3015 				   &mctx->state_log[str_idx + 1]->nodes);
3016 	  if (BE (err != REG_NOERROR, 0))
3017 	    {
3018 	      re_node_set_free (&next_nodes);
3019 	      return err;
3020 	    }
3021 	}
3022       if (cur_state)
3023 	{
3024 	  err = check_arrival_add_next_nodes (mctx, str_idx,
3025 					      &cur_state->non_eps_nodes,
3026 					      &next_nodes);
3027 	  if (BE (err != REG_NOERROR, 0))
3028 	    {
3029 	      re_node_set_free (&next_nodes);
3030 	      return err;
3031 	    }
3032 	}
3033       ++str_idx;
3034       if (next_nodes.nelem)
3035 	{
3036 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3037 	  if (BE (err != REG_NOERROR, 0))
3038 	    {
3039 	      re_node_set_free (&next_nodes);
3040 	      return err;
3041 	    }
3042 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3043 				    subexp_num, type);
3044 	  if (BE (err != REG_NOERROR, 0))
3045 	    {
3046 	      re_node_set_free (&next_nodes);
3047 	      return err;
3048 	    }
3049 	}
3050       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3051       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3052       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3053 	{
3054 	  re_node_set_free (&next_nodes);
3055 	  return err;
3056 	}
3057       mctx->state_log[str_idx] = cur_state;
3058       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3059     }
3060   re_node_set_free (&next_nodes);
3061   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3062 	       : &mctx->state_log[last_str]->nodes);
3063   path->next_idx = str_idx;
3064 
3065   /* Fix MCTX.  */
3066   mctx->state_log = backup_state_log;
3067   mctx->input.cur_idx = backup_cur_idx;
3068 
3069   /* Then check the current node set has the node LAST_NODE.  */
3070   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3071     return REG_NOERROR;
3072 
3073   return REG_NOMATCH;
3074 }
3075 
3076 /* Helper functions for check_arrival.  */
3077 
3078 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3079    to NEXT_NODES.
3080    TODO: This function is similar to the functions transit_state*(),
3081 	 however this function has many additional works.
3082 	 Can't we unify them?  */
3083 
3084 static reg_errcode_t
3085 internal_function __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3086 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3087 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3088 {
3089   const re_dfa_t *const dfa = mctx->dfa;
3090   bool ok;
3091   Idx cur_idx;
3092 #ifdef RE_ENABLE_I18N
3093   reg_errcode_t err = REG_NOERROR;
3094 #endif
3095   re_node_set union_set;
3096   re_node_set_init_empty (&union_set);
3097   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3098     {
3099       int naccepted = 0;
3100       Idx cur_node = cur_nodes->elems[cur_idx];
3101 #ifdef DEBUG
3102       re_token_type_t type = dfa->nodes[cur_node].type;
3103       assert (!IS_EPSILON_NODE (type));
3104 #endif
3105 #ifdef RE_ENABLE_I18N
3106       /* If the node may accept "multi byte".  */
3107       if (dfa->nodes[cur_node].accept_mb)
3108 	{
3109 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3110 					       str_idx);
3111 	  if (naccepted > 1)
3112 	    {
3113 	      re_dfastate_t *dest_state;
3114 	      Idx next_node = dfa->nexts[cur_node];
3115 	      Idx next_idx = str_idx + naccepted;
3116 	      dest_state = mctx->state_log[next_idx];
3117 	      re_node_set_empty (&union_set);
3118 	      if (dest_state)
3119 		{
3120 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3121 		  if (BE (err != REG_NOERROR, 0))
3122 		    {
3123 		      re_node_set_free (&union_set);
3124 		      return err;
3125 		    }
3126 		}
3127 	      ok = re_node_set_insert (&union_set, next_node);
3128 	      if (BE (! ok, 0))
3129 		{
3130 		  re_node_set_free (&union_set);
3131 		  return REG_ESPACE;
3132 		}
3133 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3134 							    &union_set);
3135 	      if (BE (mctx->state_log[next_idx] == NULL
3136 		      && err != REG_NOERROR, 0))
3137 		{
3138 		  re_node_set_free (&union_set);
3139 		  return err;
3140 		}
3141 	    }
3142 	}
3143 #endif /* RE_ENABLE_I18N */
3144       if (naccepted
3145 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3146 	{
3147 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3148 	  if (BE (! ok, 0))
3149 	    {
3150 	      re_node_set_free (&union_set);
3151 	      return REG_ESPACE;
3152 	    }
3153 	}
3154     }
3155   re_node_set_free (&union_set);
3156   return REG_NOERROR;
3157 }
3158 
3159 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3160    CUR_NODES, however exclude the nodes which are:
3161     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3162     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3163 */
3164 
3165 static reg_errcode_t
3166 internal_function
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3167 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3168 			  Idx ex_subexp, int type)
3169 {
3170   reg_errcode_t err;
3171   Idx idx, outside_node;
3172   re_node_set new_nodes;
3173 #ifdef DEBUG
3174   assert (cur_nodes->nelem);
3175 #endif
3176   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3177   if (BE (err != REG_NOERROR, 0))
3178     return err;
3179   /* Create a new node set NEW_NODES with the nodes which are epsilon
3180      closures of the node in CUR_NODES.  */
3181 
3182   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3183     {
3184       Idx cur_node = cur_nodes->elems[idx];
3185       const re_node_set *eclosure = dfa->eclosures + cur_node;
3186       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3187       if (outside_node == REG_MISSING)
3188 	{
3189 	  /* There are no problematic nodes, just merge them.  */
3190 	  err = re_node_set_merge (&new_nodes, eclosure);
3191 	  if (BE (err != REG_NOERROR, 0))
3192 	    {
3193 	      re_node_set_free (&new_nodes);
3194 	      return err;
3195 	    }
3196 	}
3197       else
3198 	{
3199 	  /* There are problematic nodes, re-calculate incrementally.  */
3200 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3201 					      ex_subexp, type);
3202 	  if (BE (err != REG_NOERROR, 0))
3203 	    {
3204 	      re_node_set_free (&new_nodes);
3205 	      return err;
3206 	    }
3207 	}
3208     }
3209   re_node_set_free (cur_nodes);
3210   *cur_nodes = new_nodes;
3211   return REG_NOERROR;
3212 }
3213 
3214 /* Helper function for check_arrival_expand_ecl.
3215    Check incrementally the epsilon closure of TARGET, and if it isn't
3216    problematic append it to DST_NODES.  */
3217 
3218 static reg_errcode_t
3219 internal_function __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3220 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3221 			      Idx target, Idx ex_subexp, int type)
3222 {
3223   Idx cur_node;
3224   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3225     {
3226       bool ok;
3227 
3228       if (dfa->nodes[cur_node].type == type
3229 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3230 	{
3231 	  if (type == OP_CLOSE_SUBEXP)
3232 	    {
3233 	      ok = re_node_set_insert (dst_nodes, cur_node);
3234 	      if (BE (! ok, 0))
3235 		return REG_ESPACE;
3236 	    }
3237 	  break;
3238 	}
3239       ok = re_node_set_insert (dst_nodes, cur_node);
3240       if (BE (! ok, 0))
3241 	return REG_ESPACE;
3242       if (dfa->edests[cur_node].nelem == 0)
3243 	break;
3244       if (dfa->edests[cur_node].nelem == 2)
3245 	{
3246 	  reg_errcode_t err;
3247 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3248 					      dfa->edests[cur_node].elems[1],
3249 					      ex_subexp, type);
3250 	  if (BE (err != REG_NOERROR, 0))
3251 	    return err;
3252 	}
3253       cur_node = dfa->edests[cur_node].elems[0];
3254     }
3255   return REG_NOERROR;
3256 }
3257 
3258 
3259 /* For all the back references in the current state, calculate the
3260    destination of the back references by the appropriate entry
3261    in MCTX->BKREF_ENTS.  */
3262 
3263 static reg_errcode_t
3264 internal_function __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3265 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3266 		    Idx cur_str, Idx subexp_num, int type)
3267 {
3268   const re_dfa_t *const dfa = mctx->dfa;
3269   reg_errcode_t err;
3270   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3271   struct re_backref_cache_entry *ent;
3272 
3273   if (cache_idx_start == REG_MISSING)
3274     return REG_NOERROR;
3275 
3276  restart:
3277   ent = mctx->bkref_ents + cache_idx_start;
3278   do
3279     {
3280       Idx to_idx, next_node;
3281 
3282       /* Is this entry ENT is appropriate?  */
3283       if (!re_node_set_contains (cur_nodes, ent->node))
3284 	continue; /* No.  */
3285 
3286       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3287       /* Calculate the destination of the back reference, and append it
3288 	 to MCTX->STATE_LOG.  */
3289       if (to_idx == cur_str)
3290 	{
3291 	  /* The backreference did epsilon transit, we must re-check all the
3292 	     node in the current state.  */
3293 	  re_node_set new_dests;
3294 	  reg_errcode_t err2, err3;
3295 	  next_node = dfa->edests[ent->node].elems[0];
3296 	  if (re_node_set_contains (cur_nodes, next_node))
3297 	    continue;
3298 	  err = re_node_set_init_1 (&new_dests, next_node);
3299 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3300 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3301 	  re_node_set_free (&new_dests);
3302 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3303 		  || err3 != REG_NOERROR, 0))
3304 	    {
3305 	      err = (err != REG_NOERROR ? err
3306 		     : (err2 != REG_NOERROR ? err2 : err3));
3307 	      return err;
3308 	    }
3309 	  /* TODO: It is still inefficient...  */
3310 	  goto restart;
3311 	}
3312       else
3313 	{
3314 	  re_node_set union_set;
3315 	  next_node = dfa->nexts[ent->node];
3316 	  if (mctx->state_log[to_idx])
3317 	    {
3318 	      bool ok;
3319 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3320 					next_node))
3321 		continue;
3322 	      err = re_node_set_init_copy (&union_set,
3323 					   &mctx->state_log[to_idx]->nodes);
3324 	      ok = re_node_set_insert (&union_set, next_node);
3325 	      if (BE (err != REG_NOERROR || ! ok, 0))
3326 		{
3327 		  re_node_set_free (&union_set);
3328 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3329 		  return err;
3330 		}
3331 	    }
3332 	  else
3333 	    {
3334 	      err = re_node_set_init_1 (&union_set, next_node);
3335 	      if (BE (err != REG_NOERROR, 0))
3336 		return err;
3337 	    }
3338 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3339 	  re_node_set_free (&union_set);
3340 	  if (BE (mctx->state_log[to_idx] == NULL
3341 		  && err != REG_NOERROR, 0))
3342 	    return err;
3343 	}
3344     }
3345   while (ent++->more);
3346   return REG_NOERROR;
3347 }
3348 
3349 /* Build transition table for the state.
3350    Return true if successful.  */
3351 
3352 static bool
3353 internal_function
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3354 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3355 {
3356   reg_errcode_t err;
3357   Idx i, j;
3358   int ch;
3359   bool need_word_trtable = false;
3360   bitset_word_t elem, mask;
3361   bool dests_node_malloced = false;
3362   bool dest_states_malloced = false;
3363   Idx ndests; /* Number of the destination states from 'state'.  */
3364   re_dfastate_t **trtable;
3365   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3366   re_node_set follows, *dests_node;
3367   bitset_t *dests_ch;
3368   bitset_t acceptable;
3369 
3370   struct dests_alloc
3371   {
3372     re_node_set dests_node[SBC_MAX];
3373     bitset_t dests_ch[SBC_MAX];
3374   } *dests_alloc;
3375 
3376   /* We build DFA states which corresponds to the destination nodes
3377      from 'state'.  'dests_node[i]' represents the nodes which i-th
3378      destination state contains, and 'dests_ch[i]' represents the
3379      characters which i-th destination state accepts.  */
3380   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3381     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3382   else
3383     {
3384       dests_alloc = re_malloc (struct dests_alloc, 1);
3385       if (BE (dests_alloc == NULL, 0))
3386 	return false;
3387       dests_node_malloced = true;
3388     }
3389   dests_node = dests_alloc->dests_node;
3390   dests_ch = dests_alloc->dests_ch;
3391 
3392   /* Initialize transition table.  */
3393   state->word_trtable = state->trtable = NULL;
3394 
3395   /* At first, group all nodes belonging to 'state' into several
3396      destinations.  */
3397   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3398   if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0))
3399     {
3400       if (dests_node_malloced)
3401 	free (dests_alloc);
3402       /* Return false in case of an error, true otherwise.  */
3403       if (ndests == 0)
3404 	{
3405 	  state->trtable = (re_dfastate_t **)
3406 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3407           if (BE (state->trtable == NULL, 0))
3408             return false;
3409 	  return true;
3410 	}
3411       return false;
3412     }
3413 
3414   err = re_node_set_alloc (&follows, ndests + 1);
3415   if (BE (err != REG_NOERROR, 0))
3416     goto out_free;
3417 
3418   /* Avoid arithmetic overflow in size calculation.  */
3419   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3420 	    / (3 * sizeof (re_dfastate_t *)))
3421 	   < ndests),
3422 	  0))
3423     goto out_free;
3424 
3425   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3426 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3427     dest_states = (re_dfastate_t **)
3428       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3429   else
3430     {
3431       dest_states = (re_dfastate_t **)
3432 	malloc (ndests * 3 * sizeof (re_dfastate_t *));
3433       if (BE (dest_states == NULL, 0))
3434 	{
3435 out_free:
3436 	  if (dest_states_malloced)
3437 	    free (dest_states);
3438 	  re_node_set_free (&follows);
3439 	  for (i = 0; i < ndests; ++i)
3440 	    re_node_set_free (dests_node + i);
3441 	  if (dests_node_malloced)
3442 	    free (dests_alloc);
3443 	  return false;
3444 	}
3445       dest_states_malloced = true;
3446     }
3447   dest_states_word = dest_states + ndests;
3448   dest_states_nl = dest_states_word + ndests;
3449   bitset_empty (acceptable);
3450 
3451   /* Then build the states for all destinations.  */
3452   for (i = 0; i < ndests; ++i)
3453     {
3454       Idx next_node;
3455       re_node_set_empty (&follows);
3456       /* Merge the follows of this destination states.  */
3457       for (j = 0; j < dests_node[i].nelem; ++j)
3458 	{
3459 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3460 	  if (next_node != REG_MISSING)
3461 	    {
3462 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3463 	      if (BE (err != REG_NOERROR, 0))
3464 		goto out_free;
3465 	    }
3466 	}
3467       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3468       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3469 	goto out_free;
3470       /* If the new state has context constraint,
3471 	 build appropriate states for these contexts.  */
3472       if (dest_states[i]->has_constraint)
3473 	{
3474 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3475 							  CONTEXT_WORD);
3476 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3477 	    goto out_free;
3478 
3479 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3480 	    need_word_trtable = true;
3481 
3482 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3483 							CONTEXT_NEWLINE);
3484 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3485 	    goto out_free;
3486 	}
3487       else
3488 	{
3489 	  dest_states_word[i] = dest_states[i];
3490 	  dest_states_nl[i] = dest_states[i];
3491 	}
3492       bitset_merge (acceptable, dests_ch[i]);
3493     }
3494 
3495   if (!BE (need_word_trtable, 0))
3496     {
3497       /* We don't care about whether the following character is a word
3498 	 character, or we are in a single-byte character set so we can
3499 	 discern by looking at the character code: allocate a
3500 	 256-entry transition table.  */
3501       trtable = state->trtable =
3502 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3503       if (BE (trtable == NULL, 0))
3504 	goto out_free;
3505 
3506       /* For all characters ch...:  */
3507       for (i = 0; i < BITSET_WORDS; ++i)
3508 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3509 	     elem;
3510 	     mask <<= 1, elem >>= 1, ++ch)
3511 	  if (BE (elem & 1, 0))
3512 	    {
3513 	      /* There must be exactly one destination which accepts
3514 		 character ch.  See group_nodes_into_DFAstates.  */
3515 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3516 		;
3517 
3518 	      /* j-th destination accepts the word character ch.  */
3519 	      if (dfa->word_char[i] & mask)
3520 		trtable[ch] = dest_states_word[j];
3521 	      else
3522 		trtable[ch] = dest_states[j];
3523 	    }
3524     }
3525   else
3526     {
3527       /* We care about whether the following character is a word
3528 	 character, and we are in a multi-byte character set: discern
3529 	 by looking at the character code: build two 256-entry
3530 	 transition tables, one starting at trtable[0] and one
3531 	 starting at trtable[SBC_MAX].  */
3532       trtable = state->word_trtable =
3533 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3534       if (BE (trtable == NULL, 0))
3535 	goto out_free;
3536 
3537       /* For all characters ch...:  */
3538       for (i = 0; i < BITSET_WORDS; ++i)
3539 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3540 	     elem;
3541 	     mask <<= 1, elem >>= 1, ++ch)
3542 	  if (BE (elem & 1, 0))
3543 	    {
3544 	      /* There must be exactly one destination which accepts
3545 		 character ch.  See group_nodes_into_DFAstates.  */
3546 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3547 		;
3548 
3549 	      /* j-th destination accepts the word character ch.  */
3550 	      trtable[ch] = dest_states[j];
3551 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3552 	    }
3553     }
3554 
3555   /* new line */
3556   if (bitset_contain (acceptable, NEWLINE_CHAR))
3557     {
3558       /* The current state accepts newline character.  */
3559       for (j = 0; j < ndests; ++j)
3560 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3561 	  {
3562 	    /* k-th destination accepts newline character.  */
3563 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3564 	    if (need_word_trtable)
3565 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3566 	    /* There must be only one destination which accepts
3567 	       newline.  See group_nodes_into_DFAstates.  */
3568 	    break;
3569 	  }
3570     }
3571 
3572   if (dest_states_malloced)
3573     free (dest_states);
3574 
3575   re_node_set_free (&follows);
3576   for (i = 0; i < ndests; ++i)
3577     re_node_set_free (dests_node + i);
3578 
3579   if (dests_node_malloced)
3580     free (dests_alloc);
3581 
3582   return true;
3583 }
3584 
3585 /* Group all nodes belonging to STATE into several destinations.
3586    Then for all destinations, set the nodes belonging to the destination
3587    to DESTS_NODE[i] and set the characters accepted by the destination
3588    to DEST_CH[i].  This function return the number of destinations.  */
3589 
3590 static Idx
3591 internal_function
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3592 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3593 			    re_node_set *dests_node, bitset_t *dests_ch)
3594 {
3595   reg_errcode_t err;
3596   bool ok;
3597   Idx i, j, k;
3598   Idx ndests; /* Number of the destinations from 'state'.  */
3599   bitset_t accepts; /* Characters a node can accept.  */
3600   const re_node_set *cur_nodes = &state->nodes;
3601   bitset_empty (accepts);
3602   ndests = 0;
3603 
3604   /* For all the nodes belonging to 'state',  */
3605   for (i = 0; i < cur_nodes->nelem; ++i)
3606     {
3607       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3608       re_token_type_t type = node->type;
3609       unsigned int constraint = node->constraint;
3610 
3611       /* Enumerate all single byte character this node can accept.  */
3612       if (type == CHARACTER)
3613 	bitset_set (accepts, node->opr.c);
3614       else if (type == SIMPLE_BRACKET)
3615 	{
3616 	  bitset_merge (accepts, node->opr.sbcset);
3617 	}
3618       else if (type == OP_PERIOD)
3619 	{
3620 #ifdef RE_ENABLE_I18N
3621 	  if (dfa->mb_cur_max > 1)
3622 	    bitset_merge (accepts, dfa->sb_char);
3623 	  else
3624 #endif
3625 	    bitset_set_all (accepts);
3626 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3627 	    bitset_clear (accepts, '\n');
3628 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3629 	    bitset_clear (accepts, '\0');
3630 	}
3631 #ifdef RE_ENABLE_I18N
3632       else if (type == OP_UTF8_PERIOD)
3633 	{
3634 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3635 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3636 	  else
3637 	    bitset_merge (accepts, utf8_sb_map);
3638 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3639 	    bitset_clear (accepts, '\n');
3640 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3641 	    bitset_clear (accepts, '\0');
3642 	}
3643 #endif
3644       else
3645 	continue;
3646 
3647       /* Check the 'accepts' and sift the characters which are not
3648 	 match it the context.  */
3649       if (constraint)
3650 	{
3651 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3652 	    {
3653 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3654 	      bitset_empty (accepts);
3655 	      if (accepts_newline)
3656 		bitset_set (accepts, NEWLINE_CHAR);
3657 	      else
3658 		continue;
3659 	    }
3660 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3661 	    {
3662 	      bitset_empty (accepts);
3663 	      continue;
3664 	    }
3665 
3666 	  if (constraint & NEXT_WORD_CONSTRAINT)
3667 	    {
3668 	      bitset_word_t any_set = 0;
3669 	      if (type == CHARACTER && !node->word_char)
3670 		{
3671 		  bitset_empty (accepts);
3672 		  continue;
3673 		}
3674 #ifdef RE_ENABLE_I18N
3675 	      if (dfa->mb_cur_max > 1)
3676 		for (j = 0; j < BITSET_WORDS; ++j)
3677 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3678 	      else
3679 #endif
3680 		for (j = 0; j < BITSET_WORDS; ++j)
3681 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3682 	      if (!any_set)
3683 		continue;
3684 	    }
3685 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3686 	    {
3687 	      bitset_word_t any_set = 0;
3688 	      if (type == CHARACTER && node->word_char)
3689 		{
3690 		  bitset_empty (accepts);
3691 		  continue;
3692 		}
3693 #ifdef RE_ENABLE_I18N
3694 	      if (dfa->mb_cur_max > 1)
3695 		for (j = 0; j < BITSET_WORDS; ++j)
3696 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3697 	      else
3698 #endif
3699 		for (j = 0; j < BITSET_WORDS; ++j)
3700 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3701 	      if (!any_set)
3702 		continue;
3703 	    }
3704 	}
3705 
3706       /* Then divide 'accepts' into DFA states, or create a new
3707 	 state.  Above, we make sure that accepts is not empty.  */
3708       for (j = 0; j < ndests; ++j)
3709 	{
3710 	  bitset_t intersec; /* Intersection sets, see below.  */
3711 	  bitset_t remains;
3712 	  /* Flags, see below.  */
3713 	  bitset_word_t has_intersec, not_subset, not_consumed;
3714 
3715 	  /* Optimization, skip if this state doesn't accept the character.  */
3716 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3717 	    continue;
3718 
3719 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3720 	  has_intersec = 0;
3721 	  for (k = 0; k < BITSET_WORDS; ++k)
3722 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3723 	  /* And skip if the intersection set is empty.  */
3724 	  if (!has_intersec)
3725 	    continue;
3726 
3727 	  /* Then check if this state is a subset of 'accepts'.  */
3728 	  not_subset = not_consumed = 0;
3729 	  for (k = 0; k < BITSET_WORDS; ++k)
3730 	    {
3731 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3732 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3733 	    }
3734 
3735 	  /* If this state isn't a subset of 'accepts', create a
3736 	     new group state, which has the 'remains'. */
3737 	  if (not_subset)
3738 	    {
3739 	      bitset_copy (dests_ch[ndests], remains);
3740 	      bitset_copy (dests_ch[j], intersec);
3741 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3742 	      if (BE (err != REG_NOERROR, 0))
3743 		goto error_return;
3744 	      ++ndests;
3745 	    }
3746 
3747 	  /* Put the position in the current group. */
3748 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3749 	  if (BE (! ok, 0))
3750 	    goto error_return;
3751 
3752 	  /* If all characters are consumed, go to next node. */
3753 	  if (!not_consumed)
3754 	    break;
3755 	}
3756       /* Some characters remain, create a new group. */
3757       if (j == ndests)
3758 	{
3759 	  bitset_copy (dests_ch[ndests], accepts);
3760 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3761 	  if (BE (err != REG_NOERROR, 0))
3762 	    goto error_return;
3763 	  ++ndests;
3764 	  bitset_empty (accepts);
3765 	}
3766     }
3767   return ndests;
3768  error_return:
3769   for (j = 0; j < ndests; ++j)
3770     re_node_set_free (dests_node + j);
3771   return REG_MISSING;
3772 }
3773 
3774 #ifdef RE_ENABLE_I18N
3775 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3776    Return the number of the bytes the node accepts.
3777    STR_IDX is the current index of the input string.
3778 
3779    This function handles the nodes which can accept one character, or
3780    one collating element like '.', '[a-z]', opposite to the other nodes
3781    can only accept one byte.  */
3782 
3783 static int
3784 internal_function
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3785 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3786 			 const re_string_t *input, Idx str_idx)
3787 {
3788   const re_token_t *node = dfa->nodes + node_idx;
3789   int char_len, elem_len;
3790   Idx i;
3791 
3792   if (BE (node->type == OP_UTF8_PERIOD, 0))
3793     {
3794       unsigned char c = re_string_byte_at (input, str_idx), d;
3795       if (BE (c < 0xc2, 1))
3796 	return 0;
3797 
3798       if (str_idx + 2 > input->len)
3799 	return 0;
3800 
3801       d = re_string_byte_at (input, str_idx + 1);
3802       if (c < 0xe0)
3803 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3804       else if (c < 0xf0)
3805 	{
3806 	  char_len = 3;
3807 	  if (c == 0xe0 && d < 0xa0)
3808 	    return 0;
3809 	}
3810       else if (c < 0xf8)
3811 	{
3812 	  char_len = 4;
3813 	  if (c == 0xf0 && d < 0x90)
3814 	    return 0;
3815 	}
3816       else if (c < 0xfc)
3817 	{
3818 	  char_len = 5;
3819 	  if (c == 0xf8 && d < 0x88)
3820 	    return 0;
3821 	}
3822       else if (c < 0xfe)
3823 	{
3824 	  char_len = 6;
3825 	  if (c == 0xfc && d < 0x84)
3826 	    return 0;
3827 	}
3828       else
3829 	return 0;
3830 
3831       if (str_idx + char_len > input->len)
3832 	return 0;
3833 
3834       for (i = 1; i < char_len; ++i)
3835 	{
3836 	  d = re_string_byte_at (input, str_idx + i);
3837 	  if (d < 0x80 || d > 0xbf)
3838 	    return 0;
3839 	}
3840       return char_len;
3841     }
3842 
3843   char_len = re_string_char_size_at (input, str_idx);
3844   if (node->type == OP_PERIOD)
3845     {
3846       if (char_len <= 1)
3847 	return 0;
3848       /* FIXME: I don't think this if is needed, as both '\n'
3849 	 and '\0' are char_len == 1.  */
3850       /* '.' accepts any one character except the following two cases.  */
3851       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3852 	   re_string_byte_at (input, str_idx) == '\n') ||
3853 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
3854 	   re_string_byte_at (input, str_idx) == '\0'))
3855 	return 0;
3856       return char_len;
3857     }
3858 
3859   elem_len = re_string_elem_size_at (input, str_idx);
3860   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3861     return 0;
3862 
3863   if (node->type == COMPLEX_BRACKET)
3864     {
3865       const re_charset_t *cset = node->opr.mbcset;
3866 # ifdef _LIBC
3867       const unsigned char *pin
3868 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3869       Idx j;
3870       uint32_t nrules;
3871 # endif /* _LIBC */
3872       int match_len = 0;
3873       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3874 		    ? re_string_wchar_at (input, str_idx) : 0);
3875 
3876       /* match with multibyte character?  */
3877       for (i = 0; i < cset->nmbchars; ++i)
3878 	if (wc == cset->mbchars[i])
3879 	  {
3880 	    match_len = char_len;
3881 	    goto check_node_accept_bytes_match;
3882 	  }
3883       /* match with character_class?  */
3884       for (i = 0; i < cset->nchar_classes; ++i)
3885 	{
3886 	  wctype_t wt = cset->char_classes[i];
3887 	  if (__iswctype (wc, wt))
3888 	    {
3889 	      match_len = char_len;
3890 	      goto check_node_accept_bytes_match;
3891 	    }
3892 	}
3893 
3894 # ifdef _LIBC
3895       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3896       if (nrules != 0)
3897 	{
3898 	  unsigned int in_collseq = 0;
3899 	  const int32_t *table, *indirect;
3900 	  const unsigned char *weights, *extra;
3901 	  const char *collseqwc;
3902 	  /* This #include defines a local function!  */
3903 #  include <locale/weight.h>
3904 
3905 	  /* match with collating_symbol?  */
3906 	  if (cset->ncoll_syms)
3907 	    extra = (const unsigned char *)
3908 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3909 	  for (i = 0; i < cset->ncoll_syms; ++i)
3910 	    {
3911 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3912 	      /* Compare the length of input collating element and
3913 		 the length of current collating element.  */
3914 	      if (*coll_sym != elem_len)
3915 		continue;
3916 	      /* Compare each bytes.  */
3917 	      for (j = 0; j < *coll_sym; j++)
3918 		if (pin[j] != coll_sym[1 + j])
3919 		  break;
3920 	      if (j == *coll_sym)
3921 		{
3922 		  /* Match if every bytes is equal.  */
3923 		  match_len = j;
3924 		  goto check_node_accept_bytes_match;
3925 		}
3926 	    }
3927 
3928 	  if (cset->nranges)
3929 	    {
3930 	      if (elem_len <= char_len)
3931 		{
3932 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3933 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3934 		}
3935 	      else
3936 		in_collseq = find_collation_sequence_value (pin, elem_len);
3937 	    }
3938 	  /* match with range expression?  */
3939 	  /* FIXME: Implement rational ranges here, too.  */
3940 	  for (i = 0; i < cset->nranges; ++i)
3941 	    if (cset->range_starts[i] <= in_collseq
3942 		&& in_collseq <= cset->range_ends[i])
3943 	      {
3944 		match_len = elem_len;
3945 		goto check_node_accept_bytes_match;
3946 	      }
3947 
3948 	  /* match with equivalence_class?  */
3949 	  if (cset->nequiv_classes)
3950 	    {
3951 	      const unsigned char *cp = pin;
3952 	      table = (const int32_t *)
3953 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3954 	      weights = (const unsigned char *)
3955 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3956 	      extra = (const unsigned char *)
3957 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3958 	      indirect = (const int32_t *)
3959 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3960 	      int32_t idx = findidx (&cp, elem_len);
3961 	      if (idx > 0)
3962 		for (i = 0; i < cset->nequiv_classes; ++i)
3963 		  {
3964 		    int32_t equiv_class_idx = cset->equiv_classes[i];
3965 		    size_t weight_len = weights[idx & 0xffffff];
3966 		    if (weight_len == weights[equiv_class_idx & 0xffffff]
3967 			&& (idx >> 24) == (equiv_class_idx >> 24))
3968 		      {
3969 			Idx cnt = 0;
3970 
3971 			idx &= 0xffffff;
3972 			equiv_class_idx &= 0xffffff;
3973 
3974 			while (cnt <= weight_len
3975 			       && (weights[equiv_class_idx + 1 + cnt]
3976 				   == weights[idx + 1 + cnt]))
3977 			  ++cnt;
3978 			if (cnt > weight_len)
3979 			  {
3980 			    match_len = elem_len;
3981 			    goto check_node_accept_bytes_match;
3982 			  }
3983 		      }
3984 		  }
3985 	    }
3986 	}
3987       else
3988 # endif /* _LIBC */
3989 	{
3990 	  /* match with range expression?  */
3991 	  for (i = 0; i < cset->nranges; ++i)
3992 	    {
3993 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3994 		{
3995 		  match_len = char_len;
3996 		  goto check_node_accept_bytes_match;
3997 		}
3998 	    }
3999 	}
4000     check_node_accept_bytes_match:
4001       if (!cset->non_match)
4002 	return match_len;
4003       else
4004 	{
4005 	  if (match_len > 0)
4006 	    return 0;
4007 	  else
4008 	    return (elem_len > char_len) ? elem_len : char_len;
4009 	}
4010     }
4011   return 0;
4012 }
4013 
4014 # ifdef _LIBC
4015 static unsigned int
4016 internal_function
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)4017 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4018 {
4019   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4020   if (nrules == 0)
4021     {
4022       if (mbs_len == 1)
4023 	{
4024 	  /* No valid character.  Match it as a single byte character.  */
4025 	  const unsigned char *collseq = (const unsigned char *)
4026 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4027 	  return collseq[mbs[0]];
4028 	}
4029       return UINT_MAX;
4030     }
4031   else
4032     {
4033       int32_t idx;
4034       const unsigned char *extra = (const unsigned char *)
4035 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4036       int32_t extrasize = (const unsigned char *)
4037 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4038 
4039       for (idx = 0; idx < extrasize;)
4040 	{
4041 	  int mbs_cnt;
4042 	  bool found = false;
4043 	  int32_t elem_mbs_len;
4044 	  /* Skip the name of collating element name.  */
4045 	  idx = idx + extra[idx] + 1;
4046 	  elem_mbs_len = extra[idx++];
4047 	  if (mbs_len == elem_mbs_len)
4048 	    {
4049 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4050 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4051 		  break;
4052 	      if (mbs_cnt == elem_mbs_len)
4053 		/* Found the entry.  */
4054 		found = true;
4055 	    }
4056 	  /* Skip the byte sequence of the collating element.  */
4057 	  idx += elem_mbs_len;
4058 	  /* Adjust for the alignment.  */
4059 	  idx = (idx + 3) & ~3;
4060 	  /* Skip the collation sequence value.  */
4061 	  idx += sizeof (uint32_t);
4062 	  /* Skip the wide char sequence of the collating element.  */
4063 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4064 	  /* If we found the entry, return the sequence value.  */
4065 	  if (found)
4066 	    return *(uint32_t *) (extra + idx);
4067 	  /* Skip the collation sequence value.  */
4068 	  idx += sizeof (uint32_t);
4069 	}
4070       return UINT_MAX;
4071     }
4072 }
4073 # endif /* _LIBC */
4074 #endif /* RE_ENABLE_I18N */
4075 
4076 /* Check whether the node accepts the byte which is IDX-th
4077    byte of the INPUT.  */
4078 
4079 static bool
4080 internal_function
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)4081 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4082 		   Idx idx)
4083 {
4084   unsigned char ch;
4085   ch = re_string_byte_at (&mctx->input, idx);
4086   switch (node->type)
4087     {
4088     case CHARACTER:
4089       if (node->opr.c != ch)
4090         return false;
4091       break;
4092 
4093     case SIMPLE_BRACKET:
4094       if (!bitset_contain (node->opr.sbcset, ch))
4095         return false;
4096       break;
4097 
4098 #ifdef RE_ENABLE_I18N
4099     case OP_UTF8_PERIOD:
4100       if (ch >= ASCII_CHARS)
4101         return false;
4102       /* FALLTHROUGH */
4103 #endif
4104     case OP_PERIOD:
4105       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4106 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4107 	return false;
4108       break;
4109 
4110     default:
4111       return false;
4112     }
4113 
4114   if (node->constraint)
4115     {
4116       /* The node has constraints.  Check whether the current context
4117 	 satisfies the constraints.  */
4118       unsigned int context = re_string_context_at (&mctx->input, idx,
4119 						   mctx->eflags);
4120       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4121 	return false;
4122     }
4123 
4124   return true;
4125 }
4126 
4127 /* Extend the buffers, if the buffers have run out.  */
4128 
4129 static reg_errcode_t
4130 internal_function __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)4131 extend_buffers (re_match_context_t *mctx, int min_len)
4132 {
4133   reg_errcode_t ret;
4134   re_string_t *pstr = &mctx->input;
4135 
4136   /* Avoid overflow.  */
4137   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4138           <= pstr->bufs_len, 0))
4139     return REG_ESPACE;
4140 
4141   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
4142   ret = re_string_realloc_buffers (pstr,
4143 				   MAX (min_len,
4144 					MIN (pstr->len, pstr->bufs_len * 2)));
4145   if (BE (ret != REG_NOERROR, 0))
4146     return ret;
4147 
4148   if (mctx->state_log != NULL)
4149     {
4150       /* And double the length of state_log.  */
4151       /* XXX We have no indication of the size of this buffer.  If this
4152 	 allocation fail we have no indication that the state_log array
4153 	 does not have the right size.  */
4154       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4155 					      pstr->bufs_len + 1);
4156       if (BE (new_array == NULL, 0))
4157 	return REG_ESPACE;
4158       mctx->state_log = new_array;
4159     }
4160 
4161   /* Then reconstruct the buffers.  */
4162   if (pstr->icase)
4163     {
4164 #ifdef RE_ENABLE_I18N
4165       if (pstr->mb_cur_max > 1)
4166 	{
4167 	  ret = build_wcs_upper_buffer (pstr);
4168 	  if (BE (ret != REG_NOERROR, 0))
4169 	    return ret;
4170 	}
4171       else
4172 #endif /* RE_ENABLE_I18N  */
4173 	build_upper_buffer (pstr);
4174     }
4175   else
4176     {
4177 #ifdef RE_ENABLE_I18N
4178       if (pstr->mb_cur_max > 1)
4179 	build_wcs_buffer (pstr);
4180       else
4181 #endif /* RE_ENABLE_I18N  */
4182 	{
4183 	  if (pstr->trans != NULL)
4184 	    re_string_translate_buffer (pstr);
4185 	}
4186     }
4187   return REG_NOERROR;
4188 }
4189 
4190 
4191 /* Functions for matching context.  */
4192 
4193 /* Initialize MCTX.  */
4194 
4195 static reg_errcode_t
4196 internal_function __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4197 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4198 {
4199   mctx->eflags = eflags;
4200   mctx->match_last = REG_MISSING;
4201   if (n > 0)
4202     {
4203       /* Avoid overflow.  */
4204       size_t max_object_size =
4205 	MAX (sizeof (struct re_backref_cache_entry),
4206 	     sizeof (re_sub_match_top_t *));
4207       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4208 	return REG_ESPACE;
4209 
4210       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4211       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4212       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4213 	return REG_ESPACE;
4214     }
4215   /* Already zero-ed by the caller.
4216      else
4217        mctx->bkref_ents = NULL;
4218      mctx->nbkref_ents = 0;
4219      mctx->nsub_tops = 0;  */
4220   mctx->abkref_ents = n;
4221   mctx->max_mb_elem_len = 1;
4222   mctx->asub_tops = n;
4223   return REG_NOERROR;
4224 }
4225 
4226 /* Clean the entries which depend on the current input in MCTX.
4227    This function must be invoked when the matcher changes the start index
4228    of the input, or changes the input string.  */
4229 
4230 static void
4231 internal_function
match_ctx_clean(re_match_context_t * mctx)4232 match_ctx_clean (re_match_context_t *mctx)
4233 {
4234   Idx st_idx;
4235   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4236     {
4237       Idx sl_idx;
4238       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4239       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4240 	{
4241 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4242 	  re_free (last->path.array);
4243 	  re_free (last);
4244 	}
4245       re_free (top->lasts);
4246       if (top->path)
4247 	{
4248 	  re_free (top->path->array);
4249 	  re_free (top->path);
4250 	}
4251       free (top);
4252     }
4253 
4254   mctx->nsub_tops = 0;
4255   mctx->nbkref_ents = 0;
4256 }
4257 
4258 /* Free all the memory associated with MCTX.  */
4259 
4260 static void
4261 internal_function
match_ctx_free(re_match_context_t * mctx)4262 match_ctx_free (re_match_context_t *mctx)
4263 {
4264   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4265   match_ctx_clean (mctx);
4266   re_free (mctx->sub_tops);
4267   re_free (mctx->bkref_ents);
4268 }
4269 
4270 /* Add a new backreference entry to MCTX.
4271    Note that we assume that caller never call this function with duplicate
4272    entry, and call with STR_IDX which isn't smaller than any existing entry.
4273 */
4274 
4275 static reg_errcode_t
4276 internal_function __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4277 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4278 		     Idx to)
4279 {
4280   if (mctx->nbkref_ents >= mctx->abkref_ents)
4281     {
4282       struct re_backref_cache_entry* new_entry;
4283       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4284 			      mctx->abkref_ents * 2);
4285       if (BE (new_entry == NULL, 0))
4286 	{
4287 	  re_free (mctx->bkref_ents);
4288 	  return REG_ESPACE;
4289 	}
4290       mctx->bkref_ents = new_entry;
4291       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4292 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4293       mctx->abkref_ents *= 2;
4294     }
4295   if (mctx->nbkref_ents > 0
4296       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4297     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4298 
4299   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4300   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4301   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4302   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4303 
4304   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4305      If bit N is clear, means that this entry won't epsilon-transition to
4306      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4307      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4308      such node.
4309 
4310      A backreference does not epsilon-transition unless it is empty, so set
4311      to all zeros if FROM != TO.  */
4312   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4313     = (from == to ? -1 : 0);
4314 
4315   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4316   if (mctx->max_mb_elem_len < to - from)
4317     mctx->max_mb_elem_len = to - from;
4318   return REG_NOERROR;
4319 }
4320 
4321 /* Return the first entry with the same str_idx, or REG_MISSING if none is
4322    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4323 
4324 static Idx
4325 internal_function
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4326 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4327 {
4328   Idx left, right, mid, last;
4329   last = right = mctx->nbkref_ents;
4330   for (left = 0; left < right;)
4331     {
4332       mid = (left + right) / 2;
4333       if (mctx->bkref_ents[mid].str_idx < str_idx)
4334 	left = mid + 1;
4335       else
4336 	right = mid;
4337     }
4338   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4339     return left;
4340   else
4341     return REG_MISSING;
4342 }
4343 
4344 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4345    at STR_IDX.  */
4346 
4347 static reg_errcode_t
4348 internal_function __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4349 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4350 {
4351 #ifdef DEBUG
4352   assert (mctx->sub_tops != NULL);
4353   assert (mctx->asub_tops > 0);
4354 #endif
4355   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4356     {
4357       Idx new_asub_tops = mctx->asub_tops * 2;
4358       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4359 						   re_sub_match_top_t *,
4360 						   new_asub_tops);
4361       if (BE (new_array == NULL, 0))
4362 	return REG_ESPACE;
4363       mctx->sub_tops = new_array;
4364       mctx->asub_tops = new_asub_tops;
4365     }
4366   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4367   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4368     return REG_ESPACE;
4369   mctx->sub_tops[mctx->nsub_tops]->node = node;
4370   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4371   return REG_NOERROR;
4372 }
4373 
4374 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4375    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4376 
4377 static re_sub_match_last_t *
4378 internal_function
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4379 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4380 {
4381   re_sub_match_last_t *new_entry;
4382   if (BE (subtop->nlasts == subtop->alasts, 0))
4383     {
4384       Idx new_alasts = 2 * subtop->alasts + 1;
4385       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4386 						    re_sub_match_last_t *,
4387 						    new_alasts);
4388       if (BE (new_array == NULL, 0))
4389 	return NULL;
4390       subtop->lasts = new_array;
4391       subtop->alasts = new_alasts;
4392     }
4393   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4394   if (BE (new_entry != NULL, 1))
4395     {
4396       subtop->lasts[subtop->nlasts] = new_entry;
4397       new_entry->node = node;
4398       new_entry->str_idx = str_idx;
4399       ++subtop->nlasts;
4400     }
4401   return new_entry;
4402 }
4403 
4404 static void
4405 internal_function
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4406 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4407 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4408 {
4409   sctx->sifted_states = sifted_sts;
4410   sctx->limited_states = limited_sts;
4411   sctx->last_node = last_node;
4412   sctx->last_str_idx = last_str_idx;
4413   re_node_set_init_empty (&sctx->limits);
4414 }
4415