1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 					  Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 					   Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 						    Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 			   re_dfastate_t **limited_sts, Idx last_node,
33 			   Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 					 const char *string, Idx length,
36 					 Idx start, Idx last_start, Idx stop,
37 					 size_t nmatch, regmatch_t pmatch[],
38 					 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 				  const char *string1, Idx length1,
41 				  const char *string2, Idx length2,
42 				  Idx start, regoff_t range,
43 				  struct re_registers *regs,
44 				  Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 				const char *string, Idx length, Idx start,
47 				regoff_t range, Idx stop,
48 				struct re_registers *regs,
49 				bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51                               Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 			   Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 				     const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 			 regmatch_t *prev_idx_match, Idx cur_node,
59 			 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 				      Idx str_idx, Idx dest_node, Idx nregs,
62 				      regmatch_t *regs,
63 				      re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 			       const re_match_context_t *mctx,
66 			       size_t nmatch, regmatch_t *pmatch,
67 			       bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69 
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 				re_sift_context_t *sctx,
73 				Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 					   re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 					  re_sift_context_t *sctx, Idx str_idx,
79 					  re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 					      re_sift_context_t *sctx,
82 					      Idx str_idx,
83 					      re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 					    re_node_set *dest_nodes,
86 					    const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 			      const re_node_set *limits,
89 			      Idx dst_node, Idx dst_idx, Idx src_node,
90 			      Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 					int boundaries, Idx subexp_idx,
93 					Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 				      Idx limit, Idx subexp_idx,
96 				      Idx node, Idx str_idx,
97 				      Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 					  re_node_set *dest_nodes,
100 					  const re_node_set *candidates,
101 					  re_node_set *limits,
102 					  struct re_backref_cache_entry *bkref_ents,
103 					  Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 					re_sift_context_t *sctx,
106 					Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 					re_dfastate_t **dst,
109 					re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 					 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 				     re_match_context_t *mctx,
114 				     re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 					    re_match_context_t *mctx,
117 					    re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 						re_node_set *cur_nodes,
120 						Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 					re_match_context_t *mctx,
124 					re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 				       re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 					  const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 				 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 				     const re_sub_match_top_t *sub_top,
136 				     re_sub_match_last_t *sub_last,
137 				     Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 			     Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 				    state_array_t *path, Idx top_node,
142 				    Idx top_str, Idx last_node, Idx last_str,
143 				    int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 						   Idx str_idx,
146 						   re_node_set *cur_nodes,
147 						   re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 					       re_node_set *cur_nodes,
150 					       Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 						   re_node_set *dst_nodes,
153 						   Idx target, Idx ex_subexp,
154 						   int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 					 re_node_set *cur_nodes, Idx cur_str,
157 					 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 				    const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 						   size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 				       const re_dfastate_t *state,
169 				       re_node_set *states_node,
170 				       bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 			       const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174 
175 /* Entry point for POSIX code.  */
176 
177 /* regexec searches for a given pattern, specified by PREG, in the
178    string STRING.
179 
180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
182    least NMATCH elements, and we set them to the offsets of the
183    corresponding matched substrings.
184 
185    EFLAGS specifies "execution flags" which affect matching: if
186    REG_NOTBOL is set, then ^ does not match at the beginning of the
187    string; if REG_NOTEOL is set, then $ does not match at the end.
188 
189    We return 0 if we find a match and REG_NOMATCH if not.  */
190 
191 int
regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)192 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
193 	 size_t nmatch, regmatch_t pmatch[], int eflags)
194 {
195   reg_errcode_t err;
196   Idx start, length;
197   re_dfa_t *dfa = preg->buffer;
198 
199   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
200     return REG_BADPAT;
201 
202   if (eflags & REG_STARTEND)
203     {
204       start = pmatch[0].rm_so;
205       length = pmatch[0].rm_eo;
206     }
207   else
208     {
209       start = 0;
210       length = strlen (string);
211     }
212 
213   lock_lock (dfa->lock);
214   if (preg->no_sub)
215     err = re_search_internal (preg, string, length, start, length,
216 			      length, 0, NULL, eflags);
217   else
218     err = re_search_internal (preg, string, length, start, length,
219 			      length, nmatch, pmatch, eflags);
220   lock_unlock (dfa->lock);
221   return err != REG_NOERROR;
222 }
223 
224 #ifdef _LIBC
225 libc_hidden_def (__regexec)
226 
227 # include <shlib-compat.h>
228 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
229 
230 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
231 __typeof__ (__regexec) __compat_regexec;
232 
233 int
234 attribute_compat_text_section
__compat_regexec(const regex_t * _Restrict_ preg,const char * _Restrict_ string,size_t nmatch,regmatch_t pmatch[],int eflags)235 __compat_regexec (const regex_t *_Restrict_ preg,
236 		  const char *_Restrict_ string, size_t nmatch,
237 		  regmatch_t pmatch[], int eflags)
238 {
239   return regexec (preg, string, nmatch, pmatch,
240 		  eflags & (REG_NOTBOL | REG_NOTEOL));
241 }
242 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
243 # endif
244 #endif
245 
246 /* Entry points for GNU code.  */
247 
248 /* re_match, re_search, re_match_2, re_search_2
249 
250    The former two functions operate on STRING with length LENGTH,
251    while the later two operate on concatenation of STRING1 and STRING2
252    with lengths LENGTH1 and LENGTH2, respectively.
253 
254    re_match() matches the compiled pattern in BUFP against the string,
255    starting at index START.
256 
257    re_search() first tries matching at index START, then it tries to match
258    starting from index START + 1, and so on.  The last start position tried
259    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
260    way as re_match().)
261 
262    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
263    the first STOP characters of the concatenation of the strings should be
264    concerned.
265 
266    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
267    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
268    computed relative to the concatenation, not relative to the individual
269    strings.)
270 
271    On success, re_match* functions return the length of the match, re_search*
272    return the position of the start of the match.  Return value -1 means no
273    match was found and -2 indicates an internal error.  */
274 
275 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)276 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
277 	  Idx start, struct re_registers *regs)
278 {
279   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
280 }
281 #ifdef _LIBC
weak_alias(__re_match,re_match)282 weak_alias (__re_match, re_match)
283 #endif
284 
285 regoff_t
286 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
287 	   Idx start, regoff_t range, struct re_registers *regs)
288 {
289   return re_search_stub (bufp, string, length, start, range, length, regs,
290 			 false);
291 }
292 #ifdef _LIBC
weak_alias(__re_search,re_search)293 weak_alias (__re_search, re_search)
294 #endif
295 
296 regoff_t
297 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
298 	    const char *string2, Idx length2, Idx start,
299 	    struct re_registers *regs, Idx stop)
300 {
301   return re_search_2_stub (bufp, string1, length1, string2, length2,
302 			   start, 0, regs, stop, true);
303 }
304 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)305 weak_alias (__re_match_2, re_match_2)
306 #endif
307 
308 regoff_t
309 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
310 	     const char *string2, Idx length2, Idx start, regoff_t range,
311 	     struct re_registers *regs, Idx stop)
312 {
313   return re_search_2_stub (bufp, string1, length1, string2, length2,
314 			   start, range, regs, stop, false);
315 }
316 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)317 weak_alias (__re_search_2, re_search_2)
318 #endif
319 
320 static regoff_t
321 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
322 		  Idx length1, const char *string2, Idx length2, Idx start,
323 		  regoff_t range, struct re_registers *regs,
324 		  Idx stop, bool ret_len)
325 {
326   const char *str;
327   regoff_t rval;
328   Idx len;
329   char *s = NULL;
330 
331   if (BE ((length1 < 0 || length2 < 0 || stop < 0
332            || INT_ADD_WRAPV (length1, length2, &len)),
333           0))
334     return -2;
335 
336   /* Concatenate the strings.  */
337   if (length2 > 0)
338     if (length1 > 0)
339       {
340 	s = re_malloc (char, len);
341 
342 	if (BE (s == NULL, 0))
343 	  return -2;
344 #ifdef _LIBC
345 	memcpy (__mempcpy (s, string1, length1), string2, length2);
346 #else
347 	memcpy (s, string1, length1);
348 	memcpy (s + length1, string2, length2);
349 #endif
350 	str = s;
351       }
352     else
353       str = string2;
354   else
355     str = string1;
356 
357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 			 ret_len);
359   re_free (s);
360   return rval;
361 }
362 
363 /* The parameters have the same meaning as those of re_search.
364    Additional parameters:
365    If RET_LEN is true the length of the match is returned (re_match style);
366    otherwise the position of the match is returned.  */
367 
368 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 		bool ret_len)
372 {
373   reg_errcode_t result;
374   regmatch_t *pmatch;
375   Idx nregs;
376   regoff_t rval;
377   int eflags = 0;
378   re_dfa_t *dfa = bufp->buffer;
379   Idx last_start = start + range;
380 
381   /* Check for out-of-range.  */
382   if (BE (start < 0 || start > length, 0))
383     return -1;
384   if (BE (length < last_start || (0 <= range && last_start < start), 0))
385     last_start = length;
386   else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
387     last_start = 0;
388 
389   lock_lock (dfa->lock);
390 
391   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
392   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
393 
394   /* Compile fastmap if we haven't yet.  */
395   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
396     re_compile_fastmap (bufp);
397 
398   if (BE (bufp->no_sub, 0))
399     regs = NULL;
400 
401   /* We need at least 1 register.  */
402   if (regs == NULL)
403     nregs = 1;
404   else if (BE (bufp->regs_allocated == REGS_FIXED
405 	       && regs->num_regs <= bufp->re_nsub, 0))
406     {
407       nregs = regs->num_regs;
408       if (BE (nregs < 1, 0))
409 	{
410 	  /* Nothing can be copied to regs.  */
411 	  regs = NULL;
412 	  nregs = 1;
413 	}
414     }
415   else
416     nregs = bufp->re_nsub + 1;
417   pmatch = re_malloc (regmatch_t, nregs);
418   if (BE (pmatch == NULL, 0))
419     {
420       rval = -2;
421       goto out;
422     }
423 
424   result = re_search_internal (bufp, string, length, start, last_start, stop,
425 			       nregs, pmatch, eflags);
426 
427   rval = 0;
428 
429   /* I hope we needn't fill their regs with -1's when no match was found.  */
430   if (result != REG_NOERROR)
431     rval = result == REG_NOMATCH ? -1 : -2;
432   else if (regs != NULL)
433     {
434       /* If caller wants register contents data back, copy them.  */
435       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
436 					   bufp->regs_allocated);
437       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
438 	rval = -2;
439     }
440 
441   if (BE (rval == 0, 1))
442     {
443       if (ret_len)
444 	{
445 	  assert (pmatch[0].rm_so == start);
446 	  rval = pmatch[0].rm_eo - start;
447 	}
448       else
449 	rval = pmatch[0].rm_so;
450     }
451   re_free (pmatch);
452  out:
453   lock_unlock (dfa->lock);
454   return rval;
455 }
456 
457 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)458 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
459 	      int regs_allocated)
460 {
461   int rval = REGS_REALLOCATE;
462   Idx i;
463   Idx need_regs = nregs + 1;
464   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
465      uses.  */
466 
467   /* Have the register data arrays been allocated?  */
468   if (regs_allocated == REGS_UNALLOCATED)
469     { /* No.  So allocate them with malloc.  */
470       regs->start = re_malloc (regoff_t, need_regs);
471       if (BE (regs->start == NULL, 0))
472 	return REGS_UNALLOCATED;
473       regs->end = re_malloc (regoff_t, need_regs);
474       if (BE (regs->end == NULL, 0))
475 	{
476 	  re_free (regs->start);
477 	  return REGS_UNALLOCATED;
478 	}
479       regs->num_regs = need_regs;
480     }
481   else if (regs_allocated == REGS_REALLOCATE)
482     { /* Yes.  If we need more elements than were already
483 	 allocated, reallocate them.  If we need fewer, just
484 	 leave it alone.  */
485       if (BE (need_regs > regs->num_regs, 0))
486 	{
487 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
488 	  regoff_t *new_end;
489 	  if (BE (new_start == NULL, 0))
490 	    return REGS_UNALLOCATED;
491 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
492 	  if (BE (new_end == NULL, 0))
493 	    {
494 	      re_free (new_start);
495 	      return REGS_UNALLOCATED;
496 	    }
497 	  regs->start = new_start;
498 	  regs->end = new_end;
499 	  regs->num_regs = need_regs;
500 	}
501     }
502   else
503     {
504       assert (regs_allocated == REGS_FIXED);
505       /* This function may not be called with REGS_FIXED and nregs too big.  */
506       assert (regs->num_regs >= nregs);
507       rval = REGS_FIXED;
508     }
509 
510   /* Copy the regs.  */
511   for (i = 0; i < nregs; ++i)
512     {
513       regs->start[i] = pmatch[i].rm_so;
514       regs->end[i] = pmatch[i].rm_eo;
515     }
516   for ( ; i < regs->num_regs; ++i)
517     regs->start[i] = regs->end[i] = -1;
518 
519   return rval;
520 }
521 
522 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
523    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
524    this memory for recording register information.  STARTS and ENDS
525    must be allocated using the malloc library routine, and must each
526    be at least NUM_REGS * sizeof (regoff_t) bytes long.
527 
528    If NUM_REGS == 0, then subsequent matches should allocate their own
529    register data.
530 
531    Unless this function is called, the first search or match using
532    PATTERN_BUFFER will allocate its own register data, without
533    freeing the old data.  */
534 
535 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)536 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
537 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
538 {
539   if (num_regs)
540     {
541       bufp->regs_allocated = REGS_REALLOCATE;
542       regs->num_regs = num_regs;
543       regs->start = starts;
544       regs->end = ends;
545     }
546   else
547     {
548       bufp->regs_allocated = REGS_UNALLOCATED;
549       regs->num_regs = 0;
550       regs->start = regs->end = NULL;
551     }
552 }
553 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)554 weak_alias (__re_set_registers, re_set_registers)
555 #endif
556 
557 /* Entry points compatible with 4.2 BSD regex library.  We don't define
558    them unless specifically requested.  */
559 
560 #if defined _REGEX_RE_COMP || defined _LIBC
561 int
562 # ifdef _LIBC
563 weak_function
564 # endif
565 re_exec (const char *s)
566 {
567   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
568 }
569 #endif /* _REGEX_RE_COMP */
570 
571 /* Internal entry point.  */
572 
573 /* Searches for a compiled pattern PREG in the string STRING, whose
574    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
575    meaning as with regexec.  LAST_START is START + RANGE, where
576    START and RANGE have the same meaning as with re_search.
577    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
578    otherwise return the error code.
579    Note: We assume front end functions already check ranges.
580    (0 <= LAST_START && LAST_START <= LENGTH)  */
581 
582 static reg_errcode_t
583 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)584 re_search_internal (const regex_t *preg, const char *string, Idx length,
585 		    Idx start, Idx last_start, Idx stop, size_t nmatch,
586 		    regmatch_t pmatch[], int eflags)
587 {
588   reg_errcode_t err;
589   const re_dfa_t *dfa = preg->buffer;
590   Idx left_lim, right_lim;
591   int incr;
592   bool fl_longest_match;
593   int match_kind;
594   Idx match_first;
595   Idx match_last = -1;
596   Idx extra_nmatch;
597   bool sb;
598   int ch;
599 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
600   re_match_context_t mctx = { .dfa = dfa };
601 #else
602   re_match_context_t mctx;
603 #endif
604   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
605 		    && start != last_start && !preg->can_be_null)
606 		   ? preg->fastmap : NULL);
607   RE_TRANSLATE_TYPE t = preg->translate;
608 
609 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
610   memset (&mctx, '\0', sizeof (re_match_context_t));
611   mctx.dfa = dfa;
612 #endif
613 
614   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
615   nmatch -= extra_nmatch;
616 
617   /* Check if the DFA haven't been compiled.  */
618   if (BE (preg->used == 0 || dfa->init_state == NULL
619 	  || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
620 	  || dfa->init_state_begbuf == NULL, 0))
621     return REG_NOMATCH;
622 
623 #ifdef DEBUG
624   /* We assume front-end functions already check them.  */
625   assert (0 <= last_start && last_start <= length);
626 #endif
627 
628   /* If initial states with non-begbuf contexts have no elements,
629      the regex must be anchored.  If preg->newline_anchor is set,
630      we'll never use init_state_nl, so do not check it.  */
631   if (dfa->init_state->nodes.nelem == 0
632       && dfa->init_state_word->nodes.nelem == 0
633       && (dfa->init_state_nl->nodes.nelem == 0
634 	  || !preg->newline_anchor))
635     {
636       if (start != 0 && last_start != 0)
637         return REG_NOMATCH;
638       start = last_start = 0;
639     }
640 
641   /* We must check the longest matching, if nmatch > 0.  */
642   fl_longest_match = (nmatch != 0 || dfa->nbackref);
643 
644   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
645 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
646 			    dfa);
647   if (BE (err != REG_NOERROR, 0))
648     goto free_return;
649   mctx.input.stop = stop;
650   mctx.input.raw_stop = stop;
651   mctx.input.newline_anchor = preg->newline_anchor;
652 
653   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
654   if (BE (err != REG_NOERROR, 0))
655     goto free_return;
656 
657   /* We will log all the DFA states through which the dfa pass,
658      if nmatch > 1, or this dfa has "multibyte node", which is a
659      back-reference or a node which can accept multibyte character or
660      multi character collating element.  */
661   if (nmatch > 1 || dfa->has_mb_node)
662     {
663       /* Avoid overflow.  */
664       if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
665                <= mctx.input.bufs_len), 0))
666 	{
667 	  err = REG_ESPACE;
668 	  goto free_return;
669 	}
670 
671       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
672       if (BE (mctx.state_log == NULL, 0))
673 	{
674 	  err = REG_ESPACE;
675 	  goto free_return;
676 	}
677     }
678   else
679     mctx.state_log = NULL;
680 
681   match_first = start;
682   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
683 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
684 
685   /* Check incrementally whether the input string matches.  */
686   incr = (last_start < start) ? -1 : 1;
687   left_lim = (last_start < start) ? last_start : start;
688   right_lim = (last_start < start) ? start : last_start;
689   sb = dfa->mb_cur_max == 1;
690   match_kind =
691     (fastmap
692      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
693 	| (start <= last_start ? 2 : 0)
694 	| (t != NULL ? 1 : 0))
695      : 8);
696 
697   for (;; match_first += incr)
698     {
699       err = REG_NOMATCH;
700       if (match_first < left_lim || right_lim < match_first)
701 	goto free_return;
702 
703       /* Advance as rapidly as possible through the string, until we
704 	 find a plausible place to start matching.  This may be done
705 	 with varying efficiency, so there are various possibilities:
706 	 only the most common of them are specialized, in order to
707 	 save on code size.  We use a switch statement for speed.  */
708       switch (match_kind)
709 	{
710 	case 8:
711 	  /* No fastmap.  */
712 	  break;
713 
714 	case 7:
715 	  /* Fastmap with single-byte translation, match forward.  */
716 	  while (BE (match_first < right_lim, 1)
717 		 && !fastmap[t[(unsigned char) string[match_first]]])
718 	    ++match_first;
719 	  goto forward_match_found_start_or_reached_end;
720 
721 	case 6:
722 	  /* Fastmap without translation, match forward.  */
723 	  while (BE (match_first < right_lim, 1)
724 		 && !fastmap[(unsigned char) string[match_first]])
725 	    ++match_first;
726 
727 	forward_match_found_start_or_reached_end:
728 	  if (BE (match_first == right_lim, 0))
729 	    {
730 	      ch = match_first >= length
731 		       ? 0 : (unsigned char) string[match_first];
732 	      if (!fastmap[t ? t[ch] : ch])
733 		goto free_return;
734 	    }
735 	  break;
736 
737 	case 4:
738 	case 5:
739 	  /* Fastmap without multi-byte translation, match backwards.  */
740 	  while (match_first >= left_lim)
741 	    {
742 	      ch = match_first >= length
743 		       ? 0 : (unsigned char) string[match_first];
744 	      if (fastmap[t ? t[ch] : ch])
745 		break;
746 	      --match_first;
747 	    }
748 	  if (match_first < left_lim)
749 	    goto free_return;
750 	  break;
751 
752 	default:
753 	  /* In this case, we can't determine easily the current byte,
754 	     since it might be a component byte of a multibyte
755 	     character.  Then we use the constructed buffer instead.  */
756 	  for (;;)
757 	    {
758 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
759 		 buffers.  */
760 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
761 	      if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
762 		{
763 		  err = re_string_reconstruct (&mctx.input, match_first,
764 					       eflags);
765 		  if (BE (err != REG_NOERROR, 0))
766 		    goto free_return;
767 
768 		  offset = match_first - mctx.input.raw_mbs_idx;
769 		}
770 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
771 		 Note that MATCH_FIRST must not be smaller than 0.  */
772 	      ch = (match_first >= length
773 		    ? 0 : re_string_byte_at (&mctx.input, offset));
774 	      if (fastmap[ch])
775 		break;
776 	      match_first += incr;
777 	      if (match_first < left_lim || match_first > right_lim)
778 		{
779 		  err = REG_NOMATCH;
780 		  goto free_return;
781 		}
782 	    }
783 	  break;
784 	}
785 
786       /* Reconstruct the buffers so that the matcher can assume that
787 	 the matching starts from the beginning of the buffer.  */
788       err = re_string_reconstruct (&mctx.input, match_first, eflags);
789       if (BE (err != REG_NOERROR, 0))
790 	goto free_return;
791 
792 #ifdef RE_ENABLE_I18N
793      /* Don't consider this char as a possible match start if it part,
794 	yet isn't the head, of a multibyte character.  */
795       if (!sb && !re_string_first_byte (&mctx.input, 0))
796 	continue;
797 #endif
798 
799       /* It seems to be appropriate one, then use the matcher.  */
800       /* We assume that the matching starts from 0.  */
801       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
802       match_last = check_matching (&mctx, fl_longest_match,
803 				   start <= last_start ? &match_first : NULL);
804       if (match_last != -1)
805 	{
806 	  if (BE (match_last == -2, 0))
807 	    {
808 	      err = REG_ESPACE;
809 	      goto free_return;
810 	    }
811 	  else
812 	    {
813 	      mctx.match_last = match_last;
814 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
815 		{
816 		  re_dfastate_t *pstate = mctx.state_log[match_last];
817 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
818 							     match_last);
819 		}
820 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
821 		  || dfa->nbackref)
822 		{
823 		  err = prune_impossible_nodes (&mctx);
824 		  if (err == REG_NOERROR)
825 		    break;
826 		  if (BE (err != REG_NOMATCH, 0))
827 		    goto free_return;
828 		  match_last = -1;
829 		}
830 	      else
831 		break; /* We found a match.  */
832 	    }
833 	}
834 
835       match_ctx_clean (&mctx);
836     }
837 
838 #ifdef DEBUG
839   assert (match_last != -1);
840   assert (err == REG_NOERROR);
841 #endif
842 
843   /* Set pmatch[] if we need.  */
844   if (nmatch > 0)
845     {
846       Idx reg_idx;
847 
848       /* Initialize registers.  */
849       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
850 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
851 
852       /* Set the points where matching start/end.  */
853       pmatch[0].rm_so = 0;
854       pmatch[0].rm_eo = mctx.match_last;
855       /* FIXME: This function should fail if mctx.match_last exceeds
856 	 the maximum possible regoff_t value.  We need a new error
857 	 code REG_OVERFLOW.  */
858 
859       if (!preg->no_sub && nmatch > 1)
860 	{
861 	  err = set_regs (preg, &mctx, nmatch, pmatch,
862 			  dfa->has_plural_match && dfa->nbackref > 0);
863 	  if (BE (err != REG_NOERROR, 0))
864 	    goto free_return;
865 	}
866 
867       /* At last, add the offset to each register, since we slid
868 	 the buffers so that we could assume that the matching starts
869 	 from 0.  */
870       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
871 	if (pmatch[reg_idx].rm_so != -1)
872 	  {
873 #ifdef RE_ENABLE_I18N
874 	    if (BE (mctx.input.offsets_needed != 0, 0))
875 	      {
876 		pmatch[reg_idx].rm_so =
877 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
878 		   ? mctx.input.valid_raw_len
879 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
880 		pmatch[reg_idx].rm_eo =
881 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
882 		   ? mctx.input.valid_raw_len
883 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
884 	      }
885 #else
886 	    assert (mctx.input.offsets_needed == 0);
887 #endif
888 	    pmatch[reg_idx].rm_so += match_first;
889 	    pmatch[reg_idx].rm_eo += match_first;
890 	  }
891       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
892 	{
893 	  pmatch[nmatch + reg_idx].rm_so = -1;
894 	  pmatch[nmatch + reg_idx].rm_eo = -1;
895 	}
896 
897       if (dfa->subexp_map)
898 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
899 	  if (dfa->subexp_map[reg_idx] != reg_idx)
900 	    {
901 	      pmatch[reg_idx + 1].rm_so
902 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
903 	      pmatch[reg_idx + 1].rm_eo
904 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
905 	    }
906     }
907 
908  free_return:
909   re_free (mctx.state_log);
910   if (dfa->nbackref)
911     match_ctx_free (&mctx);
912   re_string_destruct (&mctx.input);
913   return err;
914 }
915 
916 static reg_errcode_t
917 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)918 prune_impossible_nodes (re_match_context_t *mctx)
919 {
920   const re_dfa_t *const dfa = mctx->dfa;
921   Idx halt_node, match_last;
922   reg_errcode_t ret;
923   re_dfastate_t **sifted_states;
924   re_dfastate_t **lim_states = NULL;
925   re_sift_context_t sctx;
926 #ifdef DEBUG
927   assert (mctx->state_log != NULL);
928 #endif
929   match_last = mctx->match_last;
930   halt_node = mctx->last_node;
931 
932   /* Avoid overflow.  */
933   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
934     return REG_ESPACE;
935 
936   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
937   if (BE (sifted_states == NULL, 0))
938     {
939       ret = REG_ESPACE;
940       goto free_return;
941     }
942   if (dfa->nbackref)
943     {
944       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
945       if (BE (lim_states == NULL, 0))
946 	{
947 	  ret = REG_ESPACE;
948 	  goto free_return;
949 	}
950       while (1)
951 	{
952 	  memset (lim_states, '\0',
953 		  sizeof (re_dfastate_t *) * (match_last + 1));
954 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
955 			 match_last);
956 	  ret = sift_states_backward (mctx, &sctx);
957 	  re_node_set_free (&sctx.limits);
958 	  if (BE (ret != REG_NOERROR, 0))
959 	      goto free_return;
960 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
961 	    break;
962 	  do
963 	    {
964 	      --match_last;
965 	      if (match_last < 0)
966 		{
967 		  ret = REG_NOMATCH;
968 		  goto free_return;
969 		}
970 	    } while (mctx->state_log[match_last] == NULL
971 		     || !mctx->state_log[match_last]->halt);
972 	  halt_node = check_halt_state_context (mctx,
973 						mctx->state_log[match_last],
974 						match_last);
975 	}
976       ret = merge_state_array (dfa, sifted_states, lim_states,
977 			       match_last + 1);
978       re_free (lim_states);
979       lim_states = NULL;
980       if (BE (ret != REG_NOERROR, 0))
981 	goto free_return;
982     }
983   else
984     {
985       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
986       ret = sift_states_backward (mctx, &sctx);
987       re_node_set_free (&sctx.limits);
988       if (BE (ret != REG_NOERROR, 0))
989 	goto free_return;
990       if (sifted_states[0] == NULL)
991 	{
992 	  ret = REG_NOMATCH;
993 	  goto free_return;
994 	}
995     }
996   re_free (mctx->state_log);
997   mctx->state_log = sifted_states;
998   sifted_states = NULL;
999   mctx->last_node = halt_node;
1000   mctx->match_last = match_last;
1001   ret = REG_NOERROR;
1002  free_return:
1003   re_free (sifted_states);
1004   re_free (lim_states);
1005   return ret;
1006 }
1007 
1008 /* Acquire an initial state and return it.
1009    We must select appropriate initial state depending on the context,
1010    since initial states may have constraints like "\<", "^", etc..  */
1011 
1012 static inline re_dfastate_t *
1013 __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1014 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1015 			    Idx idx)
1016 {
1017   const re_dfa_t *const dfa = mctx->dfa;
1018   if (dfa->init_state->has_constraint)
1019     {
1020       unsigned int context;
1021       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1022       if (IS_WORD_CONTEXT (context))
1023 	return dfa->init_state_word;
1024       else if (IS_ORDINARY_CONTEXT (context))
1025 	return dfa->init_state;
1026       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027 	return dfa->init_state_begbuf;
1028       else if (IS_NEWLINE_CONTEXT (context))
1029 	return dfa->init_state_nl;
1030       else if (IS_BEGBUF_CONTEXT (context))
1031 	{
1032 	  /* It is relatively rare case, then calculate on demand.  */
1033 	  return re_acquire_state_context (err, dfa,
1034 					   dfa->init_state->entrance_nodes,
1035 					   context);
1036 	}
1037       else
1038 	/* Must not happen?  */
1039 	return dfa->init_state;
1040     }
1041   else
1042     return dfa->init_state;
1043 }
1044 
1045 /* Check whether the regular expression match input string INPUT or not,
1046    and return the index where the matching end.  Return -1 if
1047    there is no match, and return -2 in case of an error.
1048    FL_LONGEST_MATCH means we want the POSIX longest matching.
1049    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050    next place where we may want to try matching.
1051    Note that the matcher assumes that the matching starts from the current
1052    index of the buffer.  */
1053 
1054 static Idx
1055 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1056 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1057 		Idx *p_match_first)
1058 {
1059   const re_dfa_t *const dfa = mctx->dfa;
1060   reg_errcode_t err;
1061   Idx match = 0;
1062   Idx match_last = -1;
1063   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1064   re_dfastate_t *cur_state;
1065   bool at_init_state = p_match_first != NULL;
1066   Idx next_start_idx = cur_str_idx;
1067 
1068   err = REG_NOERROR;
1069   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1070   /* An initial state must not be NULL (invalid).  */
1071   if (BE (cur_state == NULL, 0))
1072     {
1073       assert (err == REG_ESPACE);
1074       return -2;
1075     }
1076 
1077   if (mctx->state_log != NULL)
1078     {
1079       mctx->state_log[cur_str_idx] = cur_state;
1080 
1081       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1082 	 later.  E.g. Processing back references.  */
1083       if (BE (dfa->nbackref, 0))
1084 	{
1085 	  at_init_state = false;
1086 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1087 	  if (BE (err != REG_NOERROR, 0))
1088 	    return err;
1089 
1090 	  if (cur_state->has_backref)
1091 	    {
1092 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1093 	      if (BE (err != REG_NOERROR, 0))
1094 		return err;
1095 	    }
1096 	}
1097     }
1098 
1099   /* If the RE accepts NULL string.  */
1100   if (BE (cur_state->halt, 0))
1101     {
1102       if (!cur_state->has_constraint
1103 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1104 	{
1105 	  if (!fl_longest_match)
1106 	    return cur_str_idx;
1107 	  else
1108 	    {
1109 	      match_last = cur_str_idx;
1110 	      match = 1;
1111 	    }
1112 	}
1113     }
1114 
1115   while (!re_string_eoi (&mctx->input))
1116     {
1117       re_dfastate_t *old_state = cur_state;
1118       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1119 
1120       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1121 	   && mctx->input.bufs_len < mctx->input.len)
1122 	  || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123 	      && mctx->input.valid_len < mctx->input.len))
1124 	{
1125 	  err = extend_buffers (mctx, next_char_idx + 1);
1126 	  if (BE (err != REG_NOERROR, 0))
1127 	    {
1128 	      assert (err == REG_ESPACE);
1129 	      return -2;
1130 	    }
1131 	}
1132 
1133       cur_state = transit_state (&err, mctx, cur_state);
1134       if (mctx->state_log != NULL)
1135 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1136 
1137       if (cur_state == NULL)
1138 	{
1139 	  /* Reached the invalid state or an error.  Try to recover a valid
1140 	     state using the state log, if available and if we have not
1141 	     already found a valid (even if not the longest) match.  */
1142 	  if (BE (err != REG_NOERROR, 0))
1143 	    return -2;
1144 
1145 	  if (mctx->state_log == NULL
1146 	      || (match && !fl_longest_match)
1147 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1148 	    break;
1149 	}
1150 
1151       if (BE (at_init_state, 0))
1152 	{
1153 	  if (old_state == cur_state)
1154 	    next_start_idx = next_char_idx;
1155 	  else
1156 	    at_init_state = false;
1157 	}
1158 
1159       if (cur_state->halt)
1160 	{
1161 	  /* Reached a halt state.
1162 	     Check the halt state can satisfy the current context.  */
1163 	  if (!cur_state->has_constraint
1164 	      || check_halt_state_context (mctx, cur_state,
1165 					   re_string_cur_idx (&mctx->input)))
1166 	    {
1167 	      /* We found an appropriate halt state.  */
1168 	      match_last = re_string_cur_idx (&mctx->input);
1169 	      match = 1;
1170 
1171 	      /* We found a match, do not modify match_first below.  */
1172 	      p_match_first = NULL;
1173 	      if (!fl_longest_match)
1174 		break;
1175 	    }
1176 	}
1177     }
1178 
1179   if (p_match_first)
1180     *p_match_first += next_start_idx;
1181 
1182   return match_last;
1183 }
1184 
1185 /* Check NODE match the current context.  */
1186 
1187 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1188 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1189 {
1190   re_token_type_t type = dfa->nodes[node].type;
1191   unsigned int constraint = dfa->nodes[node].constraint;
1192   if (type != END_OF_RE)
1193     return false;
1194   if (!constraint)
1195     return true;
1196   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1197     return false;
1198   return true;
1199 }
1200 
1201 /* Check the halt state STATE match the current context.
1202    Return 0 if not match, if the node, STATE has, is a halt node and
1203    match the context, return the node.  */
1204 
1205 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1206 check_halt_state_context (const re_match_context_t *mctx,
1207 			  const re_dfastate_t *state, Idx idx)
1208 {
1209   Idx i;
1210   unsigned int context;
1211 #ifdef DEBUG
1212   assert (state->halt);
1213 #endif
1214   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1215   for (i = 0; i < state->nodes.nelem; ++i)
1216     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1217       return state->nodes.elems[i];
1218   return 0;
1219 }
1220 
1221 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1222    corresponding to the DFA).
1223    Return the destination node, and update EPS_VIA_NODES;
1224    return -1 in case of errors.  */
1225 
1226 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1227 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1228 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1229 		   struct re_fail_stack_t *fs)
1230 {
1231   const re_dfa_t *const dfa = mctx->dfa;
1232   Idx i;
1233   bool ok;
1234   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1235     {
1236       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1237       re_node_set *edests = &dfa->edests[node];
1238       Idx dest_node;
1239       ok = re_node_set_insert (eps_via_nodes, node);
1240       if (BE (! ok, 0))
1241 	return -2;
1242       /* Pick up a valid destination, or return -1 if none
1243 	 is found.  */
1244       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245 	{
1246 	  Idx candidate = edests->elems[i];
1247 	  if (!re_node_set_contains (cur_nodes, candidate))
1248 	    continue;
1249           if (dest_node == -1)
1250 	    dest_node = candidate;
1251 
1252 	  else
1253 	    {
1254 	      /* In order to avoid infinite loop like "(a*)*", return the second
1255 		 epsilon-transition if the first was already considered.  */
1256 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1257 		return candidate;
1258 
1259 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1260 	      else if (fs != NULL
1261 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262 					   eps_via_nodes))
1263 		return -2;
1264 
1265 	      /* We know we are going to exit.  */
1266 	      break;
1267 	    }
1268 	}
1269       return dest_node;
1270     }
1271   else
1272     {
1273       Idx naccepted = 0;
1274       re_token_type_t type = dfa->nodes[node].type;
1275 
1276 #ifdef RE_ENABLE_I18N
1277       if (dfa->nodes[node].accept_mb)
1278 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279       else
1280 #endif /* RE_ENABLE_I18N */
1281       if (type == OP_BACK_REF)
1282 	{
1283 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1284 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285 	  if (fs != NULL)
1286 	    {
1287 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288 		return -1;
1289 	      else if (naccepted)
1290 		{
1291 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1292 		  if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293 			      naccepted) != 0)
1294 		    return -1;
1295 		}
1296 	    }
1297 
1298 	  if (naccepted == 0)
1299 	    {
1300 	      Idx dest_node;
1301 	      ok = re_node_set_insert (eps_via_nodes, node);
1302 	      if (BE (! ok, 0))
1303 		return -2;
1304 	      dest_node = dfa->edests[node].elems[0];
1305 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1306 					dest_node))
1307 		return dest_node;
1308 	    }
1309 	}
1310 
1311       if (naccepted != 0
1312 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1313 	{
1314 	  Idx dest_node = dfa->nexts[node];
1315 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1316 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1317 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318 					       dest_node)))
1319 	    return -1;
1320 	  re_node_set_empty (eps_via_nodes);
1321 	  return dest_node;
1322 	}
1323     }
1324   return -1;
1325 }
1326 
1327 static reg_errcode_t
1328 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1329 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1330 		 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1331 {
1332   reg_errcode_t err;
1333   Idx num = fs->num++;
1334   if (fs->num == fs->alloc)
1335     {
1336       struct re_fail_stack_ent_t *new_array;
1337       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1338                               fs->alloc * 2);
1339       if (new_array == NULL)
1340 	return REG_ESPACE;
1341       fs->alloc *= 2;
1342       fs->stack = new_array;
1343     }
1344   fs->stack[num].idx = str_idx;
1345   fs->stack[num].node = dest_node;
1346   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1347   if (fs->stack[num].regs == NULL)
1348     return REG_ESPACE;
1349   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1350   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1351   return err;
1352 }
1353 
1354 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1355 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1356 		regmatch_t *regs, re_node_set *eps_via_nodes)
1357 {
1358   Idx num = --fs->num;
1359   assert (num >= 0);
1360   *pidx = fs->stack[num].idx;
1361   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1362   re_node_set_free (eps_via_nodes);
1363   re_free (fs->stack[num].regs);
1364   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1365   return fs->stack[num].node;
1366 }
1367 
1368 /* Set the positions where the subexpressions are starts/ends to registers
1369    PMATCH.
1370    Note: We assume that pmatch[0] is already set, and
1371    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1372 
1373 static reg_errcode_t
1374 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1375 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1376 	  regmatch_t *pmatch, bool fl_backtrack)
1377 {
1378   const re_dfa_t *dfa = preg->buffer;
1379   Idx idx, cur_node;
1380   re_node_set eps_via_nodes;
1381   struct re_fail_stack_t *fs;
1382   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1383   regmatch_t *prev_idx_match;
1384   bool prev_idx_match_malloced = false;
1385 
1386 #ifdef DEBUG
1387   assert (nmatch > 1);
1388   assert (mctx->state_log != NULL);
1389 #endif
1390   if (fl_backtrack)
1391     {
1392       fs = &fs_body;
1393       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394       if (fs->stack == NULL)
1395 	return REG_ESPACE;
1396     }
1397   else
1398     fs = NULL;
1399 
1400   cur_node = dfa->init_node;
1401   re_node_set_init_empty (&eps_via_nodes);
1402 
1403   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1404     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1405   else
1406     {
1407       prev_idx_match = re_malloc (regmatch_t, nmatch);
1408       if (prev_idx_match == NULL)
1409 	{
1410 	  free_fail_stack_return (fs);
1411 	  return REG_ESPACE;
1412 	}
1413       prev_idx_match_malloced = true;
1414     }
1415   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1416 
1417   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1418     {
1419       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1420 
1421       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1422 	{
1423 	  Idx reg_idx;
1424 	  if (fs)
1425 	    {
1426 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1427 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1428 		  break;
1429 	      if (reg_idx == nmatch)
1430 		{
1431 		  re_node_set_free (&eps_via_nodes);
1432 		  if (prev_idx_match_malloced)
1433 		    re_free (prev_idx_match);
1434 		  return free_fail_stack_return (fs);
1435 		}
1436 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437 					 &eps_via_nodes);
1438 	    }
1439 	  else
1440 	    {
1441 	      re_node_set_free (&eps_via_nodes);
1442 	      if (prev_idx_match_malloced)
1443 		re_free (prev_idx_match);
1444 	      return REG_NOERROR;
1445 	    }
1446 	}
1447 
1448       /* Proceed to next node.  */
1449       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1450 				    &eps_via_nodes, fs);
1451 
1452       if (BE (cur_node < 0, 0))
1453 	{
1454 	  if (BE (cur_node == -2, 0))
1455 	    {
1456 	      re_node_set_free (&eps_via_nodes);
1457 	      if (prev_idx_match_malloced)
1458 		re_free (prev_idx_match);
1459 	      free_fail_stack_return (fs);
1460 	      return REG_ESPACE;
1461 	    }
1462 	  if (fs)
1463 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464 				       &eps_via_nodes);
1465 	  else
1466 	    {
1467 	      re_node_set_free (&eps_via_nodes);
1468 	      if (prev_idx_match_malloced)
1469 		re_free (prev_idx_match);
1470 	      return REG_NOMATCH;
1471 	    }
1472 	}
1473     }
1474   re_node_set_free (&eps_via_nodes);
1475   if (prev_idx_match_malloced)
1476     re_free (prev_idx_match);
1477   return free_fail_stack_return (fs);
1478 }
1479 
1480 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1481 free_fail_stack_return (struct re_fail_stack_t *fs)
1482 {
1483   if (fs)
1484     {
1485       Idx fs_idx;
1486       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1487 	{
1488 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1489 	  re_free (fs->stack[fs_idx].regs);
1490 	}
1491       re_free (fs->stack);
1492     }
1493   return REG_NOERROR;
1494 }
1495 
1496 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1497 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1498 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1499 {
1500   int type = dfa->nodes[cur_node].type;
1501   if (type == OP_OPEN_SUBEXP)
1502     {
1503       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504 
1505       /* We are at the first node of this sub expression.  */
1506       if (reg_num < nmatch)
1507 	{
1508 	  pmatch[reg_num].rm_so = cur_idx;
1509 	  pmatch[reg_num].rm_eo = -1;
1510 	}
1511     }
1512   else if (type == OP_CLOSE_SUBEXP)
1513     {
1514       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1515       if (reg_num < nmatch)
1516 	{
1517 	  /* We are at the last node of this sub expression.  */
1518 	  if (pmatch[reg_num].rm_so < cur_idx)
1519 	    {
1520 	      pmatch[reg_num].rm_eo = cur_idx;
1521 	      /* This is a non-empty match or we are not inside an optional
1522 		 subexpression.  Accept this right away.  */
1523 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1524 	    }
1525 	  else
1526 	    {
1527 	      if (dfa->nodes[cur_node].opt_subexp
1528 		  && prev_idx_match[reg_num].rm_so != -1)
1529 		/* We transited through an empty match for an optional
1530 		   subexpression, like (a?)*, and this is not the subexp's
1531 		   first match.  Copy back the old content of the registers
1532 		   so that matches of an inner subexpression are undone as
1533 		   well, like in ((a?))*.  */
1534 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1535 	      else
1536 		/* We completed a subexpression, but it may be part of
1537 		   an optional one, so do not update PREV_IDX_MATCH.  */
1538 		pmatch[reg_num].rm_eo = cur_idx;
1539 	    }
1540 	}
1541     }
1542 }
1543 
1544 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1545    and sift the nodes in each states according to the following rules.
1546    Updated state_log will be wrote to STATE_LOG.
1547 
1548    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1549      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1550 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1551 	the LAST_NODE, we throw away the node 'a'.
1552      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1553 	string 's' and transit to 'b':
1554 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1555 	   away the node 'a'.
1556 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1557 	    thrown away, we throw away the node 'a'.
1558      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1559 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1560 	   node 'a'.
1561 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1562 	    we throw away the node 'a'.  */
1563 
1564 #define STATE_NODE_CONTAINS(state,node) \
1565   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1566 
1567 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1568 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1569 {
1570   reg_errcode_t err;
1571   int null_cnt = 0;
1572   Idx str_idx = sctx->last_str_idx;
1573   re_node_set cur_dest;
1574 
1575 #ifdef DEBUG
1576   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577 #endif
1578 
1579   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1580      transit to the last_node and the last_node itself.  */
1581   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1582   if (BE (err != REG_NOERROR, 0))
1583     return err;
1584   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1585   if (BE (err != REG_NOERROR, 0))
1586     goto free_return;
1587 
1588   /* Then check each states in the state_log.  */
1589   while (str_idx > 0)
1590     {
1591       /* Update counters.  */
1592       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1593       if (null_cnt > mctx->max_mb_elem_len)
1594 	{
1595 	  memset (sctx->sifted_states, '\0',
1596 		  sizeof (re_dfastate_t *) * str_idx);
1597 	  re_node_set_free (&cur_dest);
1598 	  return REG_NOERROR;
1599 	}
1600       re_node_set_empty (&cur_dest);
1601       --str_idx;
1602 
1603       if (mctx->state_log[str_idx])
1604 	{
1605 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1606 	  if (BE (err != REG_NOERROR, 0))
1607 	    goto free_return;
1608 	}
1609 
1610       /* Add all the nodes which satisfy the following conditions:
1611 	 - It can epsilon transit to a node in CUR_DEST.
1612 	 - It is in CUR_SRC.
1613 	 And update state_log.  */
1614       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615       if (BE (err != REG_NOERROR, 0))
1616 	goto free_return;
1617     }
1618   err = REG_NOERROR;
1619  free_return:
1620   re_node_set_free (&cur_dest);
1621   return err;
1622 }
1623 
1624 static reg_errcode_t
1625 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1626 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1627 		     Idx str_idx, re_node_set *cur_dest)
1628 {
1629   const re_dfa_t *const dfa = mctx->dfa;
1630   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1631   Idx i;
1632 
1633   /* Then build the next sifted state.
1634      We build the next sifted state on 'cur_dest', and update
1635      'sifted_states[str_idx]' with 'cur_dest'.
1636      Note:
1637      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1638      'cur_src' points the node_set of the old 'state_log[str_idx]'
1639      (with the epsilon nodes pre-filtered out).  */
1640   for (i = 0; i < cur_src->nelem; i++)
1641     {
1642       Idx prev_node = cur_src->elems[i];
1643       int naccepted = 0;
1644       bool ok;
1645 
1646 #ifdef DEBUG
1647       re_token_type_t type = dfa->nodes[prev_node].type;
1648       assert (!IS_EPSILON_NODE (type));
1649 #endif
1650 #ifdef RE_ENABLE_I18N
1651       /* If the node may accept "multi byte".  */
1652       if (dfa->nodes[prev_node].accept_mb)
1653 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1654 					 str_idx, sctx->last_str_idx);
1655 #endif /* RE_ENABLE_I18N */
1656 
1657       /* We don't check backreferences here.
1658 	 See update_cur_sifted_state().  */
1659       if (!naccepted
1660 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1661 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1662 				  dfa->nexts[prev_node]))
1663 	naccepted = 1;
1664 
1665       if (naccepted == 0)
1666 	continue;
1667 
1668       if (sctx->limits.nelem)
1669 	{
1670 	  Idx to_idx = str_idx + naccepted;
1671 	  if (check_dst_limits (mctx, &sctx->limits,
1672 				dfa->nexts[prev_node], to_idx,
1673 				prev_node, str_idx))
1674 	    continue;
1675 	}
1676       ok = re_node_set_insert (cur_dest, prev_node);
1677       if (BE (! ok, 0))
1678 	return REG_ESPACE;
1679     }
1680 
1681   return REG_NOERROR;
1682 }
1683 
1684 /* Helper functions.  */
1685 
1686 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1687 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1688 {
1689   Idx top = mctx->state_log_top;
1690 
1691   if ((next_state_log_idx >= mctx->input.bufs_len
1692        && mctx->input.bufs_len < mctx->input.len)
1693       || (next_state_log_idx >= mctx->input.valid_len
1694 	  && mctx->input.valid_len < mctx->input.len))
1695     {
1696       reg_errcode_t err;
1697       err = extend_buffers (mctx, next_state_log_idx + 1);
1698       if (BE (err != REG_NOERROR, 0))
1699 	return err;
1700     }
1701 
1702   if (top < next_state_log_idx)
1703     {
1704       memset (mctx->state_log + top + 1, '\0',
1705 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706       mctx->state_log_top = next_state_log_idx;
1707     }
1708   return REG_NOERROR;
1709 }
1710 
1711 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1712 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1713 		   re_dfastate_t **src, Idx num)
1714 {
1715   Idx st_idx;
1716   reg_errcode_t err;
1717   for (st_idx = 0; st_idx < num; ++st_idx)
1718     {
1719       if (dst[st_idx] == NULL)
1720 	dst[st_idx] = src[st_idx];
1721       else if (src[st_idx] != NULL)
1722 	{
1723 	  re_node_set merged_set;
1724 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1725 					&src[st_idx]->nodes);
1726 	  if (BE (err != REG_NOERROR, 0))
1727 	    return err;
1728 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1729 	  re_node_set_free (&merged_set);
1730 	  if (BE (err != REG_NOERROR, 0))
1731 	    return err;
1732 	}
1733     }
1734   return REG_NOERROR;
1735 }
1736 
1737 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1738 update_cur_sifted_state (const re_match_context_t *mctx,
1739 			 re_sift_context_t *sctx, Idx str_idx,
1740 			 re_node_set *dest_nodes)
1741 {
1742   const re_dfa_t *const dfa = mctx->dfa;
1743   reg_errcode_t err = REG_NOERROR;
1744   const re_node_set *candidates;
1745   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1746 		: &mctx->state_log[str_idx]->nodes);
1747 
1748   if (dest_nodes->nelem == 0)
1749     sctx->sifted_states[str_idx] = NULL;
1750   else
1751     {
1752       if (candidates)
1753 	{
1754 	  /* At first, add the nodes which can epsilon transit to a node in
1755 	     DEST_NODE.  */
1756 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1757 	  if (BE (err != REG_NOERROR, 0))
1758 	    return err;
1759 
1760 	  /* Then, check the limitations in the current sift_context.  */
1761 	  if (sctx->limits.nelem)
1762 	    {
1763 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1764 					 mctx->bkref_ents, str_idx);
1765 	      if (BE (err != REG_NOERROR, 0))
1766 		return err;
1767 	    }
1768 	}
1769 
1770       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1771       if (BE (err != REG_NOERROR, 0))
1772 	return err;
1773     }
1774 
1775   if (candidates && mctx->state_log[str_idx]->has_backref)
1776     {
1777       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1778       if (BE (err != REG_NOERROR, 0))
1779 	return err;
1780     }
1781   return REG_NOERROR;
1782 }
1783 
1784 static reg_errcode_t
1785 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1786 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1787 		       const re_node_set *candidates)
1788 {
1789   reg_errcode_t err = REG_NOERROR;
1790   Idx i;
1791 
1792   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1793   if (BE (err != REG_NOERROR, 0))
1794     return err;
1795 
1796   if (!state->inveclosure.alloc)
1797     {
1798       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1799       if (BE (err != REG_NOERROR, 0))
1800 	return REG_ESPACE;
1801       for (i = 0; i < dest_nodes->nelem; i++)
1802 	{
1803 	  err = re_node_set_merge (&state->inveclosure,
1804 				   dfa->inveclosures + dest_nodes->elems[i]);
1805 	  if (BE (err != REG_NOERROR, 0))
1806 	    return REG_ESPACE;
1807 	}
1808     }
1809   return re_node_set_add_intersect (dest_nodes, candidates,
1810 				    &state->inveclosure);
1811 }
1812 
1813 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1814 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1815 		       const re_node_set *candidates)
1816 {
1817     Idx ecl_idx;
1818     reg_errcode_t err;
1819     re_node_set *inv_eclosure = dfa->inveclosures + node;
1820     re_node_set except_nodes;
1821     re_node_set_init_empty (&except_nodes);
1822     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823       {
1824 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1825 	if (cur_node == node)
1826 	  continue;
1827 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1828 	  {
1829 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1830 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1831 			 ? dfa->edests[cur_node].elems[1] : -1);
1832 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1833 		 && re_node_set_contains (dest_nodes, edst1))
1834 		|| (edst2 > 0
1835 		    && !re_node_set_contains (inv_eclosure, edst2)
1836 		    && re_node_set_contains (dest_nodes, edst2)))
1837 	      {
1838 		err = re_node_set_add_intersect (&except_nodes, candidates,
1839 						 dfa->inveclosures + cur_node);
1840 		if (BE (err != REG_NOERROR, 0))
1841 		  {
1842 		    re_node_set_free (&except_nodes);
1843 		    return err;
1844 		  }
1845 	      }
1846 	  }
1847       }
1848     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1849       {
1850 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1851 	if (!re_node_set_contains (&except_nodes, cur_node))
1852 	  {
1853 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1854 	    re_node_set_remove_at (dest_nodes, idx);
1855 	  }
1856       }
1857     re_node_set_free (&except_nodes);
1858     return REG_NOERROR;
1859 }
1860 
1861 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1862 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1863 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1864 {
1865   const re_dfa_t *const dfa = mctx->dfa;
1866   Idx lim_idx, src_pos, dst_pos;
1867 
1868   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1869   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1870   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1871     {
1872       Idx subexp_idx;
1873       struct re_backref_cache_entry *ent;
1874       ent = mctx->bkref_ents + limits->elems[lim_idx];
1875       subexp_idx = dfa->nodes[ent->node].opr.idx;
1876 
1877       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878 					   subexp_idx, dst_node, dst_idx,
1879 					   dst_bkref_idx);
1880       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881 					   subexp_idx, src_node, src_idx,
1882 					   src_bkref_idx);
1883 
1884       /* In case of:
1885 	 <src> <dst> ( <subexp> )
1886 	 ( <subexp> ) <src> <dst>
1887 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1888       if (src_pos == dst_pos)
1889 	continue; /* This is unrelated limitation.  */
1890       else
1891 	return true;
1892     }
1893   return false;
1894 }
1895 
1896 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1897 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1898 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1899 {
1900   const re_dfa_t *const dfa = mctx->dfa;
1901   const re_node_set *eclosures = dfa->eclosures + from_node;
1902   Idx node_idx;
1903 
1904   /* Else, we are on the boundary: examine the nodes on the epsilon
1905      closure.  */
1906   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1907     {
1908       Idx node = eclosures->elems[node_idx];
1909       switch (dfa->nodes[node].type)
1910 	{
1911 	case OP_BACK_REF:
1912 	  if (bkref_idx != -1)
1913 	    {
1914 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1915 	      do
1916 		{
1917 		  Idx dst;
1918 		  int cpos;
1919 
1920 		  if (ent->node != node)
1921 		    continue;
1922 
1923 		  if (subexp_idx < BITSET_WORD_BITS
1924 		      && !(ent->eps_reachable_subexps_map
1925 			   & ((bitset_word_t) 1 << subexp_idx)))
1926 		    continue;
1927 
1928 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1929 		     OP_CLOSE_SUBEXP cases below.  But, if the
1930 		     destination node is the same node as the source
1931 		     node, don't recurse because it would cause an
1932 		     infinite loop: a regex that exhibits this behavior
1933 		     is ()\1*\1*  */
1934 		  dst = dfa->edests[node].elems[0];
1935 		  if (dst == from_node)
1936 		    {
1937 		      if (boundaries & 1)
1938 			return -1;
1939 		      else /* if (boundaries & 2) */
1940 			return 0;
1941 		    }
1942 
1943 		  cpos =
1944 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1945 						 dst, bkref_idx);
1946 		  if (cpos == -1 /* && (boundaries & 1) */)
1947 		    return -1;
1948 		  if (cpos == 0 && (boundaries & 2))
1949 		    return 0;
1950 
1951 		  if (subexp_idx < BITSET_WORD_BITS)
1952 		    ent->eps_reachable_subexps_map
1953 		      &= ~((bitset_word_t) 1 << subexp_idx);
1954 		}
1955 	      while (ent++->more);
1956 	    }
1957 	  break;
1958 
1959 	case OP_OPEN_SUBEXP:
1960 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961 	    return -1;
1962 	  break;
1963 
1964 	case OP_CLOSE_SUBEXP:
1965 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966 	    return 0;
1967 	  break;
1968 
1969 	default:
1970 	    break;
1971 	}
1972     }
1973 
1974   return (boundaries & 2) ? 1 : 0;
1975 }
1976 
1977 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1978 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1979 			   Idx subexp_idx, Idx from_node, Idx str_idx,
1980 			   Idx bkref_idx)
1981 {
1982   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983   int boundaries;
1984 
1985   /* If we are outside the range of the subexpression, return -1 or 1.  */
1986   if (str_idx < lim->subexp_from)
1987     return -1;
1988 
1989   if (lim->subexp_to < str_idx)
1990     return 1;
1991 
1992   /* If we are within the subexpression, return 0.  */
1993   boundaries = (str_idx == lim->subexp_from);
1994   boundaries |= (str_idx == lim->subexp_to) << 1;
1995   if (boundaries == 0)
1996     return 0;
1997 
1998   /* Else, examine epsilon closure.  */
1999   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000 				      from_node, bkref_idx);
2001 }
2002 
2003 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2004    which are against limitations from DEST_NODES. */
2005 
2006 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2007 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2008 		     const re_node_set *candidates, re_node_set *limits,
2009 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2010 {
2011   reg_errcode_t err;
2012   Idx node_idx, lim_idx;
2013 
2014   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2015     {
2016       Idx subexp_idx;
2017       struct re_backref_cache_entry *ent;
2018       ent = bkref_ents + limits->elems[lim_idx];
2019 
2020       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021 	continue; /* This is unrelated limitation.  */
2022 
2023       subexp_idx = dfa->nodes[ent->node].opr.idx;
2024       if (ent->subexp_to == str_idx)
2025 	{
2026 	  Idx ops_node = -1;
2027 	  Idx cls_node = -1;
2028 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029 	    {
2030 	      Idx node = dest_nodes->elems[node_idx];
2031 	      re_token_type_t type = dfa->nodes[node].type;
2032 	      if (type == OP_OPEN_SUBEXP
2033 		  && subexp_idx == dfa->nodes[node].opr.idx)
2034 		ops_node = node;
2035 	      else if (type == OP_CLOSE_SUBEXP
2036 		       && subexp_idx == dfa->nodes[node].opr.idx)
2037 		cls_node = node;
2038 	    }
2039 
2040 	  /* Check the limitation of the open subexpression.  */
2041 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2042 	  if (ops_node >= 0)
2043 	    {
2044 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2045 					   candidates);
2046 	      if (BE (err != REG_NOERROR, 0))
2047 		return err;
2048 	    }
2049 
2050 	  /* Check the limitation of the close subexpression.  */
2051 	  if (cls_node >= 0)
2052 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2053 	      {
2054 		Idx node = dest_nodes->elems[node_idx];
2055 		if (!re_node_set_contains (dfa->inveclosures + node,
2056 					   cls_node)
2057 		    && !re_node_set_contains (dfa->eclosures + node,
2058 					      cls_node))
2059 		  {
2060 		    /* It is against this limitation.
2061 		       Remove it form the current sifted state.  */
2062 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2063 						 candidates);
2064 		    if (BE (err != REG_NOERROR, 0))
2065 		      return err;
2066 		    --node_idx;
2067 		  }
2068 	      }
2069 	}
2070       else /* (ent->subexp_to != str_idx)  */
2071 	{
2072 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2073 	    {
2074 	      Idx node = dest_nodes->elems[node_idx];
2075 	      re_token_type_t type = dfa->nodes[node].type;
2076 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2077 		{
2078 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2079 		    continue;
2080 		  /* It is against this limitation.
2081 		     Remove it form the current sifted state.  */
2082 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2083 					       candidates);
2084 		  if (BE (err != REG_NOERROR, 0))
2085 		    return err;
2086 		}
2087 	    }
2088 	}
2089     }
2090   return REG_NOERROR;
2091 }
2092 
2093 static reg_errcode_t
2094 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2095 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2096 		   Idx str_idx, const re_node_set *candidates)
2097 {
2098   const re_dfa_t *const dfa = mctx->dfa;
2099   reg_errcode_t err;
2100   Idx node_idx, node;
2101   re_sift_context_t local_sctx;
2102   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2103 
2104   if (first_idx == -1)
2105     return REG_NOERROR;
2106 
2107   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2108 
2109   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2110     {
2111       Idx enabled_idx;
2112       re_token_type_t type;
2113       struct re_backref_cache_entry *entry;
2114       node = candidates->elems[node_idx];
2115       type = dfa->nodes[node].type;
2116       /* Avoid infinite loop for the REs like "()\1+".  */
2117       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2118 	continue;
2119       if (type != OP_BACK_REF)
2120 	continue;
2121 
2122       entry = mctx->bkref_ents + first_idx;
2123       enabled_idx = first_idx;
2124       do
2125 	{
2126 	  Idx subexp_len;
2127 	  Idx to_idx;
2128 	  Idx dst_node;
2129 	  bool ok;
2130 	  re_dfastate_t *cur_state;
2131 
2132 	  if (entry->node != node)
2133 	    continue;
2134 	  subexp_len = entry->subexp_to - entry->subexp_from;
2135 	  to_idx = str_idx + subexp_len;
2136 	  dst_node = (subexp_len ? dfa->nexts[node]
2137 		      : dfa->edests[node].elems[0]);
2138 
2139 	  if (to_idx > sctx->last_str_idx
2140 	      || sctx->sifted_states[to_idx] == NULL
2141 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2142 	      || check_dst_limits (mctx, &sctx->limits, node,
2143 				   str_idx, dst_node, to_idx))
2144 	    continue;
2145 
2146 	  if (local_sctx.sifted_states == NULL)
2147 	    {
2148 	      local_sctx = *sctx;
2149 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2150 	      if (BE (err != REG_NOERROR, 0))
2151 		goto free_return;
2152 	    }
2153 	  local_sctx.last_node = node;
2154 	  local_sctx.last_str_idx = str_idx;
2155 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2156 	  if (BE (! ok, 0))
2157 	    {
2158 	      err = REG_ESPACE;
2159 	      goto free_return;
2160 	    }
2161 	  cur_state = local_sctx.sifted_states[str_idx];
2162 	  err = sift_states_backward (mctx, &local_sctx);
2163 	  if (BE (err != REG_NOERROR, 0))
2164 	    goto free_return;
2165 	  if (sctx->limited_states != NULL)
2166 	    {
2167 	      err = merge_state_array (dfa, sctx->limited_states,
2168 				       local_sctx.sifted_states,
2169 				       str_idx + 1);
2170 	      if (BE (err != REG_NOERROR, 0))
2171 		goto free_return;
2172 	    }
2173 	  local_sctx.sifted_states[str_idx] = cur_state;
2174 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2175 
2176 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2177 	  entry = mctx->bkref_ents + enabled_idx;
2178 	}
2179       while (enabled_idx++, entry++->more);
2180     }
2181   err = REG_NOERROR;
2182  free_return:
2183   if (local_sctx.sifted_states != NULL)
2184     {
2185       re_node_set_free (&local_sctx.limits);
2186     }
2187 
2188   return err;
2189 }
2190 
2191 
2192 #ifdef RE_ENABLE_I18N
2193 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2194 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2195 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2196 {
2197   const re_dfa_t *const dfa = mctx->dfa;
2198   int naccepted;
2199   /* Check the node can accept "multi byte".  */
2200   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2201   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2202       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2203 			    dfa->nexts[node_idx]))
2204     /* The node can't accept the "multi byte", or the
2205        destination was already thrown away, then the node
2206        could't accept the current input "multi byte".   */
2207     naccepted = 0;
2208   /* Otherwise, it is sure that the node could accept
2209      'naccepted' bytes input.  */
2210   return naccepted;
2211 }
2212 #endif /* RE_ENABLE_I18N */
2213 
2214 
2215 /* Functions for state transition.  */
2216 
2217 /* Return the next state to which the current state STATE will transit by
2218    accepting the current input byte, and update STATE_LOG if necessary.
2219    If STATE can accept a multibyte char/collating element/back reference
2220    update the destination of STATE_LOG.  */
2221 
2222 static re_dfastate_t *
2223 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2224 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2225 	       re_dfastate_t *state)
2226 {
2227   re_dfastate_t **trtable;
2228   unsigned char ch;
2229 
2230 #ifdef RE_ENABLE_I18N
2231   /* If the current state can accept multibyte.  */
2232   if (BE (state->accept_mb, 0))
2233     {
2234       *err = transit_state_mb (mctx, state);
2235       if (BE (*err != REG_NOERROR, 0))
2236 	return NULL;
2237     }
2238 #endif /* RE_ENABLE_I18N */
2239 
2240   /* Then decide the next state with the single byte.  */
2241 #if 0
2242   if (0)
2243     /* don't use transition table  */
2244     return transit_state_sb (err, mctx, state);
2245 #endif
2246 
2247   /* Use transition table  */
2248   ch = re_string_fetch_byte (&mctx->input);
2249   for (;;)
2250     {
2251       trtable = state->trtable;
2252       if (BE (trtable != NULL, 1))
2253 	return trtable[ch];
2254 
2255       trtable = state->word_trtable;
2256       if (BE (trtable != NULL, 1))
2257 	{
2258 	  unsigned int context;
2259 	  context
2260 	    = re_string_context_at (&mctx->input,
2261 				    re_string_cur_idx (&mctx->input) - 1,
2262 				    mctx->eflags);
2263 	  if (IS_WORD_CONTEXT (context))
2264 	    return trtable[ch + SBC_MAX];
2265 	  else
2266 	    return trtable[ch];
2267 	}
2268 
2269       if (!build_trtable (mctx->dfa, state))
2270 	{
2271 	  *err = REG_ESPACE;
2272 	  return NULL;
2273 	}
2274 
2275       /* Retry, we now have a transition table.  */
2276     }
2277 }
2278 
2279 /* Update the state_log if we need */
2280 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2281 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2282 		      re_dfastate_t *next_state)
2283 {
2284   const re_dfa_t *const dfa = mctx->dfa;
2285   Idx cur_idx = re_string_cur_idx (&mctx->input);
2286 
2287   if (cur_idx > mctx->state_log_top)
2288     {
2289       mctx->state_log[cur_idx] = next_state;
2290       mctx->state_log_top = cur_idx;
2291     }
2292   else if (mctx->state_log[cur_idx] == 0)
2293     {
2294       mctx->state_log[cur_idx] = next_state;
2295     }
2296   else
2297     {
2298       re_dfastate_t *pstate;
2299       unsigned int context;
2300       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302 	 the destination of a multibyte char/collating element/
2303 	 back reference.  Then the next state is the union set of
2304 	 these destinations and the results of the transition table.  */
2305       pstate = mctx->state_log[cur_idx];
2306       log_nodes = pstate->entrance_nodes;
2307       if (next_state != NULL)
2308 	{
2309 	  table_nodes = next_state->entrance_nodes;
2310 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2311 					     log_nodes);
2312 	  if (BE (*err != REG_NOERROR, 0))
2313 	    return NULL;
2314 	}
2315       else
2316 	next_nodes = *log_nodes;
2317       /* Note: We already add the nodes of the initial state,
2318 	 then we don't need to add them here.  */
2319 
2320       context = re_string_context_at (&mctx->input,
2321 				      re_string_cur_idx (&mctx->input) - 1,
2322 				      mctx->eflags);
2323       next_state = mctx->state_log[cur_idx]
2324 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2325       /* We don't need to check errors here, since the return value of
2326 	 this function is next_state and ERR is already set.  */
2327 
2328       if (table_nodes != NULL)
2329 	re_node_set_free (&next_nodes);
2330     }
2331 
2332   if (BE (dfa->nbackref, 0) && next_state != NULL)
2333     {
2334       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335 	 later.  We must check them here, since the back references in the
2336 	 next state might use them.  */
2337       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338 					cur_idx);
2339       if (BE (*err != REG_NOERROR, 0))
2340 	return NULL;
2341 
2342       /* If the next state has back references.  */
2343       if (next_state->has_backref)
2344 	{
2345 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2346 	  if (BE (*err != REG_NOERROR, 0))
2347 	    return NULL;
2348 	  next_state = mctx->state_log[cur_idx];
2349 	}
2350     }
2351 
2352   return next_state;
2353 }
2354 
2355 /* Skip bytes in the input that correspond to part of a
2356    multi-byte match, then look in the log for a state
2357    from which to restart matching.  */
2358 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2359 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2360 {
2361   re_dfastate_t *cur_state;
2362   do
2363     {
2364       Idx max = mctx->state_log_top;
2365       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2366 
2367       do
2368 	{
2369 	  if (++cur_str_idx > max)
2370 	    return NULL;
2371 	  re_string_skip_bytes (&mctx->input, 1);
2372 	}
2373       while (mctx->state_log[cur_str_idx] == NULL);
2374 
2375       cur_state = merge_state_with_log (err, mctx, NULL);
2376     }
2377   while (*err == REG_NOERROR && cur_state == NULL);
2378   return cur_state;
2379 }
2380 
2381 /* Helper functions for transit_state.  */
2382 
2383 /* From the node set CUR_NODES, pick up the nodes whose types are
2384    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385    expression. And register them to use them later for evaluating the
2386    corresponding back references.  */
2387 
2388 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2389 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2390 			   Idx str_idx)
2391 {
2392   const re_dfa_t *const dfa = mctx->dfa;
2393   Idx node_idx;
2394   reg_errcode_t err;
2395 
2396   /* TODO: This isn't efficient.
2397 	   Because there might be more than one nodes whose types are
2398 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2399 	   nodes.
2400 	   E.g. RE: (a){2}  */
2401   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2402     {
2403       Idx node = cur_nodes->elems[node_idx];
2404       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2405 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2406 	  && (dfa->used_bkref_map
2407 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2408 	{
2409 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2410 	  if (BE (err != REG_NOERROR, 0))
2411 	    return err;
2412 	}
2413     }
2414   return REG_NOERROR;
2415 }
2416 
2417 #if 0
2418 /* Return the next state to which the current state STATE will transit by
2419    accepting the current input byte.  */
2420 
2421 static re_dfastate_t *
2422 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423 		  re_dfastate_t *state)
2424 {
2425   const re_dfa_t *const dfa = mctx->dfa;
2426   re_node_set next_nodes;
2427   re_dfastate_t *next_state;
2428   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429   unsigned int context;
2430 
2431   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432   if (BE (*err != REG_NOERROR, 0))
2433     return NULL;
2434   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435     {
2436       Idx cur_node = state->nodes.elems[node_cnt];
2437       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2438 	{
2439 	  *err = re_node_set_merge (&next_nodes,
2440 				    dfa->eclosures + dfa->nexts[cur_node]);
2441 	  if (BE (*err != REG_NOERROR, 0))
2442 	    {
2443 	      re_node_set_free (&next_nodes);
2444 	      return NULL;
2445 	    }
2446 	}
2447     }
2448   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450   /* We don't need to check errors here, since the return value of
2451      this function is next_state and ERR is already set.  */
2452 
2453   re_node_set_free (&next_nodes);
2454   re_string_skip_bytes (&mctx->input, 1);
2455   return next_state;
2456 }
2457 #endif
2458 
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2461 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2462 {
2463   const re_dfa_t *const dfa = mctx->dfa;
2464   reg_errcode_t err;
2465   Idx i;
2466 
2467   for (i = 0; i < pstate->nodes.nelem; ++i)
2468     {
2469       re_node_set dest_nodes, *new_nodes;
2470       Idx cur_node_idx = pstate->nodes.elems[i];
2471       int naccepted;
2472       Idx dest_idx;
2473       unsigned int context;
2474       re_dfastate_t *dest_state;
2475 
2476       if (!dfa->nodes[cur_node_idx].accept_mb)
2477 	continue;
2478 
2479       if (dfa->nodes[cur_node_idx].constraint)
2480 	{
2481 	  context = re_string_context_at (&mctx->input,
2482 					  re_string_cur_idx (&mctx->input),
2483 					  mctx->eflags);
2484 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2485 					   context))
2486 	    continue;
2487 	}
2488 
2489       /* How many bytes the node can accept?  */
2490       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2491 					   re_string_cur_idx (&mctx->input));
2492       if (naccepted == 0)
2493 	continue;
2494 
2495       /* The node can accepts 'naccepted' bytes.  */
2496       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2497       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2498 			       : mctx->max_mb_elem_len);
2499       err = clean_state_log_if_needed (mctx, dest_idx);
2500       if (BE (err != REG_NOERROR, 0))
2501 	return err;
2502 #ifdef DEBUG
2503       assert (dfa->nexts[cur_node_idx] != -1);
2504 #endif
2505       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2506 
2507       dest_state = mctx->state_log[dest_idx];
2508       if (dest_state == NULL)
2509 	dest_nodes = *new_nodes;
2510       else
2511 	{
2512 	  err = re_node_set_init_union (&dest_nodes,
2513 					dest_state->entrance_nodes, new_nodes);
2514 	  if (BE (err != REG_NOERROR, 0))
2515 	    return err;
2516 	}
2517       context = re_string_context_at (&mctx->input, dest_idx - 1,
2518 				      mctx->eflags);
2519       mctx->state_log[dest_idx]
2520 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521       if (dest_state != NULL)
2522 	re_node_set_free (&dest_nodes);
2523       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524 	return err;
2525     }
2526   return REG_NOERROR;
2527 }
2528 #endif /* RE_ENABLE_I18N */
2529 
2530 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2531 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2532 {
2533   const re_dfa_t *const dfa = mctx->dfa;
2534   reg_errcode_t err;
2535   Idx i;
2536   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2537 
2538   for (i = 0; i < nodes->nelem; ++i)
2539     {
2540       Idx dest_str_idx, prev_nelem, bkc_idx;
2541       Idx node_idx = nodes->elems[i];
2542       unsigned int context;
2543       const re_token_t *node = dfa->nodes + node_idx;
2544       re_node_set *new_dest_nodes;
2545 
2546       /* Check whether 'node' is a backreference or not.  */
2547       if (node->type != OP_BACK_REF)
2548 	continue;
2549 
2550       if (node->constraint)
2551 	{
2552 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2553 					  mctx->eflags);
2554 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2555 	    continue;
2556 	}
2557 
2558       /* 'node' is a backreference.
2559 	 Check the substring which the substring matched.  */
2560       bkc_idx = mctx->nbkref_ents;
2561       err = get_subexp (mctx, node_idx, cur_str_idx);
2562       if (BE (err != REG_NOERROR, 0))
2563 	goto free_return;
2564 
2565       /* And add the epsilon closures (which is 'new_dest_nodes') of
2566 	 the backreference to appropriate state_log.  */
2567 #ifdef DEBUG
2568       assert (dfa->nexts[node_idx] != -1);
2569 #endif
2570       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2571 	{
2572 	  Idx subexp_len;
2573 	  re_dfastate_t *dest_state;
2574 	  struct re_backref_cache_entry *bkref_ent;
2575 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2576 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2577 	    continue;
2578 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2579 	  new_dest_nodes = (subexp_len == 0
2580 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2581 			    : dfa->eclosures + dfa->nexts[node_idx]);
2582 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2583 			  - bkref_ent->subexp_from);
2584 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2585 					  mctx->eflags);
2586 	  dest_state = mctx->state_log[dest_str_idx];
2587 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2588 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2589 	  /* Add 'new_dest_node' to state_log.  */
2590 	  if (dest_state == NULL)
2591 	    {
2592 	      mctx->state_log[dest_str_idx]
2593 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2594 					    context);
2595 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2596 		      && err != REG_NOERROR, 0))
2597 		goto free_return;
2598 	    }
2599 	  else
2600 	    {
2601 	      re_node_set dest_nodes;
2602 	      err = re_node_set_init_union (&dest_nodes,
2603 					    dest_state->entrance_nodes,
2604 					    new_dest_nodes);
2605 	      if (BE (err != REG_NOERROR, 0))
2606 		{
2607 		  re_node_set_free (&dest_nodes);
2608 		  goto free_return;
2609 		}
2610 	      mctx->state_log[dest_str_idx]
2611 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2612 	      re_node_set_free (&dest_nodes);
2613 	      if (BE (mctx->state_log[dest_str_idx] == NULL
2614 		      && err != REG_NOERROR, 0))
2615 		goto free_return;
2616 	    }
2617 	  /* We need to check recursively if the backreference can epsilon
2618 	     transit.  */
2619 	  if (subexp_len == 0
2620 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2621 	    {
2622 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2623 					       cur_str_idx);
2624 	      if (BE (err != REG_NOERROR, 0))
2625 		goto free_return;
2626 	      err = transit_state_bkref (mctx, new_dest_nodes);
2627 	      if (BE (err != REG_NOERROR, 0))
2628 		goto free_return;
2629 	    }
2630 	}
2631     }
2632   err = REG_NOERROR;
2633  free_return:
2634   return err;
2635 }
2636 
2637 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2638    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2639    Note that we might collect inappropriate candidates here.
2640    However, the cost of checking them strictly here is too high, then we
2641    delay these checking for prune_impossible_nodes().  */
2642 
2643 static reg_errcode_t
2644 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2645 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2646 {
2647   const re_dfa_t *const dfa = mctx->dfa;
2648   Idx subexp_num, sub_top_idx;
2649   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2650   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2651   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2652   if (cache_idx != -1)
2653     {
2654       const struct re_backref_cache_entry *entry
2655 	= mctx->bkref_ents + cache_idx;
2656       do
2657 	if (entry->node == bkref_node)
2658 	  return REG_NOERROR; /* We already checked it.  */
2659       while (entry++->more);
2660     }
2661 
2662   subexp_num = dfa->nodes[bkref_node].opr.idx;
2663 
2664   /* For each sub expression  */
2665   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2666     {
2667       reg_errcode_t err;
2668       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669       re_sub_match_last_t *sub_last;
2670       Idx sub_last_idx, sl_str, bkref_str_off;
2671 
2672       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673 	continue; /* It isn't related.  */
2674 
2675       sl_str = sub_top->str_idx;
2676       bkref_str_off = bkref_str_idx;
2677       /* At first, check the last node of sub expressions we already
2678 	 evaluated.  */
2679       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2680 	{
2681 	  regoff_t sl_str_diff;
2682 	  sub_last = sub_top->lasts[sub_last_idx];
2683 	  sl_str_diff = sub_last->str_idx - sl_str;
2684 	  /* The matched string by the sub expression match with the substring
2685 	     at the back reference?  */
2686 	  if (sl_str_diff > 0)
2687 	    {
2688 	      if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2689 		{
2690 		  /* Not enough chars for a successful match.  */
2691 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2692 		    break;
2693 
2694 		  err = clean_state_log_if_needed (mctx,
2695 						   bkref_str_off
2696 						   + sl_str_diff);
2697 		  if (BE (err != REG_NOERROR, 0))
2698 		    return err;
2699 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2700 		}
2701 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702 		/* We don't need to search this sub expression any more.  */
2703 		break;
2704 	    }
2705 	  bkref_str_off += sl_str_diff;
2706 	  sl_str += sl_str_diff;
2707 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2708 				bkref_str_idx);
2709 
2710 	  /* Reload buf, since the preceding call might have reallocated
2711 	     the buffer.  */
2712 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2713 
2714 	  if (err == REG_NOMATCH)
2715 	    continue;
2716 	  if (BE (err != REG_NOERROR, 0))
2717 	    return err;
2718 	}
2719 
2720       if (sub_last_idx < sub_top->nlasts)
2721 	continue;
2722       if (sub_last_idx > 0)
2723 	++sl_str;
2724       /* Then, search for the other last nodes of the sub expression.  */
2725       for (; sl_str <= bkref_str_idx; ++sl_str)
2726 	{
2727 	  Idx cls_node;
2728 	  regoff_t sl_str_off;
2729 	  const re_node_set *nodes;
2730 	  sl_str_off = sl_str - sub_top->str_idx;
2731 	  /* The matched string by the sub expression match with the substring
2732 	     at the back reference?  */
2733 	  if (sl_str_off > 0)
2734 	    {
2735 	      if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736 		{
2737 		  /* If we are at the end of the input, we cannot match.  */
2738 		  if (bkref_str_off >= mctx->input.len)
2739 		    break;
2740 
2741 		  err = extend_buffers (mctx, bkref_str_off + 1);
2742 		  if (BE (err != REG_NOERROR, 0))
2743 		    return err;
2744 
2745 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2746 		}
2747 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2748 		break; /* We don't need to search this sub expression
2749 			  any more.  */
2750 	    }
2751 	  if (mctx->state_log[sl_str] == NULL)
2752 	    continue;
2753 	  /* Does this state have a ')' of the sub expression?  */
2754 	  nodes = &mctx->state_log[sl_str]->nodes;
2755 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2756 				       OP_CLOSE_SUBEXP);
2757 	  if (cls_node == -1)
2758 	    continue; /* No.  */
2759 	  if (sub_top->path == NULL)
2760 	    {
2761 	      sub_top->path = calloc (sizeof (state_array_t),
2762 				      sl_str - sub_top->str_idx + 1);
2763 	      if (sub_top->path == NULL)
2764 		return REG_ESPACE;
2765 	    }
2766 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2767 	     in the current context?  */
2768 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2769 			       sub_top->str_idx, cls_node, sl_str,
2770 			       OP_CLOSE_SUBEXP);
2771 	  if (err == REG_NOMATCH)
2772 	      continue;
2773 	  if (BE (err != REG_NOERROR, 0))
2774 	      return err;
2775 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2776 	  if (BE (sub_last == NULL, 0))
2777 	    return REG_ESPACE;
2778 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2779 				bkref_str_idx);
2780 	  if (err == REG_NOMATCH)
2781 	    continue;
2782 	}
2783     }
2784   return REG_NOERROR;
2785 }
2786 
2787 /* Helper functions for get_subexp().  */
2788 
2789 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2790    If it can arrive, register the sub expression expressed with SUB_TOP
2791    and SUB_LAST.  */
2792 
2793 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2794 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2795 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2796 {
2797   reg_errcode_t err;
2798   Idx to_idx;
2799   /* Can the subexpression arrive the back reference?  */
2800   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2801 		       sub_last->str_idx, bkref_node, bkref_str,
2802 		       OP_OPEN_SUBEXP);
2803   if (err != REG_NOERROR)
2804     return err;
2805   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806 			     sub_last->str_idx);
2807   if (BE (err != REG_NOERROR, 0))
2808     return err;
2809   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2810   return clean_state_log_if_needed (mctx, to_idx);
2811 }
2812 
2813 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814    Search '(' if FL_OPEN, or search ')' otherwise.
2815    TODO: This function isn't efficient...
2816 	 Because there might be more than one nodes whose types are
2817 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818 	 nodes.
2819 	 E.g. RE: (a){2}  */
2820 
2821 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2822 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2823 		  Idx subexp_idx, int type)
2824 {
2825   Idx cls_idx;
2826   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2827     {
2828       Idx cls_node = nodes->elems[cls_idx];
2829       const re_token_t *node = dfa->nodes + cls_node;
2830       if (node->type == type
2831 	  && node->opr.idx == subexp_idx)
2832 	return cls_node;
2833     }
2834   return -1;
2835 }
2836 
2837 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2838    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2839    heavily reused.
2840    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2841 
2842 static reg_errcode_t
2843 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2844 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2845 	       Idx top_str, Idx last_node, Idx last_str, int type)
2846 {
2847   const re_dfa_t *const dfa = mctx->dfa;
2848   reg_errcode_t err = REG_NOERROR;
2849   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2850   re_dfastate_t *cur_state = NULL;
2851   re_node_set *cur_nodes, next_nodes;
2852   re_dfastate_t **backup_state_log;
2853   unsigned int context;
2854 
2855   subexp_num = dfa->nodes[top_node].opr.idx;
2856   /* Extend the buffer if we need.  */
2857   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858     {
2859       re_dfastate_t **new_array;
2860       Idx old_alloc = path->alloc;
2861       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2862       Idx new_alloc;
2863       if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2864 	return REG_ESPACE;
2865       new_alloc = old_alloc + incr_alloc;
2866       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2867 	return REG_ESPACE;
2868       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2869       if (BE (new_array == NULL, 0))
2870 	return REG_ESPACE;
2871       path->array = new_array;
2872       path->alloc = new_alloc;
2873       memset (new_array + old_alloc, '\0',
2874 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2875     }
2876 
2877   str_idx = path->next_idx ? path->next_idx : top_str;
2878 
2879   /* Temporary modify MCTX.  */
2880   backup_state_log = mctx->state_log;
2881   backup_cur_idx = mctx->input.cur_idx;
2882   mctx->state_log = path->array;
2883   mctx->input.cur_idx = str_idx;
2884 
2885   /* Setup initial node set.  */
2886   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2887   if (str_idx == top_str)
2888     {
2889       err = re_node_set_init_1 (&next_nodes, top_node);
2890       if (BE (err != REG_NOERROR, 0))
2891 	return err;
2892       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2893       if (BE (err != REG_NOERROR, 0))
2894 	{
2895 	  re_node_set_free (&next_nodes);
2896 	  return err;
2897 	}
2898     }
2899   else
2900     {
2901       cur_state = mctx->state_log[str_idx];
2902       if (cur_state && cur_state->has_backref)
2903 	{
2904 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2905 	  if (BE (err != REG_NOERROR, 0))
2906 	    return err;
2907 	}
2908       else
2909 	re_node_set_init_empty (&next_nodes);
2910     }
2911   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2912     {
2913       if (next_nodes.nelem)
2914 	{
2915 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916 				    subexp_num, type);
2917 	  if (BE (err != REG_NOERROR, 0))
2918 	    {
2919 	      re_node_set_free (&next_nodes);
2920 	      return err;
2921 	    }
2922 	}
2923       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2925 	{
2926 	  re_node_set_free (&next_nodes);
2927 	  return err;
2928 	}
2929       mctx->state_log[str_idx] = cur_state;
2930     }
2931 
2932   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2933     {
2934       re_node_set_empty (&next_nodes);
2935       if (mctx->state_log[str_idx + 1])
2936 	{
2937 	  err = re_node_set_merge (&next_nodes,
2938 				   &mctx->state_log[str_idx + 1]->nodes);
2939 	  if (BE (err != REG_NOERROR, 0))
2940 	    {
2941 	      re_node_set_free (&next_nodes);
2942 	      return err;
2943 	    }
2944 	}
2945       if (cur_state)
2946 	{
2947 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2948 					      &cur_state->non_eps_nodes,
2949 					      &next_nodes);
2950 	  if (BE (err != REG_NOERROR, 0))
2951 	    {
2952 	      re_node_set_free (&next_nodes);
2953 	      return err;
2954 	    }
2955 	}
2956       ++str_idx;
2957       if (next_nodes.nelem)
2958 	{
2959 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2960 	  if (BE (err != REG_NOERROR, 0))
2961 	    {
2962 	      re_node_set_free (&next_nodes);
2963 	      return err;
2964 	    }
2965 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2966 				    subexp_num, type);
2967 	  if (BE (err != REG_NOERROR, 0))
2968 	    {
2969 	      re_node_set_free (&next_nodes);
2970 	      return err;
2971 	    }
2972 	}
2973       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2974       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976 	{
2977 	  re_node_set_free (&next_nodes);
2978 	  return err;
2979 	}
2980       mctx->state_log[str_idx] = cur_state;
2981       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2982     }
2983   re_node_set_free (&next_nodes);
2984   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2985 	       : &mctx->state_log[last_str]->nodes);
2986   path->next_idx = str_idx;
2987 
2988   /* Fix MCTX.  */
2989   mctx->state_log = backup_state_log;
2990   mctx->input.cur_idx = backup_cur_idx;
2991 
2992   /* Then check the current node set has the node LAST_NODE.  */
2993   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2994     return REG_NOERROR;
2995 
2996   return REG_NOMATCH;
2997 }
2998 
2999 /* Helper functions for check_arrival.  */
3000 
3001 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3002    to NEXT_NODES.
3003    TODO: This function is similar to the functions transit_state*(),
3004 	 however this function has many additional works.
3005 	 Can't we unify them?  */
3006 
3007 static reg_errcode_t
3008 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3009 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3010 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3011 {
3012   const re_dfa_t *const dfa = mctx->dfa;
3013   bool ok;
3014   Idx cur_idx;
3015 #ifdef RE_ENABLE_I18N
3016   reg_errcode_t err = REG_NOERROR;
3017 #endif
3018   re_node_set union_set;
3019   re_node_set_init_empty (&union_set);
3020   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3021     {
3022       int naccepted = 0;
3023       Idx cur_node = cur_nodes->elems[cur_idx];
3024 #ifdef DEBUG
3025       re_token_type_t type = dfa->nodes[cur_node].type;
3026       assert (!IS_EPSILON_NODE (type));
3027 #endif
3028 #ifdef RE_ENABLE_I18N
3029       /* If the node may accept "multi byte".  */
3030       if (dfa->nodes[cur_node].accept_mb)
3031 	{
3032 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3033 					       str_idx);
3034 	  if (naccepted > 1)
3035 	    {
3036 	      re_dfastate_t *dest_state;
3037 	      Idx next_node = dfa->nexts[cur_node];
3038 	      Idx next_idx = str_idx + naccepted;
3039 	      dest_state = mctx->state_log[next_idx];
3040 	      re_node_set_empty (&union_set);
3041 	      if (dest_state)
3042 		{
3043 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3044 		  if (BE (err != REG_NOERROR, 0))
3045 		    {
3046 		      re_node_set_free (&union_set);
3047 		      return err;
3048 		    }
3049 		}
3050 	      ok = re_node_set_insert (&union_set, next_node);
3051 	      if (BE (! ok, 0))
3052 		{
3053 		  re_node_set_free (&union_set);
3054 		  return REG_ESPACE;
3055 		}
3056 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057 							    &union_set);
3058 	      if (BE (mctx->state_log[next_idx] == NULL
3059 		      && err != REG_NOERROR, 0))
3060 		{
3061 		  re_node_set_free (&union_set);
3062 		  return err;
3063 		}
3064 	    }
3065 	}
3066 #endif /* RE_ENABLE_I18N */
3067       if (naccepted
3068 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3069 	{
3070 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071 	  if (BE (! ok, 0))
3072 	    {
3073 	      re_node_set_free (&union_set);
3074 	      return REG_ESPACE;
3075 	    }
3076 	}
3077     }
3078   re_node_set_free (&union_set);
3079   return REG_NOERROR;
3080 }
3081 
3082 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083    CUR_NODES, however exclude the nodes which are:
3084     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3086 */
3087 
3088 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3089 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3090 			  Idx ex_subexp, int type)
3091 {
3092   reg_errcode_t err;
3093   Idx idx, outside_node;
3094   re_node_set new_nodes;
3095 #ifdef DEBUG
3096   assert (cur_nodes->nelem);
3097 #endif
3098   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3099   if (BE (err != REG_NOERROR, 0))
3100     return err;
3101   /* Create a new node set NEW_NODES with the nodes which are epsilon
3102      closures of the node in CUR_NODES.  */
3103 
3104   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3105     {
3106       Idx cur_node = cur_nodes->elems[idx];
3107       const re_node_set *eclosure = dfa->eclosures + cur_node;
3108       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3109       if (outside_node == -1)
3110 	{
3111 	  /* There are no problematic nodes, just merge them.  */
3112 	  err = re_node_set_merge (&new_nodes, eclosure);
3113 	  if (BE (err != REG_NOERROR, 0))
3114 	    {
3115 	      re_node_set_free (&new_nodes);
3116 	      return err;
3117 	    }
3118 	}
3119       else
3120 	{
3121 	  /* There are problematic nodes, re-calculate incrementally.  */
3122 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3123 					      ex_subexp, type);
3124 	  if (BE (err != REG_NOERROR, 0))
3125 	    {
3126 	      re_node_set_free (&new_nodes);
3127 	      return err;
3128 	    }
3129 	}
3130     }
3131   re_node_set_free (cur_nodes);
3132   *cur_nodes = new_nodes;
3133   return REG_NOERROR;
3134 }
3135 
3136 /* Helper function for check_arrival_expand_ecl.
3137    Check incrementally the epsilon closure of TARGET, and if it isn't
3138    problematic append it to DST_NODES.  */
3139 
3140 static reg_errcode_t
3141 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3142 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3143 			      Idx target, Idx ex_subexp, int type)
3144 {
3145   Idx cur_node;
3146   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3147     {
3148       bool ok;
3149 
3150       if (dfa->nodes[cur_node].type == type
3151 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3152 	{
3153 	  if (type == OP_CLOSE_SUBEXP)
3154 	    {
3155 	      ok = re_node_set_insert (dst_nodes, cur_node);
3156 	      if (BE (! ok, 0))
3157 		return REG_ESPACE;
3158 	    }
3159 	  break;
3160 	}
3161       ok = re_node_set_insert (dst_nodes, cur_node);
3162       if (BE (! ok, 0))
3163 	return REG_ESPACE;
3164       if (dfa->edests[cur_node].nelem == 0)
3165 	break;
3166       if (dfa->edests[cur_node].nelem == 2)
3167 	{
3168 	  reg_errcode_t err;
3169 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3170 					      dfa->edests[cur_node].elems[1],
3171 					      ex_subexp, type);
3172 	  if (BE (err != REG_NOERROR, 0))
3173 	    return err;
3174 	}
3175       cur_node = dfa->edests[cur_node].elems[0];
3176     }
3177   return REG_NOERROR;
3178 }
3179 
3180 
3181 /* For all the back references in the current state, calculate the
3182    destination of the back references by the appropriate entry
3183    in MCTX->BKREF_ENTS.  */
3184 
3185 static reg_errcode_t
3186 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3187 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3188 		    Idx cur_str, Idx subexp_num, int type)
3189 {
3190   const re_dfa_t *const dfa = mctx->dfa;
3191   reg_errcode_t err;
3192   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3193   struct re_backref_cache_entry *ent;
3194 
3195   if (cache_idx_start == -1)
3196     return REG_NOERROR;
3197 
3198  restart:
3199   ent = mctx->bkref_ents + cache_idx_start;
3200   do
3201     {
3202       Idx to_idx, next_node;
3203 
3204       /* Is this entry ENT is appropriate?  */
3205       if (!re_node_set_contains (cur_nodes, ent->node))
3206 	continue; /* No.  */
3207 
3208       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3209       /* Calculate the destination of the back reference, and append it
3210 	 to MCTX->STATE_LOG.  */
3211       if (to_idx == cur_str)
3212 	{
3213 	  /* The backreference did epsilon transit, we must re-check all the
3214 	     node in the current state.  */
3215 	  re_node_set new_dests;
3216 	  reg_errcode_t err2, err3;
3217 	  next_node = dfa->edests[ent->node].elems[0];
3218 	  if (re_node_set_contains (cur_nodes, next_node))
3219 	    continue;
3220 	  err = re_node_set_init_1 (&new_dests, next_node);
3221 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3222 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3223 	  re_node_set_free (&new_dests);
3224 	  if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3225 		  || err3 != REG_NOERROR, 0))
3226 	    {
3227 	      err = (err != REG_NOERROR ? err
3228 		     : (err2 != REG_NOERROR ? err2 : err3));
3229 	      return err;
3230 	    }
3231 	  /* TODO: It is still inefficient...  */
3232 	  goto restart;
3233 	}
3234       else
3235 	{
3236 	  re_node_set union_set;
3237 	  next_node = dfa->nexts[ent->node];
3238 	  if (mctx->state_log[to_idx])
3239 	    {
3240 	      bool ok;
3241 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3242 					next_node))
3243 		continue;
3244 	      err = re_node_set_init_copy (&union_set,
3245 					   &mctx->state_log[to_idx]->nodes);
3246 	      ok = re_node_set_insert (&union_set, next_node);
3247 	      if (BE (err != REG_NOERROR || ! ok, 0))
3248 		{
3249 		  re_node_set_free (&union_set);
3250 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3251 		  return err;
3252 		}
3253 	    }
3254 	  else
3255 	    {
3256 	      err = re_node_set_init_1 (&union_set, next_node);
3257 	      if (BE (err != REG_NOERROR, 0))
3258 		return err;
3259 	    }
3260 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3261 	  re_node_set_free (&union_set);
3262 	  if (BE (mctx->state_log[to_idx] == NULL
3263 		  && err != REG_NOERROR, 0))
3264 	    return err;
3265 	}
3266     }
3267   while (ent++->more);
3268   return REG_NOERROR;
3269 }
3270 
3271 /* Build transition table for the state.
3272    Return true if successful.  */
3273 
3274 static bool
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3275 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3276 {
3277   reg_errcode_t err;
3278   Idx i, j;
3279   int ch;
3280   bool need_word_trtable = false;
3281   bitset_word_t elem, mask;
3282   bool dests_node_malloced = false;
3283   bool dest_states_malloced = false;
3284   Idx ndests; /* Number of the destination states from 'state'.  */
3285   re_dfastate_t **trtable;
3286   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3287   re_node_set follows, *dests_node;
3288   bitset_t *dests_ch;
3289   bitset_t acceptable;
3290 
3291   struct dests_alloc
3292   {
3293     re_node_set dests_node[SBC_MAX];
3294     bitset_t dests_ch[SBC_MAX];
3295   } *dests_alloc;
3296 
3297   /* We build DFA states which corresponds to the destination nodes
3298      from 'state'.  'dests_node[i]' represents the nodes which i-th
3299      destination state contains, and 'dests_ch[i]' represents the
3300      characters which i-th destination state accepts.  */
3301   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3302     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3303   else
3304     {
3305       dests_alloc = re_malloc (struct dests_alloc, 1);
3306       if (BE (dests_alloc == NULL, 0))
3307 	return false;
3308       dests_node_malloced = true;
3309     }
3310   dests_node = dests_alloc->dests_node;
3311   dests_ch = dests_alloc->dests_ch;
3312 
3313   /* Initialize transition table.  */
3314   state->word_trtable = state->trtable = NULL;
3315 
3316   /* At first, group all nodes belonging to 'state' into several
3317      destinations.  */
3318   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319   if (BE (ndests <= 0, 0))
3320     {
3321       if (dests_node_malloced)
3322 	re_free (dests_alloc);
3323       /* Return false in case of an error, true otherwise.  */
3324       if (ndests == 0)
3325 	{
3326 	  state->trtable = (re_dfastate_t **)
3327 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3328           if (BE (state->trtable == NULL, 0))
3329             return false;
3330 	  return true;
3331 	}
3332       return false;
3333     }
3334 
3335   err = re_node_set_alloc (&follows, ndests + 1);
3336   if (BE (err != REG_NOERROR, 0))
3337     goto out_free;
3338 
3339   /* Avoid arithmetic overflow in size calculation.  */
3340   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3341 	    / (3 * sizeof (re_dfastate_t *)))
3342 	   < ndests),
3343 	  0))
3344     goto out_free;
3345 
3346   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3347 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3348     dest_states = (re_dfastate_t **)
3349       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3350   else
3351     {
3352       dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3353       if (BE (dest_states == NULL, 0))
3354 	{
3355 out_free:
3356 	  if (dest_states_malloced)
3357 	    re_free (dest_states);
3358 	  re_node_set_free (&follows);
3359 	  for (i = 0; i < ndests; ++i)
3360 	    re_node_set_free (dests_node + i);
3361 	  if (dests_node_malloced)
3362 	    re_free (dests_alloc);
3363 	  return false;
3364 	}
3365       dest_states_malloced = true;
3366     }
3367   dest_states_word = dest_states + ndests;
3368   dest_states_nl = dest_states_word + ndests;
3369   bitset_empty (acceptable);
3370 
3371   /* Then build the states for all destinations.  */
3372   for (i = 0; i < ndests; ++i)
3373     {
3374       Idx next_node;
3375       re_node_set_empty (&follows);
3376       /* Merge the follows of this destination states.  */
3377       for (j = 0; j < dests_node[i].nelem; ++j)
3378 	{
3379 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3380 	  if (next_node != -1)
3381 	    {
3382 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3383 	      if (BE (err != REG_NOERROR, 0))
3384 		goto out_free;
3385 	    }
3386 	}
3387       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3388       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3389 	goto out_free;
3390       /* If the new state has context constraint,
3391 	 build appropriate states for these contexts.  */
3392       if (dest_states[i]->has_constraint)
3393 	{
3394 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3395 							  CONTEXT_WORD);
3396 	  if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3397 	    goto out_free;
3398 
3399 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3400 	    need_word_trtable = true;
3401 
3402 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3403 							CONTEXT_NEWLINE);
3404 	  if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3405 	    goto out_free;
3406 	}
3407       else
3408 	{
3409 	  dest_states_word[i] = dest_states[i];
3410 	  dest_states_nl[i] = dest_states[i];
3411 	}
3412       bitset_merge (acceptable, dests_ch[i]);
3413     }
3414 
3415   if (!BE (need_word_trtable, 0))
3416     {
3417       /* We don't care about whether the following character is a word
3418 	 character, or we are in a single-byte character set so we can
3419 	 discern by looking at the character code: allocate a
3420 	 256-entry transition table.  */
3421       trtable = state->trtable =
3422 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3423       if (BE (trtable == NULL, 0))
3424 	goto out_free;
3425 
3426       /* For all characters ch...:  */
3427       for (i = 0; i < BITSET_WORDS; ++i)
3428 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3429 	     elem;
3430 	     mask <<= 1, elem >>= 1, ++ch)
3431 	  if (BE (elem & 1, 0))
3432 	    {
3433 	      /* There must be exactly one destination which accepts
3434 		 character ch.  See group_nodes_into_DFAstates.  */
3435 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3436 		;
3437 
3438 	      /* j-th destination accepts the word character ch.  */
3439 	      if (dfa->word_char[i] & mask)
3440 		trtable[ch] = dest_states_word[j];
3441 	      else
3442 		trtable[ch] = dest_states[j];
3443 	    }
3444     }
3445   else
3446     {
3447       /* We care about whether the following character is a word
3448 	 character, and we are in a multi-byte character set: discern
3449 	 by looking at the character code: build two 256-entry
3450 	 transition tables, one starting at trtable[0] and one
3451 	 starting at trtable[SBC_MAX].  */
3452       trtable = state->word_trtable =
3453 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3454       if (BE (trtable == NULL, 0))
3455 	goto out_free;
3456 
3457       /* For all characters ch...:  */
3458       for (i = 0; i < BITSET_WORDS; ++i)
3459 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3460 	     elem;
3461 	     mask <<= 1, elem >>= 1, ++ch)
3462 	  if (BE (elem & 1, 0))
3463 	    {
3464 	      /* There must be exactly one destination which accepts
3465 		 character ch.  See group_nodes_into_DFAstates.  */
3466 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3467 		;
3468 
3469 	      /* j-th destination accepts the word character ch.  */
3470 	      trtable[ch] = dest_states[j];
3471 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3472 	    }
3473     }
3474 
3475   /* new line */
3476   if (bitset_contain (acceptable, NEWLINE_CHAR))
3477     {
3478       /* The current state accepts newline character.  */
3479       for (j = 0; j < ndests; ++j)
3480 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3481 	  {
3482 	    /* k-th destination accepts newline character.  */
3483 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3484 	    if (need_word_trtable)
3485 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3486 	    /* There must be only one destination which accepts
3487 	       newline.  See group_nodes_into_DFAstates.  */
3488 	    break;
3489 	  }
3490     }
3491 
3492   if (dest_states_malloced)
3493     re_free (dest_states);
3494 
3495   re_node_set_free (&follows);
3496   for (i = 0; i < ndests; ++i)
3497     re_node_set_free (dests_node + i);
3498 
3499   if (dests_node_malloced)
3500     re_free (dests_alloc);
3501 
3502   return true;
3503 }
3504 
3505 /* Group all nodes belonging to STATE into several destinations.
3506    Then for all destinations, set the nodes belonging to the destination
3507    to DESTS_NODE[i] and set the characters accepted by the destination
3508    to DEST_CH[i].  This function return the number of destinations.  */
3509 
3510 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3511 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3512 			    re_node_set *dests_node, bitset_t *dests_ch)
3513 {
3514   reg_errcode_t err;
3515   bool ok;
3516   Idx i, j, k;
3517   Idx ndests; /* Number of the destinations from 'state'.  */
3518   bitset_t accepts; /* Characters a node can accept.  */
3519   const re_node_set *cur_nodes = &state->nodes;
3520   bitset_empty (accepts);
3521   ndests = 0;
3522 
3523   /* For all the nodes belonging to 'state',  */
3524   for (i = 0; i < cur_nodes->nelem; ++i)
3525     {
3526       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527       re_token_type_t type = node->type;
3528       unsigned int constraint = node->constraint;
3529 
3530       /* Enumerate all single byte character this node can accept.  */
3531       if (type == CHARACTER)
3532 	bitset_set (accepts, node->opr.c);
3533       else if (type == SIMPLE_BRACKET)
3534 	{
3535 	  bitset_merge (accepts, node->opr.sbcset);
3536 	}
3537       else if (type == OP_PERIOD)
3538 	{
3539 #ifdef RE_ENABLE_I18N
3540 	  if (dfa->mb_cur_max > 1)
3541 	    bitset_merge (accepts, dfa->sb_char);
3542 	  else
3543 #endif
3544 	    bitset_set_all (accepts);
3545 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3546 	    bitset_clear (accepts, '\n');
3547 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3548 	    bitset_clear (accepts, '\0');
3549 	}
3550 #ifdef RE_ENABLE_I18N
3551       else if (type == OP_UTF8_PERIOD)
3552 	{
3553 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3554 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3555 	  else
3556 	    bitset_merge (accepts, utf8_sb_map);
3557 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3558 	    bitset_clear (accepts, '\n');
3559 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3560 	    bitset_clear (accepts, '\0');
3561 	}
3562 #endif
3563       else
3564 	continue;
3565 
3566       /* Check the 'accepts' and sift the characters which are not
3567 	 match it the context.  */
3568       if (constraint)
3569 	{
3570 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3571 	    {
3572 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3573 	      bitset_empty (accepts);
3574 	      if (accepts_newline)
3575 		bitset_set (accepts, NEWLINE_CHAR);
3576 	      else
3577 		continue;
3578 	    }
3579 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3580 	    {
3581 	      bitset_empty (accepts);
3582 	      continue;
3583 	    }
3584 
3585 	  if (constraint & NEXT_WORD_CONSTRAINT)
3586 	    {
3587 	      bitset_word_t any_set = 0;
3588 	      if (type == CHARACTER && !node->word_char)
3589 		{
3590 		  bitset_empty (accepts);
3591 		  continue;
3592 		}
3593 #ifdef RE_ENABLE_I18N
3594 	      if (dfa->mb_cur_max > 1)
3595 		for (j = 0; j < BITSET_WORDS; ++j)
3596 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3597 	      else
3598 #endif
3599 		for (j = 0; j < BITSET_WORDS; ++j)
3600 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3601 	      if (!any_set)
3602 		continue;
3603 	    }
3604 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3605 	    {
3606 	      bitset_word_t any_set = 0;
3607 	      if (type == CHARACTER && node->word_char)
3608 		{
3609 		  bitset_empty (accepts);
3610 		  continue;
3611 		}
3612 #ifdef RE_ENABLE_I18N
3613 	      if (dfa->mb_cur_max > 1)
3614 		for (j = 0; j < BITSET_WORDS; ++j)
3615 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3616 	      else
3617 #endif
3618 		for (j = 0; j < BITSET_WORDS; ++j)
3619 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3620 	      if (!any_set)
3621 		continue;
3622 	    }
3623 	}
3624 
3625       /* Then divide 'accepts' into DFA states, or create a new
3626 	 state.  Above, we make sure that accepts is not empty.  */
3627       for (j = 0; j < ndests; ++j)
3628 	{
3629 	  bitset_t intersec; /* Intersection sets, see below.  */
3630 	  bitset_t remains;
3631 	  /* Flags, see below.  */
3632 	  bitset_word_t has_intersec, not_subset, not_consumed;
3633 
3634 	  /* Optimization, skip if this state doesn't accept the character.  */
3635 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3636 	    continue;
3637 
3638 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3639 	  has_intersec = 0;
3640 	  for (k = 0; k < BITSET_WORDS; ++k)
3641 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3642 	  /* And skip if the intersection set is empty.  */
3643 	  if (!has_intersec)
3644 	    continue;
3645 
3646 	  /* Then check if this state is a subset of 'accepts'.  */
3647 	  not_subset = not_consumed = 0;
3648 	  for (k = 0; k < BITSET_WORDS; ++k)
3649 	    {
3650 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3651 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3652 	    }
3653 
3654 	  /* If this state isn't a subset of 'accepts', create a
3655 	     new group state, which has the 'remains'. */
3656 	  if (not_subset)
3657 	    {
3658 	      bitset_copy (dests_ch[ndests], remains);
3659 	      bitset_copy (dests_ch[j], intersec);
3660 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3661 	      if (BE (err != REG_NOERROR, 0))
3662 		goto error_return;
3663 	      ++ndests;
3664 	    }
3665 
3666 	  /* Put the position in the current group. */
3667 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3668 	  if (BE (! ok, 0))
3669 	    goto error_return;
3670 
3671 	  /* If all characters are consumed, go to next node. */
3672 	  if (!not_consumed)
3673 	    break;
3674 	}
3675       /* Some characters remain, create a new group. */
3676       if (j == ndests)
3677 	{
3678 	  bitset_copy (dests_ch[ndests], accepts);
3679 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3680 	  if (BE (err != REG_NOERROR, 0))
3681 	    goto error_return;
3682 	  ++ndests;
3683 	  bitset_empty (accepts);
3684 	}
3685     }
3686   return ndests;
3687  error_return:
3688   for (j = 0; j < ndests; ++j)
3689     re_node_set_free (dests_node + j);
3690   return -1;
3691 }
3692 
3693 #ifdef RE_ENABLE_I18N
3694 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3695    Return the number of the bytes the node accepts.
3696    STR_IDX is the current index of the input string.
3697 
3698    This function handles the nodes which can accept one character, or
3699    one collating element like '.', '[a-z]', opposite to the other nodes
3700    can only accept one byte.  */
3701 
3702 # ifdef _LIBC
3703 #  include <locale/weight.h>
3704 # endif
3705 
3706 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3707 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3708 			 const re_string_t *input, Idx str_idx)
3709 {
3710   const re_token_t *node = dfa->nodes + node_idx;
3711   int char_len, elem_len;
3712   Idx i;
3713 
3714   if (BE (node->type == OP_UTF8_PERIOD, 0))
3715     {
3716       unsigned char c = re_string_byte_at (input, str_idx), d;
3717       if (BE (c < 0xc2, 1))
3718 	return 0;
3719 
3720       if (str_idx + 2 > input->len)
3721 	return 0;
3722 
3723       d = re_string_byte_at (input, str_idx + 1);
3724       if (c < 0xe0)
3725 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3726       else if (c < 0xf0)
3727 	{
3728 	  char_len = 3;
3729 	  if (c == 0xe0 && d < 0xa0)
3730 	    return 0;
3731 	}
3732       else if (c < 0xf8)
3733 	{
3734 	  char_len = 4;
3735 	  if (c == 0xf0 && d < 0x90)
3736 	    return 0;
3737 	}
3738       else if (c < 0xfc)
3739 	{
3740 	  char_len = 5;
3741 	  if (c == 0xf8 && d < 0x88)
3742 	    return 0;
3743 	}
3744       else if (c < 0xfe)
3745 	{
3746 	  char_len = 6;
3747 	  if (c == 0xfc && d < 0x84)
3748 	    return 0;
3749 	}
3750       else
3751 	return 0;
3752 
3753       if (str_idx + char_len > input->len)
3754 	return 0;
3755 
3756       for (i = 1; i < char_len; ++i)
3757 	{
3758 	  d = re_string_byte_at (input, str_idx + i);
3759 	  if (d < 0x80 || d > 0xbf)
3760 	    return 0;
3761 	}
3762       return char_len;
3763     }
3764 
3765   char_len = re_string_char_size_at (input, str_idx);
3766   if (node->type == OP_PERIOD)
3767     {
3768       if (char_len <= 1)
3769 	return 0;
3770       /* FIXME: I don't think this if is needed, as both '\n'
3771 	 and '\0' are char_len == 1.  */
3772       /* '.' accepts any one character except the following two cases.  */
3773       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3774 	   re_string_byte_at (input, str_idx) == '\n') ||
3775 	  ((dfa->syntax & RE_DOT_NOT_NULL) &&
3776 	   re_string_byte_at (input, str_idx) == '\0'))
3777 	return 0;
3778       return char_len;
3779     }
3780 
3781   elem_len = re_string_elem_size_at (input, str_idx);
3782   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3783     return 0;
3784 
3785   if (node->type == COMPLEX_BRACKET)
3786     {
3787       const re_charset_t *cset = node->opr.mbcset;
3788 # ifdef _LIBC
3789       const unsigned char *pin
3790 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3791       Idx j;
3792       uint32_t nrules;
3793 # endif /* _LIBC */
3794       int match_len = 0;
3795       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3796 		    ? re_string_wchar_at (input, str_idx) : 0);
3797 
3798       /* match with multibyte character?  */
3799       for (i = 0; i < cset->nmbchars; ++i)
3800 	if (wc == cset->mbchars[i])
3801 	  {
3802 	    match_len = char_len;
3803 	    goto check_node_accept_bytes_match;
3804 	  }
3805       /* match with character_class?  */
3806       for (i = 0; i < cset->nchar_classes; ++i)
3807 	{
3808 	  wctype_t wt = cset->char_classes[i];
3809 	  if (__iswctype (wc, wt))
3810 	    {
3811 	      match_len = char_len;
3812 	      goto check_node_accept_bytes_match;
3813 	    }
3814 	}
3815 
3816 # ifdef _LIBC
3817       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3818       if (nrules != 0)
3819 	{
3820 	  unsigned int in_collseq = 0;
3821 	  const int32_t *table, *indirect;
3822 	  const unsigned char *weights, *extra;
3823 	  const char *collseqwc;
3824 
3825 	  /* match with collating_symbol?  */
3826 	  if (cset->ncoll_syms)
3827 	    extra = (const unsigned char *)
3828 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3829 	  for (i = 0; i < cset->ncoll_syms; ++i)
3830 	    {
3831 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3832 	      /* Compare the length of input collating element and
3833 		 the length of current collating element.  */
3834 	      if (*coll_sym != elem_len)
3835 		continue;
3836 	      /* Compare each bytes.  */
3837 	      for (j = 0; j < *coll_sym; j++)
3838 		if (pin[j] != coll_sym[1 + j])
3839 		  break;
3840 	      if (j == *coll_sym)
3841 		{
3842 		  /* Match if every bytes is equal.  */
3843 		  match_len = j;
3844 		  goto check_node_accept_bytes_match;
3845 		}
3846 	    }
3847 
3848 	  if (cset->nranges)
3849 	    {
3850 	      if (elem_len <= char_len)
3851 		{
3852 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3853 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3854 		}
3855 	      else
3856 		in_collseq = find_collation_sequence_value (pin, elem_len);
3857 	    }
3858 	  /* match with range expression?  */
3859 	  /* FIXME: Implement rational ranges here, too.  */
3860 	  for (i = 0; i < cset->nranges; ++i)
3861 	    if (cset->range_starts[i] <= in_collseq
3862 		&& in_collseq <= cset->range_ends[i])
3863 	      {
3864 		match_len = elem_len;
3865 		goto check_node_accept_bytes_match;
3866 	      }
3867 
3868 	  /* match with equivalence_class?  */
3869 	  if (cset->nequiv_classes)
3870 	    {
3871 	      const unsigned char *cp = pin;
3872 	      table = (const int32_t *)
3873 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3874 	      weights = (const unsigned char *)
3875 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3876 	      extra = (const unsigned char *)
3877 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3878 	      indirect = (const int32_t *)
3879 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3880 	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3881 	      int32_t rule = idx >> 24;
3882 	      idx &= 0xffffff;
3883 	      if (idx > 0)
3884 		{
3885 		  size_t weight_len = weights[idx];
3886 		  for (i = 0; i < cset->nequiv_classes; ++i)
3887 		    {
3888 		      int32_t equiv_class_idx = cset->equiv_classes[i];
3889 		      int32_t equiv_class_rule = equiv_class_idx >> 24;
3890 		      equiv_class_idx &= 0xffffff;
3891 		      if (weights[equiv_class_idx] == weight_len
3892 			  && equiv_class_rule == rule
3893 			  && memcmp (weights + idx + 1,
3894 				     weights + equiv_class_idx + 1,
3895 				     weight_len) == 0)
3896 			{
3897 			  match_len = elem_len;
3898 			  goto check_node_accept_bytes_match;
3899 			}
3900 		    }
3901 		}
3902 	    }
3903 	}
3904       else
3905 # endif /* _LIBC */
3906 	{
3907 	  /* match with range expression?  */
3908 	  for (i = 0; i < cset->nranges; ++i)
3909 	    {
3910 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3911 		{
3912 		  match_len = char_len;
3913 		  goto check_node_accept_bytes_match;
3914 		}
3915 	    }
3916 	}
3917     check_node_accept_bytes_match:
3918       if (!cset->non_match)
3919 	return match_len;
3920       else
3921 	{
3922 	  if (match_len > 0)
3923 	    return 0;
3924 	  else
3925 	    return (elem_len > char_len) ? elem_len : char_len;
3926 	}
3927     }
3928   return 0;
3929 }
3930 
3931 # ifdef _LIBC
3932 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3933 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3934 {
3935   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3936   if (nrules == 0)
3937     {
3938       if (mbs_len == 1)
3939 	{
3940 	  /* No valid character.  Match it as a single byte character.  */
3941 	  const unsigned char *collseq = (const unsigned char *)
3942 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3943 	  return collseq[mbs[0]];
3944 	}
3945       return UINT_MAX;
3946     }
3947   else
3948     {
3949       int32_t idx;
3950       const unsigned char *extra = (const unsigned char *)
3951 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3952       int32_t extrasize = (const unsigned char *)
3953 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3954 
3955       for (idx = 0; idx < extrasize;)
3956 	{
3957 	  int mbs_cnt;
3958 	  bool found = false;
3959 	  int32_t elem_mbs_len;
3960 	  /* Skip the name of collating element name.  */
3961 	  idx = idx + extra[idx] + 1;
3962 	  elem_mbs_len = extra[idx++];
3963 	  if (mbs_len == elem_mbs_len)
3964 	    {
3965 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3966 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3967 		  break;
3968 	      if (mbs_cnt == elem_mbs_len)
3969 		/* Found the entry.  */
3970 		found = true;
3971 	    }
3972 	  /* Skip the byte sequence of the collating element.  */
3973 	  idx += elem_mbs_len;
3974 	  /* Adjust for the alignment.  */
3975 	  idx = (idx + 3) & ~3;
3976 	  /* Skip the collation sequence value.  */
3977 	  idx += sizeof (uint32_t);
3978 	  /* Skip the wide char sequence of the collating element.  */
3979 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3980 	  /* If we found the entry, return the sequence value.  */
3981 	  if (found)
3982 	    return *(uint32_t *) (extra + idx);
3983 	  /* Skip the collation sequence value.  */
3984 	  idx += sizeof (uint32_t);
3985 	}
3986       return UINT_MAX;
3987     }
3988 }
3989 # endif /* _LIBC */
3990 #endif /* RE_ENABLE_I18N */
3991 
3992 /* Check whether the node accepts the byte which is IDX-th
3993    byte of the INPUT.  */
3994 
3995 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3996 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3997 		   Idx idx)
3998 {
3999   unsigned char ch;
4000   ch = re_string_byte_at (&mctx->input, idx);
4001   switch (node->type)
4002     {
4003     case CHARACTER:
4004       if (node->opr.c != ch)
4005         return false;
4006       break;
4007 
4008     case SIMPLE_BRACKET:
4009       if (!bitset_contain (node->opr.sbcset, ch))
4010         return false;
4011       break;
4012 
4013 #ifdef RE_ENABLE_I18N
4014     case OP_UTF8_PERIOD:
4015       if (ch >= ASCII_CHARS)
4016         return false;
4017       FALLTHROUGH;
4018 #endif
4019     case OP_PERIOD:
4020       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4021 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4022 	return false;
4023       break;
4024 
4025     default:
4026       return false;
4027     }
4028 
4029   if (node->constraint)
4030     {
4031       /* The node has constraints.  Check whether the current context
4032 	 satisfies the constraints.  */
4033       unsigned int context = re_string_context_at (&mctx->input, idx,
4034 						   mctx->eflags);
4035       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4036 	return false;
4037     }
4038 
4039   return true;
4040 }
4041 
4042 /* Extend the buffers, if the buffers have run out.  */
4043 
4044 static reg_errcode_t
4045 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)4046 extend_buffers (re_match_context_t *mctx, int min_len)
4047 {
4048   reg_errcode_t ret;
4049   re_string_t *pstr = &mctx->input;
4050 
4051   /* Avoid overflow.  */
4052   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4053           <= pstr->bufs_len, 0))
4054     return REG_ESPACE;
4055 
4056   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
4057   ret = re_string_realloc_buffers (pstr,
4058 				   MAX (min_len,
4059 					MIN (pstr->len, pstr->bufs_len * 2)));
4060   if (BE (ret != REG_NOERROR, 0))
4061     return ret;
4062 
4063   if (mctx->state_log != NULL)
4064     {
4065       /* And double the length of state_log.  */
4066       /* XXX We have no indication of the size of this buffer.  If this
4067 	 allocation fail we have no indication that the state_log array
4068 	 does not have the right size.  */
4069       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070 					      pstr->bufs_len + 1);
4071       if (BE (new_array == NULL, 0))
4072 	return REG_ESPACE;
4073       mctx->state_log = new_array;
4074     }
4075 
4076   /* Then reconstruct the buffers.  */
4077   if (pstr->icase)
4078     {
4079 #ifdef RE_ENABLE_I18N
4080       if (pstr->mb_cur_max > 1)
4081 	{
4082 	  ret = build_wcs_upper_buffer (pstr);
4083 	  if (BE (ret != REG_NOERROR, 0))
4084 	    return ret;
4085 	}
4086       else
4087 #endif /* RE_ENABLE_I18N  */
4088 	build_upper_buffer (pstr);
4089     }
4090   else
4091     {
4092 #ifdef RE_ENABLE_I18N
4093       if (pstr->mb_cur_max > 1)
4094 	build_wcs_buffer (pstr);
4095       else
4096 #endif /* RE_ENABLE_I18N  */
4097 	{
4098 	  if (pstr->trans != NULL)
4099 	    re_string_translate_buffer (pstr);
4100 	}
4101     }
4102   return REG_NOERROR;
4103 }
4104 
4105 
4106 /* Functions for matching context.  */
4107 
4108 /* Initialize MCTX.  */
4109 
4110 static reg_errcode_t
4111 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4112 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4113 {
4114   mctx->eflags = eflags;
4115   mctx->match_last = -1;
4116   if (n > 0)
4117     {
4118       /* Avoid overflow.  */
4119       size_t max_object_size =
4120 	MAX (sizeof (struct re_backref_cache_entry),
4121 	     sizeof (re_sub_match_top_t *));
4122       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4123 	return REG_ESPACE;
4124 
4125       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4126       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4127       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4128 	return REG_ESPACE;
4129     }
4130   /* Already zero-ed by the caller.
4131      else
4132        mctx->bkref_ents = NULL;
4133      mctx->nbkref_ents = 0;
4134      mctx->nsub_tops = 0;  */
4135   mctx->abkref_ents = n;
4136   mctx->max_mb_elem_len = 1;
4137   mctx->asub_tops = n;
4138   return REG_NOERROR;
4139 }
4140 
4141 /* Clean the entries which depend on the current input in MCTX.
4142    This function must be invoked when the matcher changes the start index
4143    of the input, or changes the input string.  */
4144 
4145 static void
match_ctx_clean(re_match_context_t * mctx)4146 match_ctx_clean (re_match_context_t *mctx)
4147 {
4148   Idx st_idx;
4149   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4150     {
4151       Idx sl_idx;
4152       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4153       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4154 	{
4155 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4156 	  re_free (last->path.array);
4157 	  re_free (last);
4158 	}
4159       re_free (top->lasts);
4160       if (top->path)
4161 	{
4162 	  re_free (top->path->array);
4163 	  re_free (top->path);
4164 	}
4165       re_free (top);
4166     }
4167 
4168   mctx->nsub_tops = 0;
4169   mctx->nbkref_ents = 0;
4170 }
4171 
4172 /* Free all the memory associated with MCTX.  */
4173 
4174 static void
match_ctx_free(re_match_context_t * mctx)4175 match_ctx_free (re_match_context_t *mctx)
4176 {
4177   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4178   match_ctx_clean (mctx);
4179   re_free (mctx->sub_tops);
4180   re_free (mctx->bkref_ents);
4181 }
4182 
4183 /* Add a new backreference entry to MCTX.
4184    Note that we assume that caller never call this function with duplicate
4185    entry, and call with STR_IDX which isn't smaller than any existing entry.
4186 */
4187 
4188 static reg_errcode_t
4189 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4190 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4191 		     Idx to)
4192 {
4193   if (mctx->nbkref_ents >= mctx->abkref_ents)
4194     {
4195       struct re_backref_cache_entry* new_entry;
4196       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4197 			      mctx->abkref_ents * 2);
4198       if (BE (new_entry == NULL, 0))
4199 	{
4200 	  re_free (mctx->bkref_ents);
4201 	  return REG_ESPACE;
4202 	}
4203       mctx->bkref_ents = new_entry;
4204       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4205 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4206       mctx->abkref_ents *= 2;
4207     }
4208   if (mctx->nbkref_ents > 0
4209       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4210     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4211 
4212   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4213   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4214   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4215   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4216 
4217   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4218      If bit N is clear, means that this entry won't epsilon-transition to
4219      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4220      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4221      such node.
4222 
4223      A backreference does not epsilon-transition unless it is empty, so set
4224      to all zeros if FROM != TO.  */
4225   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4226     = (from == to ? -1 : 0);
4227 
4228   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4229   if (mctx->max_mb_elem_len < to - from)
4230     mctx->max_mb_elem_len = to - from;
4231   return REG_NOERROR;
4232 }
4233 
4234 /* Return the first entry with the same str_idx, or -1 if none is
4235    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4236 
4237 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4238 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4239 {
4240   Idx left, right, mid, last;
4241   last = right = mctx->nbkref_ents;
4242   for (left = 0; left < right;)
4243     {
4244       mid = (left + right) / 2;
4245       if (mctx->bkref_ents[mid].str_idx < str_idx)
4246 	left = mid + 1;
4247       else
4248 	right = mid;
4249     }
4250   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4251     return left;
4252   else
4253     return -1;
4254 }
4255 
4256 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4257    at STR_IDX.  */
4258 
4259 static reg_errcode_t
4260 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4261 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4262 {
4263 #ifdef DEBUG
4264   assert (mctx->sub_tops != NULL);
4265   assert (mctx->asub_tops > 0);
4266 #endif
4267   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4268     {
4269       Idx new_asub_tops = mctx->asub_tops * 2;
4270       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4271 						   re_sub_match_top_t *,
4272 						   new_asub_tops);
4273       if (BE (new_array == NULL, 0))
4274 	return REG_ESPACE;
4275       mctx->sub_tops = new_array;
4276       mctx->asub_tops = new_asub_tops;
4277     }
4278   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4279   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4280     return REG_ESPACE;
4281   mctx->sub_tops[mctx->nsub_tops]->node = node;
4282   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4283   return REG_NOERROR;
4284 }
4285 
4286 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4287    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4288 
4289 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4290 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4291 {
4292   re_sub_match_last_t *new_entry;
4293   if (BE (subtop->nlasts == subtop->alasts, 0))
4294     {
4295       Idx new_alasts = 2 * subtop->alasts + 1;
4296       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4297 						    re_sub_match_last_t *,
4298 						    new_alasts);
4299       if (BE (new_array == NULL, 0))
4300 	return NULL;
4301       subtop->lasts = new_array;
4302       subtop->alasts = new_alasts;
4303     }
4304   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4305   if (BE (new_entry != NULL, 1))
4306     {
4307       subtop->lasts[subtop->nlasts] = new_entry;
4308       new_entry->node = node;
4309       new_entry->str_idx = str_idx;
4310       ++subtop->nlasts;
4311     }
4312   return new_entry;
4313 }
4314 
4315 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4316 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4317 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4318 {
4319   sctx->sifted_states = sifted_sts;
4320   sctx->limited_states = limited_sts;
4321   sctx->last_node = last_node;
4322   sctx->last_str_idx = last_str_idx;
4323   re_node_set_init_empty (&sctx->limits);
4324 }
4325