1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 					  Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 					   Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 						    Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 			   re_dfastate_t **limited_sts, Idx last_node,
33 			   Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 					 const char *string, Idx length,
36 					 Idx start, Idx last_start, Idx stop,
37 					 size_t nmatch, regmatch_t pmatch[],
38 					 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 				  const char *string1, Idx length1,
41 				  const char *string2, Idx length2,
42 				  Idx start, regoff_t range,
43 				  struct re_registers *regs,
44 				  Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 				const char *string, Idx length, Idx start,
47 				regoff_t range, Idx stop,
48 				struct re_registers *regs,
49 				bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51                               Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 			   Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 				     const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 			 regmatch_t *prev_idx_match, Idx cur_node,
59 			 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 				      Idx str_idx, Idx dest_node, Idx nregs,
62 				      regmatch_t *regs, regmatch_t *prevregs,
63 				      re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 			       const re_match_context_t *mctx,
66 			       size_t nmatch, regmatch_t *pmatch,
67 			       bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69 
70 #ifdef RE_ENABLE_I18N
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 				re_sift_context_t *sctx,
73 				Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 					   re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 					  re_sift_context_t *sctx, Idx str_idx,
79 					  re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 					      re_sift_context_t *sctx,
82 					      Idx str_idx,
83 					      re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 					    re_node_set *dest_nodes,
86 					    const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 			      const re_node_set *limits,
89 			      Idx dst_node, Idx dst_idx, Idx src_node,
90 			      Idx src_idx);
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 					int boundaries, Idx subexp_idx,
93 					Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 				      Idx limit, Idx subexp_idx,
96 				      Idx node, Idx str_idx,
97 				      Idx bkref_idx);
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 					  re_node_set *dest_nodes,
100 					  const re_node_set *candidates,
101 					  re_node_set *limits,
102 					  struct re_backref_cache_entry *bkref_ents,
103 					  Idx str_idx);
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 					re_sift_context_t *sctx,
106 					Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
108 					re_dfastate_t **dst,
109 					re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 					 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 				     re_match_context_t *mctx,
114 				     re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 					    re_match_context_t *mctx,
117 					    re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 						re_node_set *cur_nodes,
120 						Idx str_idx);
121 #if 0
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 					re_match_context_t *mctx,
124 					re_dfastate_t *pstate);
125 #endif
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 				       re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 					  const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 				 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 				     const re_sub_match_top_t *sub_top,
136 				     re_sub_match_last_t *sub_last,
137 				     Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 			     Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 				    state_array_t *path, Idx top_node,
142 				    Idx top_str, Idx last_node, Idx last_str,
143 				    int type);
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
145 						   Idx str_idx,
146 						   re_node_set *cur_nodes,
147 						   re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 					       re_node_set *cur_nodes,
150 					       Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 						   re_node_set *dst_nodes,
153 						   Idx target, Idx ex_subexp,
154 						   int type);
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 					 re_node_set *cur_nodes, Idx cur_str,
157 					 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 				    const re_string_t *input, Idx idx);
162 # ifdef _LIBC
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
164 						   size_t name_len);
165 # endif /* _LIBC */
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 				       const re_dfastate_t *state,
169 				       re_node_set *states_node,
170 				       bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 			       const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
174 
175 /* Entry point for POSIX code.  */
176 
177 /* regexec searches for a given pattern, specified by PREG, in the
178    string STRING.
179 
180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
182    least NMATCH elements, and we set them to the offsets of the
183    corresponding matched substrings.
184 
185    EFLAGS specifies "execution flags" which affect matching: if
186    REG_NOTBOL is set, then ^ does not match at the beginning of the
187    string; if REG_NOTEOL is set, then $ does not match at the end.
188 
189    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
190    EFLAGS is invalid.  */
191 
192 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)193 regexec (const regex_t *__restrict preg, const char *__restrict string,
194 	 size_t nmatch, regmatch_t pmatch[], int eflags)
195 {
196   reg_errcode_t err;
197   Idx start, length;
198   re_dfa_t *dfa = preg->buffer;
199 
200   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
201     return REG_BADPAT;
202 
203   if (eflags & REG_STARTEND)
204     {
205       start = pmatch[0].rm_so;
206       length = pmatch[0].rm_eo;
207     }
208   else
209     {
210       start = 0;
211       length = strlen (string);
212     }
213 
214   lock_lock (dfa->lock);
215   if (preg->no_sub)
216     err = re_search_internal (preg, string, length, start, length,
217 			      length, 0, NULL, eflags);
218   else
219     err = re_search_internal (preg, string, length, start, length,
220 			      length, nmatch, pmatch, eflags);
221   lock_unlock (dfa->lock);
222   return err != REG_NOERROR;
223 }
224 
225 #ifdef _LIBC
226 libc_hidden_def (__regexec)
227 
228 # include <shlib-compat.h>
229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
230 
231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
232 __typeof__ (__regexec) __compat_regexec;
233 
234 int
235 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)236 __compat_regexec (const regex_t *__restrict preg,
237 		  const char *__restrict string, size_t nmatch,
238 		  regmatch_t pmatch[], int eflags)
239 {
240   return regexec (preg, string, nmatch, pmatch,
241 		  eflags & (REG_NOTBOL | REG_NOTEOL));
242 }
243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
244 # endif
245 #endif
246 
247 /* Entry points for GNU code.  */
248 
249 /* re_match, re_search, re_match_2, re_search_2
250 
251    The former two functions operate on STRING with length LENGTH,
252    while the later two operate on concatenation of STRING1 and STRING2
253    with lengths LENGTH1 and LENGTH2, respectively.
254 
255    re_match() matches the compiled pattern in BUFP against the string,
256    starting at index START.
257 
258    re_search() first tries matching at index START, then it tries to match
259    starting from index START + 1, and so on.  The last start position tried
260    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
261    way as re_match().)
262 
263    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
264    the first STOP characters of the concatenation of the strings should be
265    concerned.
266 
267    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
268    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
269    computed relative to the concatenation, not relative to the individual
270    strings.)
271 
272    On success, re_match* functions return the length of the match, re_search*
273    return the position of the start of the match.  They return -1 on
274    match failure, -2 on error.  */
275 
276 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
278 	  Idx start, struct re_registers *regs)
279 {
280   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
281 }
282 #ifdef _LIBC
weak_alias(__re_match,re_match)283 weak_alias (__re_match, re_match)
284 #endif
285 
286 regoff_t
287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
288 	   Idx start, regoff_t range, struct re_registers *regs)
289 {
290   return re_search_stub (bufp, string, length, start, range, length, regs,
291 			 false);
292 }
293 #ifdef _LIBC
weak_alias(__re_search,re_search)294 weak_alias (__re_search, re_search)
295 #endif
296 
297 regoff_t
298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
299 	    const char *string2, Idx length2, Idx start,
300 	    struct re_registers *regs, Idx stop)
301 {
302   return re_search_2_stub (bufp, string1, length1, string2, length2,
303 			   start, 0, regs, stop, true);
304 }
305 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)306 weak_alias (__re_match_2, re_match_2)
307 #endif
308 
309 regoff_t
310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
311 	     const char *string2, Idx length2, Idx start, regoff_t range,
312 	     struct re_registers *regs, Idx stop)
313 {
314   return re_search_2_stub (bufp, string1, length1, string2, length2,
315 			   start, range, regs, stop, false);
316 }
317 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)318 weak_alias (__re_search_2, re_search_2)
319 #endif
320 
321 static regoff_t
322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
323 		  Idx length1, const char *string2, Idx length2, Idx start,
324 		  regoff_t range, struct re_registers *regs,
325 		  Idx stop, bool ret_len)
326 {
327   const char *str;
328   regoff_t rval;
329   Idx len;
330   char *s = NULL;
331 
332   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
333 			 || INT_ADD_WRAPV (length1, length2, &len))))
334     return -2;
335 
336   /* Concatenate the strings.  */
337   if (length2 > 0)
338     if (length1 > 0)
339       {
340 	s = re_malloc (char, len);
341 
342 	if (__glibc_unlikely (s == NULL))
343 	  return -2;
344 #ifdef _LIBC
345 	memcpy (__mempcpy (s, string1, length1), string2, length2);
346 #else
347 	memcpy (s, string1, length1);
348 	memcpy (s + length1, string2, length2);
349 #endif
350 	str = s;
351       }
352     else
353       str = string2;
354   else
355     str = string1;
356 
357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
358 			 ret_len);
359   re_free (s);
360   return rval;
361 }
362 
363 /* The parameters have the same meaning as those of re_search.
364    Additional parameters:
365    If RET_LEN is true the length of the match is returned (re_match style);
366    otherwise the position of the match is returned.  */
367 
368 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
371 		bool ret_len)
372 {
373   reg_errcode_t result;
374   regmatch_t *pmatch;
375   Idx nregs;
376   regoff_t rval;
377   int eflags = 0;
378   re_dfa_t *dfa = bufp->buffer;
379   Idx last_start = start + range;
380 
381   /* Check for out-of-range.  */
382   if (__glibc_unlikely (start < 0 || start > length))
383     return -1;
384   if (__glibc_unlikely (length < last_start
385 			|| (0 <= range && last_start < start)))
386     last_start = length;
387   else if (__glibc_unlikely (last_start < 0
388 			     || (range < 0 && start <= last_start)))
389     last_start = 0;
390 
391   lock_lock (dfa->lock);
392 
393   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
394   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
395 
396   /* Compile fastmap if we haven't yet.  */
397   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
398     re_compile_fastmap (bufp);
399 
400   if (__glibc_unlikely (bufp->no_sub))
401     regs = NULL;
402 
403   /* We need at least 1 register.  */
404   if (regs == NULL)
405     nregs = 1;
406   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
407 			     && regs->num_regs <= bufp->re_nsub))
408     {
409       nregs = regs->num_regs;
410       if (__glibc_unlikely (nregs < 1))
411 	{
412 	  /* Nothing can be copied to regs.  */
413 	  regs = NULL;
414 	  nregs = 1;
415 	}
416     }
417   else
418     nregs = bufp->re_nsub + 1;
419   pmatch = re_malloc (regmatch_t, nregs);
420   if (__glibc_unlikely (pmatch == NULL))
421     {
422       rval = -2;
423       goto out;
424     }
425 
426   result = re_search_internal (bufp, string, length, start, last_start, stop,
427 			       nregs, pmatch, eflags);
428 
429   rval = 0;
430 
431   /* I hope we needn't fill their regs with -1's when no match was found.  */
432   if (result != REG_NOERROR)
433     rval = result == REG_NOMATCH ? -1 : -2;
434   else if (regs != NULL)
435     {
436       /* If caller wants register contents data back, copy them.  */
437       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
438 					   bufp->regs_allocated);
439       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
440 	rval = -2;
441     }
442 
443   if (__glibc_likely (rval == 0))
444     {
445       if (ret_len)
446 	{
447 	  DEBUG_ASSERT (pmatch[0].rm_so == start);
448 	  rval = pmatch[0].rm_eo - start;
449 	}
450       else
451 	rval = pmatch[0].rm_so;
452     }
453   re_free (pmatch);
454  out:
455   lock_unlock (dfa->lock);
456   return rval;
457 }
458 
459 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
461 	      int regs_allocated)
462 {
463   int rval = REGS_REALLOCATE;
464   Idx i;
465   Idx need_regs = nregs + 1;
466   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
467      uses.  */
468 
469   /* Have the register data arrays been allocated?  */
470   if (regs_allocated == REGS_UNALLOCATED)
471     { /* No.  So allocate them with malloc.  */
472       regs->start = re_malloc (regoff_t, need_regs);
473       if (__glibc_unlikely (regs->start == NULL))
474 	return REGS_UNALLOCATED;
475       regs->end = re_malloc (regoff_t, need_regs);
476       if (__glibc_unlikely (regs->end == NULL))
477 	{
478 	  re_free (regs->start);
479 	  return REGS_UNALLOCATED;
480 	}
481       regs->num_regs = need_regs;
482     }
483   else if (regs_allocated == REGS_REALLOCATE)
484     { /* Yes.  If we need more elements than were already
485 	 allocated, reallocate them.  If we need fewer, just
486 	 leave it alone.  */
487       if (__glibc_unlikely (need_regs > regs->num_regs))
488 	{
489 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
490 	  regoff_t *new_end;
491 	  if (__glibc_unlikely (new_start == NULL))
492 	    return REGS_UNALLOCATED;
493 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
494 	  if (__glibc_unlikely (new_end == NULL))
495 	    {
496 	      re_free (new_start);
497 	      return REGS_UNALLOCATED;
498 	    }
499 	  regs->start = new_start;
500 	  regs->end = new_end;
501 	  regs->num_regs = need_regs;
502 	}
503     }
504   else
505     {
506       DEBUG_ASSERT (regs_allocated == REGS_FIXED);
507       /* This function may not be called with REGS_FIXED and nregs too big.  */
508       DEBUG_ASSERT (nregs <= regs->num_regs);
509       rval = REGS_FIXED;
510     }
511 
512   /* Copy the regs.  */
513   for (i = 0; i < nregs; ++i)
514     {
515       regs->start[i] = pmatch[i].rm_so;
516       regs->end[i] = pmatch[i].rm_eo;
517     }
518   for ( ; i < regs->num_regs; ++i)
519     regs->start[i] = regs->end[i] = -1;
520 
521   return rval;
522 }
523 
524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
525    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
526    this memory for recording register information.  STARTS and ENDS
527    must be allocated using the malloc library routine, and must each
528    be at least NUM_REGS * sizeof (regoff_t) bytes long.
529 
530    If NUM_REGS == 0, then subsequent matches should allocate their own
531    register data.
532 
533    Unless this function is called, the first search or match using
534    PATTERN_BUFFER will allocate its own register data, without
535    freeing the old data.  */
536 
537 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
539 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
540 {
541   if (num_regs)
542     {
543       bufp->regs_allocated = REGS_REALLOCATE;
544       regs->num_regs = num_regs;
545       regs->start = starts;
546       regs->end = ends;
547     }
548   else
549     {
550       bufp->regs_allocated = REGS_UNALLOCATED;
551       regs->num_regs = 0;
552       regs->start = regs->end = NULL;
553     }
554 }
555 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)556 weak_alias (__re_set_registers, re_set_registers)
557 #endif
558 
559 /* Entry points compatible with 4.2 BSD regex library.  We don't define
560    them unless specifically requested.  */
561 
562 #if defined _REGEX_RE_COMP || defined _LIBC
563 int
564 # ifdef _LIBC
565 weak_function
566 # endif
567 re_exec (const char *s)
568 {
569   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
570 }
571 #endif /* _REGEX_RE_COMP */
572 
573 /* Internal entry point.  */
574 
575 /* Searches for a compiled pattern PREG in the string STRING, whose
576    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
577    meaning as with regexec.  LAST_START is START + RANGE, where
578    START and RANGE have the same meaning as with re_search.
579    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
580    otherwise return the error code.
581    Note: We assume front end functions already check ranges.
582    (0 <= LAST_START && LAST_START <= LENGTH)  */
583 
584 static reg_errcode_t
585 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)586 re_search_internal (const regex_t *preg, const char *string, Idx length,
587 		    Idx start, Idx last_start, Idx stop, size_t nmatch,
588 		    regmatch_t pmatch[], int eflags)
589 {
590   reg_errcode_t err;
591   const re_dfa_t *dfa = preg->buffer;
592   Idx left_lim, right_lim;
593   int incr;
594   bool fl_longest_match;
595   int match_kind;
596   Idx match_first;
597   Idx match_last = -1;
598   Idx extra_nmatch;
599   bool sb;
600   int ch;
601   re_match_context_t mctx = { .dfa = dfa };
602   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
603 		    && start != last_start && !preg->can_be_null)
604 		   ? preg->fastmap : NULL);
605   RE_TRANSLATE_TYPE t = preg->translate;
606 
607   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
608   nmatch -= extra_nmatch;
609 
610   /* Check if the DFA haven't been compiled.  */
611   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
612 			|| dfa->init_state_word == NULL
613 			|| dfa->init_state_nl == NULL
614 			|| dfa->init_state_begbuf == NULL))
615     return REG_NOMATCH;
616 
617   /* We assume front-end functions already check them.  */
618   DEBUG_ASSERT (0 <= last_start && last_start <= length);
619 
620   /* If initial states with non-begbuf contexts have no elements,
621      the regex must be anchored.  If preg->newline_anchor is set,
622      we'll never use init_state_nl, so do not check it.  */
623   if (dfa->init_state->nodes.nelem == 0
624       && dfa->init_state_word->nodes.nelem == 0
625       && (dfa->init_state_nl->nodes.nelem == 0
626 	  || !preg->newline_anchor))
627     {
628       if (start != 0 && last_start != 0)
629         return REG_NOMATCH;
630       start = last_start = 0;
631     }
632 
633   /* We must check the longest matching, if nmatch > 0.  */
634   fl_longest_match = (nmatch != 0 || dfa->nbackref);
635 
636   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
637 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
638 			    dfa);
639   if (__glibc_unlikely (err != REG_NOERROR))
640     goto free_return;
641   mctx.input.stop = stop;
642   mctx.input.raw_stop = stop;
643   mctx.input.newline_anchor = preg->newline_anchor;
644 
645   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
646   if (__glibc_unlikely (err != REG_NOERROR))
647     goto free_return;
648 
649   /* We will log all the DFA states through which the dfa pass,
650      if nmatch > 1, or this dfa has "multibyte node", which is a
651      back-reference or a node which can accept multibyte character or
652      multi character collating element.  */
653   if (nmatch > 1 || dfa->has_mb_node)
654     {
655       /* Avoid overflow.  */
656       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
657 			     <= mctx.input.bufs_len)))
658 	{
659 	  err = REG_ESPACE;
660 	  goto free_return;
661 	}
662 
663       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
664       if (__glibc_unlikely (mctx.state_log == NULL))
665 	{
666 	  err = REG_ESPACE;
667 	  goto free_return;
668 	}
669     }
670 
671   match_first = start;
672   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
673 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
674 
675   /* Check incrementally whether the input string matches.  */
676   incr = (last_start < start) ? -1 : 1;
677   left_lim = (last_start < start) ? last_start : start;
678   right_lim = (last_start < start) ? start : last_start;
679   sb = dfa->mb_cur_max == 1;
680   match_kind =
681     (fastmap
682      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
683 	| (start <= last_start ? 2 : 0)
684 	| (t != NULL ? 1 : 0))
685      : 8);
686 
687   for (;; match_first += incr)
688     {
689       err = REG_NOMATCH;
690       if (match_first < left_lim || right_lim < match_first)
691 	goto free_return;
692 
693       /* Advance as rapidly as possible through the string, until we
694 	 find a plausible place to start matching.  This may be done
695 	 with varying efficiency, so there are various possibilities:
696 	 only the most common of them are specialized, in order to
697 	 save on code size.  We use a switch statement for speed.  */
698       switch (match_kind)
699 	{
700 	case 8:
701 	  /* No fastmap.  */
702 	  break;
703 
704 	case 7:
705 	  /* Fastmap with single-byte translation, match forward.  */
706 	  while (__glibc_likely (match_first < right_lim)
707 		 && !fastmap[t[(unsigned char) string[match_first]]])
708 	    ++match_first;
709 	  goto forward_match_found_start_or_reached_end;
710 
711 	case 6:
712 	  /* Fastmap without translation, match forward.  */
713 	  while (__glibc_likely (match_first < right_lim)
714 		 && !fastmap[(unsigned char) string[match_first]])
715 	    ++match_first;
716 
717 	forward_match_found_start_or_reached_end:
718 	  if (__glibc_unlikely (match_first == right_lim))
719 	    {
720 	      ch = match_first >= length
721 		       ? 0 : (unsigned char) string[match_first];
722 	      if (!fastmap[t ? t[ch] : ch])
723 		goto free_return;
724 	    }
725 	  break;
726 
727 	case 4:
728 	case 5:
729 	  /* Fastmap without multi-byte translation, match backwards.  */
730 	  while (match_first >= left_lim)
731 	    {
732 	      ch = match_first >= length
733 		       ? 0 : (unsigned char) string[match_first];
734 	      if (fastmap[t ? t[ch] : ch])
735 		break;
736 	      --match_first;
737 	    }
738 	  if (match_first < left_lim)
739 	    goto free_return;
740 	  break;
741 
742 	default:
743 	  /* In this case, we can't determine easily the current byte,
744 	     since it might be a component byte of a multibyte
745 	     character.  Then we use the constructed buffer instead.  */
746 	  for (;;)
747 	    {
748 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
749 		 buffers.  */
750 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
751 	      if (__glibc_unlikely (offset
752 				    >= (__re_size_t) mctx.input.valid_raw_len))
753 		{
754 		  err = re_string_reconstruct (&mctx.input, match_first,
755 					       eflags);
756 		  if (__glibc_unlikely (err != REG_NOERROR))
757 		    goto free_return;
758 
759 		  offset = match_first - mctx.input.raw_mbs_idx;
760 		}
761 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
762 		 Note that MATCH_FIRST must not be smaller than 0.  */
763 	      ch = (match_first >= length
764 		    ? 0 : re_string_byte_at (&mctx.input, offset));
765 	      if (fastmap[ch])
766 		break;
767 	      match_first += incr;
768 	      if (match_first < left_lim || match_first > right_lim)
769 		{
770 		  err = REG_NOMATCH;
771 		  goto free_return;
772 		}
773 	    }
774 	  break;
775 	}
776 
777       /* Reconstruct the buffers so that the matcher can assume that
778 	 the matching starts from the beginning of the buffer.  */
779       err = re_string_reconstruct (&mctx.input, match_first, eflags);
780       if (__glibc_unlikely (err != REG_NOERROR))
781 	goto free_return;
782 
783 #ifdef RE_ENABLE_I18N
784      /* Don't consider this char as a possible match start if it part,
785 	yet isn't the head, of a multibyte character.  */
786       if (!sb && !re_string_first_byte (&mctx.input, 0))
787 	continue;
788 #endif
789 
790       /* It seems to be appropriate one, then use the matcher.  */
791       /* We assume that the matching starts from 0.  */
792       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
793       match_last = check_matching (&mctx, fl_longest_match,
794 				   start <= last_start ? &match_first : NULL);
795       if (match_last != -1)
796 	{
797 	  if (__glibc_unlikely (match_last == -2))
798 	    {
799 	      err = REG_ESPACE;
800 	      goto free_return;
801 	    }
802 	  else
803 	    {
804 	      mctx.match_last = match_last;
805 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
806 		{
807 		  re_dfastate_t *pstate = mctx.state_log[match_last];
808 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
809 							     match_last);
810 		}
811 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
812 		  || dfa->nbackref)
813 		{
814 		  err = prune_impossible_nodes (&mctx);
815 		  if (err == REG_NOERROR)
816 		    break;
817 		  if (__glibc_unlikely (err != REG_NOMATCH))
818 		    goto free_return;
819 		  match_last = -1;
820 		}
821 	      else
822 		break; /* We found a match.  */
823 	    }
824 	}
825 
826       match_ctx_clean (&mctx);
827     }
828 
829   DEBUG_ASSERT (match_last != -1);
830   DEBUG_ASSERT (err == REG_NOERROR);
831 
832   /* Set pmatch[] if we need.  */
833   if (nmatch > 0)
834     {
835       Idx reg_idx;
836 
837       /* Initialize registers.  */
838       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
839 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
840 
841       /* Set the points where matching start/end.  */
842       pmatch[0].rm_so = 0;
843       pmatch[0].rm_eo = mctx.match_last;
844       /* FIXME: This function should fail if mctx.match_last exceeds
845 	 the maximum possible regoff_t value.  We need a new error
846 	 code REG_OVERFLOW.  */
847 
848       if (!preg->no_sub && nmatch > 1)
849 	{
850 	  err = set_regs (preg, &mctx, nmatch, pmatch,
851 			  dfa->has_plural_match && dfa->nbackref > 0);
852 	  if (__glibc_unlikely (err != REG_NOERROR))
853 	    goto free_return;
854 	}
855 
856       /* At last, add the offset to each register, since we slid
857 	 the buffers so that we could assume that the matching starts
858 	 from 0.  */
859       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
860 	if (pmatch[reg_idx].rm_so != -1)
861 	  {
862 #ifdef RE_ENABLE_I18N
863 	    if (__glibc_unlikely (mctx.input.offsets_needed != 0))
864 	      {
865 		pmatch[reg_idx].rm_so =
866 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
867 		   ? mctx.input.valid_raw_len
868 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
869 		pmatch[reg_idx].rm_eo =
870 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
871 		   ? mctx.input.valid_raw_len
872 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
873 	      }
874 #else
875 	    DEBUG_ASSERT (mctx.input.offsets_needed == 0);
876 #endif
877 	    pmatch[reg_idx].rm_so += match_first;
878 	    pmatch[reg_idx].rm_eo += match_first;
879 	  }
880       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
881 	{
882 	  pmatch[nmatch + reg_idx].rm_so = -1;
883 	  pmatch[nmatch + reg_idx].rm_eo = -1;
884 	}
885 
886       if (dfa->subexp_map)
887 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
888 	  if (dfa->subexp_map[reg_idx] != reg_idx)
889 	    {
890 	      pmatch[reg_idx + 1].rm_so
891 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
892 	      pmatch[reg_idx + 1].rm_eo
893 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
894 	    }
895     }
896 
897  free_return:
898   re_free (mctx.state_log);
899   if (dfa->nbackref)
900     match_ctx_free (&mctx);
901   re_string_destruct (&mctx.input);
902   return err;
903 }
904 
905 static reg_errcode_t
906 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)907 prune_impossible_nodes (re_match_context_t *mctx)
908 {
909   const re_dfa_t *const dfa = mctx->dfa;
910   Idx halt_node, match_last;
911   reg_errcode_t ret;
912   re_dfastate_t **sifted_states;
913   re_dfastate_t **lim_states = NULL;
914   re_sift_context_t sctx;
915   DEBUG_ASSERT (mctx->state_log != NULL);
916   match_last = mctx->match_last;
917   halt_node = mctx->last_node;
918 
919   /* Avoid overflow.  */
920   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
921 			<= match_last))
922     return REG_ESPACE;
923 
924   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
925   if (__glibc_unlikely (sifted_states == NULL))
926     {
927       ret = REG_ESPACE;
928       goto free_return;
929     }
930   if (dfa->nbackref)
931     {
932       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
933       if (__glibc_unlikely (lim_states == NULL))
934 	{
935 	  ret = REG_ESPACE;
936 	  goto free_return;
937 	}
938       while (1)
939 	{
940 	  memset (lim_states, '\0',
941 		  sizeof (re_dfastate_t *) * (match_last + 1));
942 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
943 			 match_last);
944 	  ret = sift_states_backward (mctx, &sctx);
945 	  re_node_set_free (&sctx.limits);
946 	  if (__glibc_unlikely (ret != REG_NOERROR))
947 	      goto free_return;
948 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
949 	    break;
950 	  do
951 	    {
952 	      --match_last;
953 	      if (match_last < 0)
954 		{
955 		  ret = REG_NOMATCH;
956 		  goto free_return;
957 		}
958 	    } while (mctx->state_log[match_last] == NULL
959 		     || !mctx->state_log[match_last]->halt);
960 	  halt_node = check_halt_state_context (mctx,
961 						mctx->state_log[match_last],
962 						match_last);
963 	}
964       ret = merge_state_array (dfa, sifted_states, lim_states,
965 			       match_last + 1);
966       re_free (lim_states);
967       lim_states = NULL;
968       if (__glibc_unlikely (ret != REG_NOERROR))
969 	goto free_return;
970     }
971   else
972     {
973       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
974       ret = sift_states_backward (mctx, &sctx);
975       re_node_set_free (&sctx.limits);
976       if (__glibc_unlikely (ret != REG_NOERROR))
977 	goto free_return;
978       if (sifted_states[0] == NULL)
979 	{
980 	  ret = REG_NOMATCH;
981 	  goto free_return;
982 	}
983     }
984   re_free (mctx->state_log);
985   mctx->state_log = sifted_states;
986   sifted_states = NULL;
987   mctx->last_node = halt_node;
988   mctx->match_last = match_last;
989   ret = REG_NOERROR;
990  free_return:
991   re_free (sifted_states);
992   re_free (lim_states);
993   return ret;
994 }
995 
996 /* Acquire an initial state and return it.
997    We must select appropriate initial state depending on the context,
998    since initial states may have constraints like "\<", "^", etc..  */
999 
1000 static inline re_dfastate_t *
1001 __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1002 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1003 			    Idx idx)
1004 {
1005   const re_dfa_t *const dfa = mctx->dfa;
1006   if (dfa->init_state->has_constraint)
1007     {
1008       unsigned int context;
1009       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1010       if (IS_WORD_CONTEXT (context))
1011 	return dfa->init_state_word;
1012       else if (IS_ORDINARY_CONTEXT (context))
1013 	return dfa->init_state;
1014       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1015 	return dfa->init_state_begbuf;
1016       else if (IS_NEWLINE_CONTEXT (context))
1017 	return dfa->init_state_nl;
1018       else if (IS_BEGBUF_CONTEXT (context))
1019 	{
1020 	  /* It is relatively rare case, then calculate on demand.  */
1021 	  return re_acquire_state_context (err, dfa,
1022 					   dfa->init_state->entrance_nodes,
1023 					   context);
1024 	}
1025       else
1026 	/* Must not happen?  */
1027 	return dfa->init_state;
1028     }
1029   else
1030     return dfa->init_state;
1031 }
1032 
1033 /* Check whether the regular expression match input string INPUT or not,
1034    and return the index where the matching end.  Return -1 if
1035    there is no match, and return -2 in case of an error.
1036    FL_LONGEST_MATCH means we want the POSIX longest matching.
1037    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1038    next place where we may want to try matching.
1039    Note that the matcher assumes that the matching starts from the current
1040    index of the buffer.  */
1041 
1042 static Idx
1043 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1044 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1045 		Idx *p_match_first)
1046 {
1047   const re_dfa_t *const dfa = mctx->dfa;
1048   reg_errcode_t err;
1049   Idx match = 0;
1050   Idx match_last = -1;
1051   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1052   re_dfastate_t *cur_state;
1053   bool at_init_state = p_match_first != NULL;
1054   Idx next_start_idx = cur_str_idx;
1055 
1056   err = REG_NOERROR;
1057   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1058   /* An initial state must not be NULL (invalid).  */
1059   if (__glibc_unlikely (cur_state == NULL))
1060     {
1061       DEBUG_ASSERT (err == REG_ESPACE);
1062       return -2;
1063     }
1064 
1065   if (mctx->state_log != NULL)
1066     {
1067       mctx->state_log[cur_str_idx] = cur_state;
1068 
1069       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1070 	 later.  E.g. Processing back references.  */
1071       if (__glibc_unlikely (dfa->nbackref))
1072 	{
1073 	  at_init_state = false;
1074 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1075 	  if (__glibc_unlikely (err != REG_NOERROR))
1076 	    return err;
1077 
1078 	  if (cur_state->has_backref)
1079 	    {
1080 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1081 	      if (__glibc_unlikely (err != REG_NOERROR))
1082 		return err;
1083 	    }
1084 	}
1085     }
1086 
1087   /* If the RE accepts NULL string.  */
1088   if (__glibc_unlikely (cur_state->halt))
1089     {
1090       if (!cur_state->has_constraint
1091 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1092 	{
1093 	  if (!fl_longest_match)
1094 	    return cur_str_idx;
1095 	  else
1096 	    {
1097 	      match_last = cur_str_idx;
1098 	      match = 1;
1099 	    }
1100 	}
1101     }
1102 
1103   while (!re_string_eoi (&mctx->input))
1104     {
1105       re_dfastate_t *old_state = cur_state;
1106       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1107 
1108       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1109 	   && mctx->input.bufs_len < mctx->input.len)
1110 	  || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1111 	      && mctx->input.valid_len < mctx->input.len))
1112 	{
1113 	  err = extend_buffers (mctx, next_char_idx + 1);
1114 	  if (__glibc_unlikely (err != REG_NOERROR))
1115 	    {
1116 	      DEBUG_ASSERT (err == REG_ESPACE);
1117 	      return -2;
1118 	    }
1119 	}
1120 
1121       cur_state = transit_state (&err, mctx, cur_state);
1122       if (mctx->state_log != NULL)
1123 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1124 
1125       if (cur_state == NULL)
1126 	{
1127 	  /* Reached the invalid state or an error.  Try to recover a valid
1128 	     state using the state log, if available and if we have not
1129 	     already found a valid (even if not the longest) match.  */
1130 	  if (__glibc_unlikely (err != REG_NOERROR))
1131 	    return -2;
1132 
1133 	  if (mctx->state_log == NULL
1134 	      || (match && !fl_longest_match)
1135 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1136 	    break;
1137 	}
1138 
1139       if (__glibc_unlikely (at_init_state))
1140 	{
1141 	  if (old_state == cur_state)
1142 	    next_start_idx = next_char_idx;
1143 	  else
1144 	    at_init_state = false;
1145 	}
1146 
1147       if (cur_state->halt)
1148 	{
1149 	  /* Reached a halt state.
1150 	     Check the halt state can satisfy the current context.  */
1151 	  if (!cur_state->has_constraint
1152 	      || check_halt_state_context (mctx, cur_state,
1153 					   re_string_cur_idx (&mctx->input)))
1154 	    {
1155 	      /* We found an appropriate halt state.  */
1156 	      match_last = re_string_cur_idx (&mctx->input);
1157 	      match = 1;
1158 
1159 	      /* We found a match, do not modify match_first below.  */
1160 	      p_match_first = NULL;
1161 	      if (!fl_longest_match)
1162 		break;
1163 	    }
1164 	}
1165     }
1166 
1167   if (p_match_first)
1168     *p_match_first += next_start_idx;
1169 
1170   return match_last;
1171 }
1172 
1173 /* Check NODE match the current context.  */
1174 
1175 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1176 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1177 {
1178   re_token_type_t type = dfa->nodes[node].type;
1179   unsigned int constraint = dfa->nodes[node].constraint;
1180   if (type != END_OF_RE)
1181     return false;
1182   if (!constraint)
1183     return true;
1184   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1185     return false;
1186   return true;
1187 }
1188 
1189 /* Check the halt state STATE match the current context.
1190    Return 0 if not match, if the node, STATE has, is a halt node and
1191    match the context, return the node.  */
1192 
1193 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1194 check_halt_state_context (const re_match_context_t *mctx,
1195 			  const re_dfastate_t *state, Idx idx)
1196 {
1197   Idx i;
1198   unsigned int context;
1199   DEBUG_ASSERT (state->halt);
1200   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201   for (i = 0; i < state->nodes.nelem; ++i)
1202     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203       return state->nodes.elems[i];
1204   return 0;
1205 }
1206 
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208    corresponding to the DFA).
1209    Return the destination node, and update EPS_VIA_NODES;
1210    return -1 on match failure, -2 on error.  */
1211 
1212 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1213 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1214 		   regmatch_t *prevregs,
1215 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1216 		   struct re_fail_stack_t *fs)
1217 {
1218   const re_dfa_t *const dfa = mctx->dfa;
1219   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1220     {
1221       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1222       re_node_set *edests = &dfa->edests[node];
1223       bool ok = re_node_set_insert (eps_via_nodes, node);
1224       if (__glibc_unlikely (! ok))
1225 	return -2;
1226 
1227       /* Pick a valid destination, or return -1 if none is found.  */
1228       Idx dest_node = -1;
1229       for (Idx i = 0; i < edests->nelem; i++)
1230 	{
1231 	  Idx candidate = edests->elems[i];
1232 	  if (!re_node_set_contains (cur_nodes, candidate))
1233 	    continue;
1234           if (dest_node == -1)
1235 	    dest_node = candidate;
1236 
1237 	  else
1238 	    {
1239 	      /* In order to avoid infinite loop like "(a*)*", return the second
1240 		 epsilon-transition if the first was already considered.  */
1241 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1242 		return candidate;
1243 
1244 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1245 	      else if (fs != NULL
1246 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1247 					   prevregs, eps_via_nodes))
1248 		return -2;
1249 
1250 	      /* We know we are going to exit.  */
1251 	      break;
1252 	    }
1253 	}
1254       return dest_node;
1255     }
1256   else
1257     {
1258       Idx naccepted = 0;
1259       re_token_type_t type = dfa->nodes[node].type;
1260 
1261 #ifdef RE_ENABLE_I18N
1262       if (dfa->nodes[node].accept_mb)
1263 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1264       else
1265 #endif /* RE_ENABLE_I18N */
1266       if (type == OP_BACK_REF)
1267 	{
1268 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1269 	  if (subexp_idx < nregs)
1270 	    naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1271 	  if (fs != NULL)
1272 	    {
1273 	      if (subexp_idx >= nregs
1274 		  || regs[subexp_idx].rm_so == -1
1275 		  || regs[subexp_idx].rm_eo == -1)
1276 		return -1;
1277 	      else if (naccepted)
1278 		{
1279 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1280 		  if (mctx->input.valid_len - *pidx < naccepted
1281 		      || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1282 				  naccepted)
1283 			  != 0))
1284 		    return -1;
1285 		}
1286 	    }
1287 
1288 	  if (naccepted == 0)
1289 	    {
1290 	      Idx dest_node;
1291 	      bool ok = re_node_set_insert (eps_via_nodes, node);
1292 	      if (__glibc_unlikely (! ok))
1293 		return -2;
1294 	      dest_node = dfa->edests[node].elems[0];
1295 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1296 					dest_node))
1297 		return dest_node;
1298 	    }
1299 	}
1300 
1301       if (naccepted != 0
1302 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1303 	{
1304 	  Idx dest_node = dfa->nexts[node];
1305 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1306 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1307 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1308 					       dest_node)))
1309 	    return -1;
1310 	  re_node_set_empty (eps_via_nodes);
1311 	  return dest_node;
1312 	}
1313     }
1314   return -1;
1315 }
1316 
1317 static reg_errcode_t
1318 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1319 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1320 		 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1321 		 re_node_set *eps_via_nodes)
1322 {
1323   reg_errcode_t err;
1324   Idx num = fs->num++;
1325   if (fs->num == fs->alloc)
1326     {
1327       struct re_fail_stack_ent_t *new_array;
1328       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1329                               fs->alloc * 2);
1330       if (new_array == NULL)
1331 	return REG_ESPACE;
1332       fs->alloc *= 2;
1333       fs->stack = new_array;
1334     }
1335   fs->stack[num].idx = str_idx;
1336   fs->stack[num].node = dest_node;
1337   fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1338   if (fs->stack[num].regs == NULL)
1339     return REG_ESPACE;
1340   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1341   memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1342   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1343   return err;
1344 }
1345 
1346 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1347 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1348 		regmatch_t *regs, regmatch_t *prevregs,
1349 		re_node_set *eps_via_nodes)
1350 {
1351   if (fs == NULL || fs->num == 0)
1352     return -1;
1353   Idx num = --fs->num;
1354   *pidx = fs->stack[num].idx;
1355   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1356   memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1357   re_node_set_free (eps_via_nodes);
1358   re_free (fs->stack[num].regs);
1359   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1360   DEBUG_ASSERT (0 <= fs->stack[num].node);
1361   return fs->stack[num].node;
1362 }
1363 
1364 
1365 #define DYNARRAY_STRUCT  regmatch_list
1366 #define DYNARRAY_ELEMENT regmatch_t
1367 #define DYNARRAY_PREFIX  regmatch_list_
1368 #include <malloc/dynarray-skeleton.c>
1369 
1370 /* Set the positions where the subexpressions are starts/ends to registers
1371    PMATCH.
1372    Note: We assume that pmatch[0] is already set, and
1373    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1374 
1375 static reg_errcode_t
1376 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1377 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1378 	  regmatch_t *pmatch, bool fl_backtrack)
1379 {
1380   const re_dfa_t *dfa = preg->buffer;
1381   Idx idx, cur_node;
1382   re_node_set eps_via_nodes;
1383   struct re_fail_stack_t *fs;
1384   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385   struct regmatch_list prev_match;
1386   regmatch_list_init (&prev_match);
1387 
1388   DEBUG_ASSERT (nmatch > 1);
1389   DEBUG_ASSERT (mctx->state_log != NULL);
1390   if (fl_backtrack)
1391     {
1392       fs = &fs_body;
1393       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394       if (fs->stack == NULL)
1395 	return REG_ESPACE;
1396     }
1397   else
1398     fs = NULL;
1399 
1400   cur_node = dfa->init_node;
1401   re_node_set_init_empty (&eps_via_nodes);
1402 
1403   if (!regmatch_list_resize (&prev_match, nmatch))
1404     {
1405       regmatch_list_free (&prev_match);
1406       free_fail_stack_return (fs);
1407       return REG_ESPACE;
1408     }
1409   regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1410   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1411 
1412   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1413     {
1414       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1415 
1416       if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1417 	  || re_node_set_contains (&eps_via_nodes, cur_node))
1418 	{
1419 	  Idx reg_idx;
1420 	  cur_node = -1;
1421 	  if (fs)
1422 	    {
1423 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1424 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1425 		  {
1426 		    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1427 					       prev_idx_match, &eps_via_nodes);
1428 		    break;
1429 		  }
1430 	    }
1431 	  if (cur_node < 0)
1432 	    {
1433 	      re_node_set_free (&eps_via_nodes);
1434 	      regmatch_list_free (&prev_match);
1435 	      return free_fail_stack_return (fs);
1436 	    }
1437 	}
1438 
1439       /* Proceed to next node.  */
1440       cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1441 				    &idx, cur_node,
1442 				    &eps_via_nodes, fs);
1443 
1444       if (__glibc_unlikely (cur_node < 0))
1445 	{
1446 	  if (__glibc_unlikely (cur_node == -2))
1447 	    {
1448 	      re_node_set_free (&eps_via_nodes);
1449 	      regmatch_list_free (&prev_match);
1450 	      free_fail_stack_return (fs);
1451 	      return REG_ESPACE;
1452 	    }
1453 	  cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 				     prev_idx_match, &eps_via_nodes);
1455 	  if (cur_node < 0)
1456 	    {
1457 	      re_node_set_free (&eps_via_nodes);
1458 	      regmatch_list_free (&prev_match);
1459 	      free_fail_stack_return (fs);
1460 	      return REG_NOMATCH;
1461 	    }
1462 	}
1463     }
1464   re_node_set_free (&eps_via_nodes);
1465   regmatch_list_free (&prev_match);
1466   return free_fail_stack_return (fs);
1467 }
1468 
1469 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1470 free_fail_stack_return (struct re_fail_stack_t *fs)
1471 {
1472   if (fs)
1473     {
1474       Idx fs_idx;
1475       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1476 	{
1477 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1478 	  re_free (fs->stack[fs_idx].regs);
1479 	}
1480       re_free (fs->stack);
1481     }
1482   return REG_NOERROR;
1483 }
1484 
1485 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1486 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1487 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1488 {
1489   int type = dfa->nodes[cur_node].type;
1490   if (type == OP_OPEN_SUBEXP)
1491     {
1492       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493 
1494       /* We are at the first node of this sub expression.  */
1495       if (reg_num < nmatch)
1496 	{
1497 	  pmatch[reg_num].rm_so = cur_idx;
1498 	  pmatch[reg_num].rm_eo = -1;
1499 	}
1500     }
1501   else if (type == OP_CLOSE_SUBEXP)
1502     {
1503       /* We are at the last node of this sub expression.  */
1504       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1505       if (reg_num < nmatch)
1506 	{
1507 	  if (pmatch[reg_num].rm_so < cur_idx)
1508 	    {
1509 	      pmatch[reg_num].rm_eo = cur_idx;
1510 	      /* This is a non-empty match or we are not inside an optional
1511 		 subexpression.  Accept this right away.  */
1512 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1513 	    }
1514 	  else
1515 	    {
1516 	      if (dfa->nodes[cur_node].opt_subexp
1517 		  && prev_idx_match[reg_num].rm_so != -1)
1518 		/* We transited through an empty match for an optional
1519 		   subexpression, like (a?)*, and this is not the subexp's
1520 		   first match.  Copy back the old content of the registers
1521 		   so that matches of an inner subexpression are undone as
1522 		   well, like in ((a?))*.  */
1523 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1524 	      else
1525 		/* We completed a subexpression, but it may be part of
1526 		   an optional one, so do not update PREV_IDX_MATCH.  */
1527 		pmatch[reg_num].rm_eo = cur_idx;
1528 	    }
1529 	}
1530     }
1531 }
1532 
1533 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1534    and sift the nodes in each states according to the following rules.
1535    Updated state_log will be wrote to STATE_LOG.
1536 
1537    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1538      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1539 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1540 	the LAST_NODE, we throw away the node 'a'.
1541      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1542 	string 's' and transit to 'b':
1543 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1544 	   away the node 'a'.
1545 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1546 	    thrown away, we throw away the node 'a'.
1547      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1548 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1549 	   node 'a'.
1550 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1551 	    we throw away the node 'a'.  */
1552 
1553 #define STATE_NODE_CONTAINS(state,node) \
1554   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1555 
1556 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1557 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1558 {
1559   reg_errcode_t err;
1560   int null_cnt = 0;
1561   Idx str_idx = sctx->last_str_idx;
1562   re_node_set cur_dest;
1563 
1564   DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1565 
1566   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1567      transit to the last_node and the last_node itself.  */
1568   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1569   if (__glibc_unlikely (err != REG_NOERROR))
1570     return err;
1571   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1572   if (__glibc_unlikely (err != REG_NOERROR))
1573     goto free_return;
1574 
1575   /* Then check each states in the state_log.  */
1576   while (str_idx > 0)
1577     {
1578       /* Update counters.  */
1579       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1580       if (null_cnt > mctx->max_mb_elem_len)
1581 	{
1582 	  memset (sctx->sifted_states, '\0',
1583 		  sizeof (re_dfastate_t *) * str_idx);
1584 	  re_node_set_free (&cur_dest);
1585 	  return REG_NOERROR;
1586 	}
1587       re_node_set_empty (&cur_dest);
1588       --str_idx;
1589 
1590       if (mctx->state_log[str_idx])
1591 	{
1592 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1593 	  if (__glibc_unlikely (err != REG_NOERROR))
1594 	    goto free_return;
1595 	}
1596 
1597       /* Add all the nodes which satisfy the following conditions:
1598 	 - It can epsilon transit to a node in CUR_DEST.
1599 	 - It is in CUR_SRC.
1600 	 And update state_log.  */
1601       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1602       if (__glibc_unlikely (err != REG_NOERROR))
1603 	goto free_return;
1604     }
1605   err = REG_NOERROR;
1606  free_return:
1607   re_node_set_free (&cur_dest);
1608   return err;
1609 }
1610 
1611 static reg_errcode_t
1612 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1613 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1614 		     Idx str_idx, re_node_set *cur_dest)
1615 {
1616   const re_dfa_t *const dfa = mctx->dfa;
1617   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1618   Idx i;
1619 
1620   /* Then build the next sifted state.
1621      We build the next sifted state on 'cur_dest', and update
1622      'sifted_states[str_idx]' with 'cur_dest'.
1623      Note:
1624      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1625      'cur_src' points the node_set of the old 'state_log[str_idx]'
1626      (with the epsilon nodes pre-filtered out).  */
1627   for (i = 0; i < cur_src->nelem; i++)
1628     {
1629       Idx prev_node = cur_src->elems[i];
1630       int naccepted = 0;
1631       bool ok;
1632       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1633 
1634 #ifdef RE_ENABLE_I18N
1635       /* If the node may accept "multi byte".  */
1636       if (dfa->nodes[prev_node].accept_mb)
1637 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1638 					 str_idx, sctx->last_str_idx);
1639 #endif /* RE_ENABLE_I18N */
1640 
1641       /* We don't check backreferences here.
1642 	 See update_cur_sifted_state().  */
1643       if (!naccepted
1644 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1645 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1646 				  dfa->nexts[prev_node]))
1647 	naccepted = 1;
1648 
1649       if (naccepted == 0)
1650 	continue;
1651 
1652       if (sctx->limits.nelem)
1653 	{
1654 	  Idx to_idx = str_idx + naccepted;
1655 	  if (check_dst_limits (mctx, &sctx->limits,
1656 				dfa->nexts[prev_node], to_idx,
1657 				prev_node, str_idx))
1658 	    continue;
1659 	}
1660       ok = re_node_set_insert (cur_dest, prev_node);
1661       if (__glibc_unlikely (! ok))
1662 	return REG_ESPACE;
1663     }
1664 
1665   return REG_NOERROR;
1666 }
1667 
1668 /* Helper functions.  */
1669 
1670 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1671 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1672 {
1673   Idx top = mctx->state_log_top;
1674 
1675   if ((next_state_log_idx >= mctx->input.bufs_len
1676        && mctx->input.bufs_len < mctx->input.len)
1677       || (next_state_log_idx >= mctx->input.valid_len
1678 	  && mctx->input.valid_len < mctx->input.len))
1679     {
1680       reg_errcode_t err;
1681       err = extend_buffers (mctx, next_state_log_idx + 1);
1682       if (__glibc_unlikely (err != REG_NOERROR))
1683 	return err;
1684     }
1685 
1686   if (top < next_state_log_idx)
1687     {
1688       memset (mctx->state_log + top + 1, '\0',
1689 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1690       mctx->state_log_top = next_state_log_idx;
1691     }
1692   return REG_NOERROR;
1693 }
1694 
1695 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1696 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1697 		   re_dfastate_t **src, Idx num)
1698 {
1699   Idx st_idx;
1700   reg_errcode_t err;
1701   for (st_idx = 0; st_idx < num; ++st_idx)
1702     {
1703       if (dst[st_idx] == NULL)
1704 	dst[st_idx] = src[st_idx];
1705       else if (src[st_idx] != NULL)
1706 	{
1707 	  re_node_set merged_set;
1708 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1709 					&src[st_idx]->nodes);
1710 	  if (__glibc_unlikely (err != REG_NOERROR))
1711 	    return err;
1712 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1713 	  re_node_set_free (&merged_set);
1714 	  if (__glibc_unlikely (err != REG_NOERROR))
1715 	    return err;
1716 	}
1717     }
1718   return REG_NOERROR;
1719 }
1720 
1721 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1722 update_cur_sifted_state (const re_match_context_t *mctx,
1723 			 re_sift_context_t *sctx, Idx str_idx,
1724 			 re_node_set *dest_nodes)
1725 {
1726   const re_dfa_t *const dfa = mctx->dfa;
1727   reg_errcode_t err = REG_NOERROR;
1728   const re_node_set *candidates;
1729   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1730 		: &mctx->state_log[str_idx]->nodes);
1731 
1732   if (dest_nodes->nelem == 0)
1733     sctx->sifted_states[str_idx] = NULL;
1734   else
1735     {
1736       if (candidates)
1737 	{
1738 	  /* At first, add the nodes which can epsilon transit to a node in
1739 	     DEST_NODE.  */
1740 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1741 	  if (__glibc_unlikely (err != REG_NOERROR))
1742 	    return err;
1743 
1744 	  /* Then, check the limitations in the current sift_context.  */
1745 	  if (sctx->limits.nelem)
1746 	    {
1747 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1748 					 mctx->bkref_ents, str_idx);
1749 	      if (__glibc_unlikely (err != REG_NOERROR))
1750 		return err;
1751 	    }
1752 	}
1753 
1754       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1755       if (__glibc_unlikely (err != REG_NOERROR))
1756 	return err;
1757     }
1758 
1759   if (candidates && mctx->state_log[str_idx]->has_backref)
1760     {
1761       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1762       if (__glibc_unlikely (err != REG_NOERROR))
1763 	return err;
1764     }
1765   return REG_NOERROR;
1766 }
1767 
1768 static reg_errcode_t
1769 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1770 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1771 		       const re_node_set *candidates)
1772 {
1773   reg_errcode_t err = REG_NOERROR;
1774   Idx i;
1775 
1776   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1777   if (__glibc_unlikely (err != REG_NOERROR))
1778     return err;
1779 
1780   if (!state->inveclosure.alloc)
1781     {
1782       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1783       if (__glibc_unlikely (err != REG_NOERROR))
1784 	return REG_ESPACE;
1785       for (i = 0; i < dest_nodes->nelem; i++)
1786 	{
1787 	  err = re_node_set_merge (&state->inveclosure,
1788 				   dfa->inveclosures + dest_nodes->elems[i]);
1789 	  if (__glibc_unlikely (err != REG_NOERROR))
1790 	    return REG_ESPACE;
1791 	}
1792     }
1793   return re_node_set_add_intersect (dest_nodes, candidates,
1794 				    &state->inveclosure);
1795 }
1796 
1797 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1798 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1799 		       const re_node_set *candidates)
1800 {
1801     Idx ecl_idx;
1802     reg_errcode_t err;
1803     re_node_set *inv_eclosure = dfa->inveclosures + node;
1804     re_node_set except_nodes;
1805     re_node_set_init_empty (&except_nodes);
1806     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1807       {
1808 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1809 	if (cur_node == node)
1810 	  continue;
1811 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1812 	  {
1813 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1814 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1815 			 ? dfa->edests[cur_node].elems[1] : -1);
1816 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1817 		 && re_node_set_contains (dest_nodes, edst1))
1818 		|| (edst2 > 0
1819 		    && !re_node_set_contains (inv_eclosure, edst2)
1820 		    && re_node_set_contains (dest_nodes, edst2)))
1821 	      {
1822 		err = re_node_set_add_intersect (&except_nodes, candidates,
1823 						 dfa->inveclosures + cur_node);
1824 		if (__glibc_unlikely (err != REG_NOERROR))
1825 		  {
1826 		    re_node_set_free (&except_nodes);
1827 		    return err;
1828 		  }
1829 	      }
1830 	  }
1831       }
1832     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1833       {
1834 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1835 	if (!re_node_set_contains (&except_nodes, cur_node))
1836 	  {
1837 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1838 	    re_node_set_remove_at (dest_nodes, idx);
1839 	  }
1840       }
1841     re_node_set_free (&except_nodes);
1842     return REG_NOERROR;
1843 }
1844 
1845 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1846 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1847 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1848 {
1849   const re_dfa_t *const dfa = mctx->dfa;
1850   Idx lim_idx, src_pos, dst_pos;
1851 
1852   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1853   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1854   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1855     {
1856       Idx subexp_idx;
1857       struct re_backref_cache_entry *ent;
1858       ent = mctx->bkref_ents + limits->elems[lim_idx];
1859       subexp_idx = dfa->nodes[ent->node].opr.idx;
1860 
1861       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1862 					   subexp_idx, dst_node, dst_idx,
1863 					   dst_bkref_idx);
1864       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1865 					   subexp_idx, src_node, src_idx,
1866 					   src_bkref_idx);
1867 
1868       /* In case of:
1869 	 <src> <dst> ( <subexp> )
1870 	 ( <subexp> ) <src> <dst>
1871 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1872       if (src_pos == dst_pos)
1873 	continue; /* This is unrelated limitation.  */
1874       else
1875 	return true;
1876     }
1877   return false;
1878 }
1879 
1880 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1881 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1882 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1883 {
1884   const re_dfa_t *const dfa = mctx->dfa;
1885   const re_node_set *eclosures = dfa->eclosures + from_node;
1886   Idx node_idx;
1887 
1888   /* Else, we are on the boundary: examine the nodes on the epsilon
1889      closure.  */
1890   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1891     {
1892       Idx node = eclosures->elems[node_idx];
1893       switch (dfa->nodes[node].type)
1894 	{
1895 	case OP_BACK_REF:
1896 	  if (bkref_idx != -1)
1897 	    {
1898 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1899 	      do
1900 		{
1901 		  Idx dst;
1902 		  int cpos;
1903 
1904 		  if (ent->node != node)
1905 		    continue;
1906 
1907 		  if (subexp_idx < BITSET_WORD_BITS
1908 		      && !(ent->eps_reachable_subexps_map
1909 			   & ((bitset_word_t) 1 << subexp_idx)))
1910 		    continue;
1911 
1912 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1913 		     OP_CLOSE_SUBEXP cases below.  But, if the
1914 		     destination node is the same node as the source
1915 		     node, don't recurse because it would cause an
1916 		     infinite loop: a regex that exhibits this behavior
1917 		     is ()\1*\1*  */
1918 		  dst = dfa->edests[node].elems[0];
1919 		  if (dst == from_node)
1920 		    {
1921 		      if (boundaries & 1)
1922 			return -1;
1923 		      else /* if (boundaries & 2) */
1924 			return 0;
1925 		    }
1926 
1927 		  cpos =
1928 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1929 						 dst, bkref_idx);
1930 		  if (cpos == -1 /* && (boundaries & 1) */)
1931 		    return -1;
1932 		  if (cpos == 0 && (boundaries & 2))
1933 		    return 0;
1934 
1935 		  if (subexp_idx < BITSET_WORD_BITS)
1936 		    ent->eps_reachable_subexps_map
1937 		      &= ~((bitset_word_t) 1 << subexp_idx);
1938 		}
1939 	      while (ent++->more);
1940 	    }
1941 	  break;
1942 
1943 	case OP_OPEN_SUBEXP:
1944 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1945 	    return -1;
1946 	  break;
1947 
1948 	case OP_CLOSE_SUBEXP:
1949 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1950 	    return 0;
1951 	  break;
1952 
1953 	default:
1954 	    break;
1955 	}
1956     }
1957 
1958   return (boundaries & 2) ? 1 : 0;
1959 }
1960 
1961 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1962 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1963 			   Idx subexp_idx, Idx from_node, Idx str_idx,
1964 			   Idx bkref_idx)
1965 {
1966   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1967   int boundaries;
1968 
1969   /* If we are outside the range of the subexpression, return -1 or 1.  */
1970   if (str_idx < lim->subexp_from)
1971     return -1;
1972 
1973   if (lim->subexp_to < str_idx)
1974     return 1;
1975 
1976   /* If we are within the subexpression, return 0.  */
1977   boundaries = (str_idx == lim->subexp_from);
1978   boundaries |= (str_idx == lim->subexp_to) << 1;
1979   if (boundaries == 0)
1980     return 0;
1981 
1982   /* Else, examine epsilon closure.  */
1983   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1984 				      from_node, bkref_idx);
1985 }
1986 
1987 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1988    which are against limitations from DEST_NODES. */
1989 
1990 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)1991 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1992 		     const re_node_set *candidates, re_node_set *limits,
1993 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1994 {
1995   reg_errcode_t err;
1996   Idx node_idx, lim_idx;
1997 
1998   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1999     {
2000       Idx subexp_idx;
2001       struct re_backref_cache_entry *ent;
2002       ent = bkref_ents + limits->elems[lim_idx];
2003 
2004       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2005 	continue; /* This is unrelated limitation.  */
2006 
2007       subexp_idx = dfa->nodes[ent->node].opr.idx;
2008       if (ent->subexp_to == str_idx)
2009 	{
2010 	  Idx ops_node = -1;
2011 	  Idx cls_node = -1;
2012 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2013 	    {
2014 	      Idx node = dest_nodes->elems[node_idx];
2015 	      re_token_type_t type = dfa->nodes[node].type;
2016 	      if (type == OP_OPEN_SUBEXP
2017 		  && subexp_idx == dfa->nodes[node].opr.idx)
2018 		ops_node = node;
2019 	      else if (type == OP_CLOSE_SUBEXP
2020 		       && subexp_idx == dfa->nodes[node].opr.idx)
2021 		cls_node = node;
2022 	    }
2023 
2024 	  /* Check the limitation of the open subexpression.  */
2025 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2026 	  if (ops_node >= 0)
2027 	    {
2028 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2029 					   candidates);
2030 	      if (__glibc_unlikely (err != REG_NOERROR))
2031 		return err;
2032 	    }
2033 
2034 	  /* Check the limitation of the close subexpression.  */
2035 	  if (cls_node >= 0)
2036 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2037 	      {
2038 		Idx node = dest_nodes->elems[node_idx];
2039 		if (!re_node_set_contains (dfa->inveclosures + node,
2040 					   cls_node)
2041 		    && !re_node_set_contains (dfa->eclosures + node,
2042 					      cls_node))
2043 		  {
2044 		    /* It is against this limitation.
2045 		       Remove it form the current sifted state.  */
2046 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2047 						 candidates);
2048 		    if (__glibc_unlikely (err != REG_NOERROR))
2049 		      return err;
2050 		    --node_idx;
2051 		  }
2052 	      }
2053 	}
2054       else /* (ent->subexp_to != str_idx)  */
2055 	{
2056 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057 	    {
2058 	      Idx node = dest_nodes->elems[node_idx];
2059 	      re_token_type_t type = dfa->nodes[node].type;
2060 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2061 		{
2062 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2063 		    continue;
2064 		  /* It is against this limitation.
2065 		     Remove it form the current sifted state.  */
2066 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067 					       candidates);
2068 		  if (__glibc_unlikely (err != REG_NOERROR))
2069 		    return err;
2070 		}
2071 	    }
2072 	}
2073     }
2074   return REG_NOERROR;
2075 }
2076 
2077 static reg_errcode_t
2078 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2079 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2080 		   Idx str_idx, const re_node_set *candidates)
2081 {
2082   const re_dfa_t *const dfa = mctx->dfa;
2083   reg_errcode_t err;
2084   Idx node_idx, node;
2085   re_sift_context_t local_sctx;
2086   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2087 
2088   if (first_idx == -1)
2089     return REG_NOERROR;
2090 
2091   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2092 
2093   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2094     {
2095       Idx enabled_idx;
2096       re_token_type_t type;
2097       struct re_backref_cache_entry *entry;
2098       node = candidates->elems[node_idx];
2099       type = dfa->nodes[node].type;
2100       /* Avoid infinite loop for the REs like "()\1+".  */
2101       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2102 	continue;
2103       if (type != OP_BACK_REF)
2104 	continue;
2105 
2106       entry = mctx->bkref_ents + first_idx;
2107       enabled_idx = first_idx;
2108       do
2109 	{
2110 	  Idx subexp_len;
2111 	  Idx to_idx;
2112 	  Idx dst_node;
2113 	  bool ok;
2114 	  re_dfastate_t *cur_state;
2115 
2116 	  if (entry->node != node)
2117 	    continue;
2118 	  subexp_len = entry->subexp_to - entry->subexp_from;
2119 	  to_idx = str_idx + subexp_len;
2120 	  dst_node = (subexp_len ? dfa->nexts[node]
2121 		      : dfa->edests[node].elems[0]);
2122 
2123 	  if (to_idx > sctx->last_str_idx
2124 	      || sctx->sifted_states[to_idx] == NULL
2125 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2126 	      || check_dst_limits (mctx, &sctx->limits, node,
2127 				   str_idx, dst_node, to_idx))
2128 	    continue;
2129 
2130 	  if (local_sctx.sifted_states == NULL)
2131 	    {
2132 	      local_sctx = *sctx;
2133 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2134 	      if (__glibc_unlikely (err != REG_NOERROR))
2135 		goto free_return;
2136 	    }
2137 	  local_sctx.last_node = node;
2138 	  local_sctx.last_str_idx = str_idx;
2139 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2140 	  if (__glibc_unlikely (! ok))
2141 	    {
2142 	      err = REG_ESPACE;
2143 	      goto free_return;
2144 	    }
2145 	  cur_state = local_sctx.sifted_states[str_idx];
2146 	  err = sift_states_backward (mctx, &local_sctx);
2147 	  if (__glibc_unlikely (err != REG_NOERROR))
2148 	    goto free_return;
2149 	  if (sctx->limited_states != NULL)
2150 	    {
2151 	      err = merge_state_array (dfa, sctx->limited_states,
2152 				       local_sctx.sifted_states,
2153 				       str_idx + 1);
2154 	      if (__glibc_unlikely (err != REG_NOERROR))
2155 		goto free_return;
2156 	    }
2157 	  local_sctx.sifted_states[str_idx] = cur_state;
2158 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2159 
2160 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2161 	  entry = mctx->bkref_ents + enabled_idx;
2162 	}
2163       while (enabled_idx++, entry++->more);
2164     }
2165   err = REG_NOERROR;
2166  free_return:
2167   if (local_sctx.sifted_states != NULL)
2168     {
2169       re_node_set_free (&local_sctx.limits);
2170     }
2171 
2172   return err;
2173 }
2174 
2175 
2176 #ifdef RE_ENABLE_I18N
2177 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2178 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2179 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2180 {
2181   const re_dfa_t *const dfa = mctx->dfa;
2182   int naccepted;
2183   /* Check the node can accept "multi byte".  */
2184   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2185   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2186       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2187 			       dfa->nexts[node_idx]))
2188     /* The node can't accept the "multi byte", or the
2189        destination was already thrown away, then the node
2190        couldn't accept the current input "multi byte".   */
2191     naccepted = 0;
2192   /* Otherwise, it is sure that the node could accept
2193      'naccepted' bytes input.  */
2194   return naccepted;
2195 }
2196 #endif /* RE_ENABLE_I18N */
2197 
2198 
2199 /* Functions for state transition.  */
2200 
2201 /* Return the next state to which the current state STATE will transit by
2202    accepting the current input byte, and update STATE_LOG if necessary.
2203    Return NULL on failure.
2204    If STATE can accept a multibyte char/collating element/back reference
2205    update the destination of STATE_LOG.  */
2206 
2207 static re_dfastate_t *
2208 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2209 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2210 	       re_dfastate_t *state)
2211 {
2212   re_dfastate_t **trtable;
2213   unsigned char ch;
2214 
2215 #ifdef RE_ENABLE_I18N
2216   /* If the current state can accept multibyte.  */
2217   if (__glibc_unlikely (state->accept_mb))
2218     {
2219       *err = transit_state_mb (mctx, state);
2220       if (__glibc_unlikely (*err != REG_NOERROR))
2221 	return NULL;
2222     }
2223 #endif /* RE_ENABLE_I18N */
2224 
2225   /* Then decide the next state with the single byte.  */
2226 #if 0
2227   if (0)
2228     /* don't use transition table  */
2229     return transit_state_sb (err, mctx, state);
2230 #endif
2231 
2232   /* Use transition table  */
2233   ch = re_string_fetch_byte (&mctx->input);
2234   for (;;)
2235     {
2236       trtable = state->trtable;
2237       if (__glibc_likely (trtable != NULL))
2238 	return trtable[ch];
2239 
2240       trtable = state->word_trtable;
2241       if (__glibc_likely (trtable != NULL))
2242 	{
2243 	  unsigned int context;
2244 	  context
2245 	    = re_string_context_at (&mctx->input,
2246 				    re_string_cur_idx (&mctx->input) - 1,
2247 				    mctx->eflags);
2248 	  if (IS_WORD_CONTEXT (context))
2249 	    return trtable[ch + SBC_MAX];
2250 	  else
2251 	    return trtable[ch];
2252 	}
2253 
2254       if (!build_trtable (mctx->dfa, state))
2255 	{
2256 	  *err = REG_ESPACE;
2257 	  return NULL;
2258 	}
2259 
2260       /* Retry, we now have a transition table.  */
2261     }
2262 }
2263 
2264 /* Update the state_log if we need */
2265 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2266 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2267 		      re_dfastate_t *next_state)
2268 {
2269   const re_dfa_t *const dfa = mctx->dfa;
2270   Idx cur_idx = re_string_cur_idx (&mctx->input);
2271 
2272   if (cur_idx > mctx->state_log_top)
2273     {
2274       mctx->state_log[cur_idx] = next_state;
2275       mctx->state_log_top = cur_idx;
2276     }
2277   else if (mctx->state_log[cur_idx] == 0)
2278     {
2279       mctx->state_log[cur_idx] = next_state;
2280     }
2281   else
2282     {
2283       re_dfastate_t *pstate;
2284       unsigned int context;
2285       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2286       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2287 	 the destination of a multibyte char/collating element/
2288 	 back reference.  Then the next state is the union set of
2289 	 these destinations and the results of the transition table.  */
2290       pstate = mctx->state_log[cur_idx];
2291       log_nodes = pstate->entrance_nodes;
2292       if (next_state != NULL)
2293 	{
2294 	  table_nodes = next_state->entrance_nodes;
2295 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2296 					     log_nodes);
2297 	  if (__glibc_unlikely (*err != REG_NOERROR))
2298 	    return NULL;
2299 	}
2300       else
2301 	next_nodes = *log_nodes;
2302       /* Note: We already add the nodes of the initial state,
2303 	 then we don't need to add them here.  */
2304 
2305       context = re_string_context_at (&mctx->input,
2306 				      re_string_cur_idx (&mctx->input) - 1,
2307 				      mctx->eflags);
2308       next_state = mctx->state_log[cur_idx]
2309 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2310       /* We don't need to check errors here, since the return value of
2311 	 this function is next_state and ERR is already set.  */
2312 
2313       if (table_nodes != NULL)
2314 	re_node_set_free (&next_nodes);
2315     }
2316 
2317   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2318     {
2319       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2320 	 later.  We must check them here, since the back references in the
2321 	 next state might use them.  */
2322       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2323 					cur_idx);
2324       if (__glibc_unlikely (*err != REG_NOERROR))
2325 	return NULL;
2326 
2327       /* If the next state has back references.  */
2328       if (next_state->has_backref)
2329 	{
2330 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2331 	  if (__glibc_unlikely (*err != REG_NOERROR))
2332 	    return NULL;
2333 	  next_state = mctx->state_log[cur_idx];
2334 	}
2335     }
2336 
2337   return next_state;
2338 }
2339 
2340 /* Skip bytes in the input that correspond to part of a
2341    multi-byte match, then look in the log for a state
2342    from which to restart matching.  */
2343 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2344 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2345 {
2346   re_dfastate_t *cur_state;
2347   do
2348     {
2349       Idx max = mctx->state_log_top;
2350       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2351 
2352       do
2353 	{
2354 	  if (++cur_str_idx > max)
2355 	    return NULL;
2356 	  re_string_skip_bytes (&mctx->input, 1);
2357 	}
2358       while (mctx->state_log[cur_str_idx] == NULL);
2359 
2360       cur_state = merge_state_with_log (err, mctx, NULL);
2361     }
2362   while (*err == REG_NOERROR && cur_state == NULL);
2363   return cur_state;
2364 }
2365 
2366 /* Helper functions for transit_state.  */
2367 
2368 /* From the node set CUR_NODES, pick up the nodes whose types are
2369    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2370    expression. And register them to use them later for evaluating the
2371    corresponding back references.  */
2372 
2373 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2374 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2375 			   Idx str_idx)
2376 {
2377   const re_dfa_t *const dfa = mctx->dfa;
2378   Idx node_idx;
2379   reg_errcode_t err;
2380 
2381   /* TODO: This isn't efficient.
2382 	   Because there might be more than one nodes whose types are
2383 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2384 	   nodes.
2385 	   E.g. RE: (a){2}  */
2386   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2387     {
2388       Idx node = cur_nodes->elems[node_idx];
2389       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2390 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2391 	  && (dfa->used_bkref_map
2392 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2393 	{
2394 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2395 	  if (__glibc_unlikely (err != REG_NOERROR))
2396 	    return err;
2397 	}
2398     }
2399   return REG_NOERROR;
2400 }
2401 
2402 #if 0
2403 /* Return the next state to which the current state STATE will transit by
2404    accepting the current input byte.  Return NULL on failure.  */
2405 
2406 static re_dfastate_t *
2407 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2408 		  re_dfastate_t *state)
2409 {
2410   const re_dfa_t *const dfa = mctx->dfa;
2411   re_node_set next_nodes;
2412   re_dfastate_t *next_state;
2413   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2414   unsigned int context;
2415 
2416   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2417   if (__glibc_unlikely (*err != REG_NOERROR))
2418     return NULL;
2419   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2420     {
2421       Idx cur_node = state->nodes.elems[node_cnt];
2422       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2423 	{
2424 	  *err = re_node_set_merge (&next_nodes,
2425 				    dfa->eclosures + dfa->nexts[cur_node]);
2426 	  if (__glibc_unlikely (*err != REG_NOERROR))
2427 	    {
2428 	      re_node_set_free (&next_nodes);
2429 	      return NULL;
2430 	    }
2431 	}
2432     }
2433   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2434   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2435   /* We don't need to check errors here, since the return value of
2436      this function is next_state and ERR is already set.  */
2437 
2438   re_node_set_free (&next_nodes);
2439   re_string_skip_bytes (&mctx->input, 1);
2440   return next_state;
2441 }
2442 #endif
2443 
2444 #ifdef RE_ENABLE_I18N
2445 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2446 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2447 {
2448   const re_dfa_t *const dfa = mctx->dfa;
2449   reg_errcode_t err;
2450   Idx i;
2451 
2452   for (i = 0; i < pstate->nodes.nelem; ++i)
2453     {
2454       re_node_set dest_nodes, *new_nodes;
2455       Idx cur_node_idx = pstate->nodes.elems[i];
2456       int naccepted;
2457       Idx dest_idx;
2458       unsigned int context;
2459       re_dfastate_t *dest_state;
2460 
2461       if (!dfa->nodes[cur_node_idx].accept_mb)
2462 	continue;
2463 
2464       if (dfa->nodes[cur_node_idx].constraint)
2465 	{
2466 	  context = re_string_context_at (&mctx->input,
2467 					  re_string_cur_idx (&mctx->input),
2468 					  mctx->eflags);
2469 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2470 					   context))
2471 	    continue;
2472 	}
2473 
2474       /* How many bytes the node can accept?  */
2475       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2476 					   re_string_cur_idx (&mctx->input));
2477       if (naccepted == 0)
2478 	continue;
2479 
2480       /* The node can accepts 'naccepted' bytes.  */
2481       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2482       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2483 			       : mctx->max_mb_elem_len);
2484       err = clean_state_log_if_needed (mctx, dest_idx);
2485       if (__glibc_unlikely (err != REG_NOERROR))
2486 	return err;
2487       DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2488       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2489 
2490       dest_state = mctx->state_log[dest_idx];
2491       if (dest_state == NULL)
2492 	dest_nodes = *new_nodes;
2493       else
2494 	{
2495 	  err = re_node_set_init_union (&dest_nodes,
2496 					dest_state->entrance_nodes, new_nodes);
2497 	  if (__glibc_unlikely (err != REG_NOERROR))
2498 	    return err;
2499 	}
2500       context = re_string_context_at (&mctx->input, dest_idx - 1,
2501 				      mctx->eflags);
2502       mctx->state_log[dest_idx]
2503 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2504       if (dest_state != NULL)
2505 	re_node_set_free (&dest_nodes);
2506       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2507 			    && err != REG_NOERROR))
2508 	return err;
2509     }
2510   return REG_NOERROR;
2511 }
2512 #endif /* RE_ENABLE_I18N */
2513 
2514 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2515 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2516 {
2517   const re_dfa_t *const dfa = mctx->dfa;
2518   reg_errcode_t err;
2519   Idx i;
2520   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2521 
2522   for (i = 0; i < nodes->nelem; ++i)
2523     {
2524       Idx dest_str_idx, prev_nelem, bkc_idx;
2525       Idx node_idx = nodes->elems[i];
2526       unsigned int context;
2527       const re_token_t *node = dfa->nodes + node_idx;
2528       re_node_set *new_dest_nodes;
2529 
2530       /* Check whether 'node' is a backreference or not.  */
2531       if (node->type != OP_BACK_REF)
2532 	continue;
2533 
2534       if (node->constraint)
2535 	{
2536 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2537 					  mctx->eflags);
2538 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2539 	    continue;
2540 	}
2541 
2542       /* 'node' is a backreference.
2543 	 Check the substring which the substring matched.  */
2544       bkc_idx = mctx->nbkref_ents;
2545       err = get_subexp (mctx, node_idx, cur_str_idx);
2546       if (__glibc_unlikely (err != REG_NOERROR))
2547 	goto free_return;
2548 
2549       /* And add the epsilon closures (which is 'new_dest_nodes') of
2550 	 the backreference to appropriate state_log.  */
2551       DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2552       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2553 	{
2554 	  Idx subexp_len;
2555 	  re_dfastate_t *dest_state;
2556 	  struct re_backref_cache_entry *bkref_ent;
2557 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2558 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2559 	    continue;
2560 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2561 	  new_dest_nodes = (subexp_len == 0
2562 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2563 			    : dfa->eclosures + dfa->nexts[node_idx]);
2564 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2565 			  - bkref_ent->subexp_from);
2566 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2567 					  mctx->eflags);
2568 	  dest_state = mctx->state_log[dest_str_idx];
2569 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2570 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2571 	  /* Add 'new_dest_node' to state_log.  */
2572 	  if (dest_state == NULL)
2573 	    {
2574 	      mctx->state_log[dest_str_idx]
2575 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2576 					    context);
2577 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2578 				    && err != REG_NOERROR))
2579 		goto free_return;
2580 	    }
2581 	  else
2582 	    {
2583 	      re_node_set dest_nodes;
2584 	      err = re_node_set_init_union (&dest_nodes,
2585 					    dest_state->entrance_nodes,
2586 					    new_dest_nodes);
2587 	      if (__glibc_unlikely (err != REG_NOERROR))
2588 		{
2589 		  re_node_set_free (&dest_nodes);
2590 		  goto free_return;
2591 		}
2592 	      mctx->state_log[dest_str_idx]
2593 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2594 	      re_node_set_free (&dest_nodes);
2595 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2596 				    && err != REG_NOERROR))
2597 		goto free_return;
2598 	    }
2599 	  /* We need to check recursively if the backreference can epsilon
2600 	     transit.  */
2601 	  if (subexp_len == 0
2602 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2603 	    {
2604 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2605 					       cur_str_idx);
2606 	      if (__glibc_unlikely (err != REG_NOERROR))
2607 		goto free_return;
2608 	      err = transit_state_bkref (mctx, new_dest_nodes);
2609 	      if (__glibc_unlikely (err != REG_NOERROR))
2610 		goto free_return;
2611 	    }
2612 	}
2613     }
2614   err = REG_NOERROR;
2615  free_return:
2616   return err;
2617 }
2618 
2619 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2620    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2621    Note that we might collect inappropriate candidates here.
2622    However, the cost of checking them strictly here is too high, then we
2623    delay these checking for prune_impossible_nodes().  */
2624 
2625 static reg_errcode_t
2626 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2627 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2628 {
2629   const re_dfa_t *const dfa = mctx->dfa;
2630   Idx subexp_num, sub_top_idx;
2631   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2632   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2633   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2634   if (cache_idx != -1)
2635     {
2636       const struct re_backref_cache_entry *entry
2637 	= mctx->bkref_ents + cache_idx;
2638       do
2639 	if (entry->node == bkref_node)
2640 	  return REG_NOERROR; /* We already checked it.  */
2641       while (entry++->more);
2642     }
2643 
2644   subexp_num = dfa->nodes[bkref_node].opr.idx;
2645 
2646   /* For each sub expression  */
2647   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2648     {
2649       reg_errcode_t err;
2650       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2651       re_sub_match_last_t *sub_last;
2652       Idx sub_last_idx, sl_str, bkref_str_off;
2653 
2654       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2655 	continue; /* It isn't related.  */
2656 
2657       sl_str = sub_top->str_idx;
2658       bkref_str_off = bkref_str_idx;
2659       /* At first, check the last node of sub expressions we already
2660 	 evaluated.  */
2661       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2662 	{
2663 	  regoff_t sl_str_diff;
2664 	  sub_last = sub_top->lasts[sub_last_idx];
2665 	  sl_str_diff = sub_last->str_idx - sl_str;
2666 	  /* The matched string by the sub expression match with the substring
2667 	     at the back reference?  */
2668 	  if (sl_str_diff > 0)
2669 	    {
2670 	      if (__glibc_unlikely (bkref_str_off + sl_str_diff
2671 				    > mctx->input.valid_len))
2672 		{
2673 		  /* Not enough chars for a successful match.  */
2674 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2675 		    break;
2676 
2677 		  err = clean_state_log_if_needed (mctx,
2678 						   bkref_str_off
2679 						   + sl_str_diff);
2680 		  if (__glibc_unlikely (err != REG_NOERROR))
2681 		    return err;
2682 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2683 		}
2684 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2685 		/* We don't need to search this sub expression any more.  */
2686 		break;
2687 	    }
2688 	  bkref_str_off += sl_str_diff;
2689 	  sl_str += sl_str_diff;
2690 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2691 				bkref_str_idx);
2692 
2693 	  /* Reload buf, since the preceding call might have reallocated
2694 	     the buffer.  */
2695 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2696 
2697 	  if (err == REG_NOMATCH)
2698 	    continue;
2699 	  if (__glibc_unlikely (err != REG_NOERROR))
2700 	    return err;
2701 	}
2702 
2703       if (sub_last_idx < sub_top->nlasts)
2704 	continue;
2705       if (sub_last_idx > 0)
2706 	++sl_str;
2707       /* Then, search for the other last nodes of the sub expression.  */
2708       for (; sl_str <= bkref_str_idx; ++sl_str)
2709 	{
2710 	  Idx cls_node;
2711 	  regoff_t sl_str_off;
2712 	  const re_node_set *nodes;
2713 	  sl_str_off = sl_str - sub_top->str_idx;
2714 	  /* The matched string by the sub expression match with the substring
2715 	     at the back reference?  */
2716 	  if (sl_str_off > 0)
2717 	    {
2718 	      if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2719 		{
2720 		  /* If we are at the end of the input, we cannot match.  */
2721 		  if (bkref_str_off >= mctx->input.len)
2722 		    break;
2723 
2724 		  err = extend_buffers (mctx, bkref_str_off + 1);
2725 		  if (__glibc_unlikely (err != REG_NOERROR))
2726 		    return err;
2727 
2728 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2729 		}
2730 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2731 		break; /* We don't need to search this sub expression
2732 			  any more.  */
2733 	    }
2734 	  if (mctx->state_log[sl_str] == NULL)
2735 	    continue;
2736 	  /* Does this state have a ')' of the sub expression?  */
2737 	  nodes = &mctx->state_log[sl_str]->nodes;
2738 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2739 				       OP_CLOSE_SUBEXP);
2740 	  if (cls_node == -1)
2741 	    continue; /* No.  */
2742 	  if (sub_top->path == NULL)
2743 	    {
2744 	      sub_top->path = calloc (sizeof (state_array_t),
2745 				      sl_str - sub_top->str_idx + 1);
2746 	      if (sub_top->path == NULL)
2747 		return REG_ESPACE;
2748 	    }
2749 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2750 	     in the current context?  */
2751 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2752 			       sub_top->str_idx, cls_node, sl_str,
2753 			       OP_CLOSE_SUBEXP);
2754 	  if (err == REG_NOMATCH)
2755 	      continue;
2756 	  if (__glibc_unlikely (err != REG_NOERROR))
2757 	      return err;
2758 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2759 	  if (__glibc_unlikely (sub_last == NULL))
2760 	    return REG_ESPACE;
2761 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2762 				bkref_str_idx);
2763 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2764 	  if (err == REG_NOMATCH)
2765 	    continue;
2766 	  if (__glibc_unlikely (err != REG_NOERROR))
2767 	    return err;
2768 	}
2769     }
2770   return REG_NOERROR;
2771 }
2772 
2773 /* Helper functions for get_subexp().  */
2774 
2775 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2776    If it can arrive, register the sub expression expressed with SUB_TOP
2777    and SUB_LAST.  */
2778 
2779 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2780 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2781 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2782 {
2783   reg_errcode_t err;
2784   Idx to_idx;
2785   /* Can the subexpression arrive the back reference?  */
2786   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2787 		       sub_last->str_idx, bkref_node, bkref_str,
2788 		       OP_OPEN_SUBEXP);
2789   if (err != REG_NOERROR)
2790     return err;
2791   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2792 			     sub_last->str_idx);
2793   if (__glibc_unlikely (err != REG_NOERROR))
2794     return err;
2795   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2796   return clean_state_log_if_needed (mctx, to_idx);
2797 }
2798 
2799 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2800    Search '(' if FL_OPEN, or search ')' otherwise.
2801    TODO: This function isn't efficient...
2802 	 Because there might be more than one nodes whose types are
2803 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2804 	 nodes.
2805 	 E.g. RE: (a){2}  */
2806 
2807 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2808 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2809 		  Idx subexp_idx, int type)
2810 {
2811   Idx cls_idx;
2812   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2813     {
2814       Idx cls_node = nodes->elems[cls_idx];
2815       const re_token_t *node = dfa->nodes + cls_node;
2816       if (node->type == type
2817 	  && node->opr.idx == subexp_idx)
2818 	return cls_node;
2819     }
2820   return -1;
2821 }
2822 
2823 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2824    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2825    heavily reused.
2826    Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2827    REG_ESPACE if memory is exhausted.  */
2828 
2829 static reg_errcode_t
2830 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2831 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2832 	       Idx top_str, Idx last_node, Idx last_str, int type)
2833 {
2834   const re_dfa_t *const dfa = mctx->dfa;
2835   reg_errcode_t err = REG_NOERROR;
2836   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2837   re_dfastate_t *cur_state = NULL;
2838   re_node_set *cur_nodes, next_nodes;
2839   re_dfastate_t **backup_state_log;
2840   unsigned int context;
2841 
2842   subexp_num = dfa->nodes[top_node].opr.idx;
2843   /* Extend the buffer if we need.  */
2844   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2845     {
2846       re_dfastate_t **new_array;
2847       Idx old_alloc = path->alloc;
2848       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2849       Idx new_alloc;
2850       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2851 	return REG_ESPACE;
2852       new_alloc = old_alloc + incr_alloc;
2853       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2854 	return REG_ESPACE;
2855       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2856       if (__glibc_unlikely (new_array == NULL))
2857 	return REG_ESPACE;
2858       path->array = new_array;
2859       path->alloc = new_alloc;
2860       memset (new_array + old_alloc, '\0',
2861 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2862     }
2863 
2864   str_idx = path->next_idx ? path->next_idx : top_str;
2865 
2866   /* Temporary modify MCTX.  */
2867   backup_state_log = mctx->state_log;
2868   backup_cur_idx = mctx->input.cur_idx;
2869   mctx->state_log = path->array;
2870   mctx->input.cur_idx = str_idx;
2871 
2872   /* Setup initial node set.  */
2873   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2874   if (str_idx == top_str)
2875     {
2876       err = re_node_set_init_1 (&next_nodes, top_node);
2877       if (__glibc_unlikely (err != REG_NOERROR))
2878 	return err;
2879       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2880       if (__glibc_unlikely (err != REG_NOERROR))
2881 	{
2882 	  re_node_set_free (&next_nodes);
2883 	  return err;
2884 	}
2885     }
2886   else
2887     {
2888       cur_state = mctx->state_log[str_idx];
2889       if (cur_state && cur_state->has_backref)
2890 	{
2891 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2892 	  if (__glibc_unlikely (err != REG_NOERROR))
2893 	    return err;
2894 	}
2895       else
2896 	re_node_set_init_empty (&next_nodes);
2897     }
2898   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2899     {
2900       if (next_nodes.nelem)
2901 	{
2902 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2903 				    subexp_num, type);
2904 	  if (__glibc_unlikely (err != REG_NOERROR))
2905 	    {
2906 	      re_node_set_free (&next_nodes);
2907 	      return err;
2908 	    }
2909 	}
2910       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2911       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2912 	{
2913 	  re_node_set_free (&next_nodes);
2914 	  return err;
2915 	}
2916       mctx->state_log[str_idx] = cur_state;
2917     }
2918 
2919   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2920     {
2921       re_node_set_empty (&next_nodes);
2922       if (mctx->state_log[str_idx + 1])
2923 	{
2924 	  err = re_node_set_merge (&next_nodes,
2925 				   &mctx->state_log[str_idx + 1]->nodes);
2926 	  if (__glibc_unlikely (err != REG_NOERROR))
2927 	    {
2928 	      re_node_set_free (&next_nodes);
2929 	      return err;
2930 	    }
2931 	}
2932       if (cur_state)
2933 	{
2934 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2935 					      &cur_state->non_eps_nodes,
2936 					      &next_nodes);
2937 	  if (__glibc_unlikely (err != REG_NOERROR))
2938 	    {
2939 	      re_node_set_free (&next_nodes);
2940 	      return err;
2941 	    }
2942 	}
2943       ++str_idx;
2944       if (next_nodes.nelem)
2945 	{
2946 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947 	  if (__glibc_unlikely (err != REG_NOERROR))
2948 	    {
2949 	      re_node_set_free (&next_nodes);
2950 	      return err;
2951 	    }
2952 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2953 				    subexp_num, type);
2954 	  if (__glibc_unlikely (err != REG_NOERROR))
2955 	    {
2956 	      re_node_set_free (&next_nodes);
2957 	      return err;
2958 	    }
2959 	}
2960       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2961       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2962       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2963 	{
2964 	  re_node_set_free (&next_nodes);
2965 	  return err;
2966 	}
2967       mctx->state_log[str_idx] = cur_state;
2968       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2969     }
2970   re_node_set_free (&next_nodes);
2971   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2972 	       : &mctx->state_log[last_str]->nodes);
2973   path->next_idx = str_idx;
2974 
2975   /* Fix MCTX.  */
2976   mctx->state_log = backup_state_log;
2977   mctx->input.cur_idx = backup_cur_idx;
2978 
2979   /* Then check the current node set has the node LAST_NODE.  */
2980   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2981     return REG_NOERROR;
2982 
2983   return REG_NOMATCH;
2984 }
2985 
2986 /* Helper functions for check_arrival.  */
2987 
2988 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2989    to NEXT_NODES.
2990    TODO: This function is similar to the functions transit_state*(),
2991 	 however this function has many additional works.
2992 	 Can't we unify them?  */
2993 
2994 static reg_errcode_t
2995 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)2996 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2997 			      re_node_set *cur_nodes, re_node_set *next_nodes)
2998 {
2999   const re_dfa_t *const dfa = mctx->dfa;
3000   bool ok;
3001   Idx cur_idx;
3002 #ifdef RE_ENABLE_I18N
3003   reg_errcode_t err = REG_NOERROR;
3004 #endif
3005   re_node_set union_set;
3006   re_node_set_init_empty (&union_set);
3007   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3008     {
3009       int naccepted = 0;
3010       Idx cur_node = cur_nodes->elems[cur_idx];
3011       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3012 
3013 #ifdef RE_ENABLE_I18N
3014       /* If the node may accept "multi byte".  */
3015       if (dfa->nodes[cur_node].accept_mb)
3016 	{
3017 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3018 					       str_idx);
3019 	  if (naccepted > 1)
3020 	    {
3021 	      re_dfastate_t *dest_state;
3022 	      Idx next_node = dfa->nexts[cur_node];
3023 	      Idx next_idx = str_idx + naccepted;
3024 	      dest_state = mctx->state_log[next_idx];
3025 	      re_node_set_empty (&union_set);
3026 	      if (dest_state)
3027 		{
3028 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3029 		  if (__glibc_unlikely (err != REG_NOERROR))
3030 		    {
3031 		      re_node_set_free (&union_set);
3032 		      return err;
3033 		    }
3034 		}
3035 	      ok = re_node_set_insert (&union_set, next_node);
3036 	      if (__glibc_unlikely (! ok))
3037 		{
3038 		  re_node_set_free (&union_set);
3039 		  return REG_ESPACE;
3040 		}
3041 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3042 							    &union_set);
3043 	      if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3044 				    && err != REG_NOERROR))
3045 		{
3046 		  re_node_set_free (&union_set);
3047 		  return err;
3048 		}
3049 	    }
3050 	}
3051 #endif /* RE_ENABLE_I18N */
3052       if (naccepted
3053 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3054 	{
3055 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3056 	  if (__glibc_unlikely (! ok))
3057 	    {
3058 	      re_node_set_free (&union_set);
3059 	      return REG_ESPACE;
3060 	    }
3061 	}
3062     }
3063   re_node_set_free (&union_set);
3064   return REG_NOERROR;
3065 }
3066 
3067 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3068    CUR_NODES, however exclude the nodes which are:
3069     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3070     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3071 */
3072 
3073 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3074 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3075 			  Idx ex_subexp, int type)
3076 {
3077   reg_errcode_t err;
3078   Idx idx, outside_node;
3079   re_node_set new_nodes;
3080   DEBUG_ASSERT (cur_nodes->nelem);
3081   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3082   if (__glibc_unlikely (err != REG_NOERROR))
3083     return err;
3084   /* Create a new node set NEW_NODES with the nodes which are epsilon
3085      closures of the node in CUR_NODES.  */
3086 
3087   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3088     {
3089       Idx cur_node = cur_nodes->elems[idx];
3090       const re_node_set *eclosure = dfa->eclosures + cur_node;
3091       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3092       if (outside_node == -1)
3093 	{
3094 	  /* There are no problematic nodes, just merge them.  */
3095 	  err = re_node_set_merge (&new_nodes, eclosure);
3096 	  if (__glibc_unlikely (err != REG_NOERROR))
3097 	    {
3098 	      re_node_set_free (&new_nodes);
3099 	      return err;
3100 	    }
3101 	}
3102       else
3103 	{
3104 	  /* There are problematic nodes, re-calculate incrementally.  */
3105 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3106 					      ex_subexp, type);
3107 	  if (__glibc_unlikely (err != REG_NOERROR))
3108 	    {
3109 	      re_node_set_free (&new_nodes);
3110 	      return err;
3111 	    }
3112 	}
3113     }
3114   re_node_set_free (cur_nodes);
3115   *cur_nodes = new_nodes;
3116   return REG_NOERROR;
3117 }
3118 
3119 /* Helper function for check_arrival_expand_ecl.
3120    Check incrementally the epsilon closure of TARGET, and if it isn't
3121    problematic append it to DST_NODES.  */
3122 
3123 static reg_errcode_t
3124 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3125 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3126 			      Idx target, Idx ex_subexp, int type)
3127 {
3128   Idx cur_node;
3129   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3130     {
3131       bool ok;
3132 
3133       if (dfa->nodes[cur_node].type == type
3134 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3135 	{
3136 	  if (type == OP_CLOSE_SUBEXP)
3137 	    {
3138 	      ok = re_node_set_insert (dst_nodes, cur_node);
3139 	      if (__glibc_unlikely (! ok))
3140 		return REG_ESPACE;
3141 	    }
3142 	  break;
3143 	}
3144       ok = re_node_set_insert (dst_nodes, cur_node);
3145       if (__glibc_unlikely (! ok))
3146 	return REG_ESPACE;
3147       if (dfa->edests[cur_node].nelem == 0)
3148 	break;
3149       if (dfa->edests[cur_node].nelem == 2)
3150 	{
3151 	  reg_errcode_t err;
3152 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3153 					      dfa->edests[cur_node].elems[1],
3154 					      ex_subexp, type);
3155 	  if (__glibc_unlikely (err != REG_NOERROR))
3156 	    return err;
3157 	}
3158       cur_node = dfa->edests[cur_node].elems[0];
3159     }
3160   return REG_NOERROR;
3161 }
3162 
3163 
3164 /* For all the back references in the current state, calculate the
3165    destination of the back references by the appropriate entry
3166    in MCTX->BKREF_ENTS.  */
3167 
3168 static reg_errcode_t
3169 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3170 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3171 		    Idx cur_str, Idx subexp_num, int type)
3172 {
3173   const re_dfa_t *const dfa = mctx->dfa;
3174   reg_errcode_t err;
3175   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3176   struct re_backref_cache_entry *ent;
3177 
3178   if (cache_idx_start == -1)
3179     return REG_NOERROR;
3180 
3181  restart:
3182   ent = mctx->bkref_ents + cache_idx_start;
3183   do
3184     {
3185       Idx to_idx, next_node;
3186 
3187       /* Is this entry ENT is appropriate?  */
3188       if (!re_node_set_contains (cur_nodes, ent->node))
3189 	continue; /* No.  */
3190 
3191       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3192       /* Calculate the destination of the back reference, and append it
3193 	 to MCTX->STATE_LOG.  */
3194       if (to_idx == cur_str)
3195 	{
3196 	  /* The backreference did epsilon transit, we must re-check all the
3197 	     node in the current state.  */
3198 	  re_node_set new_dests;
3199 	  reg_errcode_t err2, err3;
3200 	  next_node = dfa->edests[ent->node].elems[0];
3201 	  if (re_node_set_contains (cur_nodes, next_node))
3202 	    continue;
3203 	  err = re_node_set_init_1 (&new_dests, next_node);
3204 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3205 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3206 	  re_node_set_free (&new_dests);
3207 	  if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3208 				|| err3 != REG_NOERROR))
3209 	    {
3210 	      err = (err != REG_NOERROR ? err
3211 		     : (err2 != REG_NOERROR ? err2 : err3));
3212 	      return err;
3213 	    }
3214 	  /* TODO: It is still inefficient...  */
3215 	  goto restart;
3216 	}
3217       else
3218 	{
3219 	  re_node_set union_set;
3220 	  next_node = dfa->nexts[ent->node];
3221 	  if (mctx->state_log[to_idx])
3222 	    {
3223 	      bool ok;
3224 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3225 					next_node))
3226 		continue;
3227 	      err = re_node_set_init_copy (&union_set,
3228 					   &mctx->state_log[to_idx]->nodes);
3229 	      ok = re_node_set_insert (&union_set, next_node);
3230 	      if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3231 		{
3232 		  re_node_set_free (&union_set);
3233 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3234 		  return err;
3235 		}
3236 	    }
3237 	  else
3238 	    {
3239 	      err = re_node_set_init_1 (&union_set, next_node);
3240 	      if (__glibc_unlikely (err != REG_NOERROR))
3241 		return err;
3242 	    }
3243 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3244 	  re_node_set_free (&union_set);
3245 	  if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3246 				&& err != REG_NOERROR))
3247 	    return err;
3248 	}
3249     }
3250   while (ent++->more);
3251   return REG_NOERROR;
3252 }
3253 
3254 /* Build transition table for the state.
3255    Return true if successful.  */
3256 
3257 static bool __attribute_noinline__
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3258 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3259 {
3260   reg_errcode_t err;
3261   Idx i, j;
3262   int ch;
3263   bool need_word_trtable = false;
3264   bitset_word_t elem, mask;
3265   Idx ndests; /* Number of the destination states from 'state'.  */
3266   re_dfastate_t **trtable;
3267   re_dfastate_t *dest_states[SBC_MAX];
3268   re_dfastate_t *dest_states_word[SBC_MAX];
3269   re_dfastate_t *dest_states_nl[SBC_MAX];
3270   re_node_set follows;
3271   bitset_t acceptable;
3272 
3273   /* We build DFA states which corresponds to the destination nodes
3274      from 'state'.  'dests_node[i]' represents the nodes which i-th
3275      destination state contains, and 'dests_ch[i]' represents the
3276      characters which i-th destination state accepts.  */
3277   re_node_set dests_node[SBC_MAX];
3278   bitset_t dests_ch[SBC_MAX];
3279 
3280   /* Initialize transition table.  */
3281   state->word_trtable = state->trtable = NULL;
3282 
3283   /* At first, group all nodes belonging to 'state' into several
3284      destinations.  */
3285   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3286   if (__glibc_unlikely (ndests <= 0))
3287     {
3288       /* Return false in case of an error, true otherwise.  */
3289       if (ndests == 0)
3290 	{
3291 	  state->trtable = (re_dfastate_t **)
3292 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3293           if (__glibc_unlikely (state->trtable == NULL))
3294             return false;
3295 	  return true;
3296 	}
3297       return false;
3298     }
3299 
3300   err = re_node_set_alloc (&follows, ndests + 1);
3301   if (__glibc_unlikely (err != REG_NOERROR))
3302     {
3303     out_free:
3304       re_node_set_free (&follows);
3305       for (i = 0; i < ndests; ++i)
3306 	re_node_set_free (dests_node + i);
3307       return false;
3308     }
3309 
3310   bitset_empty (acceptable);
3311 
3312   /* Then build the states for all destinations.  */
3313   for (i = 0; i < ndests; ++i)
3314     {
3315       Idx next_node;
3316       re_node_set_empty (&follows);
3317       /* Merge the follows of this destination states.  */
3318       for (j = 0; j < dests_node[i].nelem; ++j)
3319 	{
3320 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3321 	  if (next_node != -1)
3322 	    {
3323 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3324 	      if (__glibc_unlikely (err != REG_NOERROR))
3325 		goto out_free;
3326 	    }
3327 	}
3328       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3329       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3330 	goto out_free;
3331       /* If the new state has context constraint,
3332 	 build appropriate states for these contexts.  */
3333       if (dest_states[i]->has_constraint)
3334 	{
3335 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3336 							  CONTEXT_WORD);
3337 	  if (__glibc_unlikely (dest_states_word[i] == NULL
3338 				&& err != REG_NOERROR))
3339 	    goto out_free;
3340 
3341 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3342 	    need_word_trtable = true;
3343 
3344 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3345 							CONTEXT_NEWLINE);
3346 	  if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3347 	    goto out_free;
3348 	}
3349       else
3350 	{
3351 	  dest_states_word[i] = dest_states[i];
3352 	  dest_states_nl[i] = dest_states[i];
3353 	}
3354       bitset_merge (acceptable, dests_ch[i]);
3355     }
3356 
3357   if (!__glibc_unlikely (need_word_trtable))
3358     {
3359       /* We don't care about whether the following character is a word
3360 	 character, or we are in a single-byte character set so we can
3361 	 discern by looking at the character code: allocate a
3362 	 256-entry transition table.  */
3363       trtable = state->trtable =
3364 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365       if (__glibc_unlikely (trtable == NULL))
3366 	goto out_free;
3367 
3368       /* For all characters ch...:  */
3369       for (i = 0; i < BITSET_WORDS; ++i)
3370 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3371 	     elem;
3372 	     mask <<= 1, elem >>= 1, ++ch)
3373 	  if (__glibc_unlikely (elem & 1))
3374 	    {
3375 	      /* There must be exactly one destination which accepts
3376 		 character ch.  See group_nodes_into_DFAstates.  */
3377 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3378 		;
3379 
3380 	      /* j-th destination accepts the word character ch.  */
3381 	      if (dfa->word_char[i] & mask)
3382 		trtable[ch] = dest_states_word[j];
3383 	      else
3384 		trtable[ch] = dest_states[j];
3385 	    }
3386     }
3387   else
3388     {
3389       /* We care about whether the following character is a word
3390 	 character, and we are in a multi-byte character set: discern
3391 	 by looking at the character code: build two 256-entry
3392 	 transition tables, one starting at trtable[0] and one
3393 	 starting at trtable[SBC_MAX].  */
3394       trtable = state->word_trtable =
3395 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3396       if (__glibc_unlikely (trtable == NULL))
3397 	goto out_free;
3398 
3399       /* For all characters ch...:  */
3400       for (i = 0; i < BITSET_WORDS; ++i)
3401 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3402 	     elem;
3403 	     mask <<= 1, elem >>= 1, ++ch)
3404 	  if (__glibc_unlikely (elem & 1))
3405 	    {
3406 	      /* There must be exactly one destination which accepts
3407 		 character ch.  See group_nodes_into_DFAstates.  */
3408 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3409 		;
3410 
3411 	      /* j-th destination accepts the word character ch.  */
3412 	      trtable[ch] = dest_states[j];
3413 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3414 	    }
3415     }
3416 
3417   /* new line */
3418   if (bitset_contain (acceptable, NEWLINE_CHAR))
3419     {
3420       /* The current state accepts newline character.  */
3421       for (j = 0; j < ndests; ++j)
3422 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3423 	  {
3424 	    /* k-th destination accepts newline character.  */
3425 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3426 	    if (need_word_trtable)
3427 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3428 	    /* There must be only one destination which accepts
3429 	       newline.  See group_nodes_into_DFAstates.  */
3430 	    break;
3431 	  }
3432     }
3433 
3434   re_node_set_free (&follows);
3435   for (i = 0; i < ndests; ++i)
3436     re_node_set_free (dests_node + i);
3437   return true;
3438 }
3439 
3440 /* Group all nodes belonging to STATE into several destinations.
3441    Then for all destinations, set the nodes belonging to the destination
3442    to DESTS_NODE[i] and set the characters accepted by the destination
3443    to DEST_CH[i].  Return the number of destinations if successful,
3444    -1 on internal error.  */
3445 
3446 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3447 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3448 			    re_node_set *dests_node, bitset_t *dests_ch)
3449 {
3450   reg_errcode_t err;
3451   bool ok;
3452   Idx i, j, k;
3453   Idx ndests; /* Number of the destinations from 'state'.  */
3454   bitset_t accepts; /* Characters a node can accept.  */
3455   const re_node_set *cur_nodes = &state->nodes;
3456   bitset_empty (accepts);
3457   ndests = 0;
3458 
3459   /* For all the nodes belonging to 'state',  */
3460   for (i = 0; i < cur_nodes->nelem; ++i)
3461     {
3462       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3463       re_token_type_t type = node->type;
3464       unsigned int constraint = node->constraint;
3465 
3466       /* Enumerate all single byte character this node can accept.  */
3467       if (type == CHARACTER)
3468 	bitset_set (accepts, node->opr.c);
3469       else if (type == SIMPLE_BRACKET)
3470 	{
3471 	  bitset_merge (accepts, node->opr.sbcset);
3472 	}
3473       else if (type == OP_PERIOD)
3474 	{
3475 #ifdef RE_ENABLE_I18N
3476 	  if (dfa->mb_cur_max > 1)
3477 	    bitset_merge (accepts, dfa->sb_char);
3478 	  else
3479 #endif
3480 	    bitset_set_all (accepts);
3481 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3482 	    bitset_clear (accepts, '\n');
3483 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3484 	    bitset_clear (accepts, '\0');
3485 	}
3486 #ifdef RE_ENABLE_I18N
3487       else if (type == OP_UTF8_PERIOD)
3488 	{
3489 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3490 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3491 	  else
3492 	    bitset_merge (accepts, utf8_sb_map);
3493 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3494 	    bitset_clear (accepts, '\n');
3495 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3496 	    bitset_clear (accepts, '\0');
3497 	}
3498 #endif
3499       else
3500 	continue;
3501 
3502       /* Check the 'accepts' and sift the characters which are not
3503 	 match it the context.  */
3504       if (constraint)
3505 	{
3506 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3507 	    {
3508 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3509 	      bitset_empty (accepts);
3510 	      if (accepts_newline)
3511 		bitset_set (accepts, NEWLINE_CHAR);
3512 	      else
3513 		continue;
3514 	    }
3515 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3516 	    {
3517 	      bitset_empty (accepts);
3518 	      continue;
3519 	    }
3520 
3521 	  if (constraint & NEXT_WORD_CONSTRAINT)
3522 	    {
3523 	      bitset_word_t any_set = 0;
3524 	      if (type == CHARACTER && !node->word_char)
3525 		{
3526 		  bitset_empty (accepts);
3527 		  continue;
3528 		}
3529 #ifdef RE_ENABLE_I18N
3530 	      if (dfa->mb_cur_max > 1)
3531 		for (j = 0; j < BITSET_WORDS; ++j)
3532 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3533 	      else
3534 #endif
3535 		for (j = 0; j < BITSET_WORDS; ++j)
3536 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3537 	      if (!any_set)
3538 		continue;
3539 	    }
3540 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3541 	    {
3542 	      bitset_word_t any_set = 0;
3543 	      if (type == CHARACTER && node->word_char)
3544 		{
3545 		  bitset_empty (accepts);
3546 		  continue;
3547 		}
3548 #ifdef RE_ENABLE_I18N
3549 	      if (dfa->mb_cur_max > 1)
3550 		for (j = 0; j < BITSET_WORDS; ++j)
3551 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3552 	      else
3553 #endif
3554 		for (j = 0; j < BITSET_WORDS; ++j)
3555 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3556 	      if (!any_set)
3557 		continue;
3558 	    }
3559 	}
3560 
3561       /* Then divide 'accepts' into DFA states, or create a new
3562 	 state.  Above, we make sure that accepts is not empty.  */
3563       for (j = 0; j < ndests; ++j)
3564 	{
3565 	  bitset_t intersec; /* Intersection sets, see below.  */
3566 	  bitset_t remains;
3567 	  /* Flags, see below.  */
3568 	  bitset_word_t has_intersec, not_subset, not_consumed;
3569 
3570 	  /* Optimization, skip if this state doesn't accept the character.  */
3571 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3572 	    continue;
3573 
3574 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3575 	  has_intersec = 0;
3576 	  for (k = 0; k < BITSET_WORDS; ++k)
3577 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3578 	  /* And skip if the intersection set is empty.  */
3579 	  if (!has_intersec)
3580 	    continue;
3581 
3582 	  /* Then check if this state is a subset of 'accepts'.  */
3583 	  not_subset = not_consumed = 0;
3584 	  for (k = 0; k < BITSET_WORDS; ++k)
3585 	    {
3586 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3587 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3588 	    }
3589 
3590 	  /* If this state isn't a subset of 'accepts', create a
3591 	     new group state, which has the 'remains'. */
3592 	  if (not_subset)
3593 	    {
3594 	      bitset_copy (dests_ch[ndests], remains);
3595 	      bitset_copy (dests_ch[j], intersec);
3596 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3597 	      if (__glibc_unlikely (err != REG_NOERROR))
3598 		goto error_return;
3599 	      ++ndests;
3600 	    }
3601 
3602 	  /* Put the position in the current group. */
3603 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3604 	  if (__glibc_unlikely (! ok))
3605 	    goto error_return;
3606 
3607 	  /* If all characters are consumed, go to next node. */
3608 	  if (!not_consumed)
3609 	    break;
3610 	}
3611       /* Some characters remain, create a new group. */
3612       if (j == ndests)
3613 	{
3614 	  bitset_copy (dests_ch[ndests], accepts);
3615 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3616 	  if (__glibc_unlikely (err != REG_NOERROR))
3617 	    goto error_return;
3618 	  ++ndests;
3619 	  bitset_empty (accepts);
3620 	}
3621     }
3622   assume (ndests <= SBC_MAX);
3623   return ndests;
3624  error_return:
3625   for (j = 0; j < ndests; ++j)
3626     re_node_set_free (dests_node + j);
3627   return -1;
3628 }
3629 
3630 #ifdef RE_ENABLE_I18N
3631 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3632    Return the number of the bytes the node accepts.
3633    STR_IDX is the current index of the input string.
3634 
3635    This function handles the nodes which can accept one character, or
3636    one collating element like '.', '[a-z]', opposite to the other nodes
3637    can only accept one byte.  */
3638 
3639 # ifdef _LIBC
3640 #  include <locale/weight.h>
3641 # endif
3642 
3643 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3644 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3645 			 const re_string_t *input, Idx str_idx)
3646 {
3647   const re_token_t *node = dfa->nodes + node_idx;
3648   int char_len, elem_len;
3649   Idx i;
3650 
3651   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3652     {
3653       unsigned char c = re_string_byte_at (input, str_idx), d;
3654       if (__glibc_likely (c < 0xc2))
3655 	return 0;
3656 
3657       if (str_idx + 2 > input->len)
3658 	return 0;
3659 
3660       d = re_string_byte_at (input, str_idx + 1);
3661       if (c < 0xe0)
3662 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3663       else if (c < 0xf0)
3664 	{
3665 	  char_len = 3;
3666 	  if (c == 0xe0 && d < 0xa0)
3667 	    return 0;
3668 	}
3669       else if (c < 0xf8)
3670 	{
3671 	  char_len = 4;
3672 	  if (c == 0xf0 && d < 0x90)
3673 	    return 0;
3674 	}
3675       else if (c < 0xfc)
3676 	{
3677 	  char_len = 5;
3678 	  if (c == 0xf8 && d < 0x88)
3679 	    return 0;
3680 	}
3681       else if (c < 0xfe)
3682 	{
3683 	  char_len = 6;
3684 	  if (c == 0xfc && d < 0x84)
3685 	    return 0;
3686 	}
3687       else
3688 	return 0;
3689 
3690       if (str_idx + char_len > input->len)
3691 	return 0;
3692 
3693       for (i = 1; i < char_len; ++i)
3694 	{
3695 	  d = re_string_byte_at (input, str_idx + i);
3696 	  if (d < 0x80 || d > 0xbf)
3697 	    return 0;
3698 	}
3699       return char_len;
3700     }
3701 
3702   char_len = re_string_char_size_at (input, str_idx);
3703   if (node->type == OP_PERIOD)
3704     {
3705       if (char_len <= 1)
3706 	return 0;
3707       /* FIXME: I don't think this if is needed, as both '\n'
3708 	 and '\0' are char_len == 1.  */
3709       /* '.' accepts any one character except the following two cases.  */
3710       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3711 	   && re_string_byte_at (input, str_idx) == '\n')
3712 	  || ((dfa->syntax & RE_DOT_NOT_NULL)
3713 	      && re_string_byte_at (input, str_idx) == '\0'))
3714 	return 0;
3715       return char_len;
3716     }
3717 
3718   elem_len = re_string_elem_size_at (input, str_idx);
3719   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3720     return 0;
3721 
3722   if (node->type == COMPLEX_BRACKET)
3723     {
3724       const re_charset_t *cset = node->opr.mbcset;
3725 # ifdef _LIBC
3726       const unsigned char *pin
3727 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3728       Idx j;
3729       uint32_t nrules;
3730 # endif /* _LIBC */
3731       int match_len = 0;
3732       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3733 		    ? re_string_wchar_at (input, str_idx) : 0);
3734 
3735       /* match with multibyte character?  */
3736       for (i = 0; i < cset->nmbchars; ++i)
3737 	if (wc == cset->mbchars[i])
3738 	  {
3739 	    match_len = char_len;
3740 	    goto check_node_accept_bytes_match;
3741 	  }
3742       /* match with character_class?  */
3743       for (i = 0; i < cset->nchar_classes; ++i)
3744 	{
3745 	  wctype_t wt = cset->char_classes[i];
3746 	  if (__iswctype (wc, wt))
3747 	    {
3748 	      match_len = char_len;
3749 	      goto check_node_accept_bytes_match;
3750 	    }
3751 	}
3752 
3753 # ifdef _LIBC
3754       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3755       if (nrules != 0)
3756 	{
3757 	  unsigned int in_collseq = 0;
3758 	  const int32_t *table, *indirect;
3759 	  const unsigned char *weights, *extra;
3760 	  const char *collseqwc;
3761 
3762 	  /* match with collating_symbol?  */
3763 	  if (cset->ncoll_syms)
3764 	    extra = (const unsigned char *)
3765 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3766 	  for (i = 0; i < cset->ncoll_syms; ++i)
3767 	    {
3768 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3769 	      /* Compare the length of input collating element and
3770 		 the length of current collating element.  */
3771 	      if (*coll_sym != elem_len)
3772 		continue;
3773 	      /* Compare each bytes.  */
3774 	      for (j = 0; j < *coll_sym; j++)
3775 		if (pin[j] != coll_sym[1 + j])
3776 		  break;
3777 	      if (j == *coll_sym)
3778 		{
3779 		  /* Match if every bytes is equal.  */
3780 		  match_len = j;
3781 		  goto check_node_accept_bytes_match;
3782 		}
3783 	    }
3784 
3785 	  if (cset->nranges)
3786 	    {
3787 	      if (elem_len <= char_len)
3788 		{
3789 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3790 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3791 		}
3792 	      else
3793 		in_collseq = find_collation_sequence_value (pin, elem_len);
3794 	    }
3795 	  /* match with range expression?  */
3796 	  /* FIXME: Implement rational ranges here, too.  */
3797 	  for (i = 0; i < cset->nranges; ++i)
3798 	    if (cset->range_starts[i] <= in_collseq
3799 		&& in_collseq <= cset->range_ends[i])
3800 	      {
3801 		match_len = elem_len;
3802 		goto check_node_accept_bytes_match;
3803 	      }
3804 
3805 	  /* match with equivalence_class?  */
3806 	  if (cset->nequiv_classes)
3807 	    {
3808 	      const unsigned char *cp = pin;
3809 	      table = (const int32_t *)
3810 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3811 	      weights = (const unsigned char *)
3812 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3813 	      extra = (const unsigned char *)
3814 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3815 	      indirect = (const int32_t *)
3816 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3817 	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3818 	      int32_t rule = idx >> 24;
3819 	      idx &= 0xffffff;
3820 	      if (idx > 0)
3821 		{
3822 		  size_t weight_len = weights[idx];
3823 		  for (i = 0; i < cset->nequiv_classes; ++i)
3824 		    {
3825 		      int32_t equiv_class_idx = cset->equiv_classes[i];
3826 		      int32_t equiv_class_rule = equiv_class_idx >> 24;
3827 		      equiv_class_idx &= 0xffffff;
3828 		      if (weights[equiv_class_idx] == weight_len
3829 			  && equiv_class_rule == rule
3830 			  && memcmp (weights + idx + 1,
3831 				     weights + equiv_class_idx + 1,
3832 				     weight_len) == 0)
3833 			{
3834 			  match_len = elem_len;
3835 			  goto check_node_accept_bytes_match;
3836 			}
3837 		    }
3838 		}
3839 	    }
3840 	}
3841       else
3842 # endif /* _LIBC */
3843 	{
3844 	  /* match with range expression?  */
3845 	  for (i = 0; i < cset->nranges; ++i)
3846 	    {
3847 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3848 		{
3849 		  match_len = char_len;
3850 		  goto check_node_accept_bytes_match;
3851 		}
3852 	    }
3853 	}
3854     check_node_accept_bytes_match:
3855       if (!cset->non_match)
3856 	return match_len;
3857       else
3858 	{
3859 	  if (match_len > 0)
3860 	    return 0;
3861 	  else
3862 	    return (elem_len > char_len) ? elem_len : char_len;
3863 	}
3864     }
3865   return 0;
3866 }
3867 
3868 # ifdef _LIBC
3869 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3870 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3871 {
3872   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3873   if (nrules == 0)
3874     {
3875       if (mbs_len == 1)
3876 	{
3877 	  /* No valid character.  Match it as a single byte character.  */
3878 	  const unsigned char *collseq = (const unsigned char *)
3879 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3880 	  return collseq[mbs[0]];
3881 	}
3882       return UINT_MAX;
3883     }
3884   else
3885     {
3886       int32_t idx;
3887       const unsigned char *extra = (const unsigned char *)
3888 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3889       int32_t extrasize = (const unsigned char *)
3890 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3891 
3892       for (idx = 0; idx < extrasize;)
3893 	{
3894 	  int mbs_cnt;
3895 	  bool found = false;
3896 	  int32_t elem_mbs_len;
3897 	  /* Skip the name of collating element name.  */
3898 	  idx = idx + extra[idx] + 1;
3899 	  elem_mbs_len = extra[idx++];
3900 	  if (mbs_len == elem_mbs_len)
3901 	    {
3902 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3903 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3904 		  break;
3905 	      if (mbs_cnt == elem_mbs_len)
3906 		/* Found the entry.  */
3907 		found = true;
3908 	    }
3909 	  /* Skip the byte sequence of the collating element.  */
3910 	  idx += elem_mbs_len;
3911 	  /* Adjust for the alignment.  */
3912 	  idx = (idx + 3) & ~3;
3913 	  /* Skip the collation sequence value.  */
3914 	  idx += sizeof (uint32_t);
3915 	  /* Skip the wide char sequence of the collating element.  */
3916 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3917 	  /* If we found the entry, return the sequence value.  */
3918 	  if (found)
3919 	    return *(uint32_t *) (extra + idx);
3920 	  /* Skip the collation sequence value.  */
3921 	  idx += sizeof (uint32_t);
3922 	}
3923       return UINT_MAX;
3924     }
3925 }
3926 # endif /* _LIBC */
3927 #endif /* RE_ENABLE_I18N */
3928 
3929 /* Check whether the node accepts the byte which is IDX-th
3930    byte of the INPUT.  */
3931 
3932 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3933 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3934 		   Idx idx)
3935 {
3936   unsigned char ch;
3937   ch = re_string_byte_at (&mctx->input, idx);
3938   switch (node->type)
3939     {
3940     case CHARACTER:
3941       if (node->opr.c != ch)
3942         return false;
3943       break;
3944 
3945     case SIMPLE_BRACKET:
3946       if (!bitset_contain (node->opr.sbcset, ch))
3947         return false;
3948       break;
3949 
3950 #ifdef RE_ENABLE_I18N
3951     case OP_UTF8_PERIOD:
3952       if (ch >= ASCII_CHARS)
3953         return false;
3954       FALLTHROUGH;
3955 #endif
3956     case OP_PERIOD:
3957       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3958 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3959 	return false;
3960       break;
3961 
3962     default:
3963       return false;
3964     }
3965 
3966   if (node->constraint)
3967     {
3968       /* The node has constraints.  Check whether the current context
3969 	 satisfies the constraints.  */
3970       unsigned int context = re_string_context_at (&mctx->input, idx,
3971 						   mctx->eflags);
3972       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3973 	return false;
3974     }
3975 
3976   return true;
3977 }
3978 
3979 /* Extend the buffers, if the buffers have run out.  */
3980 
3981 static reg_errcode_t
3982 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)3983 extend_buffers (re_match_context_t *mctx, int min_len)
3984 {
3985   reg_errcode_t ret;
3986   re_string_t *pstr = &mctx->input;
3987 
3988   /* Avoid overflow.  */
3989   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3990 			<= pstr->bufs_len))
3991     return REG_ESPACE;
3992 
3993   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3994   ret = re_string_realloc_buffers (pstr,
3995 				   MAX (min_len,
3996 					MIN (pstr->len, pstr->bufs_len * 2)));
3997   if (__glibc_unlikely (ret != REG_NOERROR))
3998     return ret;
3999 
4000   if (mctx->state_log != NULL)
4001     {
4002       /* And double the length of state_log.  */
4003       /* XXX We have no indication of the size of this buffer.  If this
4004 	 allocation fail we have no indication that the state_log array
4005 	 does not have the right size.  */
4006       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4007 					      pstr->bufs_len + 1);
4008       if (__glibc_unlikely (new_array == NULL))
4009 	return REG_ESPACE;
4010       mctx->state_log = new_array;
4011     }
4012 
4013   /* Then reconstruct the buffers.  */
4014   if (pstr->icase)
4015     {
4016 #ifdef RE_ENABLE_I18N
4017       if (pstr->mb_cur_max > 1)
4018 	{
4019 	  ret = build_wcs_upper_buffer (pstr);
4020 	  if (__glibc_unlikely (ret != REG_NOERROR))
4021 	    return ret;
4022 	}
4023       else
4024 #endif /* RE_ENABLE_I18N  */
4025 	build_upper_buffer (pstr);
4026     }
4027   else
4028     {
4029 #ifdef RE_ENABLE_I18N
4030       if (pstr->mb_cur_max > 1)
4031 	build_wcs_buffer (pstr);
4032       else
4033 #endif /* RE_ENABLE_I18N  */
4034 	{
4035 	  if (pstr->trans != NULL)
4036 	    re_string_translate_buffer (pstr);
4037 	}
4038     }
4039   return REG_NOERROR;
4040 }
4041 
4042 
4043 /* Functions for matching context.  */
4044 
4045 /* Initialize MCTX.  */
4046 
4047 static reg_errcode_t
4048 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4049 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4050 {
4051   mctx->eflags = eflags;
4052   mctx->match_last = -1;
4053   if (n > 0)
4054     {
4055       /* Avoid overflow.  */
4056       size_t max_object_size =
4057 	MAX (sizeof (struct re_backref_cache_entry),
4058 	     sizeof (re_sub_match_top_t *));
4059       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4060 	return REG_ESPACE;
4061 
4062       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4063       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4064       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4065 	return REG_ESPACE;
4066     }
4067   /* Already zero-ed by the caller.
4068      else
4069        mctx->bkref_ents = NULL;
4070      mctx->nbkref_ents = 0;
4071      mctx->nsub_tops = 0;  */
4072   mctx->abkref_ents = n;
4073   mctx->max_mb_elem_len = 1;
4074   mctx->asub_tops = n;
4075   return REG_NOERROR;
4076 }
4077 
4078 /* Clean the entries which depend on the current input in MCTX.
4079    This function must be invoked when the matcher changes the start index
4080    of the input, or changes the input string.  */
4081 
4082 static void
match_ctx_clean(re_match_context_t * mctx)4083 match_ctx_clean (re_match_context_t *mctx)
4084 {
4085   Idx st_idx;
4086   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4087     {
4088       Idx sl_idx;
4089       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4090       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4091 	{
4092 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4093 	  re_free (last->path.array);
4094 	  re_free (last);
4095 	}
4096       re_free (top->lasts);
4097       if (top->path)
4098 	{
4099 	  re_free (top->path->array);
4100 	  re_free (top->path);
4101 	}
4102       re_free (top);
4103     }
4104 
4105   mctx->nsub_tops = 0;
4106   mctx->nbkref_ents = 0;
4107 }
4108 
4109 /* Free all the memory associated with MCTX.  */
4110 
4111 static void
match_ctx_free(re_match_context_t * mctx)4112 match_ctx_free (re_match_context_t *mctx)
4113 {
4114   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4115   match_ctx_clean (mctx);
4116   re_free (mctx->sub_tops);
4117   re_free (mctx->bkref_ents);
4118 }
4119 
4120 /* Add a new backreference entry to MCTX.
4121    Note that we assume that caller never call this function with duplicate
4122    entry, and call with STR_IDX which isn't smaller than any existing entry.
4123 */
4124 
4125 static reg_errcode_t
4126 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4127 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4128 		     Idx to)
4129 {
4130   if (mctx->nbkref_ents >= mctx->abkref_ents)
4131     {
4132       struct re_backref_cache_entry* new_entry;
4133       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4134 			      mctx->abkref_ents * 2);
4135       if (__glibc_unlikely (new_entry == NULL))
4136 	{
4137 	  re_free (mctx->bkref_ents);
4138 	  return REG_ESPACE;
4139 	}
4140       mctx->bkref_ents = new_entry;
4141       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4142 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4143       mctx->abkref_ents *= 2;
4144     }
4145   if (mctx->nbkref_ents > 0
4146       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4147     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4148 
4149   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4150   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4151   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4152   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4153 
4154   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4155      If bit N is clear, means that this entry won't epsilon-transition to
4156      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4157      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4158      such node.
4159 
4160      A backreference does not epsilon-transition unless it is empty, so set
4161      to all zeros if FROM != TO.  */
4162   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4163     = (from == to ? -1 : 0);
4164 
4165   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4166   if (mctx->max_mb_elem_len < to - from)
4167     mctx->max_mb_elem_len = to - from;
4168   return REG_NOERROR;
4169 }
4170 
4171 /* Return the first entry with the same str_idx, or -1 if none is
4172    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4173 
4174 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4175 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4176 {
4177   Idx left, right, mid, last;
4178   last = right = mctx->nbkref_ents;
4179   for (left = 0; left < right;)
4180     {
4181       mid = (left + right) / 2;
4182       if (mctx->bkref_ents[mid].str_idx < str_idx)
4183 	left = mid + 1;
4184       else
4185 	right = mid;
4186     }
4187   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4188     return left;
4189   else
4190     return -1;
4191 }
4192 
4193 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4194    at STR_IDX.  */
4195 
4196 static reg_errcode_t
4197 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4198 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4199 {
4200   DEBUG_ASSERT (mctx->sub_tops != NULL);
4201   DEBUG_ASSERT (mctx->asub_tops > 0);
4202   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4203     {
4204       Idx new_asub_tops = mctx->asub_tops * 2;
4205       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4206 						   re_sub_match_top_t *,
4207 						   new_asub_tops);
4208       if (__glibc_unlikely (new_array == NULL))
4209 	return REG_ESPACE;
4210       mctx->sub_tops = new_array;
4211       mctx->asub_tops = new_asub_tops;
4212     }
4213   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4214   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4215     return REG_ESPACE;
4216   mctx->sub_tops[mctx->nsub_tops]->node = node;
4217   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4218   return REG_NOERROR;
4219 }
4220 
4221 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4222    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4223    Return the new entry if successful, NULL if memory is exhausted.  */
4224 
4225 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4226 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4227 {
4228   re_sub_match_last_t *new_entry;
4229   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4230     {
4231       Idx new_alasts = 2 * subtop->alasts + 1;
4232       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4233 						    re_sub_match_last_t *,
4234 						    new_alasts);
4235       if (__glibc_unlikely (new_array == NULL))
4236 	return NULL;
4237       subtop->lasts = new_array;
4238       subtop->alasts = new_alasts;
4239     }
4240   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4241   if (__glibc_likely (new_entry != NULL))
4242     {
4243       subtop->lasts[subtop->nlasts] = new_entry;
4244       new_entry->node = node;
4245       new_entry->str_idx = str_idx;
4246       ++subtop->nlasts;
4247     }
4248   return new_entry;
4249 }
4250 
4251 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4252 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4253 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4254 {
4255   sctx->sifted_states = sifted_sts;
4256   sctx->limited_states = limited_sts;
4257   sctx->last_node = last_node;
4258   sctx->last_str_idx = last_str_idx;
4259   re_node_set_init_empty (&sctx->limits);
4260 }
4261