1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 				     Idx n);
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 					  Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
28 					   Idx str_idx);
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 						    Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 			   re_dfastate_t **limited_sts, Idx last_node,
33 			   Idx last_str_idx);
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 					 const char *string, Idx length,
36 					 Idx start, Idx last_start, Idx stop,
37 					 size_t nmatch, regmatch_t pmatch[],
38 					 int eflags);
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 				  const char *string1, Idx length1,
41 				  const char *string2, Idx length2,
42 				  Idx start, regoff_t range,
43 				  struct re_registers *regs,
44 				  Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 				const char *string, Idx length, Idx start,
47 				regoff_t range, Idx stop,
48 				struct re_registers *regs,
49 				bool ret_len);
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51                               Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
54 			   Idx *p_match_first);
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 				     const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 			 regmatch_t *prev_idx_match, Idx cur_node,
59 			 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 				      Idx str_idx, Idx dest_node, Idx nregs,
62 				      regmatch_t *regs, regmatch_t *prevregs,
63 				      re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 			       const re_match_context_t *mctx,
66 			       size_t nmatch, regmatch_t *pmatch,
67 			       bool fl_backtrack);
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
69 
70 static int sift_states_iter_mb (const re_match_context_t *mctx,
71 				re_sift_context_t *sctx,
72 				Idx node_idx, Idx str_idx, Idx max_str_idx);
73 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
74 					   re_sift_context_t *sctx);
75 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
76 					  re_sift_context_t *sctx, Idx str_idx,
77 					  re_node_set *cur_dest);
78 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
79 					      re_sift_context_t *sctx,
80 					      Idx str_idx,
81 					      re_node_set *dest_nodes);
82 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
83 					    re_node_set *dest_nodes,
84 					    const re_node_set *candidates);
85 static bool check_dst_limits (const re_match_context_t *mctx,
86 			      const re_node_set *limits,
87 			      Idx dst_node, Idx dst_idx, Idx src_node,
88 			      Idx src_idx);
89 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
90 					int boundaries, Idx subexp_idx,
91 					Idx from_node, Idx bkref_idx);
92 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
93 				      Idx limit, Idx subexp_idx,
94 				      Idx node, Idx str_idx,
95 				      Idx bkref_idx);
96 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
97 					  re_node_set *dest_nodes,
98 					  const re_node_set *candidates,
99 					  re_node_set *limits,
100 					  struct re_backref_cache_entry *bkref_ents,
101 					  Idx str_idx);
102 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
103 					re_sift_context_t *sctx,
104 					Idx str_idx, const re_node_set *candidates);
105 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
106 					re_dfastate_t **dst,
107 					re_dfastate_t **src, Idx num);
108 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
109 					 re_match_context_t *mctx);
110 static re_dfastate_t *transit_state (reg_errcode_t *err,
111 				     re_match_context_t *mctx,
112 				     re_dfastate_t *state);
113 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
114 					    re_match_context_t *mctx,
115 					    re_dfastate_t *next_state);
116 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
117 						re_node_set *cur_nodes,
118 						Idx str_idx);
119 #if 0
120 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
121 					re_match_context_t *mctx,
122 					re_dfastate_t *pstate);
123 #endif
124 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
125 				       re_dfastate_t *pstate);
126 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
127 					  const re_node_set *nodes);
128 static reg_errcode_t get_subexp (re_match_context_t *mctx,
129 				 Idx bkref_node, Idx bkref_str_idx);
130 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
131 				     const re_sub_match_top_t *sub_top,
132 				     re_sub_match_last_t *sub_last,
133 				     Idx bkref_node, Idx bkref_str);
134 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
135 			     Idx subexp_idx, int type);
136 static reg_errcode_t check_arrival (re_match_context_t *mctx,
137 				    state_array_t *path, Idx top_node,
138 				    Idx top_str, Idx last_node, Idx last_str,
139 				    int type);
140 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
141 						   Idx str_idx,
142 						   re_node_set *cur_nodes,
143 						   re_node_set *next_nodes);
144 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
145 					       re_node_set *cur_nodes,
146 					       Idx ex_subexp, int type);
147 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
148 						   re_node_set *dst_nodes,
149 						   Idx target, Idx ex_subexp,
150 						   int type);
151 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
152 					 re_node_set *cur_nodes, Idx cur_str,
153 					 Idx subexp_num, int type);
154 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
155 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
156 				    const re_string_t *input, Idx idx);
157 #ifdef _LIBC
158 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
159 						   size_t name_len);
160 #endif
161 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
162 				       const re_dfastate_t *state,
163 				       re_node_set *states_node,
164 				       bitset_t *states_ch);
165 static bool check_node_accept (const re_match_context_t *mctx,
166 			       const re_token_t *node, Idx idx);
167 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
168 
169 /* Entry point for POSIX code.  */
170 
171 /* regexec searches for a given pattern, specified by PREG, in the
172    string STRING.
173 
174    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
175    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
176    least NMATCH elements, and we set them to the offsets of the
177    corresponding matched substrings.
178 
179    EFLAGS specifies "execution flags" which affect matching: if
180    REG_NOTBOL is set, then ^ does not match at the beginning of the
181    string; if REG_NOTEOL is set, then $ does not match at the end.
182 
183    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
184    EFLAGS is invalid.  */
185 
186 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[_REGEX_NELTS (nmatch)],int eflags)187 regexec (const regex_t *__restrict preg, const char *__restrict string,
188 	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
189 {
190   reg_errcode_t err;
191   Idx start, length;
192   re_dfa_t *dfa = preg->buffer;
193 
194   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
195     return REG_BADPAT;
196 
197   if (eflags & REG_STARTEND)
198     {
199       start = pmatch[0].rm_so;
200       length = pmatch[0].rm_eo;
201     }
202   else
203     {
204       start = 0;
205       length = strlen (string);
206     }
207 
208   lock_lock (dfa->lock);
209   if (preg->no_sub)
210     err = re_search_internal (preg, string, length, start, length,
211 			      length, 0, NULL, eflags);
212   else
213     err = re_search_internal (preg, string, length, start, length,
214 			      length, nmatch, pmatch, eflags);
215   lock_unlock (dfa->lock);
216   return err != REG_NOERROR;
217 }
218 
219 #ifdef _LIBC
220 libc_hidden_def (__regexec)
221 
222 # include <shlib-compat.h>
223 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
224 
225 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
226 __typeof__ (__regexec) __compat_regexec;
227 
228 int
229 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[_REGEX_NELTS (nmatch)],int eflags)230 __compat_regexec (const regex_t *__restrict preg,
231 		  const char *__restrict string, size_t nmatch,
232 		  regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
233 {
234   return regexec (preg, string, nmatch, pmatch,
235 		  eflags & (REG_NOTBOL | REG_NOTEOL));
236 }
237 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
238 # endif
239 #endif
240 
241 /* Entry points for GNU code.  */
242 
243 /* re_match, re_search, re_match_2, re_search_2
244 
245    The former two functions operate on STRING with length LENGTH,
246    while the later two operate on concatenation of STRING1 and STRING2
247    with lengths LENGTH1 and LENGTH2, respectively.
248 
249    re_match() matches the compiled pattern in BUFP against the string,
250    starting at index START.
251 
252    re_search() first tries matching at index START, then it tries to match
253    starting from index START + 1, and so on.  The last start position tried
254    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
255    way as re_match().)
256 
257    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
258    the first STOP characters of the concatenation of the strings should be
259    concerned.
260 
261    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
262    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
263    computed relative to the concatenation, not relative to the individual
264    strings.)
265 
266    On success, re_match* functions return the length of the match, re_search*
267    return the position of the start of the match.  They return -1 on
268    match failure, -2 on error.  */
269 
270 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)271 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
272 	  Idx start, struct re_registers *regs)
273 {
274   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
275 }
276 #ifdef _LIBC
weak_alias(__re_match,re_match)277 weak_alias (__re_match, re_match)
278 #endif
279 
280 regoff_t
281 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
282 	   Idx start, regoff_t range, struct re_registers *regs)
283 {
284   return re_search_stub (bufp, string, length, start, range, length, regs,
285 			 false);
286 }
287 #ifdef _LIBC
weak_alias(__re_search,re_search)288 weak_alias (__re_search, re_search)
289 #endif
290 
291 regoff_t
292 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
293 	    const char *string2, Idx length2, Idx start,
294 	    struct re_registers *regs, Idx stop)
295 {
296   return re_search_2_stub (bufp, string1, length1, string2, length2,
297 			   start, 0, regs, stop, true);
298 }
299 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)300 weak_alias (__re_match_2, re_match_2)
301 #endif
302 
303 regoff_t
304 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
305 	     const char *string2, Idx length2, Idx start, regoff_t range,
306 	     struct re_registers *regs, Idx stop)
307 {
308   return re_search_2_stub (bufp, string1, length1, string2, length2,
309 			   start, range, regs, stop, false);
310 }
311 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)312 weak_alias (__re_search_2, re_search_2)
313 #endif
314 
315 static regoff_t
316 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
317 		  Idx length1, const char *string2, Idx length2, Idx start,
318 		  regoff_t range, struct re_registers *regs,
319 		  Idx stop, bool ret_len)
320 {
321   const char *str;
322   regoff_t rval;
323   Idx len;
324   char *s = NULL;
325 
326   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
327 			 || INT_ADD_WRAPV (length1, length2, &len))))
328     return -2;
329 
330   /* Concatenate the strings.  */
331   if (length2 > 0)
332     if (length1 > 0)
333       {
334 	s = re_malloc (char, len);
335 
336 	if (__glibc_unlikely (s == NULL))
337 	  return -2;
338 #ifdef _LIBC
339 	memcpy (__mempcpy (s, string1, length1), string2, length2);
340 #else
341 	memcpy (s, string1, length1);
342 	memcpy (s + length1, string2, length2);
343 #endif
344 	str = s;
345       }
346     else
347       str = string2;
348   else
349     str = string1;
350 
351   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
352 			 ret_len);
353   re_free (s);
354   return rval;
355 }
356 
357 /* The parameters have the same meaning as those of re_search.
358    Additional parameters:
359    If RET_LEN is true the length of the match is returned (re_match style);
360    otherwise the position of the match is returned.  */
361 
362 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)363 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
364 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
365 		bool ret_len)
366 {
367   reg_errcode_t result;
368   regmatch_t *pmatch;
369   Idx nregs;
370   regoff_t rval;
371   int eflags = 0;
372   re_dfa_t *dfa = bufp->buffer;
373   Idx last_start = start + range;
374 
375   /* Check for out-of-range.  */
376   if (__glibc_unlikely (start < 0 || start > length))
377     return -1;
378   if (__glibc_unlikely (length < last_start
379 			|| (0 <= range && last_start < start)))
380     last_start = length;
381   else if (__glibc_unlikely (last_start < 0
382 			     || (range < 0 && start <= last_start)))
383     last_start = 0;
384 
385   lock_lock (dfa->lock);
386 
387   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
388   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
389 
390   /* Compile fastmap if we haven't yet.  */
391   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
392     re_compile_fastmap (bufp);
393 
394   if (__glibc_unlikely (bufp->no_sub))
395     regs = NULL;
396 
397   /* We need at least 1 register.  */
398   if (regs == NULL)
399     nregs = 1;
400   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
401 			     && regs->num_regs <= bufp->re_nsub))
402     {
403       nregs = regs->num_regs;
404       if (__glibc_unlikely (nregs < 1))
405 	{
406 	  /* Nothing can be copied to regs.  */
407 	  regs = NULL;
408 	  nregs = 1;
409 	}
410     }
411   else
412     nregs = bufp->re_nsub + 1;
413   pmatch = re_malloc (regmatch_t, nregs);
414   if (__glibc_unlikely (pmatch == NULL))
415     {
416       rval = -2;
417       goto out;
418     }
419 
420   result = re_search_internal (bufp, string, length, start, last_start, stop,
421 			       nregs, pmatch, eflags);
422 
423   rval = 0;
424 
425   /* I hope we needn't fill their regs with -1's when no match was found.  */
426   if (result != REG_NOERROR)
427     rval = result == REG_NOMATCH ? -1 : -2;
428   else if (regs != NULL)
429     {
430       /* If caller wants register contents data back, copy them.  */
431       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
432 					   bufp->regs_allocated);
433       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
434 	rval = -2;
435     }
436 
437   if (__glibc_likely (rval == 0))
438     {
439       if (ret_len)
440 	{
441 	  DEBUG_ASSERT (pmatch[0].rm_so == start);
442 	  rval = pmatch[0].rm_eo - start;
443 	}
444       else
445 	rval = pmatch[0].rm_so;
446     }
447   re_free (pmatch);
448  out:
449   lock_unlock (dfa->lock);
450   return rval;
451 }
452 
453 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)454 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
455 	      int regs_allocated)
456 {
457   int rval = REGS_REALLOCATE;
458   Idx i;
459   Idx need_regs = nregs + 1;
460   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
461      uses.  */
462 
463   /* Have the register data arrays been allocated?  */
464   if (regs_allocated == REGS_UNALLOCATED)
465     { /* No.  So allocate them with malloc.  */
466       regs->start = re_malloc (regoff_t, need_regs);
467       if (__glibc_unlikely (regs->start == NULL))
468 	return REGS_UNALLOCATED;
469       regs->end = re_malloc (regoff_t, need_regs);
470       if (__glibc_unlikely (regs->end == NULL))
471 	{
472 	  re_free (regs->start);
473 	  return REGS_UNALLOCATED;
474 	}
475       regs->num_regs = need_regs;
476     }
477   else if (regs_allocated == REGS_REALLOCATE)
478     { /* Yes.  If we need more elements than were already
479 	 allocated, reallocate them.  If we need fewer, just
480 	 leave it alone.  */
481       if (__glibc_unlikely (need_regs > regs->num_regs))
482 	{
483 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
484 	  regoff_t *new_end;
485 	  if (__glibc_unlikely (new_start == NULL))
486 	    return REGS_UNALLOCATED;
487 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
488 	  if (__glibc_unlikely (new_end == NULL))
489 	    {
490 	      re_free (new_start);
491 	      return REGS_UNALLOCATED;
492 	    }
493 	  regs->start = new_start;
494 	  regs->end = new_end;
495 	  regs->num_regs = need_regs;
496 	}
497     }
498   else
499     {
500       DEBUG_ASSERT (regs_allocated == REGS_FIXED);
501       /* This function may not be called with REGS_FIXED and nregs too big.  */
502       DEBUG_ASSERT (nregs <= regs->num_regs);
503       rval = REGS_FIXED;
504     }
505 
506   /* Copy the regs.  */
507   for (i = 0; i < nregs; ++i)
508     {
509       regs->start[i] = pmatch[i].rm_so;
510       regs->end[i] = pmatch[i].rm_eo;
511     }
512   for ( ; i < regs->num_regs; ++i)
513     regs->start[i] = regs->end[i] = -1;
514 
515   return rval;
516 }
517 
518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
519    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
520    this memory for recording register information.  STARTS and ENDS
521    must be allocated using the malloc library routine, and must each
522    be at least NUM_REGS * sizeof (regoff_t) bytes long.
523 
524    If NUM_REGS == 0, then subsequent matches should allocate their own
525    register data.
526 
527    Unless this function is called, the first search or match using
528    PATTERN_BUFFER will allocate its own register data, without
529    freeing the old data.  */
530 
531 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
533 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
534 {
535   if (num_regs)
536     {
537       bufp->regs_allocated = REGS_REALLOCATE;
538       regs->num_regs = num_regs;
539       regs->start = starts;
540       regs->end = ends;
541     }
542   else
543     {
544       bufp->regs_allocated = REGS_UNALLOCATED;
545       regs->num_regs = 0;
546       regs->start = regs->end = NULL;
547     }
548 }
549 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)550 weak_alias (__re_set_registers, re_set_registers)
551 #endif
552 
553 /* Entry points compatible with 4.2 BSD regex library.  We don't define
554    them unless specifically requested.  */
555 
556 #if defined _REGEX_RE_COMP || defined _LIBC
557 int
558 # ifdef _LIBC
559 weak_function
560 # endif
561 re_exec (const char *s)
562 {
563   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
564 }
565 #endif /* _REGEX_RE_COMP */
566 
567 /* Internal entry point.  */
568 
569 /* Searches for a compiled pattern PREG in the string STRING, whose
570    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
571    meaning as with regexec.  LAST_START is START + RANGE, where
572    START and RANGE have the same meaning as with re_search.
573    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574    otherwise return the error code.
575    Note: We assume front end functions already check ranges.
576    (0 <= LAST_START && LAST_START <= LENGTH)  */
577 
578 static reg_errcode_t
579 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)580 re_search_internal (const regex_t *preg, const char *string, Idx length,
581 		    Idx start, Idx last_start, Idx stop, size_t nmatch,
582 		    regmatch_t pmatch[], int eflags)
583 {
584   reg_errcode_t err;
585   const re_dfa_t *dfa = preg->buffer;
586   Idx left_lim, right_lim;
587   int incr;
588   bool fl_longest_match;
589   int match_kind;
590   Idx match_first;
591   Idx match_last = -1;
592   Idx extra_nmatch;
593   bool sb;
594   int ch;
595   re_match_context_t mctx = { .dfa = dfa };
596   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
597 		    && start != last_start && !preg->can_be_null)
598 		   ? preg->fastmap : NULL);
599   RE_TRANSLATE_TYPE t = preg->translate;
600 
601   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
602   nmatch -= extra_nmatch;
603 
604   /* Check if the DFA haven't been compiled.  */
605   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
606 			|| dfa->init_state_word == NULL
607 			|| dfa->init_state_nl == NULL
608 			|| dfa->init_state_begbuf == NULL))
609     return REG_NOMATCH;
610 
611   /* We assume front-end functions already check them.  */
612   DEBUG_ASSERT (0 <= last_start && last_start <= length);
613 
614   /* If initial states with non-begbuf contexts have no elements,
615      the regex must be anchored.  If preg->newline_anchor is set,
616      we'll never use init_state_nl, so do not check it.  */
617   if (dfa->init_state->nodes.nelem == 0
618       && dfa->init_state_word->nodes.nelem == 0
619       && (dfa->init_state_nl->nodes.nelem == 0
620 	  || !preg->newline_anchor))
621     {
622       if (start != 0 && last_start != 0)
623         return REG_NOMATCH;
624       start = last_start = 0;
625     }
626 
627   /* We must check the longest matching, if nmatch > 0.  */
628   fl_longest_match = (nmatch != 0 || dfa->nbackref);
629 
630   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
631 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
632 			    dfa);
633   if (__glibc_unlikely (err != REG_NOERROR))
634     goto free_return;
635   mctx.input.stop = stop;
636   mctx.input.raw_stop = stop;
637   mctx.input.newline_anchor = preg->newline_anchor;
638 
639   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
640   if (__glibc_unlikely (err != REG_NOERROR))
641     goto free_return;
642 
643   /* We will log all the DFA states through which the dfa pass,
644      if nmatch > 1, or this dfa has "multibyte node", which is a
645      back-reference or a node which can accept multibyte character or
646      multi character collating element.  */
647   if (nmatch > 1 || dfa->has_mb_node)
648     {
649       /* Avoid overflow.  */
650       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
651 			     <= mctx.input.bufs_len)))
652 	{
653 	  err = REG_ESPACE;
654 	  goto free_return;
655 	}
656 
657       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
658       if (__glibc_unlikely (mctx.state_log == NULL))
659 	{
660 	  err = REG_ESPACE;
661 	  goto free_return;
662 	}
663     }
664 
665   match_first = start;
666   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
667 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
668 
669   /* Check incrementally whether the input string matches.  */
670   incr = (last_start < start) ? -1 : 1;
671   left_lim = (last_start < start) ? last_start : start;
672   right_lim = (last_start < start) ? start : last_start;
673   sb = dfa->mb_cur_max == 1;
674   match_kind =
675     (fastmap
676      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
677 	| (start <= last_start ? 2 : 0)
678 	| (t != NULL ? 1 : 0))
679      : 8);
680 
681   for (;; match_first += incr)
682     {
683       err = REG_NOMATCH;
684       if (match_first < left_lim || right_lim < match_first)
685 	goto free_return;
686 
687       /* Advance as rapidly as possible through the string, until we
688 	 find a plausible place to start matching.  This may be done
689 	 with varying efficiency, so there are various possibilities:
690 	 only the most common of them are specialized, in order to
691 	 save on code size.  We use a switch statement for speed.  */
692       switch (match_kind)
693 	{
694 	case 8:
695 	  /* No fastmap.  */
696 	  break;
697 
698 	case 7:
699 	  /* Fastmap with single-byte translation, match forward.  */
700 	  while (__glibc_likely (match_first < right_lim)
701 		 && !fastmap[t[(unsigned char) string[match_first]]])
702 	    ++match_first;
703 	  goto forward_match_found_start_or_reached_end;
704 
705 	case 6:
706 	  /* Fastmap without translation, match forward.  */
707 	  while (__glibc_likely (match_first < right_lim)
708 		 && !fastmap[(unsigned char) string[match_first]])
709 	    ++match_first;
710 
711 	forward_match_found_start_or_reached_end:
712 	  if (__glibc_unlikely (match_first == right_lim))
713 	    {
714 	      ch = match_first >= length
715 		       ? 0 : (unsigned char) string[match_first];
716 	      if (!fastmap[t ? t[ch] : ch])
717 		goto free_return;
718 	    }
719 	  break;
720 
721 	case 4:
722 	case 5:
723 	  /* Fastmap without multi-byte translation, match backwards.  */
724 	  while (match_first >= left_lim)
725 	    {
726 	      ch = match_first >= length
727 		       ? 0 : (unsigned char) string[match_first];
728 	      if (fastmap[t ? t[ch] : ch])
729 		break;
730 	      --match_first;
731 	    }
732 	  if (match_first < left_lim)
733 	    goto free_return;
734 	  break;
735 
736 	default:
737 	  /* In this case, we can't determine easily the current byte,
738 	     since it might be a component byte of a multibyte
739 	     character.  Then we use the constructed buffer instead.  */
740 	  for (;;)
741 	    {
742 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
743 		 buffers.  */
744 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
745 	      if (__glibc_unlikely (offset
746 				    >= (__re_size_t) mctx.input.valid_raw_len))
747 		{
748 		  err = re_string_reconstruct (&mctx.input, match_first,
749 					       eflags);
750 		  if (__glibc_unlikely (err != REG_NOERROR))
751 		    goto free_return;
752 
753 		  offset = match_first - mctx.input.raw_mbs_idx;
754 		}
755 	      /* Use buffer byte if OFFSET is in buffer, otherwise '\0'.  */
756 	      ch = (offset < mctx.input.valid_len
757 		    ? re_string_byte_at (&mctx.input, offset) : 0);
758 	      if (fastmap[ch])
759 		break;
760 	      match_first += incr;
761 	      if (match_first < left_lim || match_first > right_lim)
762 		{
763 		  err = REG_NOMATCH;
764 		  goto free_return;
765 		}
766 	    }
767 	  break;
768 	}
769 
770       /* Reconstruct the buffers so that the matcher can assume that
771 	 the matching starts from the beginning of the buffer.  */
772       err = re_string_reconstruct (&mctx.input, match_first, eflags);
773       if (__glibc_unlikely (err != REG_NOERROR))
774 	goto free_return;
775 
776       /* Don't consider this char as a possible match start if it part,
777          yet isn't the head, of a multibyte character.  */
778       if (!sb && !re_string_first_byte (&mctx.input, 0))
779 	continue;
780 
781       /* It seems to be appropriate one, then use the matcher.  */
782       /* We assume that the matching starts from 0.  */
783       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
784       match_last = check_matching (&mctx, fl_longest_match,
785 				   start <= last_start ? &match_first : NULL);
786       if (match_last != -1)
787 	{
788 	  if (__glibc_unlikely (match_last == -2))
789 	    {
790 	      err = REG_ESPACE;
791 	      goto free_return;
792 	    }
793 	  else
794 	    {
795 	      mctx.match_last = match_last;
796 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
797 		{
798 		  re_dfastate_t *pstate = mctx.state_log[match_last];
799 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
800 							     match_last);
801 		}
802 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
803 		  || dfa->nbackref)
804 		{
805 		  err = prune_impossible_nodes (&mctx);
806 		  if (err == REG_NOERROR)
807 		    break;
808 		  if (__glibc_unlikely (err != REG_NOMATCH))
809 		    goto free_return;
810 		  match_last = -1;
811 		}
812 	      else
813 		break; /* We found a match.  */
814 	    }
815 	}
816 
817       match_ctx_clean (&mctx);
818     }
819 
820   DEBUG_ASSERT (match_last != -1);
821   DEBUG_ASSERT (err == REG_NOERROR);
822 
823   /* Set pmatch[] if we need.  */
824   if (nmatch > 0)
825     {
826       Idx reg_idx;
827 
828       /* Initialize registers.  */
829       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
830 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
831 
832       /* Set the points where matching start/end.  */
833       pmatch[0].rm_so = 0;
834       pmatch[0].rm_eo = mctx.match_last;
835       /* FIXME: This function should fail if mctx.match_last exceeds
836 	 the maximum possible regoff_t value.  We need a new error
837 	 code REG_OVERFLOW.  */
838 
839       if (!preg->no_sub && nmatch > 1)
840 	{
841 	  err = set_regs (preg, &mctx, nmatch, pmatch,
842 			  dfa->has_plural_match && dfa->nbackref > 0);
843 	  if (__glibc_unlikely (err != REG_NOERROR))
844 	    goto free_return;
845 	}
846 
847       /* At last, add the offset to each register, since we slid
848 	 the buffers so that we could assume that the matching starts
849 	 from 0.  */
850       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
851 	if (pmatch[reg_idx].rm_so != -1)
852 	  {
853 	    if (__glibc_unlikely (mctx.input.offsets_needed != 0))
854 	      {
855 		pmatch[reg_idx].rm_so =
856 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
857 		   ? mctx.input.valid_raw_len
858 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
859 		pmatch[reg_idx].rm_eo =
860 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
861 		   ? mctx.input.valid_raw_len
862 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
863 	      }
864 	    pmatch[reg_idx].rm_so += match_first;
865 	    pmatch[reg_idx].rm_eo += match_first;
866 	  }
867       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
868 	{
869 	  pmatch[nmatch + reg_idx].rm_so = -1;
870 	  pmatch[nmatch + reg_idx].rm_eo = -1;
871 	}
872 
873       if (dfa->subexp_map)
874 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
875 	  if (dfa->subexp_map[reg_idx] != reg_idx)
876 	    {
877 	      pmatch[reg_idx + 1].rm_so
878 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
879 	      pmatch[reg_idx + 1].rm_eo
880 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
881 	    }
882     }
883 
884  free_return:
885   re_free (mctx.state_log);
886   if (dfa->nbackref)
887     match_ctx_free (&mctx);
888   re_string_destruct (&mctx.input);
889   return err;
890 }
891 
892 static reg_errcode_t
893 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)894 prune_impossible_nodes (re_match_context_t *mctx)
895 {
896   const re_dfa_t *const dfa = mctx->dfa;
897   Idx halt_node, match_last;
898   reg_errcode_t ret;
899   re_dfastate_t **sifted_states;
900   re_dfastate_t **lim_states = NULL;
901   re_sift_context_t sctx;
902   DEBUG_ASSERT (mctx->state_log != NULL);
903   match_last = mctx->match_last;
904   halt_node = mctx->last_node;
905 
906   /* Avoid overflow.  */
907   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
908 			<= match_last))
909     return REG_ESPACE;
910 
911   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
912   if (__glibc_unlikely (sifted_states == NULL))
913     {
914       ret = REG_ESPACE;
915       goto free_return;
916     }
917   if (dfa->nbackref)
918     {
919       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
920       if (__glibc_unlikely (lim_states == NULL))
921 	{
922 	  ret = REG_ESPACE;
923 	  goto free_return;
924 	}
925       while (1)
926 	{
927 	  memset (lim_states, '\0',
928 		  sizeof (re_dfastate_t *) * (match_last + 1));
929 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
930 			 match_last);
931 	  ret = sift_states_backward (mctx, &sctx);
932 	  re_node_set_free (&sctx.limits);
933 	  if (__glibc_unlikely (ret != REG_NOERROR))
934 	      goto free_return;
935 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
936 	    break;
937 	  do
938 	    {
939 	      --match_last;
940 	      if (match_last < 0)
941 		{
942 		  ret = REG_NOMATCH;
943 		  goto free_return;
944 		}
945 	    } while (mctx->state_log[match_last] == NULL
946 		     || !mctx->state_log[match_last]->halt);
947 	  halt_node = check_halt_state_context (mctx,
948 						mctx->state_log[match_last],
949 						match_last);
950 	}
951       ret = merge_state_array (dfa, sifted_states, lim_states,
952 			       match_last + 1);
953       re_free (lim_states);
954       lim_states = NULL;
955       if (__glibc_unlikely (ret != REG_NOERROR))
956 	goto free_return;
957     }
958   else
959     {
960       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
961       ret = sift_states_backward (mctx, &sctx);
962       re_node_set_free (&sctx.limits);
963       if (__glibc_unlikely (ret != REG_NOERROR))
964 	goto free_return;
965       if (sifted_states[0] == NULL)
966 	{
967 	  ret = REG_NOMATCH;
968 	  goto free_return;
969 	}
970     }
971   re_free (mctx->state_log);
972   mctx->state_log = sifted_states;
973   sifted_states = NULL;
974   mctx->last_node = halt_node;
975   mctx->match_last = match_last;
976   ret = REG_NOERROR;
977  free_return:
978   re_free (sifted_states);
979   re_free (lim_states);
980   return ret;
981 }
982 
983 /* Acquire an initial state and return it.
984    We must select appropriate initial state depending on the context,
985    since initial states may have constraints like "\<", "^", etc..  */
986 
987 static __always_inline re_dfastate_t *
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)988 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
989 			    Idx idx)
990 {
991   const re_dfa_t *const dfa = mctx->dfa;
992   if (dfa->init_state->has_constraint)
993     {
994       unsigned int context;
995       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
996       if (IS_WORD_CONTEXT (context))
997 	return dfa->init_state_word;
998       else if (IS_ORDINARY_CONTEXT (context))
999 	return dfa->init_state;
1000       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1001 	return dfa->init_state_begbuf;
1002       else if (IS_NEWLINE_CONTEXT (context))
1003 	return dfa->init_state_nl;
1004       else if (IS_BEGBUF_CONTEXT (context))
1005 	{
1006 	  /* It is relatively rare case, then calculate on demand.  */
1007 	  return re_acquire_state_context (err, dfa,
1008 					   dfa->init_state->entrance_nodes,
1009 					   context);
1010 	}
1011       else
1012 	/* Must not happen?  */
1013 	return dfa->init_state;
1014     }
1015   else
1016     return dfa->init_state;
1017 }
1018 
1019 /* Check whether the regular expression match input string INPUT or not,
1020    and return the index where the matching end.  Return -1 if
1021    there is no match, and return -2 in case of an error.
1022    FL_LONGEST_MATCH means we want the POSIX longest matching.
1023    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1024    next place where we may want to try matching.
1025    Note that the matcher assumes that the matching starts from the current
1026    index of the buffer.  */
1027 
1028 static Idx
1029 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1030 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1031 		Idx *p_match_first)
1032 {
1033   const re_dfa_t *const dfa = mctx->dfa;
1034   reg_errcode_t err;
1035   Idx match = 0;
1036   Idx match_last = -1;
1037   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1038   re_dfastate_t *cur_state;
1039   bool at_init_state = p_match_first != NULL;
1040   Idx next_start_idx = cur_str_idx;
1041 
1042   err = REG_NOERROR;
1043   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1044   /* An initial state must not be NULL (invalid).  */
1045   if (__glibc_unlikely (cur_state == NULL))
1046     {
1047       DEBUG_ASSERT (err == REG_ESPACE);
1048       return -2;
1049     }
1050 
1051   if (mctx->state_log != NULL)
1052     {
1053       mctx->state_log[cur_str_idx] = cur_state;
1054 
1055       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1056 	 later.  E.g. Processing back references.  */
1057       if (__glibc_unlikely (dfa->nbackref))
1058 	{
1059 	  at_init_state = false;
1060 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1061 	  if (__glibc_unlikely (err != REG_NOERROR))
1062 	    return err;
1063 
1064 	  if (cur_state->has_backref)
1065 	    {
1066 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1067 	      if (__glibc_unlikely (err != REG_NOERROR))
1068 		return err;
1069 	    }
1070 	}
1071     }
1072 
1073   /* If the RE accepts NULL string.  */
1074   if (__glibc_unlikely (cur_state->halt))
1075     {
1076       if (!cur_state->has_constraint
1077 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1078 	{
1079 	  if (!fl_longest_match)
1080 	    return cur_str_idx;
1081 	  else
1082 	    {
1083 	      match_last = cur_str_idx;
1084 	      match = 1;
1085 	    }
1086 	}
1087     }
1088 
1089   while (!re_string_eoi (&mctx->input))
1090     {
1091       re_dfastate_t *old_state = cur_state;
1092       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1093 
1094       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1095 	   && mctx->input.bufs_len < mctx->input.len)
1096 	  || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1097 	      && mctx->input.valid_len < mctx->input.len))
1098 	{
1099 	  err = extend_buffers (mctx, next_char_idx + 1);
1100 	  if (__glibc_unlikely (err != REG_NOERROR))
1101 	    {
1102 	      DEBUG_ASSERT (err == REG_ESPACE);
1103 	      return -2;
1104 	    }
1105 	}
1106 
1107       cur_state = transit_state (&err, mctx, cur_state);
1108       if (mctx->state_log != NULL)
1109 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1110 
1111       if (cur_state == NULL)
1112 	{
1113 	  /* Reached the invalid state or an error.  Try to recover a valid
1114 	     state using the state log, if available and if we have not
1115 	     already found a valid (even if not the longest) match.  */
1116 	  if (__glibc_unlikely (err != REG_NOERROR))
1117 	    return -2;
1118 
1119 	  if (mctx->state_log == NULL
1120 	      || (match && !fl_longest_match)
1121 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1122 	    break;
1123 	}
1124 
1125       if (__glibc_unlikely (at_init_state))
1126 	{
1127 	  if (old_state == cur_state)
1128 	    next_start_idx = next_char_idx;
1129 	  else
1130 	    at_init_state = false;
1131 	}
1132 
1133       if (cur_state->halt)
1134 	{
1135 	  /* Reached a halt state.
1136 	     Check the halt state can satisfy the current context.  */
1137 	  if (!cur_state->has_constraint
1138 	      || check_halt_state_context (mctx, cur_state,
1139 					   re_string_cur_idx (&mctx->input)))
1140 	    {
1141 	      /* We found an appropriate halt state.  */
1142 	      match_last = re_string_cur_idx (&mctx->input);
1143 	      match = 1;
1144 
1145 	      /* We found a match, do not modify match_first below.  */
1146 	      p_match_first = NULL;
1147 	      if (!fl_longest_match)
1148 		break;
1149 	    }
1150 	}
1151     }
1152 
1153   if (p_match_first)
1154     *p_match_first += next_start_idx;
1155 
1156   return match_last;
1157 }
1158 
1159 /* Check NODE match the current context.  */
1160 
1161 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1162 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1163 {
1164   re_token_type_t type = dfa->nodes[node].type;
1165   unsigned int constraint = dfa->nodes[node].constraint;
1166   if (type != END_OF_RE)
1167     return false;
1168   if (!constraint)
1169     return true;
1170   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1171     return false;
1172   return true;
1173 }
1174 
1175 /* Check the halt state STATE match the current context.
1176    Return 0 if not match, if the node, STATE has, is a halt node and
1177    match the context, return the node.  */
1178 
1179 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1180 check_halt_state_context (const re_match_context_t *mctx,
1181 			  const re_dfastate_t *state, Idx idx)
1182 {
1183   Idx i;
1184   unsigned int context;
1185   DEBUG_ASSERT (state->halt);
1186   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1187   for (i = 0; i < state->nodes.nelem; ++i)
1188     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1189       return state->nodes.elems[i];
1190   return 0;
1191 }
1192 
1193 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1194    corresponding to the DFA).
1195    Return the destination node, and update EPS_VIA_NODES;
1196    return -1 on match failure, -2 on error.  */
1197 
1198 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1199 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1200 		   regmatch_t *prevregs,
1201 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1202 		   struct re_fail_stack_t *fs)
1203 {
1204   const re_dfa_t *const dfa = mctx->dfa;
1205   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1206     {
1207       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1208       re_node_set *edests = &dfa->edests[node];
1209 
1210       if (! re_node_set_contains (eps_via_nodes, node))
1211         {
1212           bool ok = re_node_set_insert (eps_via_nodes, node);
1213           if (__glibc_unlikely (! ok))
1214             return -2;
1215         }
1216 
1217       /* Pick a valid destination, or return -1 if none is found.  */
1218       Idx dest_node = -1;
1219       for (Idx i = 0; i < edests->nelem; i++)
1220 	{
1221 	  Idx candidate = edests->elems[i];
1222 	  if (!re_node_set_contains (cur_nodes, candidate))
1223 	    continue;
1224           if (dest_node == -1)
1225 	    dest_node = candidate;
1226 
1227 	  else
1228 	    {
1229 	      /* In order to avoid infinite loop like "(a*)*", return the second
1230 		 epsilon-transition if the first was already considered.  */
1231 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1232 		return candidate;
1233 
1234 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1235 	      else if (fs != NULL
1236 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1237 					   prevregs, eps_via_nodes))
1238 		return -2;
1239 
1240 	      /* We know we are going to exit.  */
1241 	      break;
1242 	    }
1243 	}
1244       return dest_node;
1245     }
1246   else
1247     {
1248       Idx naccepted = 0;
1249       re_token_type_t type = dfa->nodes[node].type;
1250 
1251       if (dfa->nodes[node].accept_mb)
1252 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1253       else if (type == OP_BACK_REF)
1254 	{
1255 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1256 	  if (subexp_idx < nregs)
1257 	    naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1258 	  if (fs != NULL)
1259 	    {
1260 	      if (subexp_idx >= nregs
1261 		  || regs[subexp_idx].rm_so == -1
1262 		  || regs[subexp_idx].rm_eo == -1)
1263 		return -1;
1264 	      else if (naccepted)
1265 		{
1266 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1267 		  if (mctx->input.valid_len - *pidx < naccepted
1268 		      || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1269 				  naccepted)
1270 			  != 0))
1271 		    return -1;
1272 		}
1273 	    }
1274 
1275 	  if (naccepted == 0)
1276 	    {
1277 	      Idx dest_node;
1278 	      bool ok = re_node_set_insert (eps_via_nodes, node);
1279 	      if (__glibc_unlikely (! ok))
1280 		return -2;
1281 	      dest_node = dfa->edests[node].elems[0];
1282 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1283 					dest_node))
1284 		return dest_node;
1285 	    }
1286 	}
1287 
1288       if (naccepted != 0
1289 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1290 	{
1291 	  Idx dest_node = dfa->nexts[node];
1292 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1293 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1294 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1295 					       dest_node)))
1296 	    return -1;
1297 	  re_node_set_empty (eps_via_nodes);
1298 	  return dest_node;
1299 	}
1300     }
1301   return -1;
1302 }
1303 
1304 static reg_errcode_t
1305 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1306 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1307 		 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1308 		 re_node_set *eps_via_nodes)
1309 {
1310   reg_errcode_t err;
1311   Idx num = fs->num++;
1312   if (fs->num == fs->alloc)
1313     {
1314       struct re_fail_stack_ent_t *new_array;
1315       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1316                               fs->alloc * 2);
1317       if (new_array == NULL)
1318 	return REG_ESPACE;
1319       fs->alloc *= 2;
1320       fs->stack = new_array;
1321     }
1322   fs->stack[num].idx = str_idx;
1323   fs->stack[num].node = dest_node;
1324   fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1325   if (fs->stack[num].regs == NULL)
1326     return REG_ESPACE;
1327   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1328   memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1329   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1330   return err;
1331 }
1332 
1333 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,regmatch_t * prevregs,re_node_set * eps_via_nodes)1334 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1335 		regmatch_t *regs, regmatch_t *prevregs,
1336 		re_node_set *eps_via_nodes)
1337 {
1338   if (fs == NULL || fs->num == 0)
1339     return -1;
1340   Idx num = --fs->num;
1341   *pidx = fs->stack[num].idx;
1342   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1343   memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1344   re_node_set_free (eps_via_nodes);
1345   re_free (fs->stack[num].regs);
1346   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1347   DEBUG_ASSERT (0 <= fs->stack[num].node);
1348   return fs->stack[num].node;
1349 }
1350 
1351 
1352 #define DYNARRAY_STRUCT  regmatch_list
1353 #define DYNARRAY_ELEMENT regmatch_t
1354 #define DYNARRAY_PREFIX  regmatch_list_
1355 #include <malloc/dynarray-skeleton.c>
1356 
1357 /* Set the positions where the subexpressions are starts/ends to registers
1358    PMATCH.
1359    Note: We assume that pmatch[0] is already set, and
1360    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1361 
1362 static reg_errcode_t
1363 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1364 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1365 	  regmatch_t *pmatch, bool fl_backtrack)
1366 {
1367   const re_dfa_t *dfa = preg->buffer;
1368   Idx idx, cur_node;
1369   re_node_set eps_via_nodes;
1370   struct re_fail_stack_t *fs;
1371   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1372   struct regmatch_list prev_match;
1373   regmatch_list_init (&prev_match);
1374 
1375   DEBUG_ASSERT (nmatch > 1);
1376   DEBUG_ASSERT (mctx->state_log != NULL);
1377   if (fl_backtrack)
1378     {
1379       fs = &fs_body;
1380       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1381       if (fs->stack == NULL)
1382 	return REG_ESPACE;
1383     }
1384   else
1385     fs = NULL;
1386 
1387   cur_node = dfa->init_node;
1388   re_node_set_init_empty (&eps_via_nodes);
1389 
1390   if (!regmatch_list_resize (&prev_match, nmatch))
1391     {
1392       regmatch_list_free (&prev_match);
1393       free_fail_stack_return (fs);
1394       return REG_ESPACE;
1395     }
1396   regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1397   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1398 
1399   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1400     {
1401       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1402 
1403       if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1404 	  || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1405 	{
1406 	  Idx reg_idx;
1407 	  cur_node = -1;
1408 	  if (fs)
1409 	    {
1410 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1411 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1412 		  {
1413 		    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1414 					       prev_idx_match, &eps_via_nodes);
1415 		    break;
1416 		  }
1417 	    }
1418 	  if (cur_node < 0)
1419 	    {
1420 	      re_node_set_free (&eps_via_nodes);
1421 	      regmatch_list_free (&prev_match);
1422 	      return free_fail_stack_return (fs);
1423 	    }
1424 	}
1425 
1426       /* Proceed to next node.  */
1427       cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1428 				    &idx, cur_node,
1429 				    &eps_via_nodes, fs);
1430 
1431       if (__glibc_unlikely (cur_node < 0))
1432 	{
1433 	  if (__glibc_unlikely (cur_node == -2))
1434 	    {
1435 	      re_node_set_free (&eps_via_nodes);
1436 	      regmatch_list_free (&prev_match);
1437 	      free_fail_stack_return (fs);
1438 	      return REG_ESPACE;
1439 	    }
1440 	  cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1441 				     prev_idx_match, &eps_via_nodes);
1442 	  if (cur_node < 0)
1443 	    {
1444 	      re_node_set_free (&eps_via_nodes);
1445 	      regmatch_list_free (&prev_match);
1446 	      free_fail_stack_return (fs);
1447 	      return REG_NOMATCH;
1448 	    }
1449 	}
1450     }
1451   re_node_set_free (&eps_via_nodes);
1452   regmatch_list_free (&prev_match);
1453   return free_fail_stack_return (fs);
1454 }
1455 
1456 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1457 free_fail_stack_return (struct re_fail_stack_t *fs)
1458 {
1459   if (fs)
1460     {
1461       Idx fs_idx;
1462       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1463 	{
1464 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1465 	  re_free (fs->stack[fs_idx].regs);
1466 	}
1467       re_free (fs->stack);
1468     }
1469   return REG_NOERROR;
1470 }
1471 
1472 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1473 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1474 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1475 {
1476   int type = dfa->nodes[cur_node].type;
1477   if (type == OP_OPEN_SUBEXP)
1478     {
1479       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1480 
1481       /* We are at the first node of this sub expression.  */
1482       if (reg_num < nmatch)
1483 	{
1484 	  pmatch[reg_num].rm_so = cur_idx;
1485 	  pmatch[reg_num].rm_eo = -1;
1486 	}
1487     }
1488   else if (type == OP_CLOSE_SUBEXP)
1489     {
1490       /* We are at the last node of this sub expression.  */
1491       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1492       if (reg_num < nmatch)
1493 	{
1494 	  if (pmatch[reg_num].rm_so < cur_idx)
1495 	    {
1496 	      pmatch[reg_num].rm_eo = cur_idx;
1497 	      /* This is a non-empty match or we are not inside an optional
1498 		 subexpression.  Accept this right away.  */
1499 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1500 	    }
1501 	  else
1502 	    {
1503 	      if (dfa->nodes[cur_node].opt_subexp
1504 		  && prev_idx_match[reg_num].rm_so != -1)
1505 		/* We transited through an empty match for an optional
1506 		   subexpression, like (a?)*, and this is not the subexp's
1507 		   first match.  Copy back the old content of the registers
1508 		   so that matches of an inner subexpression are undone as
1509 		   well, like in ((a?))*.  */
1510 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1511 	      else
1512 		/* We completed a subexpression, but it may be part of
1513 		   an optional one, so do not update PREV_IDX_MATCH.  */
1514 		pmatch[reg_num].rm_eo = cur_idx;
1515 	    }
1516 	}
1517     }
1518 }
1519 
1520 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1521    and sift the nodes in each states according to the following rules.
1522    Updated state_log will be wrote to STATE_LOG.
1523 
1524    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1525      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1526 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1527 	the LAST_NODE, we throw away the node 'a'.
1528      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1529 	string 's' and transit to 'b':
1530 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1531 	   away the node 'a'.
1532 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1533 	    thrown away, we throw away the node 'a'.
1534      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1535 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1536 	   node 'a'.
1537 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1538 	    we throw away the node 'a'.  */
1539 
1540 #define STATE_NODE_CONTAINS(state,node) \
1541   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1542 
1543 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1544 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1545 {
1546   reg_errcode_t err;
1547   int null_cnt = 0;
1548   Idx str_idx = sctx->last_str_idx;
1549   re_node_set cur_dest;
1550 
1551   DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1552 
1553   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1554      transit to the last_node and the last_node itself.  */
1555   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1556   if (__glibc_unlikely (err != REG_NOERROR))
1557     return err;
1558   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1559   if (__glibc_unlikely (err != REG_NOERROR))
1560     goto free_return;
1561 
1562   /* Then check each states in the state_log.  */
1563   while (str_idx > 0)
1564     {
1565       /* Update counters.  */
1566       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1567       if (null_cnt > mctx->max_mb_elem_len)
1568 	{
1569 	  memset (sctx->sifted_states, '\0',
1570 		  sizeof (re_dfastate_t *) * str_idx);
1571 	  re_node_set_free (&cur_dest);
1572 	  return REG_NOERROR;
1573 	}
1574       re_node_set_empty (&cur_dest);
1575       --str_idx;
1576 
1577       if (mctx->state_log[str_idx])
1578 	{
1579 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1580 	  if (__glibc_unlikely (err != REG_NOERROR))
1581 	    goto free_return;
1582 	}
1583 
1584       /* Add all the nodes which satisfy the following conditions:
1585 	 - It can epsilon transit to a node in CUR_DEST.
1586 	 - It is in CUR_SRC.
1587 	 And update state_log.  */
1588       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1589       if (__glibc_unlikely (err != REG_NOERROR))
1590 	goto free_return;
1591     }
1592   err = REG_NOERROR;
1593  free_return:
1594   re_node_set_free (&cur_dest);
1595   return err;
1596 }
1597 
1598 static reg_errcode_t
1599 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1600 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1601 		     Idx str_idx, re_node_set *cur_dest)
1602 {
1603   const re_dfa_t *const dfa = mctx->dfa;
1604   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1605   Idx i;
1606 
1607   /* Then build the next sifted state.
1608      We build the next sifted state on 'cur_dest', and update
1609      'sifted_states[str_idx]' with 'cur_dest'.
1610      Note:
1611      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1612      'cur_src' points the node_set of the old 'state_log[str_idx]'
1613      (with the epsilon nodes pre-filtered out).  */
1614   for (i = 0; i < cur_src->nelem; i++)
1615     {
1616       Idx prev_node = cur_src->elems[i];
1617       int naccepted = 0;
1618       bool ok;
1619       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1620 
1621       /* If the node may accept "multi byte".  */
1622       if (dfa->nodes[prev_node].accept_mb)
1623 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1624 					 str_idx, sctx->last_str_idx);
1625 
1626       /* We don't check backreferences here.
1627 	 See update_cur_sifted_state().  */
1628       if (!naccepted
1629 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1630 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1631 				  dfa->nexts[prev_node]))
1632 	naccepted = 1;
1633 
1634       if (naccepted == 0)
1635 	continue;
1636 
1637       if (sctx->limits.nelem)
1638 	{
1639 	  Idx to_idx = str_idx + naccepted;
1640 	  if (check_dst_limits (mctx, &sctx->limits,
1641 				dfa->nexts[prev_node], to_idx,
1642 				prev_node, str_idx))
1643 	    continue;
1644 	}
1645       ok = re_node_set_insert (cur_dest, prev_node);
1646       if (__glibc_unlikely (! ok))
1647 	return REG_ESPACE;
1648     }
1649 
1650   return REG_NOERROR;
1651 }
1652 
1653 /* Helper functions.  */
1654 
1655 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1656 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1657 {
1658   Idx top = mctx->state_log_top;
1659 
1660   if ((next_state_log_idx >= mctx->input.bufs_len
1661        && mctx->input.bufs_len < mctx->input.len)
1662       || (next_state_log_idx >= mctx->input.valid_len
1663 	  && mctx->input.valid_len < mctx->input.len))
1664     {
1665       reg_errcode_t err;
1666       err = extend_buffers (mctx, next_state_log_idx + 1);
1667       if (__glibc_unlikely (err != REG_NOERROR))
1668 	return err;
1669     }
1670 
1671   if (top < next_state_log_idx)
1672     {
1673       DEBUG_ASSERT (mctx->state_log != NULL);
1674       memset (mctx->state_log + top + 1, '\0',
1675 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1676       mctx->state_log_top = next_state_log_idx;
1677     }
1678   return REG_NOERROR;
1679 }
1680 
1681 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1682 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1683 		   re_dfastate_t **src, Idx num)
1684 {
1685   Idx st_idx;
1686   reg_errcode_t err;
1687   for (st_idx = 0; st_idx < num; ++st_idx)
1688     {
1689       if (dst[st_idx] == NULL)
1690 	dst[st_idx] = src[st_idx];
1691       else if (src[st_idx] != NULL)
1692 	{
1693 	  re_node_set merged_set;
1694 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1695 					&src[st_idx]->nodes);
1696 	  if (__glibc_unlikely (err != REG_NOERROR))
1697 	    return err;
1698 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1699 	  re_node_set_free (&merged_set);
1700 	  if (__glibc_unlikely (err != REG_NOERROR))
1701 	    return err;
1702 	}
1703     }
1704   return REG_NOERROR;
1705 }
1706 
1707 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1708 update_cur_sifted_state (const re_match_context_t *mctx,
1709 			 re_sift_context_t *sctx, Idx str_idx,
1710 			 re_node_set *dest_nodes)
1711 {
1712   const re_dfa_t *const dfa = mctx->dfa;
1713   reg_errcode_t err = REG_NOERROR;
1714   const re_node_set *candidates;
1715   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1716 		: &mctx->state_log[str_idx]->nodes);
1717 
1718   if (dest_nodes->nelem == 0)
1719     sctx->sifted_states[str_idx] = NULL;
1720   else
1721     {
1722       if (candidates)
1723 	{
1724 	  /* At first, add the nodes which can epsilon transit to a node in
1725 	     DEST_NODE.  */
1726 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1727 	  if (__glibc_unlikely (err != REG_NOERROR))
1728 	    return err;
1729 
1730 	  /* Then, check the limitations in the current sift_context.  */
1731 	  if (sctx->limits.nelem)
1732 	    {
1733 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1734 					 mctx->bkref_ents, str_idx);
1735 	      if (__glibc_unlikely (err != REG_NOERROR))
1736 		return err;
1737 	    }
1738 	}
1739 
1740       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1741       if (__glibc_unlikely (err != REG_NOERROR))
1742 	return err;
1743     }
1744 
1745   if (candidates && mctx->state_log[str_idx]->has_backref)
1746     {
1747       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1748       if (__glibc_unlikely (err != REG_NOERROR))
1749 	return err;
1750     }
1751   return REG_NOERROR;
1752 }
1753 
1754 static reg_errcode_t
1755 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1756 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1757 		       const re_node_set *candidates)
1758 {
1759   reg_errcode_t err = REG_NOERROR;
1760   Idx i;
1761 
1762   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1763   if (__glibc_unlikely (err != REG_NOERROR))
1764     return err;
1765 
1766   if (!state->inveclosure.alloc)
1767     {
1768       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1769       if (__glibc_unlikely (err != REG_NOERROR))
1770 	return REG_ESPACE;
1771       for (i = 0; i < dest_nodes->nelem; i++)
1772 	{
1773 	  err = re_node_set_merge (&state->inveclosure,
1774 				   dfa->inveclosures + dest_nodes->elems[i]);
1775 	  if (__glibc_unlikely (err != REG_NOERROR))
1776 	    return REG_ESPACE;
1777 	}
1778     }
1779   return re_node_set_add_intersect (dest_nodes, candidates,
1780 				    &state->inveclosure);
1781 }
1782 
1783 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1784 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1785 		       const re_node_set *candidates)
1786 {
1787     Idx ecl_idx;
1788     reg_errcode_t err;
1789     re_node_set *inv_eclosure = dfa->inveclosures + node;
1790     re_node_set except_nodes;
1791     re_node_set_init_empty (&except_nodes);
1792     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1793       {
1794 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1795 	if (cur_node == node)
1796 	  continue;
1797 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1798 	  {
1799 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1800 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1801 			 ? dfa->edests[cur_node].elems[1] : -1);
1802 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1803 		 && re_node_set_contains (dest_nodes, edst1))
1804 		|| (edst2 > 0
1805 		    && !re_node_set_contains (inv_eclosure, edst2)
1806 		    && re_node_set_contains (dest_nodes, edst2)))
1807 	      {
1808 		err = re_node_set_add_intersect (&except_nodes, candidates,
1809 						 dfa->inveclosures + cur_node);
1810 		if (__glibc_unlikely (err != REG_NOERROR))
1811 		  {
1812 		    re_node_set_free (&except_nodes);
1813 		    return err;
1814 		  }
1815 	      }
1816 	  }
1817       }
1818     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1819       {
1820 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1821 	if (!re_node_set_contains (&except_nodes, cur_node))
1822 	  {
1823 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1824 	    re_node_set_remove_at (dest_nodes, idx);
1825 	  }
1826       }
1827     re_node_set_free (&except_nodes);
1828     return REG_NOERROR;
1829 }
1830 
1831 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1832 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1833 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1834 {
1835   const re_dfa_t *const dfa = mctx->dfa;
1836   Idx lim_idx, src_pos, dst_pos;
1837 
1838   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1839   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1840   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1841     {
1842       Idx subexp_idx;
1843       struct re_backref_cache_entry *ent;
1844       ent = mctx->bkref_ents + limits->elems[lim_idx];
1845       subexp_idx = dfa->nodes[ent->node].opr.idx;
1846 
1847       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1848 					   subexp_idx, dst_node, dst_idx,
1849 					   dst_bkref_idx);
1850       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1851 					   subexp_idx, src_node, src_idx,
1852 					   src_bkref_idx);
1853 
1854       /* In case of:
1855 	 <src> <dst> ( <subexp> )
1856 	 ( <subexp> ) <src> <dst>
1857 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1858       if (src_pos == dst_pos)
1859 	continue; /* This is unrelated limitation.  */
1860       else
1861 	return true;
1862     }
1863   return false;
1864 }
1865 
1866 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1867 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1868 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1869 {
1870   const re_dfa_t *const dfa = mctx->dfa;
1871   const re_node_set *eclosures = dfa->eclosures + from_node;
1872   Idx node_idx;
1873 
1874   /* Else, we are on the boundary: examine the nodes on the epsilon
1875      closure.  */
1876   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1877     {
1878       Idx node = eclosures->elems[node_idx];
1879       switch (dfa->nodes[node].type)
1880 	{
1881 	case OP_BACK_REF:
1882 	  if (bkref_idx != -1)
1883 	    {
1884 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1885 	      do
1886 		{
1887 		  Idx dst;
1888 		  int cpos;
1889 
1890 		  if (ent->node != node)
1891 		    continue;
1892 
1893 		  if (subexp_idx < BITSET_WORD_BITS
1894 		      && !(ent->eps_reachable_subexps_map
1895 			   & ((bitset_word_t) 1 << subexp_idx)))
1896 		    continue;
1897 
1898 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1899 		     OP_CLOSE_SUBEXP cases below.  But, if the
1900 		     destination node is the same node as the source
1901 		     node, don't recurse because it would cause an
1902 		     infinite loop: a regex that exhibits this behavior
1903 		     is ()\1*\1*  */
1904 		  dst = dfa->edests[node].elems[0];
1905 		  if (dst == from_node)
1906 		    {
1907 		      if (boundaries & 1)
1908 			return -1;
1909 		      else /* if (boundaries & 2) */
1910 			return 0;
1911 		    }
1912 
1913 		  cpos =
1914 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1915 						 dst, bkref_idx);
1916 		  if (cpos == -1 /* && (boundaries & 1) */)
1917 		    return -1;
1918 		  if (cpos == 0 && (boundaries & 2))
1919 		    return 0;
1920 
1921 		  if (subexp_idx < BITSET_WORD_BITS)
1922 		    ent->eps_reachable_subexps_map
1923 		      &= ~((bitset_word_t) 1 << subexp_idx);
1924 		}
1925 	      while (ent++->more);
1926 	    }
1927 	  break;
1928 
1929 	case OP_OPEN_SUBEXP:
1930 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1931 	    return -1;
1932 	  break;
1933 
1934 	case OP_CLOSE_SUBEXP:
1935 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1936 	    return 0;
1937 	  break;
1938 
1939 	default:
1940 	    break;
1941 	}
1942     }
1943 
1944   return (boundaries & 2) ? 1 : 0;
1945 }
1946 
1947 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1948 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1949 			   Idx subexp_idx, Idx from_node, Idx str_idx,
1950 			   Idx bkref_idx)
1951 {
1952   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1953   int boundaries;
1954 
1955   /* If we are outside the range of the subexpression, return -1 or 1.  */
1956   if (str_idx < lim->subexp_from)
1957     return -1;
1958 
1959   if (lim->subexp_to < str_idx)
1960     return 1;
1961 
1962   /* If we are within the subexpression, return 0.  */
1963   boundaries = (str_idx == lim->subexp_from);
1964   boundaries |= (str_idx == lim->subexp_to) << 1;
1965   if (boundaries == 0)
1966     return 0;
1967 
1968   /* Else, examine epsilon closure.  */
1969   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1970 				      from_node, bkref_idx);
1971 }
1972 
1973 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1974    which are against limitations from DEST_NODES. */
1975 
1976 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)1977 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1978 		     const re_node_set *candidates, re_node_set *limits,
1979 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1980 {
1981   reg_errcode_t err;
1982   Idx node_idx, lim_idx;
1983 
1984   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1985     {
1986       Idx subexp_idx;
1987       struct re_backref_cache_entry *ent;
1988       ent = bkref_ents + limits->elems[lim_idx];
1989 
1990       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1991 	continue; /* This is unrelated limitation.  */
1992 
1993       subexp_idx = dfa->nodes[ent->node].opr.idx;
1994       if (ent->subexp_to == str_idx)
1995 	{
1996 	  Idx ops_node = -1;
1997 	  Idx cls_node = -1;
1998 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1999 	    {
2000 	      Idx node = dest_nodes->elems[node_idx];
2001 	      re_token_type_t type = dfa->nodes[node].type;
2002 	      if (type == OP_OPEN_SUBEXP
2003 		  && subexp_idx == dfa->nodes[node].opr.idx)
2004 		ops_node = node;
2005 	      else if (type == OP_CLOSE_SUBEXP
2006 		       && subexp_idx == dfa->nodes[node].opr.idx)
2007 		cls_node = node;
2008 	    }
2009 
2010 	  /* Check the limitation of the open subexpression.  */
2011 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2012 	  if (ops_node >= 0)
2013 	    {
2014 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2015 					   candidates);
2016 	      if (__glibc_unlikely (err != REG_NOERROR))
2017 		return err;
2018 	    }
2019 
2020 	  /* Check the limitation of the close subexpression.  */
2021 	  if (cls_node >= 0)
2022 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2023 	      {
2024 		Idx node = dest_nodes->elems[node_idx];
2025 		if (!re_node_set_contains (dfa->inveclosures + node,
2026 					   cls_node)
2027 		    && !re_node_set_contains (dfa->eclosures + node,
2028 					      cls_node))
2029 		  {
2030 		    /* It is against this limitation.
2031 		       Remove it form the current sifted state.  */
2032 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2033 						 candidates);
2034 		    if (__glibc_unlikely (err != REG_NOERROR))
2035 		      return err;
2036 		    --node_idx;
2037 		  }
2038 	      }
2039 	}
2040       else /* (ent->subexp_to != str_idx)  */
2041 	{
2042 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2043 	    {
2044 	      Idx node = dest_nodes->elems[node_idx];
2045 	      re_token_type_t type = dfa->nodes[node].type;
2046 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2047 		{
2048 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2049 		    continue;
2050 		  /* It is against this limitation.
2051 		     Remove it form the current sifted state.  */
2052 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2053 					       candidates);
2054 		  if (__glibc_unlikely (err != REG_NOERROR))
2055 		    return err;
2056 		}
2057 	    }
2058 	}
2059     }
2060   return REG_NOERROR;
2061 }
2062 
2063 static reg_errcode_t
2064 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2065 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2066 		   Idx str_idx, const re_node_set *candidates)
2067 {
2068   const re_dfa_t *const dfa = mctx->dfa;
2069   reg_errcode_t err;
2070   Idx node_idx, node;
2071   re_sift_context_t local_sctx;
2072   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2073 
2074   if (first_idx == -1)
2075     return REG_NOERROR;
2076 
2077   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2078 
2079   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2080     {
2081       Idx enabled_idx;
2082       re_token_type_t type;
2083       struct re_backref_cache_entry *entry;
2084       node = candidates->elems[node_idx];
2085       type = dfa->nodes[node].type;
2086       /* Avoid infinite loop for the REs like "()\1+".  */
2087       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2088 	continue;
2089       if (type != OP_BACK_REF)
2090 	continue;
2091 
2092       entry = mctx->bkref_ents + first_idx;
2093       enabled_idx = first_idx;
2094       do
2095 	{
2096 	  Idx subexp_len;
2097 	  Idx to_idx;
2098 	  Idx dst_node;
2099 	  bool ok;
2100 	  re_dfastate_t *cur_state;
2101 
2102 	  if (entry->node != node)
2103 	    continue;
2104 	  subexp_len = entry->subexp_to - entry->subexp_from;
2105 	  to_idx = str_idx + subexp_len;
2106 	  dst_node = (subexp_len ? dfa->nexts[node]
2107 		      : dfa->edests[node].elems[0]);
2108 
2109 	  if (to_idx > sctx->last_str_idx
2110 	      || sctx->sifted_states[to_idx] == NULL
2111 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2112 	      || check_dst_limits (mctx, &sctx->limits, node,
2113 				   str_idx, dst_node, to_idx))
2114 	    continue;
2115 
2116 	  if (local_sctx.sifted_states == NULL)
2117 	    {
2118 	      local_sctx = *sctx;
2119 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2120 	      if (__glibc_unlikely (err != REG_NOERROR))
2121 		goto free_return;
2122 	    }
2123 	  local_sctx.last_node = node;
2124 	  local_sctx.last_str_idx = str_idx;
2125 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2126 	  if (__glibc_unlikely (! ok))
2127 	    {
2128 	      err = REG_ESPACE;
2129 	      goto free_return;
2130 	    }
2131 	  cur_state = local_sctx.sifted_states[str_idx];
2132 	  err = sift_states_backward (mctx, &local_sctx);
2133 	  if (__glibc_unlikely (err != REG_NOERROR))
2134 	    goto free_return;
2135 	  if (sctx->limited_states != NULL)
2136 	    {
2137 	      err = merge_state_array (dfa, sctx->limited_states,
2138 				       local_sctx.sifted_states,
2139 				       str_idx + 1);
2140 	      if (__glibc_unlikely (err != REG_NOERROR))
2141 		goto free_return;
2142 	    }
2143 	  local_sctx.sifted_states[str_idx] = cur_state;
2144 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2145 
2146 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2147 	  entry = mctx->bkref_ents + enabled_idx;
2148 	}
2149       while (enabled_idx++, entry++->more);
2150     }
2151   err = REG_NOERROR;
2152  free_return:
2153   if (local_sctx.sifted_states != NULL)
2154     {
2155       re_node_set_free (&local_sctx.limits);
2156     }
2157 
2158   return err;
2159 }
2160 
2161 
2162 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2163 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2164 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2165 {
2166   const re_dfa_t *const dfa = mctx->dfa;
2167   int naccepted;
2168   /* Check the node can accept "multi byte".  */
2169   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2170   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2171       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2172 			       dfa->nexts[node_idx]))
2173     /* The node can't accept the "multi byte", or the
2174        destination was already thrown away, then the node
2175        couldn't accept the current input "multi byte".   */
2176     naccepted = 0;
2177   /* Otherwise, it is sure that the node could accept
2178      'naccepted' bytes input.  */
2179   return naccepted;
2180 }
2181 
2182 /* Functions for state transition.  */
2183 
2184 /* Return the next state to which the current state STATE will transit by
2185    accepting the current input byte, and update STATE_LOG if necessary.
2186    Return NULL on failure.
2187    If STATE can accept a multibyte char/collating element/back reference
2188    update the destination of STATE_LOG.  */
2189 
2190 static re_dfastate_t *
2191 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2192 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2193 	       re_dfastate_t *state)
2194 {
2195   re_dfastate_t **trtable;
2196   unsigned char ch;
2197 
2198   /* If the current state can accept multibyte.  */
2199   if (__glibc_unlikely (state->accept_mb))
2200     {
2201       *err = transit_state_mb (mctx, state);
2202       if (__glibc_unlikely (*err != REG_NOERROR))
2203 	return NULL;
2204     }
2205 
2206   /* Then decide the next state with the single byte.  */
2207 #if 0
2208   if (0)
2209     /* don't use transition table  */
2210     return transit_state_sb (err, mctx, state);
2211 #endif
2212 
2213   /* Use transition table  */
2214   ch = re_string_fetch_byte (&mctx->input);
2215   for (;;)
2216     {
2217       trtable = state->trtable;
2218       if (__glibc_likely (trtable != NULL))
2219 	return trtable[ch];
2220 
2221       trtable = state->word_trtable;
2222       if (__glibc_likely (trtable != NULL))
2223 	{
2224 	  unsigned int context;
2225 	  context
2226 	    = re_string_context_at (&mctx->input,
2227 				    re_string_cur_idx (&mctx->input) - 1,
2228 				    mctx->eflags);
2229 	  if (IS_WORD_CONTEXT (context))
2230 	    return trtable[ch + SBC_MAX];
2231 	  else
2232 	    return trtable[ch];
2233 	}
2234 
2235       if (!build_trtable (mctx->dfa, state))
2236 	{
2237 	  *err = REG_ESPACE;
2238 	  return NULL;
2239 	}
2240 
2241       /* Retry, we now have a transition table.  */
2242     }
2243 }
2244 
2245 /* Update the state_log if we need */
2246 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2247 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2248 		      re_dfastate_t *next_state)
2249 {
2250   const re_dfa_t *const dfa = mctx->dfa;
2251   Idx cur_idx = re_string_cur_idx (&mctx->input);
2252 
2253   if (cur_idx > mctx->state_log_top)
2254     {
2255       mctx->state_log[cur_idx] = next_state;
2256       mctx->state_log_top = cur_idx;
2257     }
2258   else if (mctx->state_log[cur_idx] == 0)
2259     {
2260       mctx->state_log[cur_idx] = next_state;
2261     }
2262   else
2263     {
2264       re_dfastate_t *pstate;
2265       unsigned int context;
2266       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2267       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2268 	 the destination of a multibyte char/collating element/
2269 	 back reference.  Then the next state is the union set of
2270 	 these destinations and the results of the transition table.  */
2271       pstate = mctx->state_log[cur_idx];
2272       log_nodes = pstate->entrance_nodes;
2273       if (next_state != NULL)
2274 	{
2275 	  table_nodes = next_state->entrance_nodes;
2276 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2277 					     log_nodes);
2278 	  if (__glibc_unlikely (*err != REG_NOERROR))
2279 	    return NULL;
2280 	}
2281       else
2282 	next_nodes = *log_nodes;
2283       /* Note: We already add the nodes of the initial state,
2284 	 then we don't need to add them here.  */
2285 
2286       context = re_string_context_at (&mctx->input,
2287 				      re_string_cur_idx (&mctx->input) - 1,
2288 				      mctx->eflags);
2289       next_state = mctx->state_log[cur_idx]
2290 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2291       /* We don't need to check errors here, since the return value of
2292 	 this function is next_state and ERR is already set.  */
2293 
2294       if (table_nodes != NULL)
2295 	re_node_set_free (&next_nodes);
2296     }
2297 
2298   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2299     {
2300       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2301 	 later.  We must check them here, since the back references in the
2302 	 next state might use them.  */
2303       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2304 					cur_idx);
2305       if (__glibc_unlikely (*err != REG_NOERROR))
2306 	return NULL;
2307 
2308       /* If the next state has back references.  */
2309       if (next_state->has_backref)
2310 	{
2311 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2312 	  if (__glibc_unlikely (*err != REG_NOERROR))
2313 	    return NULL;
2314 	  next_state = mctx->state_log[cur_idx];
2315 	}
2316     }
2317 
2318   return next_state;
2319 }
2320 
2321 /* Skip bytes in the input that correspond to part of a
2322    multi-byte match, then look in the log for a state
2323    from which to restart matching.  */
2324 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2325 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2326 {
2327   re_dfastate_t *cur_state;
2328   do
2329     {
2330       Idx max = mctx->state_log_top;
2331       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2332 
2333       do
2334 	{
2335 	  if (++cur_str_idx > max)
2336 	    return NULL;
2337 	  re_string_skip_bytes (&mctx->input, 1);
2338 	}
2339       while (mctx->state_log[cur_str_idx] == NULL);
2340 
2341       cur_state = merge_state_with_log (err, mctx, NULL);
2342     }
2343   while (*err == REG_NOERROR && cur_state == NULL);
2344   return cur_state;
2345 }
2346 
2347 /* Helper functions for transit_state.  */
2348 
2349 /* From the node set CUR_NODES, pick up the nodes whose types are
2350    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2351    expression. And register them to use them later for evaluating the
2352    corresponding back references.  */
2353 
2354 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2355 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2356 			   Idx str_idx)
2357 {
2358   const re_dfa_t *const dfa = mctx->dfa;
2359   Idx node_idx;
2360   reg_errcode_t err;
2361 
2362   /* TODO: This isn't efficient.
2363 	   Because there might be more than one nodes whose types are
2364 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2365 	   nodes.
2366 	   E.g. RE: (a){2}  */
2367   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2368     {
2369       Idx node = cur_nodes->elems[node_idx];
2370       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2371 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2372 	  && (dfa->used_bkref_map
2373 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2374 	{
2375 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2376 	  if (__glibc_unlikely (err != REG_NOERROR))
2377 	    return err;
2378 	}
2379     }
2380   return REG_NOERROR;
2381 }
2382 
2383 #if 0
2384 /* Return the next state to which the current state STATE will transit by
2385    accepting the current input byte.  Return NULL on failure.  */
2386 
2387 static re_dfastate_t *
2388 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2389 		  re_dfastate_t *state)
2390 {
2391   const re_dfa_t *const dfa = mctx->dfa;
2392   re_node_set next_nodes;
2393   re_dfastate_t *next_state;
2394   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2395   unsigned int context;
2396 
2397   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2398   if (__glibc_unlikely (*err != REG_NOERROR))
2399     return NULL;
2400   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2401     {
2402       Idx cur_node = state->nodes.elems[node_cnt];
2403       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2404 	{
2405 	  *err = re_node_set_merge (&next_nodes,
2406 				    dfa->eclosures + dfa->nexts[cur_node]);
2407 	  if (__glibc_unlikely (*err != REG_NOERROR))
2408 	    {
2409 	      re_node_set_free (&next_nodes);
2410 	      return NULL;
2411 	    }
2412 	}
2413     }
2414   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2415   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2416   /* We don't need to check errors here, since the return value of
2417      this function is next_state and ERR is already set.  */
2418 
2419   re_node_set_free (&next_nodes);
2420   re_string_skip_bytes (&mctx->input, 1);
2421   return next_state;
2422 }
2423 #endif
2424 
2425 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2426 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2427 {
2428   const re_dfa_t *const dfa = mctx->dfa;
2429   reg_errcode_t err;
2430   Idx i;
2431 
2432   for (i = 0; i < pstate->nodes.nelem; ++i)
2433     {
2434       re_node_set dest_nodes, *new_nodes;
2435       Idx cur_node_idx = pstate->nodes.elems[i];
2436       int naccepted;
2437       Idx dest_idx;
2438       unsigned int context;
2439       re_dfastate_t *dest_state;
2440 
2441       if (!dfa->nodes[cur_node_idx].accept_mb)
2442 	continue;
2443 
2444       if (dfa->nodes[cur_node_idx].constraint)
2445 	{
2446 	  context = re_string_context_at (&mctx->input,
2447 					  re_string_cur_idx (&mctx->input),
2448 					  mctx->eflags);
2449 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2450 					   context))
2451 	    continue;
2452 	}
2453 
2454       /* How many bytes the node can accept?  */
2455       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2456 					   re_string_cur_idx (&mctx->input));
2457       if (naccepted == 0)
2458 	continue;
2459 
2460       /* The node can accepts 'naccepted' bytes.  */
2461       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2462       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2463 			       : mctx->max_mb_elem_len);
2464       err = clean_state_log_if_needed (mctx, dest_idx);
2465       if (__glibc_unlikely (err != REG_NOERROR))
2466 	return err;
2467       DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2468       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2469 
2470       dest_state = mctx->state_log[dest_idx];
2471       if (dest_state == NULL)
2472 	dest_nodes = *new_nodes;
2473       else
2474 	{
2475 	  err = re_node_set_init_union (&dest_nodes,
2476 					dest_state->entrance_nodes, new_nodes);
2477 	  if (__glibc_unlikely (err != REG_NOERROR))
2478 	    return err;
2479 	}
2480       context = re_string_context_at (&mctx->input, dest_idx - 1,
2481 				      mctx->eflags);
2482       mctx->state_log[dest_idx]
2483 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2484       if (dest_state != NULL)
2485 	re_node_set_free (&dest_nodes);
2486       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2487 			    && err != REG_NOERROR))
2488 	return err;
2489     }
2490   return REG_NOERROR;
2491 }
2492 
2493 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2494 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2495 {
2496   const re_dfa_t *const dfa = mctx->dfa;
2497   reg_errcode_t err;
2498   Idx i;
2499   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2500 
2501   for (i = 0; i < nodes->nelem; ++i)
2502     {
2503       Idx dest_str_idx, prev_nelem, bkc_idx;
2504       Idx node_idx = nodes->elems[i];
2505       unsigned int context;
2506       const re_token_t *node = dfa->nodes + node_idx;
2507       re_node_set *new_dest_nodes;
2508 
2509       /* Check whether 'node' is a backreference or not.  */
2510       if (node->type != OP_BACK_REF)
2511 	continue;
2512 
2513       if (node->constraint)
2514 	{
2515 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2516 					  mctx->eflags);
2517 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2518 	    continue;
2519 	}
2520 
2521       /* 'node' is a backreference.
2522 	 Check the substring which the substring matched.  */
2523       bkc_idx = mctx->nbkref_ents;
2524       err = get_subexp (mctx, node_idx, cur_str_idx);
2525       if (__glibc_unlikely (err != REG_NOERROR))
2526 	goto free_return;
2527 
2528       /* And add the epsilon closures (which is 'new_dest_nodes') of
2529 	 the backreference to appropriate state_log.  */
2530       DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2531       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2532 	{
2533 	  Idx subexp_len;
2534 	  re_dfastate_t *dest_state;
2535 	  struct re_backref_cache_entry *bkref_ent;
2536 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2537 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2538 	    continue;
2539 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2540 	  new_dest_nodes = (subexp_len == 0
2541 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2542 			    : dfa->eclosures + dfa->nexts[node_idx]);
2543 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2544 			  - bkref_ent->subexp_from);
2545 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2546 					  mctx->eflags);
2547 	  dest_state = mctx->state_log[dest_str_idx];
2548 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2549 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2550 	  /* Add 'new_dest_node' to state_log.  */
2551 	  if (dest_state == NULL)
2552 	    {
2553 	      mctx->state_log[dest_str_idx]
2554 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2555 					    context);
2556 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2557 				    && err != REG_NOERROR))
2558 		goto free_return;
2559 	    }
2560 	  else
2561 	    {
2562 	      re_node_set dest_nodes;
2563 	      err = re_node_set_init_union (&dest_nodes,
2564 					    dest_state->entrance_nodes,
2565 					    new_dest_nodes);
2566 	      if (__glibc_unlikely (err != REG_NOERROR))
2567 		{
2568 		  re_node_set_free (&dest_nodes);
2569 		  goto free_return;
2570 		}
2571 	      mctx->state_log[dest_str_idx]
2572 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2573 	      re_node_set_free (&dest_nodes);
2574 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2575 				    && err != REG_NOERROR))
2576 		goto free_return;
2577 	    }
2578 	  /* We need to check recursively if the backreference can epsilon
2579 	     transit.  */
2580 	  if (subexp_len == 0
2581 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2582 	    {
2583 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2584 					       cur_str_idx);
2585 	      if (__glibc_unlikely (err != REG_NOERROR))
2586 		goto free_return;
2587 	      err = transit_state_bkref (mctx, new_dest_nodes);
2588 	      if (__glibc_unlikely (err != REG_NOERROR))
2589 		goto free_return;
2590 	    }
2591 	}
2592     }
2593   err = REG_NOERROR;
2594  free_return:
2595   return err;
2596 }
2597 
2598 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2599    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2600    Note that we might collect inappropriate candidates here.
2601    However, the cost of checking them strictly here is too high, then we
2602    delay these checking for prune_impossible_nodes().  */
2603 
2604 static reg_errcode_t
2605 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2606 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2607 {
2608   const re_dfa_t *const dfa = mctx->dfa;
2609   Idx subexp_num, sub_top_idx;
2610   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2611   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2612   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2613   if (cache_idx != -1)
2614     {
2615       const struct re_backref_cache_entry *entry
2616 	= mctx->bkref_ents + cache_idx;
2617       do
2618 	if (entry->node == bkref_node)
2619 	  return REG_NOERROR; /* We already checked it.  */
2620       while (entry++->more);
2621     }
2622 
2623   subexp_num = dfa->nodes[bkref_node].opr.idx;
2624 
2625   /* For each sub expression  */
2626   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2627     {
2628       reg_errcode_t err;
2629       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2630       re_sub_match_last_t *sub_last;
2631       Idx sub_last_idx, sl_str, bkref_str_off;
2632 
2633       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2634 	continue; /* It isn't related.  */
2635 
2636       sl_str = sub_top->str_idx;
2637       bkref_str_off = bkref_str_idx;
2638       /* At first, check the last node of sub expressions we already
2639 	 evaluated.  */
2640       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2641 	{
2642 	  regoff_t sl_str_diff;
2643 	  sub_last = sub_top->lasts[sub_last_idx];
2644 	  sl_str_diff = sub_last->str_idx - sl_str;
2645 	  /* The matched string by the sub expression match with the substring
2646 	     at the back reference?  */
2647 	  if (sl_str_diff > 0)
2648 	    {
2649 	      if (__glibc_unlikely (bkref_str_off + sl_str_diff
2650 				    > mctx->input.valid_len))
2651 		{
2652 		  /* Not enough chars for a successful match.  */
2653 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2654 		    break;
2655 
2656 		  err = clean_state_log_if_needed (mctx,
2657 						   bkref_str_off
2658 						   + sl_str_diff);
2659 		  if (__glibc_unlikely (err != REG_NOERROR))
2660 		    return err;
2661 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2662 		}
2663 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2664 		/* We don't need to search this sub expression any more.  */
2665 		break;
2666 	    }
2667 	  bkref_str_off += sl_str_diff;
2668 	  sl_str += sl_str_diff;
2669 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2670 				bkref_str_idx);
2671 
2672 	  /* Reload buf, since the preceding call might have reallocated
2673 	     the buffer.  */
2674 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2675 
2676 	  if (err == REG_NOMATCH)
2677 	    continue;
2678 	  if (__glibc_unlikely (err != REG_NOERROR))
2679 	    return err;
2680 	}
2681 
2682       if (sub_last_idx < sub_top->nlasts)
2683 	continue;
2684       if (sub_last_idx > 0)
2685 	++sl_str;
2686       /* Then, search for the other last nodes of the sub expression.  */
2687       for (; sl_str <= bkref_str_idx; ++sl_str)
2688 	{
2689 	  Idx cls_node;
2690 	  regoff_t sl_str_off;
2691 	  const re_node_set *nodes;
2692 	  sl_str_off = sl_str - sub_top->str_idx;
2693 	  /* The matched string by the sub expression match with the substring
2694 	     at the back reference?  */
2695 	  if (sl_str_off > 0)
2696 	    {
2697 	      if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2698 		{
2699 		  /* If we are at the end of the input, we cannot match.  */
2700 		  if (bkref_str_off >= mctx->input.len)
2701 		    break;
2702 
2703 		  err = extend_buffers (mctx, bkref_str_off + 1);
2704 		  if (__glibc_unlikely (err != REG_NOERROR))
2705 		    return err;
2706 
2707 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2708 		}
2709 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2710 		break; /* We don't need to search this sub expression
2711 			  any more.  */
2712 	    }
2713 	  if (mctx->state_log[sl_str] == NULL)
2714 	    continue;
2715 	  /* Does this state have a ')' of the sub expression?  */
2716 	  nodes = &mctx->state_log[sl_str]->nodes;
2717 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2718 				       OP_CLOSE_SUBEXP);
2719 	  if (cls_node == -1)
2720 	    continue; /* No.  */
2721 	  if (sub_top->path == NULL)
2722 	    {
2723 	      sub_top->path = calloc (sizeof (state_array_t),
2724 				      sl_str - sub_top->str_idx + 1);
2725 	      if (sub_top->path == NULL)
2726 		return REG_ESPACE;
2727 	    }
2728 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2729 	     in the current context?  */
2730 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2731 			       sub_top->str_idx, cls_node, sl_str,
2732 			       OP_CLOSE_SUBEXP);
2733 	  if (err == REG_NOMATCH)
2734 	      continue;
2735 	  if (__glibc_unlikely (err != REG_NOERROR))
2736 	      return err;
2737 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2738 	  if (__glibc_unlikely (sub_last == NULL))
2739 	    return REG_ESPACE;
2740 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2741 				bkref_str_idx);
2742 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2743 	  if (err == REG_NOMATCH)
2744 	    continue;
2745 	  if (__glibc_unlikely (err != REG_NOERROR))
2746 	    return err;
2747 	}
2748     }
2749   return REG_NOERROR;
2750 }
2751 
2752 /* Helper functions for get_subexp().  */
2753 
2754 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2755    If it can arrive, register the sub expression expressed with SUB_TOP
2756    and SUB_LAST.  */
2757 
2758 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2759 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2760 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2761 {
2762   reg_errcode_t err;
2763   Idx to_idx;
2764   /* Can the subexpression arrive the back reference?  */
2765   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2766 		       sub_last->str_idx, bkref_node, bkref_str,
2767 		       OP_OPEN_SUBEXP);
2768   if (err != REG_NOERROR)
2769     return err;
2770   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2771 			     sub_last->str_idx);
2772   if (__glibc_unlikely (err != REG_NOERROR))
2773     return err;
2774   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2775   return clean_state_log_if_needed (mctx, to_idx);
2776 }
2777 
2778 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2779    Search '(' if FL_OPEN, or search ')' otherwise.
2780    TODO: This function isn't efficient...
2781 	 Because there might be more than one nodes whose types are
2782 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2783 	 nodes.
2784 	 E.g. RE: (a){2}  */
2785 
2786 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2787 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2788 		  Idx subexp_idx, int type)
2789 {
2790   Idx cls_idx;
2791   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2792     {
2793       Idx cls_node = nodes->elems[cls_idx];
2794       const re_token_t *node = dfa->nodes + cls_node;
2795       if (node->type == type
2796 	  && node->opr.idx == subexp_idx)
2797 	return cls_node;
2798     }
2799   return -1;
2800 }
2801 
2802 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2803    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2804    heavily reused.
2805    Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2806    REG_ESPACE if memory is exhausted.  */
2807 
2808 static reg_errcode_t
2809 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2810 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2811 	       Idx top_str, Idx last_node, Idx last_str, int type)
2812 {
2813   const re_dfa_t *const dfa = mctx->dfa;
2814   reg_errcode_t err = REG_NOERROR;
2815   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2816   re_dfastate_t *cur_state = NULL;
2817   re_node_set *cur_nodes, next_nodes;
2818   re_dfastate_t **backup_state_log;
2819   unsigned int context;
2820 
2821   subexp_num = dfa->nodes[top_node].opr.idx;
2822   /* Extend the buffer if we need.  */
2823   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2824     {
2825       re_dfastate_t **new_array;
2826       Idx old_alloc = path->alloc;
2827       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2828       Idx new_alloc;
2829       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2830 	return REG_ESPACE;
2831       new_alloc = old_alloc + incr_alloc;
2832       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2833 	return REG_ESPACE;
2834       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2835       if (__glibc_unlikely (new_array == NULL))
2836 	return REG_ESPACE;
2837       path->array = new_array;
2838       path->alloc = new_alloc;
2839       memset (new_array + old_alloc, '\0',
2840 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2841     }
2842 
2843   str_idx = path->next_idx ? path->next_idx : top_str;
2844 
2845   /* Temporary modify MCTX.  */
2846   backup_state_log = mctx->state_log;
2847   backup_cur_idx = mctx->input.cur_idx;
2848   mctx->state_log = path->array;
2849   mctx->input.cur_idx = str_idx;
2850 
2851   /* Setup initial node set.  */
2852   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2853   if (str_idx == top_str)
2854     {
2855       err = re_node_set_init_1 (&next_nodes, top_node);
2856       if (__glibc_unlikely (err != REG_NOERROR))
2857 	return err;
2858       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2859       if (__glibc_unlikely (err != REG_NOERROR))
2860 	{
2861 	  re_node_set_free (&next_nodes);
2862 	  return err;
2863 	}
2864     }
2865   else
2866     {
2867       cur_state = mctx->state_log[str_idx];
2868       if (cur_state && cur_state->has_backref)
2869 	{
2870 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2871 	  if (__glibc_unlikely (err != REG_NOERROR))
2872 	    return err;
2873 	}
2874       else
2875 	re_node_set_init_empty (&next_nodes);
2876     }
2877   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2878     {
2879       if (next_nodes.nelem)
2880 	{
2881 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2882 				    subexp_num, type);
2883 	  if (__glibc_unlikely (err != REG_NOERROR))
2884 	    {
2885 	      re_node_set_free (&next_nodes);
2886 	      return err;
2887 	    }
2888 	}
2889       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2890       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2891 	{
2892 	  re_node_set_free (&next_nodes);
2893 	  return err;
2894 	}
2895       mctx->state_log[str_idx] = cur_state;
2896     }
2897 
2898   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2899     {
2900       re_node_set_empty (&next_nodes);
2901       if (mctx->state_log[str_idx + 1])
2902 	{
2903 	  err = re_node_set_merge (&next_nodes,
2904 				   &mctx->state_log[str_idx + 1]->nodes);
2905 	  if (__glibc_unlikely (err != REG_NOERROR))
2906 	    {
2907 	      re_node_set_free (&next_nodes);
2908 	      return err;
2909 	    }
2910 	}
2911       if (cur_state)
2912 	{
2913 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2914 					      &cur_state->non_eps_nodes,
2915 					      &next_nodes);
2916 	  if (__glibc_unlikely (err != REG_NOERROR))
2917 	    {
2918 	      re_node_set_free (&next_nodes);
2919 	      return err;
2920 	    }
2921 	}
2922       ++str_idx;
2923       if (next_nodes.nelem)
2924 	{
2925 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2926 	  if (__glibc_unlikely (err != REG_NOERROR))
2927 	    {
2928 	      re_node_set_free (&next_nodes);
2929 	      return err;
2930 	    }
2931 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2932 				    subexp_num, type);
2933 	  if (__glibc_unlikely (err != REG_NOERROR))
2934 	    {
2935 	      re_node_set_free (&next_nodes);
2936 	      return err;
2937 	    }
2938 	}
2939       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2940       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2941       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2942 	{
2943 	  re_node_set_free (&next_nodes);
2944 	  return err;
2945 	}
2946       mctx->state_log[str_idx] = cur_state;
2947       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2948     }
2949   re_node_set_free (&next_nodes);
2950   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2951 	       : &mctx->state_log[last_str]->nodes);
2952   path->next_idx = str_idx;
2953 
2954   /* Fix MCTX.  */
2955   mctx->state_log = backup_state_log;
2956   mctx->input.cur_idx = backup_cur_idx;
2957 
2958   /* Then check the current node set has the node LAST_NODE.  */
2959   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2960     return REG_NOERROR;
2961 
2962   return REG_NOMATCH;
2963 }
2964 
2965 /* Helper functions for check_arrival.  */
2966 
2967 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2968    to NEXT_NODES.
2969    TODO: This function is similar to the functions transit_state*(),
2970 	 however this function has many additional works.
2971 	 Can't we unify them?  */
2972 
2973 static reg_errcode_t
2974 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)2975 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2976 			      re_node_set *cur_nodes, re_node_set *next_nodes)
2977 {
2978   const re_dfa_t *const dfa = mctx->dfa;
2979   bool ok;
2980   Idx cur_idx;
2981   reg_errcode_t err = REG_NOERROR;
2982   re_node_set union_set;
2983   re_node_set_init_empty (&union_set);
2984   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2985     {
2986       int naccepted = 0;
2987       Idx cur_node = cur_nodes->elems[cur_idx];
2988       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
2989 
2990       /* If the node may accept "multi byte".  */
2991       if (dfa->nodes[cur_node].accept_mb)
2992 	{
2993 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2994 					       str_idx);
2995 	  if (naccepted > 1)
2996 	    {
2997 	      re_dfastate_t *dest_state;
2998 	      Idx next_node = dfa->nexts[cur_node];
2999 	      Idx next_idx = str_idx + naccepted;
3000 	      dest_state = mctx->state_log[next_idx];
3001 	      re_node_set_empty (&union_set);
3002 	      if (dest_state)
3003 		{
3004 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3005 		  if (__glibc_unlikely (err != REG_NOERROR))
3006 		    {
3007 		      re_node_set_free (&union_set);
3008 		      return err;
3009 		    }
3010 		}
3011 	      ok = re_node_set_insert (&union_set, next_node);
3012 	      if (__glibc_unlikely (! ok))
3013 		{
3014 		  re_node_set_free (&union_set);
3015 		  return REG_ESPACE;
3016 		}
3017 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3018 							    &union_set);
3019 	      if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3020 				    && err != REG_NOERROR))
3021 		{
3022 		  re_node_set_free (&union_set);
3023 		  return err;
3024 		}
3025 	    }
3026 	}
3027 
3028       if (naccepted
3029 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3030 	{
3031 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3032 	  if (__glibc_unlikely (! ok))
3033 	    {
3034 	      re_node_set_free (&union_set);
3035 	      return REG_ESPACE;
3036 	    }
3037 	}
3038     }
3039   re_node_set_free (&union_set);
3040   return REG_NOERROR;
3041 }
3042 
3043 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3044    CUR_NODES, however exclude the nodes which are:
3045     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3046     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3047 */
3048 
3049 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3050 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3051 			  Idx ex_subexp, int type)
3052 {
3053   reg_errcode_t err;
3054   Idx idx, outside_node;
3055   re_node_set new_nodes;
3056   DEBUG_ASSERT (cur_nodes->nelem);
3057   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3058   if (__glibc_unlikely (err != REG_NOERROR))
3059     return err;
3060   /* Create a new node set NEW_NODES with the nodes which are epsilon
3061      closures of the node in CUR_NODES.  */
3062 
3063   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3064     {
3065       Idx cur_node = cur_nodes->elems[idx];
3066       const re_node_set *eclosure = dfa->eclosures + cur_node;
3067       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3068       if (outside_node == -1)
3069 	{
3070 	  /* There are no problematic nodes, just merge them.  */
3071 	  err = re_node_set_merge (&new_nodes, eclosure);
3072 	  if (__glibc_unlikely (err != REG_NOERROR))
3073 	    {
3074 	      re_node_set_free (&new_nodes);
3075 	      return err;
3076 	    }
3077 	}
3078       else
3079 	{
3080 	  /* There are problematic nodes, re-calculate incrementally.  */
3081 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3082 					      ex_subexp, type);
3083 	  if (__glibc_unlikely (err != REG_NOERROR))
3084 	    {
3085 	      re_node_set_free (&new_nodes);
3086 	      return err;
3087 	    }
3088 	}
3089     }
3090   re_node_set_free (cur_nodes);
3091   *cur_nodes = new_nodes;
3092   return REG_NOERROR;
3093 }
3094 
3095 /* Helper function for check_arrival_expand_ecl.
3096    Check incrementally the epsilon closure of TARGET, and if it isn't
3097    problematic append it to DST_NODES.  */
3098 
3099 static reg_errcode_t
3100 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3101 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3102 			      Idx target, Idx ex_subexp, int type)
3103 {
3104   Idx cur_node;
3105   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3106     {
3107       bool ok;
3108 
3109       if (dfa->nodes[cur_node].type == type
3110 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3111 	{
3112 	  if (type == OP_CLOSE_SUBEXP)
3113 	    {
3114 	      ok = re_node_set_insert (dst_nodes, cur_node);
3115 	      if (__glibc_unlikely (! ok))
3116 		return REG_ESPACE;
3117 	    }
3118 	  break;
3119 	}
3120       ok = re_node_set_insert (dst_nodes, cur_node);
3121       if (__glibc_unlikely (! ok))
3122 	return REG_ESPACE;
3123       if (dfa->edests[cur_node].nelem == 0)
3124 	break;
3125       if (dfa->edests[cur_node].nelem == 2)
3126 	{
3127 	  reg_errcode_t err;
3128 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3129 					      dfa->edests[cur_node].elems[1],
3130 					      ex_subexp, type);
3131 	  if (__glibc_unlikely (err != REG_NOERROR))
3132 	    return err;
3133 	}
3134       cur_node = dfa->edests[cur_node].elems[0];
3135     }
3136   return REG_NOERROR;
3137 }
3138 
3139 
3140 /* For all the back references in the current state, calculate the
3141    destination of the back references by the appropriate entry
3142    in MCTX->BKREF_ENTS.  */
3143 
3144 static reg_errcode_t
3145 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3146 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3147 		    Idx cur_str, Idx subexp_num, int type)
3148 {
3149   const re_dfa_t *const dfa = mctx->dfa;
3150   reg_errcode_t err;
3151   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3152   struct re_backref_cache_entry *ent;
3153 
3154   if (cache_idx_start == -1)
3155     return REG_NOERROR;
3156 
3157  restart:
3158   ent = mctx->bkref_ents + cache_idx_start;
3159   do
3160     {
3161       Idx to_idx, next_node;
3162 
3163       /* Is this entry ENT is appropriate?  */
3164       if (!re_node_set_contains (cur_nodes, ent->node))
3165 	continue; /* No.  */
3166 
3167       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3168       /* Calculate the destination of the back reference, and append it
3169 	 to MCTX->STATE_LOG.  */
3170       if (to_idx == cur_str)
3171 	{
3172 	  /* The backreference did epsilon transit, we must re-check all the
3173 	     node in the current state.  */
3174 	  re_node_set new_dests;
3175 	  reg_errcode_t err2, err3;
3176 	  next_node = dfa->edests[ent->node].elems[0];
3177 	  if (re_node_set_contains (cur_nodes, next_node))
3178 	    continue;
3179 	  err = re_node_set_init_1 (&new_dests, next_node);
3180 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3181 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3182 	  re_node_set_free (&new_dests);
3183 	  if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3184 				|| err3 != REG_NOERROR))
3185 	    {
3186 	      err = (err != REG_NOERROR ? err
3187 		     : (err2 != REG_NOERROR ? err2 : err3));
3188 	      return err;
3189 	    }
3190 	  /* TODO: It is still inefficient...  */
3191 	  goto restart;
3192 	}
3193       else
3194 	{
3195 	  re_node_set union_set;
3196 	  next_node = dfa->nexts[ent->node];
3197 	  if (mctx->state_log[to_idx])
3198 	    {
3199 	      bool ok;
3200 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3201 					next_node))
3202 		continue;
3203 	      err = re_node_set_init_copy (&union_set,
3204 					   &mctx->state_log[to_idx]->nodes);
3205 	      ok = re_node_set_insert (&union_set, next_node);
3206 	      if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3207 		{
3208 		  re_node_set_free (&union_set);
3209 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3210 		  return err;
3211 		}
3212 	    }
3213 	  else
3214 	    {
3215 	      err = re_node_set_init_1 (&union_set, next_node);
3216 	      if (__glibc_unlikely (err != REG_NOERROR))
3217 		return err;
3218 	    }
3219 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3220 	  re_node_set_free (&union_set);
3221 	  if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3222 				&& err != REG_NOERROR))
3223 	    return err;
3224 	}
3225     }
3226   while (ent++->more);
3227   return REG_NOERROR;
3228 }
3229 
3230 /* Build transition table for the state.
3231    Return true if successful.  */
3232 
3233 static bool __attribute_noinline__
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3234 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3235 {
3236   reg_errcode_t err;
3237   Idx i, j;
3238   int ch;
3239   bool need_word_trtable = false;
3240   bitset_word_t elem, mask;
3241   Idx ndests; /* Number of the destination states from 'state'.  */
3242   re_dfastate_t **trtable;
3243   re_dfastate_t *dest_states[SBC_MAX];
3244   re_dfastate_t *dest_states_word[SBC_MAX];
3245   re_dfastate_t *dest_states_nl[SBC_MAX];
3246   re_node_set follows;
3247   bitset_t acceptable;
3248 
3249   /* We build DFA states which corresponds to the destination nodes
3250      from 'state'.  'dests_node[i]' represents the nodes which i-th
3251      destination state contains, and 'dests_ch[i]' represents the
3252      characters which i-th destination state accepts.  */
3253   re_node_set dests_node[SBC_MAX];
3254   bitset_t dests_ch[SBC_MAX];
3255 
3256   /* Initialize transition table.  */
3257   state->word_trtable = state->trtable = NULL;
3258 
3259   /* At first, group all nodes belonging to 'state' into several
3260      destinations.  */
3261   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3262   if (__glibc_unlikely (ndests <= 0))
3263     {
3264       /* Return false in case of an error, true otherwise.  */
3265       if (ndests == 0)
3266 	{
3267 	  state->trtable = (re_dfastate_t **)
3268 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3269           if (__glibc_unlikely (state->trtable == NULL))
3270             return false;
3271 	  return true;
3272 	}
3273       return false;
3274     }
3275 
3276   err = re_node_set_alloc (&follows, ndests + 1);
3277   if (__glibc_unlikely (err != REG_NOERROR))
3278     {
3279     out_free:
3280       re_node_set_free (&follows);
3281       for (i = 0; i < ndests; ++i)
3282 	re_node_set_free (dests_node + i);
3283       return false;
3284     }
3285 
3286   bitset_empty (acceptable);
3287 
3288   /* Then build the states for all destinations.  */
3289   for (i = 0; i < ndests; ++i)
3290     {
3291       Idx next_node;
3292       re_node_set_empty (&follows);
3293       /* Merge the follows of this destination states.  */
3294       for (j = 0; j < dests_node[i].nelem; ++j)
3295 	{
3296 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3297 	  if (next_node != -1)
3298 	    {
3299 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3300 	      if (__glibc_unlikely (err != REG_NOERROR))
3301 		goto out_free;
3302 	    }
3303 	}
3304       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3305       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3306 	goto out_free;
3307       /* If the new state has context constraint,
3308 	 build appropriate states for these contexts.  */
3309       if (dest_states[i]->has_constraint)
3310 	{
3311 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3312 							  CONTEXT_WORD);
3313 	  if (__glibc_unlikely (dest_states_word[i] == NULL
3314 				&& err != REG_NOERROR))
3315 	    goto out_free;
3316 
3317 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3318 	    need_word_trtable = true;
3319 
3320 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3321 							CONTEXT_NEWLINE);
3322 	  if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3323 	    goto out_free;
3324 	}
3325       else
3326 	{
3327 	  dest_states_word[i] = dest_states[i];
3328 	  dest_states_nl[i] = dest_states[i];
3329 	}
3330       bitset_merge (acceptable, dests_ch[i]);
3331     }
3332 
3333   if (!__glibc_unlikely (need_word_trtable))
3334     {
3335       /* We don't care about whether the following character is a word
3336 	 character, or we are in a single-byte character set so we can
3337 	 discern by looking at the character code: allocate a
3338 	 256-entry transition table.  */
3339       trtable = state->trtable =
3340 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3341       if (__glibc_unlikely (trtable == NULL))
3342 	goto out_free;
3343 
3344       /* For all characters ch...:  */
3345       for (i = 0; i < BITSET_WORDS; ++i)
3346 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3347 	     elem;
3348 	     mask <<= 1, elem >>= 1, ++ch)
3349 	  if (__glibc_unlikely (elem & 1))
3350 	    {
3351 	      /* There must be exactly one destination which accepts
3352 		 character ch.  See group_nodes_into_DFAstates.  */
3353 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3354 		;
3355 
3356 	      /* j-th destination accepts the word character ch.  */
3357 	      if (dfa->word_char[i] & mask)
3358 		trtable[ch] = dest_states_word[j];
3359 	      else
3360 		trtable[ch] = dest_states[j];
3361 	    }
3362     }
3363   else
3364     {
3365       /* We care about whether the following character is a word
3366 	 character, and we are in a multi-byte character set: discern
3367 	 by looking at the character code: build two 256-entry
3368 	 transition tables, one starting at trtable[0] and one
3369 	 starting at trtable[SBC_MAX].  */
3370       trtable = state->word_trtable =
3371 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3372       if (__glibc_unlikely (trtable == NULL))
3373 	goto out_free;
3374 
3375       /* For all characters ch...:  */
3376       for (i = 0; i < BITSET_WORDS; ++i)
3377 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3378 	     elem;
3379 	     mask <<= 1, elem >>= 1, ++ch)
3380 	  if (__glibc_unlikely (elem & 1))
3381 	    {
3382 	      /* There must be exactly one destination which accepts
3383 		 character ch.  See group_nodes_into_DFAstates.  */
3384 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3385 		;
3386 
3387 	      /* j-th destination accepts the word character ch.  */
3388 	      trtable[ch] = dest_states[j];
3389 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3390 	    }
3391     }
3392 
3393   /* new line */
3394   if (bitset_contain (acceptable, NEWLINE_CHAR))
3395     {
3396       /* The current state accepts newline character.  */
3397       for (j = 0; j < ndests; ++j)
3398 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3399 	  {
3400 	    /* k-th destination accepts newline character.  */
3401 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3402 	    if (need_word_trtable)
3403 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3404 	    /* There must be only one destination which accepts
3405 	       newline.  See group_nodes_into_DFAstates.  */
3406 	    break;
3407 	  }
3408     }
3409 
3410   re_node_set_free (&follows);
3411   for (i = 0; i < ndests; ++i)
3412     re_node_set_free (dests_node + i);
3413   return true;
3414 }
3415 
3416 /* Group all nodes belonging to STATE into several destinations.
3417    Then for all destinations, set the nodes belonging to the destination
3418    to DESTS_NODE[i] and set the characters accepted by the destination
3419    to DEST_CH[i].  Return the number of destinations if successful,
3420    -1 on internal error.  */
3421 
3422 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3423 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3424 			    re_node_set *dests_node, bitset_t *dests_ch)
3425 {
3426   reg_errcode_t err;
3427   bool ok;
3428   Idx i, j, k;
3429   Idx ndests; /* Number of the destinations from 'state'.  */
3430   bitset_t accepts; /* Characters a node can accept.  */
3431   const re_node_set *cur_nodes = &state->nodes;
3432   bitset_empty (accepts);
3433   ndests = 0;
3434 
3435   /* For all the nodes belonging to 'state',  */
3436   for (i = 0; i < cur_nodes->nelem; ++i)
3437     {
3438       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3439       re_token_type_t type = node->type;
3440       unsigned int constraint = node->constraint;
3441 
3442       /* Enumerate all single byte character this node can accept.  */
3443       if (type == CHARACTER)
3444 	bitset_set (accepts, node->opr.c);
3445       else if (type == SIMPLE_BRACKET)
3446 	{
3447 	  bitset_merge (accepts, node->opr.sbcset);
3448 	}
3449       else if (type == OP_PERIOD)
3450 	{
3451 	  if (dfa->mb_cur_max > 1)
3452 	    bitset_merge (accepts, dfa->sb_char);
3453 	  else
3454 	    bitset_set_all (accepts);
3455 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3456 	    bitset_clear (accepts, '\n');
3457 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3458 	    bitset_clear (accepts, '\0');
3459 	}
3460       else if (type == OP_UTF8_PERIOD)
3461 	{
3462 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3463 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3464 	  else
3465 	    bitset_merge (accepts, utf8_sb_map);
3466 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3467 	    bitset_clear (accepts, '\n');
3468 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3469 	    bitset_clear (accepts, '\0');
3470 	}
3471       else
3472 	continue;
3473 
3474       /* Check the 'accepts' and sift the characters which are not
3475 	 match it the context.  */
3476       if (constraint)
3477 	{
3478 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3479 	    {
3480 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3481 	      bitset_empty (accepts);
3482 	      if (accepts_newline)
3483 		bitset_set (accepts, NEWLINE_CHAR);
3484 	      else
3485 		continue;
3486 	    }
3487 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3488 	    {
3489 	      bitset_empty (accepts);
3490 	      continue;
3491 	    }
3492 
3493 	  if (constraint & NEXT_WORD_CONSTRAINT)
3494 	    {
3495 	      bitset_word_t any_set = 0;
3496 	      if (type == CHARACTER && !node->word_char)
3497 		{
3498 		  bitset_empty (accepts);
3499 		  continue;
3500 		}
3501 	      if (dfa->mb_cur_max > 1)
3502 		for (j = 0; j < BITSET_WORDS; ++j)
3503 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3504 	      else
3505 		for (j = 0; j < BITSET_WORDS; ++j)
3506 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3507 	      if (!any_set)
3508 		continue;
3509 	    }
3510 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3511 	    {
3512 	      bitset_word_t any_set = 0;
3513 	      if (type == CHARACTER && node->word_char)
3514 		{
3515 		  bitset_empty (accepts);
3516 		  continue;
3517 		}
3518 	      if (dfa->mb_cur_max > 1)
3519 		for (j = 0; j < BITSET_WORDS; ++j)
3520 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3521 	      else
3522 		for (j = 0; j < BITSET_WORDS; ++j)
3523 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3524 	      if (!any_set)
3525 		continue;
3526 	    }
3527 	}
3528 
3529       /* Then divide 'accepts' into DFA states, or create a new
3530 	 state.  Above, we make sure that accepts is not empty.  */
3531       for (j = 0; j < ndests; ++j)
3532 	{
3533 	  bitset_t intersec; /* Intersection sets, see below.  */
3534 	  bitset_t remains;
3535 	  /* Flags, see below.  */
3536 	  bitset_word_t has_intersec, not_subset, not_consumed;
3537 
3538 	  /* Optimization, skip if this state doesn't accept the character.  */
3539 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3540 	    continue;
3541 
3542 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3543 	  has_intersec = 0;
3544 	  for (k = 0; k < BITSET_WORDS; ++k)
3545 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3546 	  /* And skip if the intersection set is empty.  */
3547 	  if (!has_intersec)
3548 	    continue;
3549 
3550 	  /* Then check if this state is a subset of 'accepts'.  */
3551 	  not_subset = not_consumed = 0;
3552 	  for (k = 0; k < BITSET_WORDS; ++k)
3553 	    {
3554 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3555 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3556 	    }
3557 
3558 	  /* If this state isn't a subset of 'accepts', create a
3559 	     new group state, which has the 'remains'. */
3560 	  if (not_subset)
3561 	    {
3562 	      bitset_copy (dests_ch[ndests], remains);
3563 	      bitset_copy (dests_ch[j], intersec);
3564 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3565 	      if (__glibc_unlikely (err != REG_NOERROR))
3566 		goto error_return;
3567 	      ++ndests;
3568 	    }
3569 
3570 	  /* Put the position in the current group. */
3571 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3572 	  if (__glibc_unlikely (! ok))
3573 	    goto error_return;
3574 
3575 	  /* If all characters are consumed, go to next node. */
3576 	  if (!not_consumed)
3577 	    break;
3578 	}
3579       /* Some characters remain, create a new group. */
3580       if (j == ndests)
3581 	{
3582 	  bitset_copy (dests_ch[ndests], accepts);
3583 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3584 	  if (__glibc_unlikely (err != REG_NOERROR))
3585 	    goto error_return;
3586 	  ++ndests;
3587 	  bitset_empty (accepts);
3588 	}
3589     }
3590   assume (ndests <= SBC_MAX);
3591   return ndests;
3592  error_return:
3593   for (j = 0; j < ndests; ++j)
3594     re_node_set_free (dests_node + j);
3595   return -1;
3596 }
3597 
3598 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3599    Return the number of the bytes the node accepts.
3600    STR_IDX is the current index of the input string.
3601 
3602    This function handles the nodes which can accept one character, or
3603    one collating element like '.', '[a-z]', opposite to the other nodes
3604    can only accept one byte.  */
3605 
3606 #ifdef _LIBC
3607 # include <locale/weight.h>
3608 #endif
3609 
3610 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3611 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3612 			 const re_string_t *input, Idx str_idx)
3613 {
3614   const re_token_t *node = dfa->nodes + node_idx;
3615   int char_len, elem_len;
3616   Idx i;
3617 
3618   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3619     {
3620       unsigned char c = re_string_byte_at (input, str_idx), d;
3621       if (__glibc_likely (c < 0xc2))
3622 	return 0;
3623 
3624       if (str_idx + 2 > input->len)
3625 	return 0;
3626 
3627       d = re_string_byte_at (input, str_idx + 1);
3628       if (c < 0xe0)
3629 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3630       else if (c < 0xf0)
3631 	{
3632 	  char_len = 3;
3633 	  if (c == 0xe0 && d < 0xa0)
3634 	    return 0;
3635 	}
3636       else if (c < 0xf8)
3637 	{
3638 	  char_len = 4;
3639 	  if (c == 0xf0 && d < 0x90)
3640 	    return 0;
3641 	}
3642       else if (c < 0xfc)
3643 	{
3644 	  char_len = 5;
3645 	  if (c == 0xf8 && d < 0x88)
3646 	    return 0;
3647 	}
3648       else if (c < 0xfe)
3649 	{
3650 	  char_len = 6;
3651 	  if (c == 0xfc && d < 0x84)
3652 	    return 0;
3653 	}
3654       else
3655 	return 0;
3656 
3657       if (str_idx + char_len > input->len)
3658 	return 0;
3659 
3660       for (i = 1; i < char_len; ++i)
3661 	{
3662 	  d = re_string_byte_at (input, str_idx + i);
3663 	  if (d < 0x80 || d > 0xbf)
3664 	    return 0;
3665 	}
3666       return char_len;
3667     }
3668 
3669   char_len = re_string_char_size_at (input, str_idx);
3670   if (node->type == OP_PERIOD)
3671     {
3672       if (char_len <= 1)
3673 	return 0;
3674       /* FIXME: I don't think this if is needed, as both '\n'
3675 	 and '\0' are char_len == 1.  */
3676       /* '.' accepts any one character except the following two cases.  */
3677       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3678 	   && re_string_byte_at (input, str_idx) == '\n')
3679 	  || ((dfa->syntax & RE_DOT_NOT_NULL)
3680 	      && re_string_byte_at (input, str_idx) == '\0'))
3681 	return 0;
3682       return char_len;
3683     }
3684 
3685   elem_len = re_string_elem_size_at (input, str_idx);
3686   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3687     return 0;
3688 
3689   if (node->type == COMPLEX_BRACKET)
3690     {
3691       const re_charset_t *cset = node->opr.mbcset;
3692 #ifdef _LIBC
3693       const unsigned char *pin
3694 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3695       Idx j;
3696       uint32_t nrules;
3697 #endif
3698       int match_len = 0;
3699       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3700 		    ? re_string_wchar_at (input, str_idx) : 0);
3701 
3702       /* match with multibyte character?  */
3703       for (i = 0; i < cset->nmbchars; ++i)
3704 	if (wc == cset->mbchars[i])
3705 	  {
3706 	    match_len = char_len;
3707 	    goto check_node_accept_bytes_match;
3708 	  }
3709       /* match with character_class?  */
3710       for (i = 0; i < cset->nchar_classes; ++i)
3711 	{
3712 	  wctype_t wt = cset->char_classes[i];
3713 	  if (__iswctype (wc, wt))
3714 	    {
3715 	      match_len = char_len;
3716 	      goto check_node_accept_bytes_match;
3717 	    }
3718 	}
3719 
3720 #ifdef _LIBC
3721       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3722       if (nrules != 0)
3723 	{
3724 	  unsigned int in_collseq = 0;
3725 	  const int32_t *table, *indirect;
3726 	  const unsigned char *weights, *extra;
3727 	  const char *collseqwc;
3728 
3729 	  /* match with collating_symbol?  */
3730 	  if (cset->ncoll_syms)
3731 	    extra = (const unsigned char *)
3732 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3733 	  for (i = 0; i < cset->ncoll_syms; ++i)
3734 	    {
3735 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3736 	      /* Compare the length of input collating element and
3737 		 the length of current collating element.  */
3738 	      if (*coll_sym != elem_len)
3739 		continue;
3740 	      /* Compare each bytes.  */
3741 	      for (j = 0; j < *coll_sym; j++)
3742 		if (pin[j] != coll_sym[1 + j])
3743 		  break;
3744 	      if (j == *coll_sym)
3745 		{
3746 		  /* Match if every bytes is equal.  */
3747 		  match_len = j;
3748 		  goto check_node_accept_bytes_match;
3749 		}
3750 	    }
3751 
3752 	  if (cset->nranges)
3753 	    {
3754 	      if (elem_len <= char_len)
3755 		{
3756 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3757 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3758 		}
3759 	      else
3760 		in_collseq = find_collation_sequence_value (pin, elem_len);
3761 	    }
3762 	  /* match with range expression?  */
3763 	  /* FIXME: Implement rational ranges here, too.  */
3764 	  for (i = 0; i < cset->nranges; ++i)
3765 	    if (cset->range_starts[i] <= in_collseq
3766 		&& in_collseq <= cset->range_ends[i])
3767 	      {
3768 		match_len = elem_len;
3769 		goto check_node_accept_bytes_match;
3770 	      }
3771 
3772 	  /* match with equivalence_class?  */
3773 	  if (cset->nequiv_classes)
3774 	    {
3775 	      const unsigned char *cp = pin;
3776 	      table = (const int32_t *)
3777 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3778 	      weights = (const unsigned char *)
3779 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3780 	      extra = (const unsigned char *)
3781 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3782 	      indirect = (const int32_t *)
3783 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3784 	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3785 	      int32_t rule = idx >> 24;
3786 	      idx &= 0xffffff;
3787 	      if (idx > 0)
3788 		{
3789 		  size_t weight_len = weights[idx];
3790 		  for (i = 0; i < cset->nequiv_classes; ++i)
3791 		    {
3792 		      int32_t equiv_class_idx = cset->equiv_classes[i];
3793 		      int32_t equiv_class_rule = equiv_class_idx >> 24;
3794 		      equiv_class_idx &= 0xffffff;
3795 		      if (weights[equiv_class_idx] == weight_len
3796 			  && equiv_class_rule == rule
3797 			  && memcmp (weights + idx + 1,
3798 				     weights + equiv_class_idx + 1,
3799 				     weight_len) == 0)
3800 			{
3801 			  match_len = elem_len;
3802 			  goto check_node_accept_bytes_match;
3803 			}
3804 		    }
3805 		}
3806 	    }
3807 	}
3808       else
3809 #endif /* _LIBC */
3810 	{
3811 	  /* match with range expression?  */
3812 	  for (i = 0; i < cset->nranges; ++i)
3813 	    {
3814 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3815 		{
3816 		  match_len = char_len;
3817 		  goto check_node_accept_bytes_match;
3818 		}
3819 	    }
3820 	}
3821     check_node_accept_bytes_match:
3822       if (!cset->non_match)
3823 	return match_len;
3824       else
3825 	{
3826 	  if (match_len > 0)
3827 	    return 0;
3828 	  else
3829 	    return (elem_len > char_len) ? elem_len : char_len;
3830 	}
3831     }
3832   return 0;
3833 }
3834 
3835 #ifdef _LIBC
3836 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3837 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3838 {
3839   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3840   if (nrules == 0)
3841     {
3842       if (mbs_len == 1)
3843 	{
3844 	  /* No valid character.  Match it as a single byte character.  */
3845 	  const unsigned char *collseq = (const unsigned char *)
3846 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3847 	  return collseq[mbs[0]];
3848 	}
3849       return UINT_MAX;
3850     }
3851   else
3852     {
3853       int32_t idx;
3854       const unsigned char *extra = (const unsigned char *)
3855 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3856       int32_t extrasize = (const unsigned char *)
3857 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3858 
3859       for (idx = 0; idx < extrasize;)
3860 	{
3861 	  int mbs_cnt;
3862 	  bool found = false;
3863 	  int32_t elem_mbs_len;
3864 	  /* Skip the name of collating element name.  */
3865 	  idx = idx + extra[idx] + 1;
3866 	  elem_mbs_len = extra[idx++];
3867 	  if (mbs_len == elem_mbs_len)
3868 	    {
3869 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3870 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3871 		  break;
3872 	      if (mbs_cnt == elem_mbs_len)
3873 		/* Found the entry.  */
3874 		found = true;
3875 	    }
3876 	  /* Skip the byte sequence of the collating element.  */
3877 	  idx += elem_mbs_len;
3878 	  /* Adjust for the alignment.  */
3879 	  idx = (idx + 3) & ~3;
3880 	  /* Skip the collation sequence value.  */
3881 	  idx += sizeof (uint32_t);
3882 	  /* Skip the wide char sequence of the collating element.  */
3883 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3884 	  /* If we found the entry, return the sequence value.  */
3885 	  if (found)
3886 	    return *(uint32_t *) (extra + idx);
3887 	  /* Skip the collation sequence value.  */
3888 	  idx += sizeof (uint32_t);
3889 	}
3890       return UINT_MAX;
3891     }
3892 }
3893 #endif /* _LIBC */
3894 
3895 /* Check whether the node accepts the byte which is IDX-th
3896    byte of the INPUT.  */
3897 
3898 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)3899 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3900 		   Idx idx)
3901 {
3902   unsigned char ch;
3903   ch = re_string_byte_at (&mctx->input, idx);
3904   switch (node->type)
3905     {
3906     case CHARACTER:
3907       if (node->opr.c != ch)
3908         return false;
3909       break;
3910 
3911     case SIMPLE_BRACKET:
3912       if (!bitset_contain (node->opr.sbcset, ch))
3913         return false;
3914       break;
3915 
3916     case OP_UTF8_PERIOD:
3917       if (ch >= ASCII_CHARS)
3918         return false;
3919       FALLTHROUGH;
3920     case OP_PERIOD:
3921       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3922 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3923 	return false;
3924       break;
3925 
3926     default:
3927       return false;
3928     }
3929 
3930   if (node->constraint)
3931     {
3932       /* The node has constraints.  Check whether the current context
3933 	 satisfies the constraints.  */
3934       unsigned int context = re_string_context_at (&mctx->input, idx,
3935 						   mctx->eflags);
3936       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3937 	return false;
3938     }
3939 
3940   return true;
3941 }
3942 
3943 /* Extend the buffers, if the buffers have run out.  */
3944 
3945 static reg_errcode_t
3946 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)3947 extend_buffers (re_match_context_t *mctx, int min_len)
3948 {
3949   reg_errcode_t ret;
3950   re_string_t *pstr = &mctx->input;
3951 
3952   /* Avoid overflow.  */
3953   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3954 			<= pstr->bufs_len))
3955     return REG_ESPACE;
3956 
3957   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3958   ret = re_string_realloc_buffers (pstr,
3959 				   MAX (min_len,
3960 					MIN (pstr->len, pstr->bufs_len * 2)));
3961   if (__glibc_unlikely (ret != REG_NOERROR))
3962     return ret;
3963 
3964   if (mctx->state_log != NULL)
3965     {
3966       /* And double the length of state_log.  */
3967       /* XXX We have no indication of the size of this buffer.  If this
3968 	 allocation fail we have no indication that the state_log array
3969 	 does not have the right size.  */
3970       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3971 					      pstr->bufs_len + 1);
3972       if (__glibc_unlikely (new_array == NULL))
3973 	return REG_ESPACE;
3974       mctx->state_log = new_array;
3975     }
3976 
3977   /* Then reconstruct the buffers.  */
3978   if (pstr->icase)
3979     {
3980       if (pstr->mb_cur_max > 1)
3981 	{
3982 	  ret = build_wcs_upper_buffer (pstr);
3983 	  if (__glibc_unlikely (ret != REG_NOERROR))
3984 	    return ret;
3985 	}
3986       else
3987 	build_upper_buffer (pstr);
3988     }
3989   else
3990     {
3991       if (pstr->mb_cur_max > 1)
3992 	build_wcs_buffer (pstr);
3993       else
3994 	{
3995 	  if (pstr->trans != NULL)
3996 	    re_string_translate_buffer (pstr);
3997 	}
3998     }
3999   return REG_NOERROR;
4000 }
4001 
4002 
4003 /* Functions for matching context.  */
4004 
4005 /* Initialize MCTX.  */
4006 
4007 static reg_errcode_t
4008 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4009 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4010 {
4011   mctx->eflags = eflags;
4012   mctx->match_last = -1;
4013   if (n > 0)
4014     {
4015       /* Avoid overflow.  */
4016       size_t max_object_size =
4017 	MAX (sizeof (struct re_backref_cache_entry),
4018 	     sizeof (re_sub_match_top_t *));
4019       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4020 	return REG_ESPACE;
4021 
4022       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4023       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4024       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4025 	return REG_ESPACE;
4026     }
4027   /* Already zero-ed by the caller.
4028      else
4029        mctx->bkref_ents = NULL;
4030      mctx->nbkref_ents = 0;
4031      mctx->nsub_tops = 0;  */
4032   mctx->abkref_ents = n;
4033   mctx->max_mb_elem_len = 1;
4034   mctx->asub_tops = n;
4035   return REG_NOERROR;
4036 }
4037 
4038 /* Clean the entries which depend on the current input in MCTX.
4039    This function must be invoked when the matcher changes the start index
4040    of the input, or changes the input string.  */
4041 
4042 static void
match_ctx_clean(re_match_context_t * mctx)4043 match_ctx_clean (re_match_context_t *mctx)
4044 {
4045   Idx st_idx;
4046   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4047     {
4048       Idx sl_idx;
4049       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4050       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4051 	{
4052 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4053 	  re_free (last->path.array);
4054 	  re_free (last);
4055 	}
4056       re_free (top->lasts);
4057       if (top->path)
4058 	{
4059 	  re_free (top->path->array);
4060 	  re_free (top->path);
4061 	}
4062       re_free (top);
4063     }
4064 
4065   mctx->nsub_tops = 0;
4066   mctx->nbkref_ents = 0;
4067 }
4068 
4069 /* Free all the memory associated with MCTX.  */
4070 
4071 static void
match_ctx_free(re_match_context_t * mctx)4072 match_ctx_free (re_match_context_t *mctx)
4073 {
4074   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4075   match_ctx_clean (mctx);
4076   re_free (mctx->sub_tops);
4077   re_free (mctx->bkref_ents);
4078 }
4079 
4080 /* Add a new backreference entry to MCTX.
4081    Note that we assume that caller never call this function with duplicate
4082    entry, and call with STR_IDX which isn't smaller than any existing entry.
4083 */
4084 
4085 static reg_errcode_t
4086 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4087 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4088 		     Idx to)
4089 {
4090   if (mctx->nbkref_ents >= mctx->abkref_ents)
4091     {
4092       struct re_backref_cache_entry* new_entry;
4093       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4094 			      mctx->abkref_ents * 2);
4095       if (__glibc_unlikely (new_entry == NULL))
4096 	{
4097 	  re_free (mctx->bkref_ents);
4098 	  return REG_ESPACE;
4099 	}
4100       mctx->bkref_ents = new_entry;
4101       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4102 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4103       mctx->abkref_ents *= 2;
4104     }
4105   if (mctx->nbkref_ents > 0
4106       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4107     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4108 
4109   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4110   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4111   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4112   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4113 
4114   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4115      If bit N is clear, means that this entry won't epsilon-transition to
4116      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4117      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4118      such node.
4119 
4120      A backreference does not epsilon-transition unless it is empty, so set
4121      to all zeros if FROM != TO.  */
4122   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4123     = (from == to ? -1 : 0);
4124 
4125   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4126   if (mctx->max_mb_elem_len < to - from)
4127     mctx->max_mb_elem_len = to - from;
4128   return REG_NOERROR;
4129 }
4130 
4131 /* Return the first entry with the same str_idx, or -1 if none is
4132    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4133 
4134 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4135 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4136 {
4137   Idx left, right, mid, last;
4138   last = right = mctx->nbkref_ents;
4139   for (left = 0; left < right;)
4140     {
4141       mid = (left + right) / 2;
4142       if (mctx->bkref_ents[mid].str_idx < str_idx)
4143 	left = mid + 1;
4144       else
4145 	right = mid;
4146     }
4147   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4148     return left;
4149   else
4150     return -1;
4151 }
4152 
4153 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4154    at STR_IDX.  */
4155 
4156 static reg_errcode_t
4157 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4158 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4159 {
4160   DEBUG_ASSERT (mctx->sub_tops != NULL);
4161   DEBUG_ASSERT (mctx->asub_tops > 0);
4162   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4163     {
4164       Idx new_asub_tops = mctx->asub_tops * 2;
4165       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4166 						   re_sub_match_top_t *,
4167 						   new_asub_tops);
4168       if (__glibc_unlikely (new_array == NULL))
4169 	return REG_ESPACE;
4170       mctx->sub_tops = new_array;
4171       mctx->asub_tops = new_asub_tops;
4172     }
4173   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4174   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4175     return REG_ESPACE;
4176   mctx->sub_tops[mctx->nsub_tops]->node = node;
4177   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4178   return REG_NOERROR;
4179 }
4180 
4181 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4182    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4183    Return the new entry if successful, NULL if memory is exhausted.  */
4184 
4185 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4186 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4187 {
4188   re_sub_match_last_t *new_entry;
4189   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4190     {
4191       Idx new_alasts = 2 * subtop->alasts + 1;
4192       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4193 						    re_sub_match_last_t *,
4194 						    new_alasts);
4195       if (__glibc_unlikely (new_array == NULL))
4196 	return NULL;
4197       subtop->lasts = new_array;
4198       subtop->alasts = new_alasts;
4199     }
4200   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4201   if (__glibc_likely (new_entry != NULL))
4202     {
4203       subtop->lasts[subtop->nlasts] = new_entry;
4204       new_entry->node = node;
4205       new_entry->str_idx = str_idx;
4206       ++subtop->nlasts;
4207     }
4208   return new_entry;
4209 }
4210 
4211 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4212 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4213 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4214 {
4215   sctx->sifted_states = sifted_sts;
4216   sctx->limited_states = limited_sts;
4217   sctx->last_node = last_node;
4218   sctx->last_str_idx = last_str_idx;
4219   re_node_set_init_empty (&sctx->limits);
4220 }
4221