1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002-2019 Free Software Foundation, Inc.
5    This file is part of the GNU C Library.
6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 
8    The GNU C Library is free software; you can redistribute it and/or
9    modify it under the terms of the GNU General Public
10    License as published by the Free Software Foundation; either
11    version 3 of the License, or (at your option) any later version.
12 
13    The GNU C Library is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    General Public License for more details.
17 
18    You should have received a copy of the GNU General Public
19    License along with the GNU C Library; if not, see
20    <https://www.gnu.org/licenses/>.  */
21 
22 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
23 				     Idx n);
24 static void match_ctx_clean (re_match_context_t *mctx);
25 static void match_ctx_free (re_match_context_t *cache);
26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
27 					  Idx str_idx, Idx from, Idx to);
28 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 					   Idx str_idx);
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 						    Idx node, Idx str_idx);
33 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
34 			   re_dfastate_t **limited_sts, Idx last_node,
35 			   Idx last_str_idx);
36 static reg_errcode_t re_search_internal (const regex_t *preg,
37 					 const char *string, Idx length,
38 					 Idx start, Idx last_start, Idx stop,
39 					 size_t nmatch, regmatch_t pmatch[],
40 					 int eflags);
41 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
42 				  const char *string1, Idx length1,
43 				  const char *string2, Idx length2,
44 				  Idx start, regoff_t range,
45 				  struct re_registers *regs,
46 				  Idx stop, bool ret_len);
47 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
48 				const char *string, Idx length, Idx start,
49 				regoff_t range, Idx stop,
50 				struct re_registers *regs,
51 				bool ret_len);
52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53                               Idx nregs, int regs_allocated);
54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
55 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
56 			   Idx *p_match_first);
57 static Idx check_halt_state_context (const re_match_context_t *mctx,
58 				     const re_dfastate_t *state, Idx idx);
59 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
60 			 regmatch_t *prev_idx_match, Idx cur_node,
61 			 Idx cur_idx, Idx nmatch);
62 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
63 				      Idx str_idx, Idx dest_node, Idx nregs,
64 				      regmatch_t *regs,
65 				      re_node_set *eps_via_nodes);
66 static reg_errcode_t set_regs (const regex_t *preg,
67 			       const re_match_context_t *mctx,
68 			       size_t nmatch, regmatch_t *pmatch,
69 			       bool fl_backtrack);
70 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
71 
72 #ifdef RE_ENABLE_I18N
73 static int sift_states_iter_mb (const re_match_context_t *mctx,
74 				re_sift_context_t *sctx,
75 				Idx node_idx, Idx str_idx, Idx max_str_idx);
76 #endif /* RE_ENABLE_I18N */
77 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
78 					   re_sift_context_t *sctx);
79 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
80 					  re_sift_context_t *sctx, Idx str_idx,
81 					  re_node_set *cur_dest);
82 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
83 					      re_sift_context_t *sctx,
84 					      Idx str_idx,
85 					      re_node_set *dest_nodes);
86 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
87 					    re_node_set *dest_nodes,
88 					    const re_node_set *candidates);
89 static bool check_dst_limits (const re_match_context_t *mctx,
90 			      const re_node_set *limits,
91 			      Idx dst_node, Idx dst_idx, Idx src_node,
92 			      Idx src_idx);
93 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
94 					int boundaries, Idx subexp_idx,
95 					Idx from_node, Idx bkref_idx);
96 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
97 				      Idx limit, Idx subexp_idx,
98 				      Idx node, Idx str_idx,
99 				      Idx bkref_idx);
100 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
101 					  re_node_set *dest_nodes,
102 					  const re_node_set *candidates,
103 					  re_node_set *limits,
104 					  struct re_backref_cache_entry *bkref_ents,
105 					  Idx str_idx);
106 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
107 					re_sift_context_t *sctx,
108 					Idx str_idx, const re_node_set *candidates);
109 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
110 					re_dfastate_t **dst,
111 					re_dfastate_t **src, Idx num);
112 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
113 					 re_match_context_t *mctx);
114 static re_dfastate_t *transit_state (reg_errcode_t *err,
115 				     re_match_context_t *mctx,
116 				     re_dfastate_t *state);
117 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
118 					    re_match_context_t *mctx,
119 					    re_dfastate_t *next_state);
120 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
121 						re_node_set *cur_nodes,
122 						Idx str_idx);
123 #if 0
124 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
125 					re_match_context_t *mctx,
126 					re_dfastate_t *pstate);
127 #endif
128 #ifdef RE_ENABLE_I18N
129 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
130 				       re_dfastate_t *pstate);
131 #endif /* RE_ENABLE_I18N */
132 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
133 					  const re_node_set *nodes);
134 static reg_errcode_t get_subexp (re_match_context_t *mctx,
135 				 Idx bkref_node, Idx bkref_str_idx);
136 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
137 				     const re_sub_match_top_t *sub_top,
138 				     re_sub_match_last_t *sub_last,
139 				     Idx bkref_node, Idx bkref_str);
140 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
141 			     Idx subexp_idx, int type);
142 static reg_errcode_t check_arrival (re_match_context_t *mctx,
143 				    state_array_t *path, Idx top_node,
144 				    Idx top_str, Idx last_node, Idx last_str,
145 				    int type);
146 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
147 						   Idx str_idx,
148 						   re_node_set *cur_nodes,
149 						   re_node_set *next_nodes);
150 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
151 					       re_node_set *cur_nodes,
152 					       Idx ex_subexp, int type);
153 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
154 						   re_node_set *dst_nodes,
155 						   Idx target, Idx ex_subexp,
156 						   int type);
157 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
158 					 re_node_set *cur_nodes, Idx cur_str,
159 					 Idx subexp_num, int type);
160 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
161 #ifdef RE_ENABLE_I18N
162 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
163 				    const re_string_t *input, Idx idx);
164 # ifdef _LIBC
165 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
166 						   size_t name_len);
167 # endif /* _LIBC */
168 #endif /* RE_ENABLE_I18N */
169 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
170 				       const re_dfastate_t *state,
171 				       re_node_set *states_node,
172 				       bitset_t *states_ch);
173 static bool check_node_accept (const re_match_context_t *mctx,
174 			       const re_token_t *node, Idx idx);
175 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
176 
177 /* Entry point for POSIX code.  */
178 
179 /* regexec searches for a given pattern, specified by PREG, in the
180    string STRING.
181 
182    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
183    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
184    least NMATCH elements, and we set them to the offsets of the
185    corresponding matched substrings.
186 
187    EFLAGS specifies "execution flags" which affect matching: if
188    REG_NOTBOL is set, then ^ does not match at the beginning of the
189    string; if REG_NOTEOL is set, then $ does not match at the end.
190 
191    We return 0 if we find a match and REG_NOMATCH if not.  */
192 
193 int
regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)194 regexec (const regex_t *__restrict preg, const char *__restrict string,
195 	 size_t nmatch, regmatch_t pmatch[], int eflags)
196 {
197   reg_errcode_t err;
198   Idx start, length;
199   re_dfa_t *dfa = preg->buffer;
200 
201   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
202     return REG_BADPAT;
203 
204   if (eflags & REG_STARTEND)
205     {
206       start = pmatch[0].rm_so;
207       length = pmatch[0].rm_eo;
208     }
209   else
210     {
211       start = 0;
212       length = strlen (string);
213     }
214 
215   lock_lock (dfa->lock);
216   if (preg->no_sub)
217     err = re_search_internal (preg, string, length, start, length,
218 			      length, 0, NULL, eflags);
219   else
220     err = re_search_internal (preg, string, length, start, length,
221 			      length, nmatch, pmatch, eflags);
222   lock_unlock (dfa->lock);
223   return err != REG_NOERROR;
224 }
225 
226 #ifdef _LIBC
227 libc_hidden_def (__regexec)
228 
229 # include <shlib-compat.h>
230 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
231 
232 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
233 __typeof__ (__regexec) __compat_regexec;
234 
235 int
236 attribute_compat_text_section
__compat_regexec(const regex_t * __restrict preg,const char * __restrict string,size_t nmatch,regmatch_t pmatch[],int eflags)237 __compat_regexec (const regex_t *__restrict preg,
238 		  const char *__restrict string, size_t nmatch,
239 		  regmatch_t pmatch[], int eflags)
240 {
241   return regexec (preg, string, nmatch, pmatch,
242 		  eflags & (REG_NOTBOL | REG_NOTEOL));
243 }
244 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
245 # endif
246 #endif
247 
248 /* Entry points for GNU code.  */
249 
250 /* re_match, re_search, re_match_2, re_search_2
251 
252    The former two functions operate on STRING with length LENGTH,
253    while the later two operate on concatenation of STRING1 and STRING2
254    with lengths LENGTH1 and LENGTH2, respectively.
255 
256    re_match() matches the compiled pattern in BUFP against the string,
257    starting at index START.
258 
259    re_search() first tries matching at index START, then it tries to match
260    starting from index START + 1, and so on.  The last start position tried
261    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
262    way as re_match().)
263 
264    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
265    the first STOP characters of the concatenation of the strings should be
266    concerned.
267 
268    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
269    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
270    computed relative to the concatenation, not relative to the individual
271    strings.)
272 
273    On success, re_match* functions return the length of the match, re_search*
274    return the position of the start of the match.  Return value -1 means no
275    match was found and -2 indicates an internal error.  */
276 
277 regoff_t
re_match(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,struct re_registers * regs)278 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
279 	  Idx start, struct re_registers *regs)
280 {
281   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
282 }
283 #ifdef _LIBC
weak_alias(__re_match,re_match)284 weak_alias (__re_match, re_match)
285 #endif
286 
287 regoff_t
288 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
289 	   Idx start, regoff_t range, struct re_registers *regs)
290 {
291   return re_search_stub (bufp, string, length, start, range, length, regs,
292 			 false);
293 }
294 #ifdef _LIBC
weak_alias(__re_search,re_search)295 weak_alias (__re_search, re_search)
296 #endif
297 
298 regoff_t
299 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
300 	    const char *string2, Idx length2, Idx start,
301 	    struct re_registers *regs, Idx stop)
302 {
303   return re_search_2_stub (bufp, string1, length1, string2, length2,
304 			   start, 0, regs, stop, true);
305 }
306 #ifdef _LIBC
weak_alias(__re_match_2,re_match_2)307 weak_alias (__re_match_2, re_match_2)
308 #endif
309 
310 regoff_t
311 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
312 	     const char *string2, Idx length2, Idx start, regoff_t range,
313 	     struct re_registers *regs, Idx stop)
314 {
315   return re_search_2_stub (bufp, string1, length1, string2, length2,
316 			   start, range, regs, stop, false);
317 }
318 #ifdef _LIBC
weak_alias(__re_search_2,re_search_2)319 weak_alias (__re_search_2, re_search_2)
320 #endif
321 
322 static regoff_t
323 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
324 		  Idx length1, const char *string2, Idx length2, Idx start,
325 		  regoff_t range, struct re_registers *regs,
326 		  Idx stop, bool ret_len)
327 {
328   const char *str;
329   regoff_t rval;
330   Idx len;
331   char *s = NULL;
332 
333   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
334 			 || INT_ADD_WRAPV (length1, length2, &len))))
335     return -2;
336 
337   /* Concatenate the strings.  */
338   if (length2 > 0)
339     if (length1 > 0)
340       {
341 	s = re_malloc (char, len);
342 
343 	if (__glibc_unlikely (s == NULL))
344 	  return -2;
345 #ifdef _LIBC
346 	memcpy (__mempcpy (s, string1, length1), string2, length2);
347 #else
348 	memcpy (s, string1, length1);
349 	memcpy (s + length1, string2, length2);
350 #endif
351 	str = s;
352       }
353     else
354       str = string2;
355   else
356     str = string1;
357 
358   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
359 			 ret_len);
360   re_free (s);
361   return rval;
362 }
363 
364 /* The parameters have the same meaning as those of re_search.
365    Additional parameters:
366    If RET_LEN is true the length of the match is returned (re_match style);
367    otherwise the position of the match is returned.  */
368 
369 static regoff_t
re_search_stub(struct re_pattern_buffer * bufp,const char * string,Idx length,Idx start,regoff_t range,Idx stop,struct re_registers * regs,bool ret_len)370 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
371 		Idx start, regoff_t range, Idx stop, struct re_registers *regs,
372 		bool ret_len)
373 {
374   reg_errcode_t result;
375   regmatch_t *pmatch;
376   Idx nregs;
377   regoff_t rval;
378   int eflags = 0;
379   re_dfa_t *dfa = bufp->buffer;
380   Idx last_start = start + range;
381 
382   /* Check for out-of-range.  */
383   if (__glibc_unlikely (start < 0 || start > length))
384     return -1;
385   if (__glibc_unlikely (length < last_start
386 			|| (0 <= range && last_start < start)))
387     last_start = length;
388   else if (__glibc_unlikely (last_start < 0
389 			     || (range < 0 && start <= last_start)))
390     last_start = 0;
391 
392   lock_lock (dfa->lock);
393 
394   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
395   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
396 
397   /* Compile fastmap if we haven't yet.  */
398   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
399     re_compile_fastmap (bufp);
400 
401   if (__glibc_unlikely (bufp->no_sub))
402     regs = NULL;
403 
404   /* We need at least 1 register.  */
405   if (regs == NULL)
406     nregs = 1;
407   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
408 			     && regs->num_regs <= bufp->re_nsub))
409     {
410       nregs = regs->num_regs;
411       if (__glibc_unlikely (nregs < 1))
412 	{
413 	  /* Nothing can be copied to regs.  */
414 	  regs = NULL;
415 	  nregs = 1;
416 	}
417     }
418   else
419     nregs = bufp->re_nsub + 1;
420   pmatch = re_malloc (regmatch_t, nregs);
421   if (__glibc_unlikely (pmatch == NULL))
422     {
423       rval = -2;
424       goto out;
425     }
426 
427   result = re_search_internal (bufp, string, length, start, last_start, stop,
428 			       nregs, pmatch, eflags);
429 
430   rval = 0;
431 
432   /* I hope we needn't fill their regs with -1's when no match was found.  */
433   if (result != REG_NOERROR)
434     rval = result == REG_NOMATCH ? -1 : -2;
435   else if (regs != NULL)
436     {
437       /* If caller wants register contents data back, copy them.  */
438       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
439 					   bufp->regs_allocated);
440       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
441 	rval = -2;
442     }
443 
444   if (__glibc_likely (rval == 0))
445     {
446       if (ret_len)
447 	{
448 	  assert (pmatch[0].rm_so == start);
449 	  rval = pmatch[0].rm_eo - start;
450 	}
451       else
452 	rval = pmatch[0].rm_so;
453     }
454   re_free (pmatch);
455  out:
456   lock_unlock (dfa->lock);
457   return rval;
458 }
459 
460 static unsigned
re_copy_regs(struct re_registers * regs,regmatch_t * pmatch,Idx nregs,int regs_allocated)461 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
462 	      int regs_allocated)
463 {
464   int rval = REGS_REALLOCATE;
465   Idx i;
466   Idx need_regs = nregs + 1;
467   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
468      uses.  */
469 
470   /* Have the register data arrays been allocated?  */
471   if (regs_allocated == REGS_UNALLOCATED)
472     { /* No.  So allocate them with malloc.  */
473       regs->start = re_malloc (regoff_t, need_regs);
474       if (__glibc_unlikely (regs->start == NULL))
475 	return REGS_UNALLOCATED;
476       regs->end = re_malloc (regoff_t, need_regs);
477       if (__glibc_unlikely (regs->end == NULL))
478 	{
479 	  re_free (regs->start);
480 	  return REGS_UNALLOCATED;
481 	}
482       regs->num_regs = need_regs;
483     }
484   else if (regs_allocated == REGS_REALLOCATE)
485     { /* Yes.  If we need more elements than were already
486 	 allocated, reallocate them.  If we need fewer, just
487 	 leave it alone.  */
488       if (__glibc_unlikely (need_regs > regs->num_regs))
489 	{
490 	  regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
491 	  regoff_t *new_end;
492 	  if (__glibc_unlikely (new_start == NULL))
493 	    return REGS_UNALLOCATED;
494 	  new_end = re_realloc (regs->end, regoff_t, need_regs);
495 	  if (__glibc_unlikely (new_end == NULL))
496 	    {
497 	      re_free (new_start);
498 	      return REGS_UNALLOCATED;
499 	    }
500 	  regs->start = new_start;
501 	  regs->end = new_end;
502 	  regs->num_regs = need_regs;
503 	}
504     }
505   else
506     {
507       assert (regs_allocated == REGS_FIXED);
508       /* This function may not be called with REGS_FIXED and nregs too big.  */
509       assert (regs->num_regs >= nregs);
510       rval = REGS_FIXED;
511     }
512 
513   /* Copy the regs.  */
514   for (i = 0; i < nregs; ++i)
515     {
516       regs->start[i] = pmatch[i].rm_so;
517       regs->end[i] = pmatch[i].rm_eo;
518     }
519   for ( ; i < regs->num_regs; ++i)
520     regs->start[i] = regs->end[i] = -1;
521 
522   return rval;
523 }
524 
525 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
526    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
527    this memory for recording register information.  STARTS and ENDS
528    must be allocated using the malloc library routine, and must each
529    be at least NUM_REGS * sizeof (regoff_t) bytes long.
530 
531    If NUM_REGS == 0, then subsequent matches should allocate their own
532    register data.
533 
534    Unless this function is called, the first search or match using
535    PATTERN_BUFFER will allocate its own register data, without
536    freeing the old data.  */
537 
538 void
re_set_registers(struct re_pattern_buffer * bufp,struct re_registers * regs,__re_size_t num_regs,regoff_t * starts,regoff_t * ends)539 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
540 		  __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
541 {
542   if (num_regs)
543     {
544       bufp->regs_allocated = REGS_REALLOCATE;
545       regs->num_regs = num_regs;
546       regs->start = starts;
547       regs->end = ends;
548     }
549   else
550     {
551       bufp->regs_allocated = REGS_UNALLOCATED;
552       regs->num_regs = 0;
553       regs->start = regs->end = NULL;
554     }
555 }
556 #ifdef _LIBC
weak_alias(__re_set_registers,re_set_registers)557 weak_alias (__re_set_registers, re_set_registers)
558 #endif
559 
560 /* Entry points compatible with 4.2 BSD regex library.  We don't define
561    them unless specifically requested.  */
562 
563 #if defined _REGEX_RE_COMP || defined _LIBC
564 int
565 # ifdef _LIBC
566 weak_function
567 # endif
568 re_exec (const char *s)
569 {
570   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
571 }
572 #endif /* _REGEX_RE_COMP */
573 
574 /* Internal entry point.  */
575 
576 /* Searches for a compiled pattern PREG in the string STRING, whose
577    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
578    meaning as with regexec.  LAST_START is START + RANGE, where
579    START and RANGE have the same meaning as with re_search.
580    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
581    otherwise return the error code.
582    Note: We assume front end functions already check ranges.
583    (0 <= LAST_START && LAST_START <= LENGTH)  */
584 
585 static reg_errcode_t
586 __attribute_warn_unused_result__
re_search_internal(const regex_t * preg,const char * string,Idx length,Idx start,Idx last_start,Idx stop,size_t nmatch,regmatch_t pmatch[],int eflags)587 re_search_internal (const regex_t *preg, const char *string, Idx length,
588 		    Idx start, Idx last_start, Idx stop, size_t nmatch,
589 		    regmatch_t pmatch[], int eflags)
590 {
591   reg_errcode_t err;
592   const re_dfa_t *dfa = preg->buffer;
593   Idx left_lim, right_lim;
594   int incr;
595   bool fl_longest_match;
596   int match_kind;
597   Idx match_first;
598   Idx match_last = -1;
599   Idx extra_nmatch;
600   bool sb;
601   int ch;
602 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
603   re_match_context_t mctx = { .dfa = dfa };
604 #else
605   re_match_context_t mctx;
606 #endif
607   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
608 		    && start != last_start && !preg->can_be_null)
609 		   ? preg->fastmap : NULL);
610   RE_TRANSLATE_TYPE t = preg->translate;
611 
612 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
613   memset (&mctx, '\0', sizeof (re_match_context_t));
614   mctx.dfa = dfa;
615 #endif
616 
617   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
618   nmatch -= extra_nmatch;
619 
620   /* Check if the DFA haven't been compiled.  */
621   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
622 			|| dfa->init_state_word == NULL
623 			|| dfa->init_state_nl == NULL
624 			|| dfa->init_state_begbuf == NULL))
625     return REG_NOMATCH;
626 
627 #ifdef DEBUG
628   /* We assume front-end functions already check them.  */
629   assert (0 <= last_start && last_start <= length);
630 #endif
631 
632   /* If initial states with non-begbuf contexts have no elements,
633      the regex must be anchored.  If preg->newline_anchor is set,
634      we'll never use init_state_nl, so do not check it.  */
635   if (dfa->init_state->nodes.nelem == 0
636       && dfa->init_state_word->nodes.nelem == 0
637       && (dfa->init_state_nl->nodes.nelem == 0
638 	  || !preg->newline_anchor))
639     {
640       if (start != 0 && last_start != 0)
641         return REG_NOMATCH;
642       start = last_start = 0;
643     }
644 
645   /* We must check the longest matching, if nmatch > 0.  */
646   fl_longest_match = (nmatch != 0 || dfa->nbackref);
647 
648   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
649 			    preg->translate, (preg->syntax & RE_ICASE) != 0,
650 			    dfa);
651   if (__glibc_unlikely (err != REG_NOERROR))
652     goto free_return;
653   mctx.input.stop = stop;
654   mctx.input.raw_stop = stop;
655   mctx.input.newline_anchor = preg->newline_anchor;
656 
657   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
658   if (__glibc_unlikely (err != REG_NOERROR))
659     goto free_return;
660 
661   /* We will log all the DFA states through which the dfa pass,
662      if nmatch > 1, or this dfa has "multibyte node", which is a
663      back-reference or a node which can accept multibyte character or
664      multi character collating element.  */
665   if (nmatch > 1 || dfa->has_mb_node)
666     {
667       /* Avoid overflow.  */
668       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
669 			     <= mctx.input.bufs_len)))
670 	{
671 	  err = REG_ESPACE;
672 	  goto free_return;
673 	}
674 
675       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
676       if (__glibc_unlikely (mctx.state_log == NULL))
677 	{
678 	  err = REG_ESPACE;
679 	  goto free_return;
680 	}
681     }
682   else
683     mctx.state_log = NULL;
684 
685   match_first = start;
686   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
687 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
688 
689   /* Check incrementally whether the input string matches.  */
690   incr = (last_start < start) ? -1 : 1;
691   left_lim = (last_start < start) ? last_start : start;
692   right_lim = (last_start < start) ? start : last_start;
693   sb = dfa->mb_cur_max == 1;
694   match_kind =
695     (fastmap
696      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
697 	| (start <= last_start ? 2 : 0)
698 	| (t != NULL ? 1 : 0))
699      : 8);
700 
701   for (;; match_first += incr)
702     {
703       err = REG_NOMATCH;
704       if (match_first < left_lim || right_lim < match_first)
705 	goto free_return;
706 
707       /* Advance as rapidly as possible through the string, until we
708 	 find a plausible place to start matching.  This may be done
709 	 with varying efficiency, so there are various possibilities:
710 	 only the most common of them are specialized, in order to
711 	 save on code size.  We use a switch statement for speed.  */
712       switch (match_kind)
713 	{
714 	case 8:
715 	  /* No fastmap.  */
716 	  break;
717 
718 	case 7:
719 	  /* Fastmap with single-byte translation, match forward.  */
720 	  while (__glibc_likely (match_first < right_lim)
721 		 && !fastmap[t[(unsigned char) string[match_first]]])
722 	    ++match_first;
723 	  goto forward_match_found_start_or_reached_end;
724 
725 	case 6:
726 	  /* Fastmap without translation, match forward.  */
727 	  while (__glibc_likely (match_first < right_lim)
728 		 && !fastmap[(unsigned char) string[match_first]])
729 	    ++match_first;
730 
731 	forward_match_found_start_or_reached_end:
732 	  if (__glibc_unlikely (match_first == right_lim))
733 	    {
734 	      ch = match_first >= length
735 		       ? 0 : (unsigned char) string[match_first];
736 	      if (!fastmap[t ? t[ch] : ch])
737 		goto free_return;
738 	    }
739 	  break;
740 
741 	case 4:
742 	case 5:
743 	  /* Fastmap without multi-byte translation, match backwards.  */
744 	  while (match_first >= left_lim)
745 	    {
746 	      ch = match_first >= length
747 		       ? 0 : (unsigned char) string[match_first];
748 	      if (fastmap[t ? t[ch] : ch])
749 		break;
750 	      --match_first;
751 	    }
752 	  if (match_first < left_lim)
753 	    goto free_return;
754 	  break;
755 
756 	default:
757 	  /* In this case, we can't determine easily the current byte,
758 	     since it might be a component byte of a multibyte
759 	     character.  Then we use the constructed buffer instead.  */
760 	  for (;;)
761 	    {
762 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
763 		 buffers.  */
764 	      __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
765 	      if (__glibc_unlikely (offset
766 				    >= (__re_size_t) mctx.input.valid_raw_len))
767 		{
768 		  err = re_string_reconstruct (&mctx.input, match_first,
769 					       eflags);
770 		  if (__glibc_unlikely (err != REG_NOERROR))
771 		    goto free_return;
772 
773 		  offset = match_first - mctx.input.raw_mbs_idx;
774 		}
775 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
776 		 Note that MATCH_FIRST must not be smaller than 0.  */
777 	      ch = (match_first >= length
778 		    ? 0 : re_string_byte_at (&mctx.input, offset));
779 	      if (fastmap[ch])
780 		break;
781 	      match_first += incr;
782 	      if (match_first < left_lim || match_first > right_lim)
783 		{
784 		  err = REG_NOMATCH;
785 		  goto free_return;
786 		}
787 	    }
788 	  break;
789 	}
790 
791       /* Reconstruct the buffers so that the matcher can assume that
792 	 the matching starts from the beginning of the buffer.  */
793       err = re_string_reconstruct (&mctx.input, match_first, eflags);
794       if (__glibc_unlikely (err != REG_NOERROR))
795 	goto free_return;
796 
797 #ifdef RE_ENABLE_I18N
798      /* Don't consider this char as a possible match start if it part,
799 	yet isn't the head, of a multibyte character.  */
800       if (!sb && !re_string_first_byte (&mctx.input, 0))
801 	continue;
802 #endif
803 
804       /* It seems to be appropriate one, then use the matcher.  */
805       /* We assume that the matching starts from 0.  */
806       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
807       match_last = check_matching (&mctx, fl_longest_match,
808 				   start <= last_start ? &match_first : NULL);
809       if (match_last != -1)
810 	{
811 	  if (__glibc_unlikely (match_last == -2))
812 	    {
813 	      err = REG_ESPACE;
814 	      goto free_return;
815 	    }
816 	  else
817 	    {
818 	      mctx.match_last = match_last;
819 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
820 		{
821 		  re_dfastate_t *pstate = mctx.state_log[match_last];
822 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
823 							     match_last);
824 		}
825 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
826 		  || dfa->nbackref)
827 		{
828 		  err = prune_impossible_nodes (&mctx);
829 		  if (err == REG_NOERROR)
830 		    break;
831 		  if (__glibc_unlikely (err != REG_NOMATCH))
832 		    goto free_return;
833 		  match_last = -1;
834 		}
835 	      else
836 		break; /* We found a match.  */
837 	    }
838 	}
839 
840       match_ctx_clean (&mctx);
841     }
842 
843 #ifdef DEBUG
844   assert (match_last != -1);
845   assert (err == REG_NOERROR);
846 #endif
847 
848   /* Set pmatch[] if we need.  */
849   if (nmatch > 0)
850     {
851       Idx reg_idx;
852 
853       /* Initialize registers.  */
854       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
855 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
856 
857       /* Set the points where matching start/end.  */
858       pmatch[0].rm_so = 0;
859       pmatch[0].rm_eo = mctx.match_last;
860       /* FIXME: This function should fail if mctx.match_last exceeds
861 	 the maximum possible regoff_t value.  We need a new error
862 	 code REG_OVERFLOW.  */
863 
864       if (!preg->no_sub && nmatch > 1)
865 	{
866 	  err = set_regs (preg, &mctx, nmatch, pmatch,
867 			  dfa->has_plural_match && dfa->nbackref > 0);
868 	  if (__glibc_unlikely (err != REG_NOERROR))
869 	    goto free_return;
870 	}
871 
872       /* At last, add the offset to each register, since we slid
873 	 the buffers so that we could assume that the matching starts
874 	 from 0.  */
875       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
876 	if (pmatch[reg_idx].rm_so != -1)
877 	  {
878 #ifdef RE_ENABLE_I18N
879 	    if (__glibc_unlikely (mctx.input.offsets_needed != 0))
880 	      {
881 		pmatch[reg_idx].rm_so =
882 		  (pmatch[reg_idx].rm_so == mctx.input.valid_len
883 		   ? mctx.input.valid_raw_len
884 		   : mctx.input.offsets[pmatch[reg_idx].rm_so]);
885 		pmatch[reg_idx].rm_eo =
886 		  (pmatch[reg_idx].rm_eo == mctx.input.valid_len
887 		   ? mctx.input.valid_raw_len
888 		   : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
889 	      }
890 #else
891 	    assert (mctx.input.offsets_needed == 0);
892 #endif
893 	    pmatch[reg_idx].rm_so += match_first;
894 	    pmatch[reg_idx].rm_eo += match_first;
895 	  }
896       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
897 	{
898 	  pmatch[nmatch + reg_idx].rm_so = -1;
899 	  pmatch[nmatch + reg_idx].rm_eo = -1;
900 	}
901 
902       if (dfa->subexp_map)
903 	for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
904 	  if (dfa->subexp_map[reg_idx] != reg_idx)
905 	    {
906 	      pmatch[reg_idx + 1].rm_so
907 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
908 	      pmatch[reg_idx + 1].rm_eo
909 		= pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
910 	    }
911     }
912 
913  free_return:
914   re_free (mctx.state_log);
915   if (dfa->nbackref)
916     match_ctx_free (&mctx);
917   re_string_destruct (&mctx.input);
918   return err;
919 }
920 
921 static reg_errcode_t
922 __attribute_warn_unused_result__
prune_impossible_nodes(re_match_context_t * mctx)923 prune_impossible_nodes (re_match_context_t *mctx)
924 {
925   const re_dfa_t *const dfa = mctx->dfa;
926   Idx halt_node, match_last;
927   reg_errcode_t ret;
928   re_dfastate_t **sifted_states;
929   re_dfastate_t **lim_states = NULL;
930   re_sift_context_t sctx;
931 #ifdef DEBUG
932   assert (mctx->state_log != NULL);
933 #endif
934   match_last = mctx->match_last;
935   halt_node = mctx->last_node;
936 
937   /* Avoid overflow.  */
938   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
939 			<= match_last))
940     return REG_ESPACE;
941 
942   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
943   if (__glibc_unlikely (sifted_states == NULL))
944     {
945       ret = REG_ESPACE;
946       goto free_return;
947     }
948   if (dfa->nbackref)
949     {
950       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
951       if (__glibc_unlikely (lim_states == NULL))
952 	{
953 	  ret = REG_ESPACE;
954 	  goto free_return;
955 	}
956       while (1)
957 	{
958 	  memset (lim_states, '\0',
959 		  sizeof (re_dfastate_t *) * (match_last + 1));
960 	  sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
961 			 match_last);
962 	  ret = sift_states_backward (mctx, &sctx);
963 	  re_node_set_free (&sctx.limits);
964 	  if (__glibc_unlikely (ret != REG_NOERROR))
965 	      goto free_return;
966 	  if (sifted_states[0] != NULL || lim_states[0] != NULL)
967 	    break;
968 	  do
969 	    {
970 	      --match_last;
971 	      if (match_last < 0)
972 		{
973 		  ret = REG_NOMATCH;
974 		  goto free_return;
975 		}
976 	    } while (mctx->state_log[match_last] == NULL
977 		     || !mctx->state_log[match_last]->halt);
978 	  halt_node = check_halt_state_context (mctx,
979 						mctx->state_log[match_last],
980 						match_last);
981 	}
982       ret = merge_state_array (dfa, sifted_states, lim_states,
983 			       match_last + 1);
984       re_free (lim_states);
985       lim_states = NULL;
986       if (__glibc_unlikely (ret != REG_NOERROR))
987 	goto free_return;
988     }
989   else
990     {
991       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
992       ret = sift_states_backward (mctx, &sctx);
993       re_node_set_free (&sctx.limits);
994       if (__glibc_unlikely (ret != REG_NOERROR))
995 	goto free_return;
996       if (sifted_states[0] == NULL)
997 	{
998 	  ret = REG_NOMATCH;
999 	  goto free_return;
1000 	}
1001     }
1002   re_free (mctx->state_log);
1003   mctx->state_log = sifted_states;
1004   sifted_states = NULL;
1005   mctx->last_node = halt_node;
1006   mctx->match_last = match_last;
1007   ret = REG_NOERROR;
1008  free_return:
1009   re_free (sifted_states);
1010   re_free (lim_states);
1011   return ret;
1012 }
1013 
1014 /* Acquire an initial state and return it.
1015    We must select appropriate initial state depending on the context,
1016    since initial states may have constraints like "\<", "^", etc..  */
1017 
1018 static inline re_dfastate_t *
1019 __attribute__ ((always_inline))
acquire_init_state_context(reg_errcode_t * err,const re_match_context_t * mctx,Idx idx)1020 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1021 			    Idx idx)
1022 {
1023   const re_dfa_t *const dfa = mctx->dfa;
1024   if (dfa->init_state->has_constraint)
1025     {
1026       unsigned int context;
1027       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1028       if (IS_WORD_CONTEXT (context))
1029 	return dfa->init_state_word;
1030       else if (IS_ORDINARY_CONTEXT (context))
1031 	return dfa->init_state;
1032       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1033 	return dfa->init_state_begbuf;
1034       else if (IS_NEWLINE_CONTEXT (context))
1035 	return dfa->init_state_nl;
1036       else if (IS_BEGBUF_CONTEXT (context))
1037 	{
1038 	  /* It is relatively rare case, then calculate on demand.  */
1039 	  return re_acquire_state_context (err, dfa,
1040 					   dfa->init_state->entrance_nodes,
1041 					   context);
1042 	}
1043       else
1044 	/* Must not happen?  */
1045 	return dfa->init_state;
1046     }
1047   else
1048     return dfa->init_state;
1049 }
1050 
1051 /* Check whether the regular expression match input string INPUT or not,
1052    and return the index where the matching end.  Return -1 if
1053    there is no match, and return -2 in case of an error.
1054    FL_LONGEST_MATCH means we want the POSIX longest matching.
1055    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1056    next place where we may want to try matching.
1057    Note that the matcher assumes that the matching starts from the current
1058    index of the buffer.  */
1059 
1060 static Idx
1061 __attribute_warn_unused_result__
check_matching(re_match_context_t * mctx,bool fl_longest_match,Idx * p_match_first)1062 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1063 		Idx *p_match_first)
1064 {
1065   const re_dfa_t *const dfa = mctx->dfa;
1066   reg_errcode_t err;
1067   Idx match = 0;
1068   Idx match_last = -1;
1069   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1070   re_dfastate_t *cur_state;
1071   bool at_init_state = p_match_first != NULL;
1072   Idx next_start_idx = cur_str_idx;
1073 
1074   err = REG_NOERROR;
1075   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1076   /* An initial state must not be NULL (invalid).  */
1077   if (__glibc_unlikely (cur_state == NULL))
1078     {
1079       assert (err == REG_ESPACE);
1080       return -2;
1081     }
1082 
1083   if (mctx->state_log != NULL)
1084     {
1085       mctx->state_log[cur_str_idx] = cur_state;
1086 
1087       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1088 	 later.  E.g. Processing back references.  */
1089       if (__glibc_unlikely (dfa->nbackref))
1090 	{
1091 	  at_init_state = false;
1092 	  err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1093 	  if (__glibc_unlikely (err != REG_NOERROR))
1094 	    return err;
1095 
1096 	  if (cur_state->has_backref)
1097 	    {
1098 	      err = transit_state_bkref (mctx, &cur_state->nodes);
1099 	      if (__glibc_unlikely (err != REG_NOERROR))
1100 		return err;
1101 	    }
1102 	}
1103     }
1104 
1105   /* If the RE accepts NULL string.  */
1106   if (__glibc_unlikely (cur_state->halt))
1107     {
1108       if (!cur_state->has_constraint
1109 	  || check_halt_state_context (mctx, cur_state, cur_str_idx))
1110 	{
1111 	  if (!fl_longest_match)
1112 	    return cur_str_idx;
1113 	  else
1114 	    {
1115 	      match_last = cur_str_idx;
1116 	      match = 1;
1117 	    }
1118 	}
1119     }
1120 
1121   while (!re_string_eoi (&mctx->input))
1122     {
1123       re_dfastate_t *old_state = cur_state;
1124       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1125 
1126       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1127 	   && mctx->input.bufs_len < mctx->input.len)
1128 	  || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1129 	      && mctx->input.valid_len < mctx->input.len))
1130 	{
1131 	  err = extend_buffers (mctx, next_char_idx + 1);
1132 	  if (__glibc_unlikely (err != REG_NOERROR))
1133 	    {
1134 	      assert (err == REG_ESPACE);
1135 	      return -2;
1136 	    }
1137 	}
1138 
1139       cur_state = transit_state (&err, mctx, cur_state);
1140       if (mctx->state_log != NULL)
1141 	cur_state = merge_state_with_log (&err, mctx, cur_state);
1142 
1143       if (cur_state == NULL)
1144 	{
1145 	  /* Reached the invalid state or an error.  Try to recover a valid
1146 	     state using the state log, if available and if we have not
1147 	     already found a valid (even if not the longest) match.  */
1148 	  if (__glibc_unlikely (err != REG_NOERROR))
1149 	    return -2;
1150 
1151 	  if (mctx->state_log == NULL
1152 	      || (match && !fl_longest_match)
1153 	      || (cur_state = find_recover_state (&err, mctx)) == NULL)
1154 	    break;
1155 	}
1156 
1157       if (__glibc_unlikely (at_init_state))
1158 	{
1159 	  if (old_state == cur_state)
1160 	    next_start_idx = next_char_idx;
1161 	  else
1162 	    at_init_state = false;
1163 	}
1164 
1165       if (cur_state->halt)
1166 	{
1167 	  /* Reached a halt state.
1168 	     Check the halt state can satisfy the current context.  */
1169 	  if (!cur_state->has_constraint
1170 	      || check_halt_state_context (mctx, cur_state,
1171 					   re_string_cur_idx (&mctx->input)))
1172 	    {
1173 	      /* We found an appropriate halt state.  */
1174 	      match_last = re_string_cur_idx (&mctx->input);
1175 	      match = 1;
1176 
1177 	      /* We found a match, do not modify match_first below.  */
1178 	      p_match_first = NULL;
1179 	      if (!fl_longest_match)
1180 		break;
1181 	    }
1182 	}
1183     }
1184 
1185   if (p_match_first)
1186     *p_match_first += next_start_idx;
1187 
1188   return match_last;
1189 }
1190 
1191 /* Check NODE match the current context.  */
1192 
1193 static bool
check_halt_node_context(const re_dfa_t * dfa,Idx node,unsigned int context)1194 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1195 {
1196   re_token_type_t type = dfa->nodes[node].type;
1197   unsigned int constraint = dfa->nodes[node].constraint;
1198   if (type != END_OF_RE)
1199     return false;
1200   if (!constraint)
1201     return true;
1202   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1203     return false;
1204   return true;
1205 }
1206 
1207 /* Check the halt state STATE match the current context.
1208    Return 0 if not match, if the node, STATE has, is a halt node and
1209    match the context, return the node.  */
1210 
1211 static Idx
check_halt_state_context(const re_match_context_t * mctx,const re_dfastate_t * state,Idx idx)1212 check_halt_state_context (const re_match_context_t *mctx,
1213 			  const re_dfastate_t *state, Idx idx)
1214 {
1215   Idx i;
1216   unsigned int context;
1217 #ifdef DEBUG
1218   assert (state->halt);
1219 #endif
1220   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1221   for (i = 0; i < state->nodes.nelem; ++i)
1222     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1223       return state->nodes.elems[i];
1224   return 0;
1225 }
1226 
1227 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1228    corresponding to the DFA).
1229    Return the destination node, and update EPS_VIA_NODES;
1230    return -1 in case of errors.  */
1231 
1232 static Idx
proceed_next_node(const re_match_context_t * mctx,Idx nregs,regmatch_t * regs,Idx * pidx,Idx node,re_node_set * eps_via_nodes,struct re_fail_stack_t * fs)1233 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1234 		   Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1235 		   struct re_fail_stack_t *fs)
1236 {
1237   const re_dfa_t *const dfa = mctx->dfa;
1238   Idx i;
1239   bool ok;
1240   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1241     {
1242       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1243       re_node_set *edests = &dfa->edests[node];
1244       Idx dest_node;
1245       ok = re_node_set_insert (eps_via_nodes, node);
1246       if (__glibc_unlikely (! ok))
1247 	return -2;
1248       /* Pick up a valid destination, or return -1 if none
1249 	 is found.  */
1250       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1251 	{
1252 	  Idx candidate = edests->elems[i];
1253 	  if (!re_node_set_contains (cur_nodes, candidate))
1254 	    continue;
1255           if (dest_node == -1)
1256 	    dest_node = candidate;
1257 
1258 	  else
1259 	    {
1260 	      /* In order to avoid infinite loop like "(a*)*", return the second
1261 		 epsilon-transition if the first was already considered.  */
1262 	      if (re_node_set_contains (eps_via_nodes, dest_node))
1263 		return candidate;
1264 
1265 	      /* Otherwise, push the second epsilon-transition on the fail stack.  */
1266 	      else if (fs != NULL
1267 		       && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1268 					   eps_via_nodes))
1269 		return -2;
1270 
1271 	      /* We know we are going to exit.  */
1272 	      break;
1273 	    }
1274 	}
1275       return dest_node;
1276     }
1277   else
1278     {
1279       Idx naccepted = 0;
1280       re_token_type_t type = dfa->nodes[node].type;
1281 
1282 #ifdef RE_ENABLE_I18N
1283       if (dfa->nodes[node].accept_mb)
1284 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1285       else
1286 #endif /* RE_ENABLE_I18N */
1287       if (type == OP_BACK_REF)
1288 	{
1289 	  Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1290 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1291 	  if (fs != NULL)
1292 	    {
1293 	      if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1294 		return -1;
1295 	      else if (naccepted)
1296 		{
1297 		  char *buf = (char *) re_string_get_buffer (&mctx->input);
1298 		  if (mctx->input.valid_len - *pidx < naccepted
1299 		      || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1300 				  naccepted)
1301 			  != 0))
1302 		    return -1;
1303 		}
1304 	    }
1305 
1306 	  if (naccepted == 0)
1307 	    {
1308 	      Idx dest_node;
1309 	      ok = re_node_set_insert (eps_via_nodes, node);
1310 	      if (__glibc_unlikely (! ok))
1311 		return -2;
1312 	      dest_node = dfa->edests[node].elems[0];
1313 	      if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1314 					dest_node))
1315 		return dest_node;
1316 	    }
1317 	}
1318 
1319       if (naccepted != 0
1320 	  || check_node_accept (mctx, dfa->nodes + node, *pidx))
1321 	{
1322 	  Idx dest_node = dfa->nexts[node];
1323 	  *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1324 	  if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1325 		     || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1326 					       dest_node)))
1327 	    return -1;
1328 	  re_node_set_empty (eps_via_nodes);
1329 	  return dest_node;
1330 	}
1331     }
1332   return -1;
1333 }
1334 
1335 static reg_errcode_t
1336 __attribute_warn_unused_result__
push_fail_stack(struct re_fail_stack_t * fs,Idx str_idx,Idx dest_node,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1337 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1338 		 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1339 {
1340   reg_errcode_t err;
1341   Idx num = fs->num++;
1342   if (fs->num == fs->alloc)
1343     {
1344       struct re_fail_stack_ent_t *new_array;
1345       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1346                               fs->alloc * 2);
1347       if (new_array == NULL)
1348 	return REG_ESPACE;
1349       fs->alloc *= 2;
1350       fs->stack = new_array;
1351     }
1352   fs->stack[num].idx = str_idx;
1353   fs->stack[num].node = dest_node;
1354   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1355   if (fs->stack[num].regs == NULL)
1356     return REG_ESPACE;
1357   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1358   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1359   return err;
1360 }
1361 
1362 static Idx
pop_fail_stack(struct re_fail_stack_t * fs,Idx * pidx,Idx nregs,regmatch_t * regs,re_node_set * eps_via_nodes)1363 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1364 		regmatch_t *regs, re_node_set *eps_via_nodes)
1365 {
1366   Idx num = --fs->num;
1367   assert (num >= 0);
1368   *pidx = fs->stack[num].idx;
1369   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1370   re_node_set_free (eps_via_nodes);
1371   re_free (fs->stack[num].regs);
1372   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1373   return fs->stack[num].node;
1374 }
1375 
1376 /* Set the positions where the subexpressions are starts/ends to registers
1377    PMATCH.
1378    Note: We assume that pmatch[0] is already set, and
1379    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1380 
1381 static reg_errcode_t
1382 __attribute_warn_unused_result__
set_regs(const regex_t * preg,const re_match_context_t * mctx,size_t nmatch,regmatch_t * pmatch,bool fl_backtrack)1383 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1384 	  regmatch_t *pmatch, bool fl_backtrack)
1385 {
1386   const re_dfa_t *dfa = preg->buffer;
1387   Idx idx, cur_node;
1388   re_node_set eps_via_nodes;
1389   struct re_fail_stack_t *fs;
1390   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1391   regmatch_t *prev_idx_match;
1392   bool prev_idx_match_malloced = false;
1393 
1394 #ifdef DEBUG
1395   assert (nmatch > 1);
1396   assert (mctx->state_log != NULL);
1397 #endif
1398   if (fl_backtrack)
1399     {
1400       fs = &fs_body;
1401       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1402       if (fs->stack == NULL)
1403 	return REG_ESPACE;
1404     }
1405   else
1406     fs = NULL;
1407 
1408   cur_node = dfa->init_node;
1409   re_node_set_init_empty (&eps_via_nodes);
1410 
1411   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1412     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1413   else
1414     {
1415       prev_idx_match = re_malloc (regmatch_t, nmatch);
1416       if (prev_idx_match == NULL)
1417 	{
1418 	  free_fail_stack_return (fs);
1419 	  return REG_ESPACE;
1420 	}
1421       prev_idx_match_malloced = true;
1422     }
1423   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1424 
1425   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1426     {
1427       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1428 
1429       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1430 	{
1431 	  Idx reg_idx;
1432 	  if (fs)
1433 	    {
1434 	      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1435 		if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1436 		  break;
1437 	      if (reg_idx == nmatch)
1438 		{
1439 		  re_node_set_free (&eps_via_nodes);
1440 		  if (prev_idx_match_malloced)
1441 		    re_free (prev_idx_match);
1442 		  return free_fail_stack_return (fs);
1443 		}
1444 	      cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1445 					 &eps_via_nodes);
1446 	    }
1447 	  else
1448 	    {
1449 	      re_node_set_free (&eps_via_nodes);
1450 	      if (prev_idx_match_malloced)
1451 		re_free (prev_idx_match);
1452 	      return REG_NOERROR;
1453 	    }
1454 	}
1455 
1456       /* Proceed to next node.  */
1457       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1458 				    &eps_via_nodes, fs);
1459 
1460       if (__glibc_unlikely (cur_node < 0))
1461 	{
1462 	  if (__glibc_unlikely (cur_node == -2))
1463 	    {
1464 	      re_node_set_free (&eps_via_nodes);
1465 	      if (prev_idx_match_malloced)
1466 		re_free (prev_idx_match);
1467 	      free_fail_stack_return (fs);
1468 	      return REG_ESPACE;
1469 	    }
1470 	  if (fs)
1471 	    cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1472 				       &eps_via_nodes);
1473 	  else
1474 	    {
1475 	      re_node_set_free (&eps_via_nodes);
1476 	      if (prev_idx_match_malloced)
1477 		re_free (prev_idx_match);
1478 	      return REG_NOMATCH;
1479 	    }
1480 	}
1481     }
1482   re_node_set_free (&eps_via_nodes);
1483   if (prev_idx_match_malloced)
1484     re_free (prev_idx_match);
1485   return free_fail_stack_return (fs);
1486 }
1487 
1488 static reg_errcode_t
free_fail_stack_return(struct re_fail_stack_t * fs)1489 free_fail_stack_return (struct re_fail_stack_t *fs)
1490 {
1491   if (fs)
1492     {
1493       Idx fs_idx;
1494       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1495 	{
1496 	  re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1497 	  re_free (fs->stack[fs_idx].regs);
1498 	}
1499       re_free (fs->stack);
1500     }
1501   return REG_NOERROR;
1502 }
1503 
1504 static void
update_regs(const re_dfa_t * dfa,regmatch_t * pmatch,regmatch_t * prev_idx_match,Idx cur_node,Idx cur_idx,Idx nmatch)1505 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1506 	     regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1507 {
1508   int type = dfa->nodes[cur_node].type;
1509   if (type == OP_OPEN_SUBEXP)
1510     {
1511       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1512 
1513       /* We are at the first node of this sub expression.  */
1514       if (reg_num < nmatch)
1515 	{
1516 	  pmatch[reg_num].rm_so = cur_idx;
1517 	  pmatch[reg_num].rm_eo = -1;
1518 	}
1519     }
1520   else if (type == OP_CLOSE_SUBEXP)
1521     {
1522       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1523       if (reg_num < nmatch)
1524 	{
1525 	  /* We are at the last node of this sub expression.  */
1526 	  if (pmatch[reg_num].rm_so < cur_idx)
1527 	    {
1528 	      pmatch[reg_num].rm_eo = cur_idx;
1529 	      /* This is a non-empty match or we are not inside an optional
1530 		 subexpression.  Accept this right away.  */
1531 	      memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1532 	    }
1533 	  else
1534 	    {
1535 	      if (dfa->nodes[cur_node].opt_subexp
1536 		  && prev_idx_match[reg_num].rm_so != -1)
1537 		/* We transited through an empty match for an optional
1538 		   subexpression, like (a?)*, and this is not the subexp's
1539 		   first match.  Copy back the old content of the registers
1540 		   so that matches of an inner subexpression are undone as
1541 		   well, like in ((a?))*.  */
1542 		memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1543 	      else
1544 		/* We completed a subexpression, but it may be part of
1545 		   an optional one, so do not update PREV_IDX_MATCH.  */
1546 		pmatch[reg_num].rm_eo = cur_idx;
1547 	    }
1548 	}
1549     }
1550 }
1551 
1552 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1553    and sift the nodes in each states according to the following rules.
1554    Updated state_log will be wrote to STATE_LOG.
1555 
1556    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1557      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1558 	If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1559 	the LAST_NODE, we throw away the node 'a'.
1560      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1561 	string 's' and transit to 'b':
1562 	i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1563 	   away the node 'a'.
1564 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1565 	    thrown away, we throw away the node 'a'.
1566      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1567 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1568 	   node 'a'.
1569 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1570 	    we throw away the node 'a'.  */
1571 
1572 #define STATE_NODE_CONTAINS(state,node) \
1573   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1574 
1575 static reg_errcode_t
sift_states_backward(const re_match_context_t * mctx,re_sift_context_t * sctx)1576 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1577 {
1578   reg_errcode_t err;
1579   int null_cnt = 0;
1580   Idx str_idx = sctx->last_str_idx;
1581   re_node_set cur_dest;
1582 
1583 #ifdef DEBUG
1584   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1585 #endif
1586 
1587   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1588      transit to the last_node and the last_node itself.  */
1589   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1590   if (__glibc_unlikely (err != REG_NOERROR))
1591     return err;
1592   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1593   if (__glibc_unlikely (err != REG_NOERROR))
1594     goto free_return;
1595 
1596   /* Then check each states in the state_log.  */
1597   while (str_idx > 0)
1598     {
1599       /* Update counters.  */
1600       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1601       if (null_cnt > mctx->max_mb_elem_len)
1602 	{
1603 	  memset (sctx->sifted_states, '\0',
1604 		  sizeof (re_dfastate_t *) * str_idx);
1605 	  re_node_set_free (&cur_dest);
1606 	  return REG_NOERROR;
1607 	}
1608       re_node_set_empty (&cur_dest);
1609       --str_idx;
1610 
1611       if (mctx->state_log[str_idx])
1612 	{
1613 	  err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1614 	  if (__glibc_unlikely (err != REG_NOERROR))
1615 	    goto free_return;
1616 	}
1617 
1618       /* Add all the nodes which satisfy the following conditions:
1619 	 - It can epsilon transit to a node in CUR_DEST.
1620 	 - It is in CUR_SRC.
1621 	 And update state_log.  */
1622       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1623       if (__glibc_unlikely (err != REG_NOERROR))
1624 	goto free_return;
1625     }
1626   err = REG_NOERROR;
1627  free_return:
1628   re_node_set_free (&cur_dest);
1629   return err;
1630 }
1631 
1632 static reg_errcode_t
1633 __attribute_warn_unused_result__
build_sifted_states(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * cur_dest)1634 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1635 		     Idx str_idx, re_node_set *cur_dest)
1636 {
1637   const re_dfa_t *const dfa = mctx->dfa;
1638   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1639   Idx i;
1640 
1641   /* Then build the next sifted state.
1642      We build the next sifted state on 'cur_dest', and update
1643      'sifted_states[str_idx]' with 'cur_dest'.
1644      Note:
1645      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1646      'cur_src' points the node_set of the old 'state_log[str_idx]'
1647      (with the epsilon nodes pre-filtered out).  */
1648   for (i = 0; i < cur_src->nelem; i++)
1649     {
1650       Idx prev_node = cur_src->elems[i];
1651       int naccepted = 0;
1652       bool ok;
1653 
1654 #ifdef DEBUG
1655       re_token_type_t type = dfa->nodes[prev_node].type;
1656       assert (!IS_EPSILON_NODE (type));
1657 #endif
1658 #ifdef RE_ENABLE_I18N
1659       /* If the node may accept "multi byte".  */
1660       if (dfa->nodes[prev_node].accept_mb)
1661 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1662 					 str_idx, sctx->last_str_idx);
1663 #endif /* RE_ENABLE_I18N */
1664 
1665       /* We don't check backreferences here.
1666 	 See update_cur_sifted_state().  */
1667       if (!naccepted
1668 	  && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1669 	  && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1670 				  dfa->nexts[prev_node]))
1671 	naccepted = 1;
1672 
1673       if (naccepted == 0)
1674 	continue;
1675 
1676       if (sctx->limits.nelem)
1677 	{
1678 	  Idx to_idx = str_idx + naccepted;
1679 	  if (check_dst_limits (mctx, &sctx->limits,
1680 				dfa->nexts[prev_node], to_idx,
1681 				prev_node, str_idx))
1682 	    continue;
1683 	}
1684       ok = re_node_set_insert (cur_dest, prev_node);
1685       if (__glibc_unlikely (! ok))
1686 	return REG_ESPACE;
1687     }
1688 
1689   return REG_NOERROR;
1690 }
1691 
1692 /* Helper functions.  */
1693 
1694 static reg_errcode_t
clean_state_log_if_needed(re_match_context_t * mctx,Idx next_state_log_idx)1695 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1696 {
1697   Idx top = mctx->state_log_top;
1698 
1699   if ((next_state_log_idx >= mctx->input.bufs_len
1700        && mctx->input.bufs_len < mctx->input.len)
1701       || (next_state_log_idx >= mctx->input.valid_len
1702 	  && mctx->input.valid_len < mctx->input.len))
1703     {
1704       reg_errcode_t err;
1705       err = extend_buffers (mctx, next_state_log_idx + 1);
1706       if (__glibc_unlikely (err != REG_NOERROR))
1707 	return err;
1708     }
1709 
1710   if (top < next_state_log_idx)
1711     {
1712       memset (mctx->state_log + top + 1, '\0',
1713 	      sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1714       mctx->state_log_top = next_state_log_idx;
1715     }
1716   return REG_NOERROR;
1717 }
1718 
1719 static reg_errcode_t
merge_state_array(const re_dfa_t * dfa,re_dfastate_t ** dst,re_dfastate_t ** src,Idx num)1720 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1721 		   re_dfastate_t **src, Idx num)
1722 {
1723   Idx st_idx;
1724   reg_errcode_t err;
1725   for (st_idx = 0; st_idx < num; ++st_idx)
1726     {
1727       if (dst[st_idx] == NULL)
1728 	dst[st_idx] = src[st_idx];
1729       else if (src[st_idx] != NULL)
1730 	{
1731 	  re_node_set merged_set;
1732 	  err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1733 					&src[st_idx]->nodes);
1734 	  if (__glibc_unlikely (err != REG_NOERROR))
1735 	    return err;
1736 	  dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1737 	  re_node_set_free (&merged_set);
1738 	  if (__glibc_unlikely (err != REG_NOERROR))
1739 	    return err;
1740 	}
1741     }
1742   return REG_NOERROR;
1743 }
1744 
1745 static reg_errcode_t
update_cur_sifted_state(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,re_node_set * dest_nodes)1746 update_cur_sifted_state (const re_match_context_t *mctx,
1747 			 re_sift_context_t *sctx, Idx str_idx,
1748 			 re_node_set *dest_nodes)
1749 {
1750   const re_dfa_t *const dfa = mctx->dfa;
1751   reg_errcode_t err = REG_NOERROR;
1752   const re_node_set *candidates;
1753   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1754 		: &mctx->state_log[str_idx]->nodes);
1755 
1756   if (dest_nodes->nelem == 0)
1757     sctx->sifted_states[str_idx] = NULL;
1758   else
1759     {
1760       if (candidates)
1761 	{
1762 	  /* At first, add the nodes which can epsilon transit to a node in
1763 	     DEST_NODE.  */
1764 	  err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1765 	  if (__glibc_unlikely (err != REG_NOERROR))
1766 	    return err;
1767 
1768 	  /* Then, check the limitations in the current sift_context.  */
1769 	  if (sctx->limits.nelem)
1770 	    {
1771 	      err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1772 					 mctx->bkref_ents, str_idx);
1773 	      if (__glibc_unlikely (err != REG_NOERROR))
1774 		return err;
1775 	    }
1776 	}
1777 
1778       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1779       if (__glibc_unlikely (err != REG_NOERROR))
1780 	return err;
1781     }
1782 
1783   if (candidates && mctx->state_log[str_idx]->has_backref)
1784     {
1785       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1786       if (__glibc_unlikely (err != REG_NOERROR))
1787 	return err;
1788     }
1789   return REG_NOERROR;
1790 }
1791 
1792 static reg_errcode_t
1793 __attribute_warn_unused_result__
add_epsilon_src_nodes(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates)1794 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1795 		       const re_node_set *candidates)
1796 {
1797   reg_errcode_t err = REG_NOERROR;
1798   Idx i;
1799 
1800   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1801   if (__glibc_unlikely (err != REG_NOERROR))
1802     return err;
1803 
1804   if (!state->inveclosure.alloc)
1805     {
1806       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1807       if (__glibc_unlikely (err != REG_NOERROR))
1808 	return REG_ESPACE;
1809       for (i = 0; i < dest_nodes->nelem; i++)
1810 	{
1811 	  err = re_node_set_merge (&state->inveclosure,
1812 				   dfa->inveclosures + dest_nodes->elems[i]);
1813 	  if (__glibc_unlikely (err != REG_NOERROR))
1814 	    return REG_ESPACE;
1815 	}
1816     }
1817   return re_node_set_add_intersect (dest_nodes, candidates,
1818 				    &state->inveclosure);
1819 }
1820 
1821 static reg_errcode_t
sub_epsilon_src_nodes(const re_dfa_t * dfa,Idx node,re_node_set * dest_nodes,const re_node_set * candidates)1822 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1823 		       const re_node_set *candidates)
1824 {
1825     Idx ecl_idx;
1826     reg_errcode_t err;
1827     re_node_set *inv_eclosure = dfa->inveclosures + node;
1828     re_node_set except_nodes;
1829     re_node_set_init_empty (&except_nodes);
1830     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1831       {
1832 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1833 	if (cur_node == node)
1834 	  continue;
1835 	if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1836 	  {
1837 	    Idx edst1 = dfa->edests[cur_node].elems[0];
1838 	    Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1839 			 ? dfa->edests[cur_node].elems[1] : -1);
1840 	    if ((!re_node_set_contains (inv_eclosure, edst1)
1841 		 && re_node_set_contains (dest_nodes, edst1))
1842 		|| (edst2 > 0
1843 		    && !re_node_set_contains (inv_eclosure, edst2)
1844 		    && re_node_set_contains (dest_nodes, edst2)))
1845 	      {
1846 		err = re_node_set_add_intersect (&except_nodes, candidates,
1847 						 dfa->inveclosures + cur_node);
1848 		if (__glibc_unlikely (err != REG_NOERROR))
1849 		  {
1850 		    re_node_set_free (&except_nodes);
1851 		    return err;
1852 		  }
1853 	      }
1854 	  }
1855       }
1856     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1857       {
1858 	Idx cur_node = inv_eclosure->elems[ecl_idx];
1859 	if (!re_node_set_contains (&except_nodes, cur_node))
1860 	  {
1861 	    Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1862 	    re_node_set_remove_at (dest_nodes, idx);
1863 	  }
1864       }
1865     re_node_set_free (&except_nodes);
1866     return REG_NOERROR;
1867 }
1868 
1869 static bool
check_dst_limits(const re_match_context_t * mctx,const re_node_set * limits,Idx dst_node,Idx dst_idx,Idx src_node,Idx src_idx)1870 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1871 		  Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1872 {
1873   const re_dfa_t *const dfa = mctx->dfa;
1874   Idx lim_idx, src_pos, dst_pos;
1875 
1876   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1877   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1878   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1879     {
1880       Idx subexp_idx;
1881       struct re_backref_cache_entry *ent;
1882       ent = mctx->bkref_ents + limits->elems[lim_idx];
1883       subexp_idx = dfa->nodes[ent->node].opr.idx;
1884 
1885       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1886 					   subexp_idx, dst_node, dst_idx,
1887 					   dst_bkref_idx);
1888       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1889 					   subexp_idx, src_node, src_idx,
1890 					   src_bkref_idx);
1891 
1892       /* In case of:
1893 	 <src> <dst> ( <subexp> )
1894 	 ( <subexp> ) <src> <dst>
1895 	 ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1896       if (src_pos == dst_pos)
1897 	continue; /* This is unrelated limitation.  */
1898       else
1899 	return true;
1900     }
1901   return false;
1902 }
1903 
1904 static int
check_dst_limits_calc_pos_1(const re_match_context_t * mctx,int boundaries,Idx subexp_idx,Idx from_node,Idx bkref_idx)1905 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1906 			     Idx subexp_idx, Idx from_node, Idx bkref_idx)
1907 {
1908   const re_dfa_t *const dfa = mctx->dfa;
1909   const re_node_set *eclosures = dfa->eclosures + from_node;
1910   Idx node_idx;
1911 
1912   /* Else, we are on the boundary: examine the nodes on the epsilon
1913      closure.  */
1914   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1915     {
1916       Idx node = eclosures->elems[node_idx];
1917       switch (dfa->nodes[node].type)
1918 	{
1919 	case OP_BACK_REF:
1920 	  if (bkref_idx != -1)
1921 	    {
1922 	      struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1923 	      do
1924 		{
1925 		  Idx dst;
1926 		  int cpos;
1927 
1928 		  if (ent->node != node)
1929 		    continue;
1930 
1931 		  if (subexp_idx < BITSET_WORD_BITS
1932 		      && !(ent->eps_reachable_subexps_map
1933 			   & ((bitset_word_t) 1 << subexp_idx)))
1934 		    continue;
1935 
1936 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
1937 		     OP_CLOSE_SUBEXP cases below.  But, if the
1938 		     destination node is the same node as the source
1939 		     node, don't recurse because it would cause an
1940 		     infinite loop: a regex that exhibits this behavior
1941 		     is ()\1*\1*  */
1942 		  dst = dfa->edests[node].elems[0];
1943 		  if (dst == from_node)
1944 		    {
1945 		      if (boundaries & 1)
1946 			return -1;
1947 		      else /* if (boundaries & 2) */
1948 			return 0;
1949 		    }
1950 
1951 		  cpos =
1952 		    check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1953 						 dst, bkref_idx);
1954 		  if (cpos == -1 /* && (boundaries & 1) */)
1955 		    return -1;
1956 		  if (cpos == 0 && (boundaries & 2))
1957 		    return 0;
1958 
1959 		  if (subexp_idx < BITSET_WORD_BITS)
1960 		    ent->eps_reachable_subexps_map
1961 		      &= ~((bitset_word_t) 1 << subexp_idx);
1962 		}
1963 	      while (ent++->more);
1964 	    }
1965 	  break;
1966 
1967 	case OP_OPEN_SUBEXP:
1968 	  if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1969 	    return -1;
1970 	  break;
1971 
1972 	case OP_CLOSE_SUBEXP:
1973 	  if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1974 	    return 0;
1975 	  break;
1976 
1977 	default:
1978 	    break;
1979 	}
1980     }
1981 
1982   return (boundaries & 2) ? 1 : 0;
1983 }
1984 
1985 static int
check_dst_limits_calc_pos(const re_match_context_t * mctx,Idx limit,Idx subexp_idx,Idx from_node,Idx str_idx,Idx bkref_idx)1986 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1987 			   Idx subexp_idx, Idx from_node, Idx str_idx,
1988 			   Idx bkref_idx)
1989 {
1990   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1991   int boundaries;
1992 
1993   /* If we are outside the range of the subexpression, return -1 or 1.  */
1994   if (str_idx < lim->subexp_from)
1995     return -1;
1996 
1997   if (lim->subexp_to < str_idx)
1998     return 1;
1999 
2000   /* If we are within the subexpression, return 0.  */
2001   boundaries = (str_idx == lim->subexp_from);
2002   boundaries |= (str_idx == lim->subexp_to) << 1;
2003   if (boundaries == 0)
2004     return 0;
2005 
2006   /* Else, examine epsilon closure.  */
2007   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2008 				      from_node, bkref_idx);
2009 }
2010 
2011 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2012    which are against limitations from DEST_NODES. */
2013 
2014 static reg_errcode_t
check_subexp_limits(const re_dfa_t * dfa,re_node_set * dest_nodes,const re_node_set * candidates,re_node_set * limits,struct re_backref_cache_entry * bkref_ents,Idx str_idx)2015 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2016 		     const re_node_set *candidates, re_node_set *limits,
2017 		     struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2018 {
2019   reg_errcode_t err;
2020   Idx node_idx, lim_idx;
2021 
2022   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2023     {
2024       Idx subexp_idx;
2025       struct re_backref_cache_entry *ent;
2026       ent = bkref_ents + limits->elems[lim_idx];
2027 
2028       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2029 	continue; /* This is unrelated limitation.  */
2030 
2031       subexp_idx = dfa->nodes[ent->node].opr.idx;
2032       if (ent->subexp_to == str_idx)
2033 	{
2034 	  Idx ops_node = -1;
2035 	  Idx cls_node = -1;
2036 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2037 	    {
2038 	      Idx node = dest_nodes->elems[node_idx];
2039 	      re_token_type_t type = dfa->nodes[node].type;
2040 	      if (type == OP_OPEN_SUBEXP
2041 		  && subexp_idx == dfa->nodes[node].opr.idx)
2042 		ops_node = node;
2043 	      else if (type == OP_CLOSE_SUBEXP
2044 		       && subexp_idx == dfa->nodes[node].opr.idx)
2045 		cls_node = node;
2046 	    }
2047 
2048 	  /* Check the limitation of the open subexpression.  */
2049 	  /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2050 	  if (ops_node >= 0)
2051 	    {
2052 	      err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2053 					   candidates);
2054 	      if (__glibc_unlikely (err != REG_NOERROR))
2055 		return err;
2056 	    }
2057 
2058 	  /* Check the limitation of the close subexpression.  */
2059 	  if (cls_node >= 0)
2060 	    for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2061 	      {
2062 		Idx node = dest_nodes->elems[node_idx];
2063 		if (!re_node_set_contains (dfa->inveclosures + node,
2064 					   cls_node)
2065 		    && !re_node_set_contains (dfa->eclosures + node,
2066 					      cls_node))
2067 		  {
2068 		    /* It is against this limitation.
2069 		       Remove it form the current sifted state.  */
2070 		    err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2071 						 candidates);
2072 		    if (__glibc_unlikely (err != REG_NOERROR))
2073 		      return err;
2074 		    --node_idx;
2075 		  }
2076 	      }
2077 	}
2078       else /* (ent->subexp_to != str_idx)  */
2079 	{
2080 	  for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2081 	    {
2082 	      Idx node = dest_nodes->elems[node_idx];
2083 	      re_token_type_t type = dfa->nodes[node].type;
2084 	      if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2085 		{
2086 		  if (subexp_idx != dfa->nodes[node].opr.idx)
2087 		    continue;
2088 		  /* It is against this limitation.
2089 		     Remove it form the current sifted state.  */
2090 		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2091 					       candidates);
2092 		  if (__glibc_unlikely (err != REG_NOERROR))
2093 		    return err;
2094 		}
2095 	    }
2096 	}
2097     }
2098   return REG_NOERROR;
2099 }
2100 
2101 static reg_errcode_t
2102 __attribute_warn_unused_result__
sift_states_bkref(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx str_idx,const re_node_set * candidates)2103 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2104 		   Idx str_idx, const re_node_set *candidates)
2105 {
2106   const re_dfa_t *const dfa = mctx->dfa;
2107   reg_errcode_t err;
2108   Idx node_idx, node;
2109   re_sift_context_t local_sctx;
2110   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2111 
2112   if (first_idx == -1)
2113     return REG_NOERROR;
2114 
2115   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2116 
2117   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2118     {
2119       Idx enabled_idx;
2120       re_token_type_t type;
2121       struct re_backref_cache_entry *entry;
2122       node = candidates->elems[node_idx];
2123       type = dfa->nodes[node].type;
2124       /* Avoid infinite loop for the REs like "()\1+".  */
2125       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2126 	continue;
2127       if (type != OP_BACK_REF)
2128 	continue;
2129 
2130       entry = mctx->bkref_ents + first_idx;
2131       enabled_idx = first_idx;
2132       do
2133 	{
2134 	  Idx subexp_len;
2135 	  Idx to_idx;
2136 	  Idx dst_node;
2137 	  bool ok;
2138 	  re_dfastate_t *cur_state;
2139 
2140 	  if (entry->node != node)
2141 	    continue;
2142 	  subexp_len = entry->subexp_to - entry->subexp_from;
2143 	  to_idx = str_idx + subexp_len;
2144 	  dst_node = (subexp_len ? dfa->nexts[node]
2145 		      : dfa->edests[node].elems[0]);
2146 
2147 	  if (to_idx > sctx->last_str_idx
2148 	      || sctx->sifted_states[to_idx] == NULL
2149 	      || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2150 	      || check_dst_limits (mctx, &sctx->limits, node,
2151 				   str_idx, dst_node, to_idx))
2152 	    continue;
2153 
2154 	  if (local_sctx.sifted_states == NULL)
2155 	    {
2156 	      local_sctx = *sctx;
2157 	      err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2158 	      if (__glibc_unlikely (err != REG_NOERROR))
2159 		goto free_return;
2160 	    }
2161 	  local_sctx.last_node = node;
2162 	  local_sctx.last_str_idx = str_idx;
2163 	  ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2164 	  if (__glibc_unlikely (! ok))
2165 	    {
2166 	      err = REG_ESPACE;
2167 	      goto free_return;
2168 	    }
2169 	  cur_state = local_sctx.sifted_states[str_idx];
2170 	  err = sift_states_backward (mctx, &local_sctx);
2171 	  if (__glibc_unlikely (err != REG_NOERROR))
2172 	    goto free_return;
2173 	  if (sctx->limited_states != NULL)
2174 	    {
2175 	      err = merge_state_array (dfa, sctx->limited_states,
2176 				       local_sctx.sifted_states,
2177 				       str_idx + 1);
2178 	      if (__glibc_unlikely (err != REG_NOERROR))
2179 		goto free_return;
2180 	    }
2181 	  local_sctx.sifted_states[str_idx] = cur_state;
2182 	  re_node_set_remove (&local_sctx.limits, enabled_idx);
2183 
2184 	  /* mctx->bkref_ents may have changed, reload the pointer.  */
2185 	  entry = mctx->bkref_ents + enabled_idx;
2186 	}
2187       while (enabled_idx++, entry++->more);
2188     }
2189   err = REG_NOERROR;
2190  free_return:
2191   if (local_sctx.sifted_states != NULL)
2192     {
2193       re_node_set_free (&local_sctx.limits);
2194     }
2195 
2196   return err;
2197 }
2198 
2199 
2200 #ifdef RE_ENABLE_I18N
2201 static int
sift_states_iter_mb(const re_match_context_t * mctx,re_sift_context_t * sctx,Idx node_idx,Idx str_idx,Idx max_str_idx)2202 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2203 		     Idx node_idx, Idx str_idx, Idx max_str_idx)
2204 {
2205   const re_dfa_t *const dfa = mctx->dfa;
2206   int naccepted;
2207   /* Check the node can accept "multi byte".  */
2208   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2209   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2210       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2211 			       dfa->nexts[node_idx]))
2212     /* The node can't accept the "multi byte", or the
2213        destination was already thrown away, then the node
2214        couldn't accept the current input "multi byte".   */
2215     naccepted = 0;
2216   /* Otherwise, it is sure that the node could accept
2217      'naccepted' bytes input.  */
2218   return naccepted;
2219 }
2220 #endif /* RE_ENABLE_I18N */
2221 
2222 
2223 /* Functions for state transition.  */
2224 
2225 /* Return the next state to which the current state STATE will transit by
2226    accepting the current input byte, and update STATE_LOG if necessary.
2227    If STATE can accept a multibyte char/collating element/back reference
2228    update the destination of STATE_LOG.  */
2229 
2230 static re_dfastate_t *
2231 __attribute_warn_unused_result__
transit_state(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * state)2232 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2233 	       re_dfastate_t *state)
2234 {
2235   re_dfastate_t **trtable;
2236   unsigned char ch;
2237 
2238 #ifdef RE_ENABLE_I18N
2239   /* If the current state can accept multibyte.  */
2240   if (__glibc_unlikely (state->accept_mb))
2241     {
2242       *err = transit_state_mb (mctx, state);
2243       if (__glibc_unlikely (*err != REG_NOERROR))
2244 	return NULL;
2245     }
2246 #endif /* RE_ENABLE_I18N */
2247 
2248   /* Then decide the next state with the single byte.  */
2249 #if 0
2250   if (0)
2251     /* don't use transition table  */
2252     return transit_state_sb (err, mctx, state);
2253 #endif
2254 
2255   /* Use transition table  */
2256   ch = re_string_fetch_byte (&mctx->input);
2257   for (;;)
2258     {
2259       trtable = state->trtable;
2260       if (__glibc_likely (trtable != NULL))
2261 	return trtable[ch];
2262 
2263       trtable = state->word_trtable;
2264       if (__glibc_likely (trtable != NULL))
2265 	{
2266 	  unsigned int context;
2267 	  context
2268 	    = re_string_context_at (&mctx->input,
2269 				    re_string_cur_idx (&mctx->input) - 1,
2270 				    mctx->eflags);
2271 	  if (IS_WORD_CONTEXT (context))
2272 	    return trtable[ch + SBC_MAX];
2273 	  else
2274 	    return trtable[ch];
2275 	}
2276 
2277       if (!build_trtable (mctx->dfa, state))
2278 	{
2279 	  *err = REG_ESPACE;
2280 	  return NULL;
2281 	}
2282 
2283       /* Retry, we now have a transition table.  */
2284     }
2285 }
2286 
2287 /* Update the state_log if we need */
2288 static re_dfastate_t *
merge_state_with_log(reg_errcode_t * err,re_match_context_t * mctx,re_dfastate_t * next_state)2289 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2290 		      re_dfastate_t *next_state)
2291 {
2292   const re_dfa_t *const dfa = mctx->dfa;
2293   Idx cur_idx = re_string_cur_idx (&mctx->input);
2294 
2295   if (cur_idx > mctx->state_log_top)
2296     {
2297       mctx->state_log[cur_idx] = next_state;
2298       mctx->state_log_top = cur_idx;
2299     }
2300   else if (mctx->state_log[cur_idx] == 0)
2301     {
2302       mctx->state_log[cur_idx] = next_state;
2303     }
2304   else
2305     {
2306       re_dfastate_t *pstate;
2307       unsigned int context;
2308       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2309       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2310 	 the destination of a multibyte char/collating element/
2311 	 back reference.  Then the next state is the union set of
2312 	 these destinations and the results of the transition table.  */
2313       pstate = mctx->state_log[cur_idx];
2314       log_nodes = pstate->entrance_nodes;
2315       if (next_state != NULL)
2316 	{
2317 	  table_nodes = next_state->entrance_nodes;
2318 	  *err = re_node_set_init_union (&next_nodes, table_nodes,
2319 					     log_nodes);
2320 	  if (__glibc_unlikely (*err != REG_NOERROR))
2321 	    return NULL;
2322 	}
2323       else
2324 	next_nodes = *log_nodes;
2325       /* Note: We already add the nodes of the initial state,
2326 	 then we don't need to add them here.  */
2327 
2328       context = re_string_context_at (&mctx->input,
2329 				      re_string_cur_idx (&mctx->input) - 1,
2330 				      mctx->eflags);
2331       next_state = mctx->state_log[cur_idx]
2332 	= re_acquire_state_context (err, dfa, &next_nodes, context);
2333       /* We don't need to check errors here, since the return value of
2334 	 this function is next_state and ERR is already set.  */
2335 
2336       if (table_nodes != NULL)
2337 	re_node_set_free (&next_nodes);
2338     }
2339 
2340   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2341     {
2342       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2343 	 later.  We must check them here, since the back references in the
2344 	 next state might use them.  */
2345       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2346 					cur_idx);
2347       if (__glibc_unlikely (*err != REG_NOERROR))
2348 	return NULL;
2349 
2350       /* If the next state has back references.  */
2351       if (next_state->has_backref)
2352 	{
2353 	  *err = transit_state_bkref (mctx, &next_state->nodes);
2354 	  if (__glibc_unlikely (*err != REG_NOERROR))
2355 	    return NULL;
2356 	  next_state = mctx->state_log[cur_idx];
2357 	}
2358     }
2359 
2360   return next_state;
2361 }
2362 
2363 /* Skip bytes in the input that correspond to part of a
2364    multi-byte match, then look in the log for a state
2365    from which to restart matching.  */
2366 static re_dfastate_t *
find_recover_state(reg_errcode_t * err,re_match_context_t * mctx)2367 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2368 {
2369   re_dfastate_t *cur_state;
2370   do
2371     {
2372       Idx max = mctx->state_log_top;
2373       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2374 
2375       do
2376 	{
2377 	  if (++cur_str_idx > max)
2378 	    return NULL;
2379 	  re_string_skip_bytes (&mctx->input, 1);
2380 	}
2381       while (mctx->state_log[cur_str_idx] == NULL);
2382 
2383       cur_state = merge_state_with_log (err, mctx, NULL);
2384     }
2385   while (*err == REG_NOERROR && cur_state == NULL);
2386   return cur_state;
2387 }
2388 
2389 /* Helper functions for transit_state.  */
2390 
2391 /* From the node set CUR_NODES, pick up the nodes whose types are
2392    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2393    expression. And register them to use them later for evaluating the
2394    corresponding back references.  */
2395 
2396 static reg_errcode_t
check_subexp_matching_top(re_match_context_t * mctx,re_node_set * cur_nodes,Idx str_idx)2397 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2398 			   Idx str_idx)
2399 {
2400   const re_dfa_t *const dfa = mctx->dfa;
2401   Idx node_idx;
2402   reg_errcode_t err;
2403 
2404   /* TODO: This isn't efficient.
2405 	   Because there might be more than one nodes whose types are
2406 	   OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2407 	   nodes.
2408 	   E.g. RE: (a){2}  */
2409   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2410     {
2411       Idx node = cur_nodes->elems[node_idx];
2412       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2413 	  && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2414 	  && (dfa->used_bkref_map
2415 	      & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2416 	{
2417 	  err = match_ctx_add_subtop (mctx, node, str_idx);
2418 	  if (__glibc_unlikely (err != REG_NOERROR))
2419 	    return err;
2420 	}
2421     }
2422   return REG_NOERROR;
2423 }
2424 
2425 #if 0
2426 /* Return the next state to which the current state STATE will transit by
2427    accepting the current input byte.  */
2428 
2429 static re_dfastate_t *
2430 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2431 		  re_dfastate_t *state)
2432 {
2433   const re_dfa_t *const dfa = mctx->dfa;
2434   re_node_set next_nodes;
2435   re_dfastate_t *next_state;
2436   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2437   unsigned int context;
2438 
2439   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2440   if (__glibc_unlikely (*err != REG_NOERROR))
2441     return NULL;
2442   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2443     {
2444       Idx cur_node = state->nodes.elems[node_cnt];
2445       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2446 	{
2447 	  *err = re_node_set_merge (&next_nodes,
2448 				    dfa->eclosures + dfa->nexts[cur_node]);
2449 	  if (__glibc_unlikely (*err != REG_NOERROR))
2450 	    {
2451 	      re_node_set_free (&next_nodes);
2452 	      return NULL;
2453 	    }
2454 	}
2455     }
2456   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2457   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2458   /* We don't need to check errors here, since the return value of
2459      this function is next_state and ERR is already set.  */
2460 
2461   re_node_set_free (&next_nodes);
2462   re_string_skip_bytes (&mctx->input, 1);
2463   return next_state;
2464 }
2465 #endif
2466 
2467 #ifdef RE_ENABLE_I18N
2468 static reg_errcode_t
transit_state_mb(re_match_context_t * mctx,re_dfastate_t * pstate)2469 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2470 {
2471   const re_dfa_t *const dfa = mctx->dfa;
2472   reg_errcode_t err;
2473   Idx i;
2474 
2475   for (i = 0; i < pstate->nodes.nelem; ++i)
2476     {
2477       re_node_set dest_nodes, *new_nodes;
2478       Idx cur_node_idx = pstate->nodes.elems[i];
2479       int naccepted;
2480       Idx dest_idx;
2481       unsigned int context;
2482       re_dfastate_t *dest_state;
2483 
2484       if (!dfa->nodes[cur_node_idx].accept_mb)
2485 	continue;
2486 
2487       if (dfa->nodes[cur_node_idx].constraint)
2488 	{
2489 	  context = re_string_context_at (&mctx->input,
2490 					  re_string_cur_idx (&mctx->input),
2491 					  mctx->eflags);
2492 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2493 					   context))
2494 	    continue;
2495 	}
2496 
2497       /* How many bytes the node can accept?  */
2498       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2499 					   re_string_cur_idx (&mctx->input));
2500       if (naccepted == 0)
2501 	continue;
2502 
2503       /* The node can accepts 'naccepted' bytes.  */
2504       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2505       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2506 			       : mctx->max_mb_elem_len);
2507       err = clean_state_log_if_needed (mctx, dest_idx);
2508       if (__glibc_unlikely (err != REG_NOERROR))
2509 	return err;
2510 #ifdef DEBUG
2511       assert (dfa->nexts[cur_node_idx] != -1);
2512 #endif
2513       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2514 
2515       dest_state = mctx->state_log[dest_idx];
2516       if (dest_state == NULL)
2517 	dest_nodes = *new_nodes;
2518       else
2519 	{
2520 	  err = re_node_set_init_union (&dest_nodes,
2521 					dest_state->entrance_nodes, new_nodes);
2522 	  if (__glibc_unlikely (err != REG_NOERROR))
2523 	    return err;
2524 	}
2525       context = re_string_context_at (&mctx->input, dest_idx - 1,
2526 				      mctx->eflags);
2527       mctx->state_log[dest_idx]
2528 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2529       if (dest_state != NULL)
2530 	re_node_set_free (&dest_nodes);
2531       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2532 			    && err != REG_NOERROR))
2533 	return err;
2534     }
2535   return REG_NOERROR;
2536 }
2537 #endif /* RE_ENABLE_I18N */
2538 
2539 static reg_errcode_t
transit_state_bkref(re_match_context_t * mctx,const re_node_set * nodes)2540 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2541 {
2542   const re_dfa_t *const dfa = mctx->dfa;
2543   reg_errcode_t err;
2544   Idx i;
2545   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2546 
2547   for (i = 0; i < nodes->nelem; ++i)
2548     {
2549       Idx dest_str_idx, prev_nelem, bkc_idx;
2550       Idx node_idx = nodes->elems[i];
2551       unsigned int context;
2552       const re_token_t *node = dfa->nodes + node_idx;
2553       re_node_set *new_dest_nodes;
2554 
2555       /* Check whether 'node' is a backreference or not.  */
2556       if (node->type != OP_BACK_REF)
2557 	continue;
2558 
2559       if (node->constraint)
2560 	{
2561 	  context = re_string_context_at (&mctx->input, cur_str_idx,
2562 					  mctx->eflags);
2563 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2564 	    continue;
2565 	}
2566 
2567       /* 'node' is a backreference.
2568 	 Check the substring which the substring matched.  */
2569       bkc_idx = mctx->nbkref_ents;
2570       err = get_subexp (mctx, node_idx, cur_str_idx);
2571       if (__glibc_unlikely (err != REG_NOERROR))
2572 	goto free_return;
2573 
2574       /* And add the epsilon closures (which is 'new_dest_nodes') of
2575 	 the backreference to appropriate state_log.  */
2576 #ifdef DEBUG
2577       assert (dfa->nexts[node_idx] != -1);
2578 #endif
2579       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2580 	{
2581 	  Idx subexp_len;
2582 	  re_dfastate_t *dest_state;
2583 	  struct re_backref_cache_entry *bkref_ent;
2584 	  bkref_ent = mctx->bkref_ents + bkc_idx;
2585 	  if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2586 	    continue;
2587 	  subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2588 	  new_dest_nodes = (subexp_len == 0
2589 			    ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2590 			    : dfa->eclosures + dfa->nexts[node_idx]);
2591 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2592 			  - bkref_ent->subexp_from);
2593 	  context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2594 					  mctx->eflags);
2595 	  dest_state = mctx->state_log[dest_str_idx];
2596 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2597 			: mctx->state_log[cur_str_idx]->nodes.nelem);
2598 	  /* Add 'new_dest_node' to state_log.  */
2599 	  if (dest_state == NULL)
2600 	    {
2601 	      mctx->state_log[dest_str_idx]
2602 		= re_acquire_state_context (&err, dfa, new_dest_nodes,
2603 					    context);
2604 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2605 				    && err != REG_NOERROR))
2606 		goto free_return;
2607 	    }
2608 	  else
2609 	    {
2610 	      re_node_set dest_nodes;
2611 	      err = re_node_set_init_union (&dest_nodes,
2612 					    dest_state->entrance_nodes,
2613 					    new_dest_nodes);
2614 	      if (__glibc_unlikely (err != REG_NOERROR))
2615 		{
2616 		  re_node_set_free (&dest_nodes);
2617 		  goto free_return;
2618 		}
2619 	      mctx->state_log[dest_str_idx]
2620 		= re_acquire_state_context (&err, dfa, &dest_nodes, context);
2621 	      re_node_set_free (&dest_nodes);
2622 	      if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2623 				    && err != REG_NOERROR))
2624 		goto free_return;
2625 	    }
2626 	  /* We need to check recursively if the backreference can epsilon
2627 	     transit.  */
2628 	  if (subexp_len == 0
2629 	      && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2630 	    {
2631 	      err = check_subexp_matching_top (mctx, new_dest_nodes,
2632 					       cur_str_idx);
2633 	      if (__glibc_unlikely (err != REG_NOERROR))
2634 		goto free_return;
2635 	      err = transit_state_bkref (mctx, new_dest_nodes);
2636 	      if (__glibc_unlikely (err != REG_NOERROR))
2637 		goto free_return;
2638 	    }
2639 	}
2640     }
2641   err = REG_NOERROR;
2642  free_return:
2643   return err;
2644 }
2645 
2646 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2647    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2648    Note that we might collect inappropriate candidates here.
2649    However, the cost of checking them strictly here is too high, then we
2650    delay these checking for prune_impossible_nodes().  */
2651 
2652 static reg_errcode_t
2653 __attribute_warn_unused_result__
get_subexp(re_match_context_t * mctx,Idx bkref_node,Idx bkref_str_idx)2654 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2655 {
2656   const re_dfa_t *const dfa = mctx->dfa;
2657   Idx subexp_num, sub_top_idx;
2658   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2659   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2660   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2661   if (cache_idx != -1)
2662     {
2663       const struct re_backref_cache_entry *entry
2664 	= mctx->bkref_ents + cache_idx;
2665       do
2666 	if (entry->node == bkref_node)
2667 	  return REG_NOERROR; /* We already checked it.  */
2668       while (entry++->more);
2669     }
2670 
2671   subexp_num = dfa->nodes[bkref_node].opr.idx;
2672 
2673   /* For each sub expression  */
2674   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2675     {
2676       reg_errcode_t err;
2677       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2678       re_sub_match_last_t *sub_last;
2679       Idx sub_last_idx, sl_str, bkref_str_off;
2680 
2681       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2682 	continue; /* It isn't related.  */
2683 
2684       sl_str = sub_top->str_idx;
2685       bkref_str_off = bkref_str_idx;
2686       /* At first, check the last node of sub expressions we already
2687 	 evaluated.  */
2688       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2689 	{
2690 	  regoff_t sl_str_diff;
2691 	  sub_last = sub_top->lasts[sub_last_idx];
2692 	  sl_str_diff = sub_last->str_idx - sl_str;
2693 	  /* The matched string by the sub expression match with the substring
2694 	     at the back reference?  */
2695 	  if (sl_str_diff > 0)
2696 	    {
2697 	      if (__glibc_unlikely (bkref_str_off + sl_str_diff
2698 				    > mctx->input.valid_len))
2699 		{
2700 		  /* Not enough chars for a successful match.  */
2701 		  if (bkref_str_off + sl_str_diff > mctx->input.len)
2702 		    break;
2703 
2704 		  err = clean_state_log_if_needed (mctx,
2705 						   bkref_str_off
2706 						   + sl_str_diff);
2707 		  if (__glibc_unlikely (err != REG_NOERROR))
2708 		    return err;
2709 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2710 		}
2711 	      if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2712 		/* We don't need to search this sub expression any more.  */
2713 		break;
2714 	    }
2715 	  bkref_str_off += sl_str_diff;
2716 	  sl_str += sl_str_diff;
2717 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2718 				bkref_str_idx);
2719 
2720 	  /* Reload buf, since the preceding call might have reallocated
2721 	     the buffer.  */
2722 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2723 
2724 	  if (err == REG_NOMATCH)
2725 	    continue;
2726 	  if (__glibc_unlikely (err != REG_NOERROR))
2727 	    return err;
2728 	}
2729 
2730       if (sub_last_idx < sub_top->nlasts)
2731 	continue;
2732       if (sub_last_idx > 0)
2733 	++sl_str;
2734       /* Then, search for the other last nodes of the sub expression.  */
2735       for (; sl_str <= bkref_str_idx; ++sl_str)
2736 	{
2737 	  Idx cls_node;
2738 	  regoff_t sl_str_off;
2739 	  const re_node_set *nodes;
2740 	  sl_str_off = sl_str - sub_top->str_idx;
2741 	  /* The matched string by the sub expression match with the substring
2742 	     at the back reference?  */
2743 	  if (sl_str_off > 0)
2744 	    {
2745 	      if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2746 		{
2747 		  /* If we are at the end of the input, we cannot match.  */
2748 		  if (bkref_str_off >= mctx->input.len)
2749 		    break;
2750 
2751 		  err = extend_buffers (mctx, bkref_str_off + 1);
2752 		  if (__glibc_unlikely (err != REG_NOERROR))
2753 		    return err;
2754 
2755 		  buf = (const char *) re_string_get_buffer (&mctx->input);
2756 		}
2757 	      if (buf [bkref_str_off++] != buf[sl_str - 1])
2758 		break; /* We don't need to search this sub expression
2759 			  any more.  */
2760 	    }
2761 	  if (mctx->state_log[sl_str] == NULL)
2762 	    continue;
2763 	  /* Does this state have a ')' of the sub expression?  */
2764 	  nodes = &mctx->state_log[sl_str]->nodes;
2765 	  cls_node = find_subexp_node (dfa, nodes, subexp_num,
2766 				       OP_CLOSE_SUBEXP);
2767 	  if (cls_node == -1)
2768 	    continue; /* No.  */
2769 	  if (sub_top->path == NULL)
2770 	    {
2771 	      sub_top->path = calloc (sizeof (state_array_t),
2772 				      sl_str - sub_top->str_idx + 1);
2773 	      if (sub_top->path == NULL)
2774 		return REG_ESPACE;
2775 	    }
2776 	  /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2777 	     in the current context?  */
2778 	  err = check_arrival (mctx, sub_top->path, sub_top->node,
2779 			       sub_top->str_idx, cls_node, sl_str,
2780 			       OP_CLOSE_SUBEXP);
2781 	  if (err == REG_NOMATCH)
2782 	      continue;
2783 	  if (__glibc_unlikely (err != REG_NOERROR))
2784 	      return err;
2785 	  sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2786 	  if (__glibc_unlikely (sub_last == NULL))
2787 	    return REG_ESPACE;
2788 	  err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2789 				bkref_str_idx);
2790 	  buf = (const char *) re_string_get_buffer (&mctx->input);
2791 	  if (err == REG_NOMATCH)
2792 	    continue;
2793 	  if (__glibc_unlikely (err != REG_NOERROR))
2794 	    return err;
2795 	}
2796     }
2797   return REG_NOERROR;
2798 }
2799 
2800 /* Helper functions for get_subexp().  */
2801 
2802 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2803    If it can arrive, register the sub expression expressed with SUB_TOP
2804    and SUB_LAST.  */
2805 
2806 static reg_errcode_t
get_subexp_sub(re_match_context_t * mctx,const re_sub_match_top_t * sub_top,re_sub_match_last_t * sub_last,Idx bkref_node,Idx bkref_str)2807 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2808 		re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2809 {
2810   reg_errcode_t err;
2811   Idx to_idx;
2812   /* Can the subexpression arrive the back reference?  */
2813   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2814 		       sub_last->str_idx, bkref_node, bkref_str,
2815 		       OP_OPEN_SUBEXP);
2816   if (err != REG_NOERROR)
2817     return err;
2818   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2819 			     sub_last->str_idx);
2820   if (__glibc_unlikely (err != REG_NOERROR))
2821     return err;
2822   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2823   return clean_state_log_if_needed (mctx, to_idx);
2824 }
2825 
2826 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2827    Search '(' if FL_OPEN, or search ')' otherwise.
2828    TODO: This function isn't efficient...
2829 	 Because there might be more than one nodes whose types are
2830 	 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2831 	 nodes.
2832 	 E.g. RE: (a){2}  */
2833 
2834 static Idx
find_subexp_node(const re_dfa_t * dfa,const re_node_set * nodes,Idx subexp_idx,int type)2835 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2836 		  Idx subexp_idx, int type)
2837 {
2838   Idx cls_idx;
2839   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2840     {
2841       Idx cls_node = nodes->elems[cls_idx];
2842       const re_token_t *node = dfa->nodes + cls_node;
2843       if (node->type == type
2844 	  && node->opr.idx == subexp_idx)
2845 	return cls_node;
2846     }
2847   return -1;
2848 }
2849 
2850 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2851    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2852    heavily reused.
2853    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2854 
2855 static reg_errcode_t
2856 __attribute_warn_unused_result__
check_arrival(re_match_context_t * mctx,state_array_t * path,Idx top_node,Idx top_str,Idx last_node,Idx last_str,int type)2857 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2858 	       Idx top_str, Idx last_node, Idx last_str, int type)
2859 {
2860   const re_dfa_t *const dfa = mctx->dfa;
2861   reg_errcode_t err = REG_NOERROR;
2862   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2863   re_dfastate_t *cur_state = NULL;
2864   re_node_set *cur_nodes, next_nodes;
2865   re_dfastate_t **backup_state_log;
2866   unsigned int context;
2867 
2868   subexp_num = dfa->nodes[top_node].opr.idx;
2869   /* Extend the buffer if we need.  */
2870   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2871     {
2872       re_dfastate_t **new_array;
2873       Idx old_alloc = path->alloc;
2874       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2875       Idx new_alloc;
2876       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2877 	return REG_ESPACE;
2878       new_alloc = old_alloc + incr_alloc;
2879       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2880 	return REG_ESPACE;
2881       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2882       if (__glibc_unlikely (new_array == NULL))
2883 	return REG_ESPACE;
2884       path->array = new_array;
2885       path->alloc = new_alloc;
2886       memset (new_array + old_alloc, '\0',
2887 	      sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2888     }
2889 
2890   str_idx = path->next_idx ? path->next_idx : top_str;
2891 
2892   /* Temporary modify MCTX.  */
2893   backup_state_log = mctx->state_log;
2894   backup_cur_idx = mctx->input.cur_idx;
2895   mctx->state_log = path->array;
2896   mctx->input.cur_idx = str_idx;
2897 
2898   /* Setup initial node set.  */
2899   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2900   if (str_idx == top_str)
2901     {
2902       err = re_node_set_init_1 (&next_nodes, top_node);
2903       if (__glibc_unlikely (err != REG_NOERROR))
2904 	return err;
2905       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2906       if (__glibc_unlikely (err != REG_NOERROR))
2907 	{
2908 	  re_node_set_free (&next_nodes);
2909 	  return err;
2910 	}
2911     }
2912   else
2913     {
2914       cur_state = mctx->state_log[str_idx];
2915       if (cur_state && cur_state->has_backref)
2916 	{
2917 	  err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2918 	  if (__glibc_unlikely (err != REG_NOERROR))
2919 	    return err;
2920 	}
2921       else
2922 	re_node_set_init_empty (&next_nodes);
2923     }
2924   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2925     {
2926       if (next_nodes.nelem)
2927 	{
2928 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2929 				    subexp_num, type);
2930 	  if (__glibc_unlikely (err != REG_NOERROR))
2931 	    {
2932 	      re_node_set_free (&next_nodes);
2933 	      return err;
2934 	    }
2935 	}
2936       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2937       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2938 	{
2939 	  re_node_set_free (&next_nodes);
2940 	  return err;
2941 	}
2942       mctx->state_log[str_idx] = cur_state;
2943     }
2944 
2945   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2946     {
2947       re_node_set_empty (&next_nodes);
2948       if (mctx->state_log[str_idx + 1])
2949 	{
2950 	  err = re_node_set_merge (&next_nodes,
2951 				   &mctx->state_log[str_idx + 1]->nodes);
2952 	  if (__glibc_unlikely (err != REG_NOERROR))
2953 	    {
2954 	      re_node_set_free (&next_nodes);
2955 	      return err;
2956 	    }
2957 	}
2958       if (cur_state)
2959 	{
2960 	  err = check_arrival_add_next_nodes (mctx, str_idx,
2961 					      &cur_state->non_eps_nodes,
2962 					      &next_nodes);
2963 	  if (__glibc_unlikely (err != REG_NOERROR))
2964 	    {
2965 	      re_node_set_free (&next_nodes);
2966 	      return err;
2967 	    }
2968 	}
2969       ++str_idx;
2970       if (next_nodes.nelem)
2971 	{
2972 	  err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2973 	  if (__glibc_unlikely (err != REG_NOERROR))
2974 	    {
2975 	      re_node_set_free (&next_nodes);
2976 	      return err;
2977 	    }
2978 	  err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2979 				    subexp_num, type);
2980 	  if (__glibc_unlikely (err != REG_NOERROR))
2981 	    {
2982 	      re_node_set_free (&next_nodes);
2983 	      return err;
2984 	    }
2985 	}
2986       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2987       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2988       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2989 	{
2990 	  re_node_set_free (&next_nodes);
2991 	  return err;
2992 	}
2993       mctx->state_log[str_idx] = cur_state;
2994       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2995     }
2996   re_node_set_free (&next_nodes);
2997   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2998 	       : &mctx->state_log[last_str]->nodes);
2999   path->next_idx = str_idx;
3000 
3001   /* Fix MCTX.  */
3002   mctx->state_log = backup_state_log;
3003   mctx->input.cur_idx = backup_cur_idx;
3004 
3005   /* Then check the current node set has the node LAST_NODE.  */
3006   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3007     return REG_NOERROR;
3008 
3009   return REG_NOMATCH;
3010 }
3011 
3012 /* Helper functions for check_arrival.  */
3013 
3014 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3015    to NEXT_NODES.
3016    TODO: This function is similar to the functions transit_state*(),
3017 	 however this function has many additional works.
3018 	 Can't we unify them?  */
3019 
3020 static reg_errcode_t
3021 __attribute_warn_unused_result__
check_arrival_add_next_nodes(re_match_context_t * mctx,Idx str_idx,re_node_set * cur_nodes,re_node_set * next_nodes)3022 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3023 			      re_node_set *cur_nodes, re_node_set *next_nodes)
3024 {
3025   const re_dfa_t *const dfa = mctx->dfa;
3026   bool ok;
3027   Idx cur_idx;
3028 #ifdef RE_ENABLE_I18N
3029   reg_errcode_t err = REG_NOERROR;
3030 #endif
3031   re_node_set union_set;
3032   re_node_set_init_empty (&union_set);
3033   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3034     {
3035       int naccepted = 0;
3036       Idx cur_node = cur_nodes->elems[cur_idx];
3037 #ifdef DEBUG
3038       re_token_type_t type = dfa->nodes[cur_node].type;
3039       assert (!IS_EPSILON_NODE (type));
3040 #endif
3041 #ifdef RE_ENABLE_I18N
3042       /* If the node may accept "multi byte".  */
3043       if (dfa->nodes[cur_node].accept_mb)
3044 	{
3045 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3046 					       str_idx);
3047 	  if (naccepted > 1)
3048 	    {
3049 	      re_dfastate_t *dest_state;
3050 	      Idx next_node = dfa->nexts[cur_node];
3051 	      Idx next_idx = str_idx + naccepted;
3052 	      dest_state = mctx->state_log[next_idx];
3053 	      re_node_set_empty (&union_set);
3054 	      if (dest_state)
3055 		{
3056 		  err = re_node_set_merge (&union_set, &dest_state->nodes);
3057 		  if (__glibc_unlikely (err != REG_NOERROR))
3058 		    {
3059 		      re_node_set_free (&union_set);
3060 		      return err;
3061 		    }
3062 		}
3063 	      ok = re_node_set_insert (&union_set, next_node);
3064 	      if (__glibc_unlikely (! ok))
3065 		{
3066 		  re_node_set_free (&union_set);
3067 		  return REG_ESPACE;
3068 		}
3069 	      mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3070 							    &union_set);
3071 	      if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3072 				    && err != REG_NOERROR))
3073 		{
3074 		  re_node_set_free (&union_set);
3075 		  return err;
3076 		}
3077 	    }
3078 	}
3079 #endif /* RE_ENABLE_I18N */
3080       if (naccepted
3081 	  || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3082 	{
3083 	  ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3084 	  if (__glibc_unlikely (! ok))
3085 	    {
3086 	      re_node_set_free (&union_set);
3087 	      return REG_ESPACE;
3088 	    }
3089 	}
3090     }
3091   re_node_set_free (&union_set);
3092   return REG_NOERROR;
3093 }
3094 
3095 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3096    CUR_NODES, however exclude the nodes which are:
3097     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3098     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3099 */
3100 
3101 static reg_errcode_t
check_arrival_expand_ecl(const re_dfa_t * dfa,re_node_set * cur_nodes,Idx ex_subexp,int type)3102 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3103 			  Idx ex_subexp, int type)
3104 {
3105   reg_errcode_t err;
3106   Idx idx, outside_node;
3107   re_node_set new_nodes;
3108 #ifdef DEBUG
3109   assert (cur_nodes->nelem);
3110 #endif
3111   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3112   if (__glibc_unlikely (err != REG_NOERROR))
3113     return err;
3114   /* Create a new node set NEW_NODES with the nodes which are epsilon
3115      closures of the node in CUR_NODES.  */
3116 
3117   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3118     {
3119       Idx cur_node = cur_nodes->elems[idx];
3120       const re_node_set *eclosure = dfa->eclosures + cur_node;
3121       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3122       if (outside_node == -1)
3123 	{
3124 	  /* There are no problematic nodes, just merge them.  */
3125 	  err = re_node_set_merge (&new_nodes, eclosure);
3126 	  if (__glibc_unlikely (err != REG_NOERROR))
3127 	    {
3128 	      re_node_set_free (&new_nodes);
3129 	      return err;
3130 	    }
3131 	}
3132       else
3133 	{
3134 	  /* There are problematic nodes, re-calculate incrementally.  */
3135 	  err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3136 					      ex_subexp, type);
3137 	  if (__glibc_unlikely (err != REG_NOERROR))
3138 	    {
3139 	      re_node_set_free (&new_nodes);
3140 	      return err;
3141 	    }
3142 	}
3143     }
3144   re_node_set_free (cur_nodes);
3145   *cur_nodes = new_nodes;
3146   return REG_NOERROR;
3147 }
3148 
3149 /* Helper function for check_arrival_expand_ecl.
3150    Check incrementally the epsilon closure of TARGET, and if it isn't
3151    problematic append it to DST_NODES.  */
3152 
3153 static reg_errcode_t
3154 __attribute_warn_unused_result__
check_arrival_expand_ecl_sub(const re_dfa_t * dfa,re_node_set * dst_nodes,Idx target,Idx ex_subexp,int type)3155 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3156 			      Idx target, Idx ex_subexp, int type)
3157 {
3158   Idx cur_node;
3159   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3160     {
3161       bool ok;
3162 
3163       if (dfa->nodes[cur_node].type == type
3164 	  && dfa->nodes[cur_node].opr.idx == ex_subexp)
3165 	{
3166 	  if (type == OP_CLOSE_SUBEXP)
3167 	    {
3168 	      ok = re_node_set_insert (dst_nodes, cur_node);
3169 	      if (__glibc_unlikely (! ok))
3170 		return REG_ESPACE;
3171 	    }
3172 	  break;
3173 	}
3174       ok = re_node_set_insert (dst_nodes, cur_node);
3175       if (__glibc_unlikely (! ok))
3176 	return REG_ESPACE;
3177       if (dfa->edests[cur_node].nelem == 0)
3178 	break;
3179       if (dfa->edests[cur_node].nelem == 2)
3180 	{
3181 	  reg_errcode_t err;
3182 	  err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3183 					      dfa->edests[cur_node].elems[1],
3184 					      ex_subexp, type);
3185 	  if (__glibc_unlikely (err != REG_NOERROR))
3186 	    return err;
3187 	}
3188       cur_node = dfa->edests[cur_node].elems[0];
3189     }
3190   return REG_NOERROR;
3191 }
3192 
3193 
3194 /* For all the back references in the current state, calculate the
3195    destination of the back references by the appropriate entry
3196    in MCTX->BKREF_ENTS.  */
3197 
3198 static reg_errcode_t
3199 __attribute_warn_unused_result__
expand_bkref_cache(re_match_context_t * mctx,re_node_set * cur_nodes,Idx cur_str,Idx subexp_num,int type)3200 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3201 		    Idx cur_str, Idx subexp_num, int type)
3202 {
3203   const re_dfa_t *const dfa = mctx->dfa;
3204   reg_errcode_t err;
3205   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3206   struct re_backref_cache_entry *ent;
3207 
3208   if (cache_idx_start == -1)
3209     return REG_NOERROR;
3210 
3211  restart:
3212   ent = mctx->bkref_ents + cache_idx_start;
3213   do
3214     {
3215       Idx to_idx, next_node;
3216 
3217       /* Is this entry ENT is appropriate?  */
3218       if (!re_node_set_contains (cur_nodes, ent->node))
3219 	continue; /* No.  */
3220 
3221       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3222       /* Calculate the destination of the back reference, and append it
3223 	 to MCTX->STATE_LOG.  */
3224       if (to_idx == cur_str)
3225 	{
3226 	  /* The backreference did epsilon transit, we must re-check all the
3227 	     node in the current state.  */
3228 	  re_node_set new_dests;
3229 	  reg_errcode_t err2, err3;
3230 	  next_node = dfa->edests[ent->node].elems[0];
3231 	  if (re_node_set_contains (cur_nodes, next_node))
3232 	    continue;
3233 	  err = re_node_set_init_1 (&new_dests, next_node);
3234 	  err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3235 	  err3 = re_node_set_merge (cur_nodes, &new_dests);
3236 	  re_node_set_free (&new_dests);
3237 	  if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3238 				|| err3 != REG_NOERROR))
3239 	    {
3240 	      err = (err != REG_NOERROR ? err
3241 		     : (err2 != REG_NOERROR ? err2 : err3));
3242 	      return err;
3243 	    }
3244 	  /* TODO: It is still inefficient...  */
3245 	  goto restart;
3246 	}
3247       else
3248 	{
3249 	  re_node_set union_set;
3250 	  next_node = dfa->nexts[ent->node];
3251 	  if (mctx->state_log[to_idx])
3252 	    {
3253 	      bool ok;
3254 	      if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3255 					next_node))
3256 		continue;
3257 	      err = re_node_set_init_copy (&union_set,
3258 					   &mctx->state_log[to_idx]->nodes);
3259 	      ok = re_node_set_insert (&union_set, next_node);
3260 	      if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3261 		{
3262 		  re_node_set_free (&union_set);
3263 		  err = err != REG_NOERROR ? err : REG_ESPACE;
3264 		  return err;
3265 		}
3266 	    }
3267 	  else
3268 	    {
3269 	      err = re_node_set_init_1 (&union_set, next_node);
3270 	      if (__glibc_unlikely (err != REG_NOERROR))
3271 		return err;
3272 	    }
3273 	  mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3274 	  re_node_set_free (&union_set);
3275 	  if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3276 				&& err != REG_NOERROR))
3277 	    return err;
3278 	}
3279     }
3280   while (ent++->more);
3281   return REG_NOERROR;
3282 }
3283 
3284 /* Build transition table for the state.
3285    Return true if successful.  */
3286 
3287 static bool
build_trtable(const re_dfa_t * dfa,re_dfastate_t * state)3288 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3289 {
3290   reg_errcode_t err;
3291   Idx i, j;
3292   int ch;
3293   bool need_word_trtable = false;
3294   bitset_word_t elem, mask;
3295   bool dests_node_malloced = false;
3296   bool dest_states_malloced = false;
3297   Idx ndests; /* Number of the destination states from 'state'.  */
3298   re_dfastate_t **trtable;
3299   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3300   re_node_set follows, *dests_node;
3301   bitset_t *dests_ch;
3302   bitset_t acceptable;
3303 
3304   struct dests_alloc
3305   {
3306     re_node_set dests_node[SBC_MAX];
3307     bitset_t dests_ch[SBC_MAX];
3308   } *dests_alloc;
3309 
3310   /* We build DFA states which corresponds to the destination nodes
3311      from 'state'.  'dests_node[i]' represents the nodes which i-th
3312      destination state contains, and 'dests_ch[i]' represents the
3313      characters which i-th destination state accepts.  */
3314   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3315     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3316   else
3317     {
3318       dests_alloc = re_malloc (struct dests_alloc, 1);
3319       if (__glibc_unlikely (dests_alloc == NULL))
3320 	return false;
3321       dests_node_malloced = true;
3322     }
3323   dests_node = dests_alloc->dests_node;
3324   dests_ch = dests_alloc->dests_ch;
3325 
3326   /* Initialize transition table.  */
3327   state->word_trtable = state->trtable = NULL;
3328 
3329   /* At first, group all nodes belonging to 'state' into several
3330      destinations.  */
3331   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3332   if (__glibc_unlikely (ndests <= 0))
3333     {
3334       if (dests_node_malloced)
3335 	re_free (dests_alloc);
3336       /* Return false in case of an error, true otherwise.  */
3337       if (ndests == 0)
3338 	{
3339 	  state->trtable = (re_dfastate_t **)
3340 	    calloc (sizeof (re_dfastate_t *), SBC_MAX);
3341           if (__glibc_unlikely (state->trtable == NULL))
3342             return false;
3343 	  return true;
3344 	}
3345       return false;
3346     }
3347 
3348   err = re_node_set_alloc (&follows, ndests + 1);
3349   if (__glibc_unlikely (err != REG_NOERROR))
3350     goto out_free;
3351 
3352   /* Avoid arithmetic overflow in size calculation.  */
3353   size_t ndests_max
3354     = ((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3355        / (3 * sizeof (re_dfastate_t *)));
3356   if (__glibc_unlikely (ndests_max < ndests))
3357     goto out_free;
3358 
3359   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3360 			 + ndests * 3 * sizeof (re_dfastate_t *)))
3361     dest_states = (re_dfastate_t **)
3362       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3363   else
3364     {
3365       dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3366       if (__glibc_unlikely (dest_states == NULL))
3367 	{
3368 out_free:
3369 	  if (dest_states_malloced)
3370 	    re_free (dest_states);
3371 	  re_node_set_free (&follows);
3372 	  for (i = 0; i < ndests; ++i)
3373 	    re_node_set_free (dests_node + i);
3374 	  if (dests_node_malloced)
3375 	    re_free (dests_alloc);
3376 	  return false;
3377 	}
3378       dest_states_malloced = true;
3379     }
3380   dest_states_word = dest_states + ndests;
3381   dest_states_nl = dest_states_word + ndests;
3382   bitset_empty (acceptable);
3383 
3384   /* Then build the states for all destinations.  */
3385   for (i = 0; i < ndests; ++i)
3386     {
3387       Idx next_node;
3388       re_node_set_empty (&follows);
3389       /* Merge the follows of this destination states.  */
3390       for (j = 0; j < dests_node[i].nelem; ++j)
3391 	{
3392 	  next_node = dfa->nexts[dests_node[i].elems[j]];
3393 	  if (next_node != -1)
3394 	    {
3395 	      err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3396 	      if (__glibc_unlikely (err != REG_NOERROR))
3397 		goto out_free;
3398 	    }
3399 	}
3400       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3401       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3402 	goto out_free;
3403       /* If the new state has context constraint,
3404 	 build appropriate states for these contexts.  */
3405       if (dest_states[i]->has_constraint)
3406 	{
3407 	  dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3408 							  CONTEXT_WORD);
3409 	  if (__glibc_unlikely (dest_states_word[i] == NULL
3410 				&& err != REG_NOERROR))
3411 	    goto out_free;
3412 
3413 	  if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3414 	    need_word_trtable = true;
3415 
3416 	  dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3417 							CONTEXT_NEWLINE);
3418 	  if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3419 	    goto out_free;
3420 	}
3421       else
3422 	{
3423 	  dest_states_word[i] = dest_states[i];
3424 	  dest_states_nl[i] = dest_states[i];
3425 	}
3426       bitset_merge (acceptable, dests_ch[i]);
3427     }
3428 
3429   if (!__glibc_unlikely (need_word_trtable))
3430     {
3431       /* We don't care about whether the following character is a word
3432 	 character, or we are in a single-byte character set so we can
3433 	 discern by looking at the character code: allocate a
3434 	 256-entry transition table.  */
3435       trtable = state->trtable =
3436 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3437       if (__glibc_unlikely (trtable == NULL))
3438 	goto out_free;
3439 
3440       /* For all characters ch...:  */
3441       for (i = 0; i < BITSET_WORDS; ++i)
3442 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3443 	     elem;
3444 	     mask <<= 1, elem >>= 1, ++ch)
3445 	  if (__glibc_unlikely (elem & 1))
3446 	    {
3447 	      /* There must be exactly one destination which accepts
3448 		 character ch.  See group_nodes_into_DFAstates.  */
3449 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3450 		;
3451 
3452 	      /* j-th destination accepts the word character ch.  */
3453 	      if (dfa->word_char[i] & mask)
3454 		trtable[ch] = dest_states_word[j];
3455 	      else
3456 		trtable[ch] = dest_states[j];
3457 	    }
3458     }
3459   else
3460     {
3461       /* We care about whether the following character is a word
3462 	 character, and we are in a multi-byte character set: discern
3463 	 by looking at the character code: build two 256-entry
3464 	 transition tables, one starting at trtable[0] and one
3465 	 starting at trtable[SBC_MAX].  */
3466       trtable = state->word_trtable =
3467 	(re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3468       if (__glibc_unlikely (trtable == NULL))
3469 	goto out_free;
3470 
3471       /* For all characters ch...:  */
3472       for (i = 0; i < BITSET_WORDS; ++i)
3473 	for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3474 	     elem;
3475 	     mask <<= 1, elem >>= 1, ++ch)
3476 	  if (__glibc_unlikely (elem & 1))
3477 	    {
3478 	      /* There must be exactly one destination which accepts
3479 		 character ch.  See group_nodes_into_DFAstates.  */
3480 	      for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3481 		;
3482 
3483 	      /* j-th destination accepts the word character ch.  */
3484 	      trtable[ch] = dest_states[j];
3485 	      trtable[ch + SBC_MAX] = dest_states_word[j];
3486 	    }
3487     }
3488 
3489   /* new line */
3490   if (bitset_contain (acceptable, NEWLINE_CHAR))
3491     {
3492       /* The current state accepts newline character.  */
3493       for (j = 0; j < ndests; ++j)
3494 	if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3495 	  {
3496 	    /* k-th destination accepts newline character.  */
3497 	    trtable[NEWLINE_CHAR] = dest_states_nl[j];
3498 	    if (need_word_trtable)
3499 	      trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3500 	    /* There must be only one destination which accepts
3501 	       newline.  See group_nodes_into_DFAstates.  */
3502 	    break;
3503 	  }
3504     }
3505 
3506   if (dest_states_malloced)
3507     re_free (dest_states);
3508 
3509   re_node_set_free (&follows);
3510   for (i = 0; i < ndests; ++i)
3511     re_node_set_free (dests_node + i);
3512 
3513   if (dests_node_malloced)
3514     re_free (dests_alloc);
3515 
3516   return true;
3517 }
3518 
3519 /* Group all nodes belonging to STATE into several destinations.
3520    Then for all destinations, set the nodes belonging to the destination
3521    to DESTS_NODE[i] and set the characters accepted by the destination
3522    to DEST_CH[i].  This function return the number of destinations.  */
3523 
3524 static Idx
group_nodes_into_DFAstates(const re_dfa_t * dfa,const re_dfastate_t * state,re_node_set * dests_node,bitset_t * dests_ch)3525 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3526 			    re_node_set *dests_node, bitset_t *dests_ch)
3527 {
3528   reg_errcode_t err;
3529   bool ok;
3530   Idx i, j, k;
3531   Idx ndests; /* Number of the destinations from 'state'.  */
3532   bitset_t accepts; /* Characters a node can accept.  */
3533   const re_node_set *cur_nodes = &state->nodes;
3534   bitset_empty (accepts);
3535   ndests = 0;
3536 
3537   /* For all the nodes belonging to 'state',  */
3538   for (i = 0; i < cur_nodes->nelem; ++i)
3539     {
3540       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3541       re_token_type_t type = node->type;
3542       unsigned int constraint = node->constraint;
3543 
3544       /* Enumerate all single byte character this node can accept.  */
3545       if (type == CHARACTER)
3546 	bitset_set (accepts, node->opr.c);
3547       else if (type == SIMPLE_BRACKET)
3548 	{
3549 	  bitset_merge (accepts, node->opr.sbcset);
3550 	}
3551       else if (type == OP_PERIOD)
3552 	{
3553 #ifdef RE_ENABLE_I18N
3554 	  if (dfa->mb_cur_max > 1)
3555 	    bitset_merge (accepts, dfa->sb_char);
3556 	  else
3557 #endif
3558 	    bitset_set_all (accepts);
3559 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3560 	    bitset_clear (accepts, '\n');
3561 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3562 	    bitset_clear (accepts, '\0');
3563 	}
3564 #ifdef RE_ENABLE_I18N
3565       else if (type == OP_UTF8_PERIOD)
3566 	{
3567 	  if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3568 	    memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3569 	  else
3570 	    bitset_merge (accepts, utf8_sb_map);
3571 	  if (!(dfa->syntax & RE_DOT_NEWLINE))
3572 	    bitset_clear (accepts, '\n');
3573 	  if (dfa->syntax & RE_DOT_NOT_NULL)
3574 	    bitset_clear (accepts, '\0');
3575 	}
3576 #endif
3577       else
3578 	continue;
3579 
3580       /* Check the 'accepts' and sift the characters which are not
3581 	 match it the context.  */
3582       if (constraint)
3583 	{
3584 	  if (constraint & NEXT_NEWLINE_CONSTRAINT)
3585 	    {
3586 	      bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3587 	      bitset_empty (accepts);
3588 	      if (accepts_newline)
3589 		bitset_set (accepts, NEWLINE_CHAR);
3590 	      else
3591 		continue;
3592 	    }
3593 	  if (constraint & NEXT_ENDBUF_CONSTRAINT)
3594 	    {
3595 	      bitset_empty (accepts);
3596 	      continue;
3597 	    }
3598 
3599 	  if (constraint & NEXT_WORD_CONSTRAINT)
3600 	    {
3601 	      bitset_word_t any_set = 0;
3602 	      if (type == CHARACTER && !node->word_char)
3603 		{
3604 		  bitset_empty (accepts);
3605 		  continue;
3606 		}
3607 #ifdef RE_ENABLE_I18N
3608 	      if (dfa->mb_cur_max > 1)
3609 		for (j = 0; j < BITSET_WORDS; ++j)
3610 		  any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3611 	      else
3612 #endif
3613 		for (j = 0; j < BITSET_WORDS; ++j)
3614 		  any_set |= (accepts[j] &= dfa->word_char[j]);
3615 	      if (!any_set)
3616 		continue;
3617 	    }
3618 	  if (constraint & NEXT_NOTWORD_CONSTRAINT)
3619 	    {
3620 	      bitset_word_t any_set = 0;
3621 	      if (type == CHARACTER && node->word_char)
3622 		{
3623 		  bitset_empty (accepts);
3624 		  continue;
3625 		}
3626 #ifdef RE_ENABLE_I18N
3627 	      if (dfa->mb_cur_max > 1)
3628 		for (j = 0; j < BITSET_WORDS; ++j)
3629 		  any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3630 	      else
3631 #endif
3632 		for (j = 0; j < BITSET_WORDS; ++j)
3633 		  any_set |= (accepts[j] &= ~dfa->word_char[j]);
3634 	      if (!any_set)
3635 		continue;
3636 	    }
3637 	}
3638 
3639       /* Then divide 'accepts' into DFA states, or create a new
3640 	 state.  Above, we make sure that accepts is not empty.  */
3641       for (j = 0; j < ndests; ++j)
3642 	{
3643 	  bitset_t intersec; /* Intersection sets, see below.  */
3644 	  bitset_t remains;
3645 	  /* Flags, see below.  */
3646 	  bitset_word_t has_intersec, not_subset, not_consumed;
3647 
3648 	  /* Optimization, skip if this state doesn't accept the character.  */
3649 	  if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3650 	    continue;
3651 
3652 	  /* Enumerate the intersection set of this state and 'accepts'.  */
3653 	  has_intersec = 0;
3654 	  for (k = 0; k < BITSET_WORDS; ++k)
3655 	    has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3656 	  /* And skip if the intersection set is empty.  */
3657 	  if (!has_intersec)
3658 	    continue;
3659 
3660 	  /* Then check if this state is a subset of 'accepts'.  */
3661 	  not_subset = not_consumed = 0;
3662 	  for (k = 0; k < BITSET_WORDS; ++k)
3663 	    {
3664 	      not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3665 	      not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3666 	    }
3667 
3668 	  /* If this state isn't a subset of 'accepts', create a
3669 	     new group state, which has the 'remains'. */
3670 	  if (not_subset)
3671 	    {
3672 	      bitset_copy (dests_ch[ndests], remains);
3673 	      bitset_copy (dests_ch[j], intersec);
3674 	      err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3675 	      if (__glibc_unlikely (err != REG_NOERROR))
3676 		goto error_return;
3677 	      ++ndests;
3678 	    }
3679 
3680 	  /* Put the position in the current group. */
3681 	  ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3682 	  if (__glibc_unlikely (! ok))
3683 	    goto error_return;
3684 
3685 	  /* If all characters are consumed, go to next node. */
3686 	  if (!not_consumed)
3687 	    break;
3688 	}
3689       /* Some characters remain, create a new group. */
3690       if (j == ndests)
3691 	{
3692 	  bitset_copy (dests_ch[ndests], accepts);
3693 	  err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3694 	  if (__glibc_unlikely (err != REG_NOERROR))
3695 	    goto error_return;
3696 	  ++ndests;
3697 	  bitset_empty (accepts);
3698 	}
3699     }
3700   return ndests;
3701  error_return:
3702   for (j = 0; j < ndests; ++j)
3703     re_node_set_free (dests_node + j);
3704   return -1;
3705 }
3706 
3707 #ifdef RE_ENABLE_I18N
3708 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3709    Return the number of the bytes the node accepts.
3710    STR_IDX is the current index of the input string.
3711 
3712    This function handles the nodes which can accept one character, or
3713    one collating element like '.', '[a-z]', opposite to the other nodes
3714    can only accept one byte.  */
3715 
3716 # ifdef _LIBC
3717 #  include <locale/weight.h>
3718 # endif
3719 
3720 static int
check_node_accept_bytes(const re_dfa_t * dfa,Idx node_idx,const re_string_t * input,Idx str_idx)3721 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3722 			 const re_string_t *input, Idx str_idx)
3723 {
3724   const re_token_t *node = dfa->nodes + node_idx;
3725   int char_len, elem_len;
3726   Idx i;
3727 
3728   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3729     {
3730       unsigned char c = re_string_byte_at (input, str_idx), d;
3731       if (__glibc_likely (c < 0xc2))
3732 	return 0;
3733 
3734       if (str_idx + 2 > input->len)
3735 	return 0;
3736 
3737       d = re_string_byte_at (input, str_idx + 1);
3738       if (c < 0xe0)
3739 	return (d < 0x80 || d > 0xbf) ? 0 : 2;
3740       else if (c < 0xf0)
3741 	{
3742 	  char_len = 3;
3743 	  if (c == 0xe0 && d < 0xa0)
3744 	    return 0;
3745 	}
3746       else if (c < 0xf8)
3747 	{
3748 	  char_len = 4;
3749 	  if (c == 0xf0 && d < 0x90)
3750 	    return 0;
3751 	}
3752       else if (c < 0xfc)
3753 	{
3754 	  char_len = 5;
3755 	  if (c == 0xf8 && d < 0x88)
3756 	    return 0;
3757 	}
3758       else if (c < 0xfe)
3759 	{
3760 	  char_len = 6;
3761 	  if (c == 0xfc && d < 0x84)
3762 	    return 0;
3763 	}
3764       else
3765 	return 0;
3766 
3767       if (str_idx + char_len > input->len)
3768 	return 0;
3769 
3770       for (i = 1; i < char_len; ++i)
3771 	{
3772 	  d = re_string_byte_at (input, str_idx + i);
3773 	  if (d < 0x80 || d > 0xbf)
3774 	    return 0;
3775 	}
3776       return char_len;
3777     }
3778 
3779   char_len = re_string_char_size_at (input, str_idx);
3780   if (node->type == OP_PERIOD)
3781     {
3782       if (char_len <= 1)
3783 	return 0;
3784       /* FIXME: I don't think this if is needed, as both '\n'
3785 	 and '\0' are char_len == 1.  */
3786       /* '.' accepts any one character except the following two cases.  */
3787       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3788 	   && re_string_byte_at (input, str_idx) == '\n')
3789 	  || ((dfa->syntax & RE_DOT_NOT_NULL)
3790 	      && re_string_byte_at (input, str_idx) == '\0'))
3791 	return 0;
3792       return char_len;
3793     }
3794 
3795   elem_len = re_string_elem_size_at (input, str_idx);
3796   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3797     return 0;
3798 
3799   if (node->type == COMPLEX_BRACKET)
3800     {
3801       const re_charset_t *cset = node->opr.mbcset;
3802 # ifdef _LIBC
3803       const unsigned char *pin
3804 	= ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3805       Idx j;
3806       uint32_t nrules;
3807 # endif /* _LIBC */
3808       int match_len = 0;
3809       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3810 		    ? re_string_wchar_at (input, str_idx) : 0);
3811 
3812       /* match with multibyte character?  */
3813       for (i = 0; i < cset->nmbchars; ++i)
3814 	if (wc == cset->mbchars[i])
3815 	  {
3816 	    match_len = char_len;
3817 	    goto check_node_accept_bytes_match;
3818 	  }
3819       /* match with character_class?  */
3820       for (i = 0; i < cset->nchar_classes; ++i)
3821 	{
3822 	  wctype_t wt = cset->char_classes[i];
3823 	  if (__iswctype (wc, wt))
3824 	    {
3825 	      match_len = char_len;
3826 	      goto check_node_accept_bytes_match;
3827 	    }
3828 	}
3829 
3830 # ifdef _LIBC
3831       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3832       if (nrules != 0)
3833 	{
3834 	  unsigned int in_collseq = 0;
3835 	  const int32_t *table, *indirect;
3836 	  const unsigned char *weights, *extra;
3837 	  const char *collseqwc;
3838 
3839 	  /* match with collating_symbol?  */
3840 	  if (cset->ncoll_syms)
3841 	    extra = (const unsigned char *)
3842 	      _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3843 	  for (i = 0; i < cset->ncoll_syms; ++i)
3844 	    {
3845 	      const unsigned char *coll_sym = extra + cset->coll_syms[i];
3846 	      /* Compare the length of input collating element and
3847 		 the length of current collating element.  */
3848 	      if (*coll_sym != elem_len)
3849 		continue;
3850 	      /* Compare each bytes.  */
3851 	      for (j = 0; j < *coll_sym; j++)
3852 		if (pin[j] != coll_sym[1 + j])
3853 		  break;
3854 	      if (j == *coll_sym)
3855 		{
3856 		  /* Match if every bytes is equal.  */
3857 		  match_len = j;
3858 		  goto check_node_accept_bytes_match;
3859 		}
3860 	    }
3861 
3862 	  if (cset->nranges)
3863 	    {
3864 	      if (elem_len <= char_len)
3865 		{
3866 		  collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3867 		  in_collseq = __collseq_table_lookup (collseqwc, wc);
3868 		}
3869 	      else
3870 		in_collseq = find_collation_sequence_value (pin, elem_len);
3871 	    }
3872 	  /* match with range expression?  */
3873 	  /* FIXME: Implement rational ranges here, too.  */
3874 	  for (i = 0; i < cset->nranges; ++i)
3875 	    if (cset->range_starts[i] <= in_collseq
3876 		&& in_collseq <= cset->range_ends[i])
3877 	      {
3878 		match_len = elem_len;
3879 		goto check_node_accept_bytes_match;
3880 	      }
3881 
3882 	  /* match with equivalence_class?  */
3883 	  if (cset->nequiv_classes)
3884 	    {
3885 	      const unsigned char *cp = pin;
3886 	      table = (const int32_t *)
3887 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3888 	      weights = (const unsigned char *)
3889 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3890 	      extra = (const unsigned char *)
3891 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3892 	      indirect = (const int32_t *)
3893 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3894 	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3895 	      int32_t rule = idx >> 24;
3896 	      idx &= 0xffffff;
3897 	      if (idx > 0)
3898 		{
3899 		  size_t weight_len = weights[idx];
3900 		  for (i = 0; i < cset->nequiv_classes; ++i)
3901 		    {
3902 		      int32_t equiv_class_idx = cset->equiv_classes[i];
3903 		      int32_t equiv_class_rule = equiv_class_idx >> 24;
3904 		      equiv_class_idx &= 0xffffff;
3905 		      if (weights[equiv_class_idx] == weight_len
3906 			  && equiv_class_rule == rule
3907 			  && memcmp (weights + idx + 1,
3908 				     weights + equiv_class_idx + 1,
3909 				     weight_len) == 0)
3910 			{
3911 			  match_len = elem_len;
3912 			  goto check_node_accept_bytes_match;
3913 			}
3914 		    }
3915 		}
3916 	    }
3917 	}
3918       else
3919 # endif /* _LIBC */
3920 	{
3921 	  /* match with range expression?  */
3922 	  for (i = 0; i < cset->nranges; ++i)
3923 	    {
3924 	      if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3925 		{
3926 		  match_len = char_len;
3927 		  goto check_node_accept_bytes_match;
3928 		}
3929 	    }
3930 	}
3931     check_node_accept_bytes_match:
3932       if (!cset->non_match)
3933 	return match_len;
3934       else
3935 	{
3936 	  if (match_len > 0)
3937 	    return 0;
3938 	  else
3939 	    return (elem_len > char_len) ? elem_len : char_len;
3940 	}
3941     }
3942   return 0;
3943 }
3944 
3945 # ifdef _LIBC
3946 static unsigned int
find_collation_sequence_value(const unsigned char * mbs,size_t mbs_len)3947 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3948 {
3949   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3950   if (nrules == 0)
3951     {
3952       if (mbs_len == 1)
3953 	{
3954 	  /* No valid character.  Match it as a single byte character.  */
3955 	  const unsigned char *collseq = (const unsigned char *)
3956 	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3957 	  return collseq[mbs[0]];
3958 	}
3959       return UINT_MAX;
3960     }
3961   else
3962     {
3963       int32_t idx;
3964       const unsigned char *extra = (const unsigned char *)
3965 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3966       int32_t extrasize = (const unsigned char *)
3967 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3968 
3969       for (idx = 0; idx < extrasize;)
3970 	{
3971 	  int mbs_cnt;
3972 	  bool found = false;
3973 	  int32_t elem_mbs_len;
3974 	  /* Skip the name of collating element name.  */
3975 	  idx = idx + extra[idx] + 1;
3976 	  elem_mbs_len = extra[idx++];
3977 	  if (mbs_len == elem_mbs_len)
3978 	    {
3979 	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3980 		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3981 		  break;
3982 	      if (mbs_cnt == elem_mbs_len)
3983 		/* Found the entry.  */
3984 		found = true;
3985 	    }
3986 	  /* Skip the byte sequence of the collating element.  */
3987 	  idx += elem_mbs_len;
3988 	  /* Adjust for the alignment.  */
3989 	  idx = (idx + 3) & ~3;
3990 	  /* Skip the collation sequence value.  */
3991 	  idx += sizeof (uint32_t);
3992 	  /* Skip the wide char sequence of the collating element.  */
3993 	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3994 	  /* If we found the entry, return the sequence value.  */
3995 	  if (found)
3996 	    return *(uint32_t *) (extra + idx);
3997 	  /* Skip the collation sequence value.  */
3998 	  idx += sizeof (uint32_t);
3999 	}
4000       return UINT_MAX;
4001     }
4002 }
4003 # endif /* _LIBC */
4004 #endif /* RE_ENABLE_I18N */
4005 
4006 /* Check whether the node accepts the byte which is IDX-th
4007    byte of the INPUT.  */
4008 
4009 static bool
check_node_accept(const re_match_context_t * mctx,const re_token_t * node,Idx idx)4010 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4011 		   Idx idx)
4012 {
4013   unsigned char ch;
4014   ch = re_string_byte_at (&mctx->input, idx);
4015   switch (node->type)
4016     {
4017     case CHARACTER:
4018       if (node->opr.c != ch)
4019         return false;
4020       break;
4021 
4022     case SIMPLE_BRACKET:
4023       if (!bitset_contain (node->opr.sbcset, ch))
4024         return false;
4025       break;
4026 
4027 #ifdef RE_ENABLE_I18N
4028     case OP_UTF8_PERIOD:
4029       if (ch >= ASCII_CHARS)
4030         return false;
4031       FALLTHROUGH;
4032 #endif
4033     case OP_PERIOD:
4034       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4035 	  || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4036 	return false;
4037       break;
4038 
4039     default:
4040       return false;
4041     }
4042 
4043   if (node->constraint)
4044     {
4045       /* The node has constraints.  Check whether the current context
4046 	 satisfies the constraints.  */
4047       unsigned int context = re_string_context_at (&mctx->input, idx,
4048 						   mctx->eflags);
4049       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4050 	return false;
4051     }
4052 
4053   return true;
4054 }
4055 
4056 /* Extend the buffers, if the buffers have run out.  */
4057 
4058 static reg_errcode_t
4059 __attribute_warn_unused_result__
extend_buffers(re_match_context_t * mctx,int min_len)4060 extend_buffers (re_match_context_t *mctx, int min_len)
4061 {
4062   reg_errcode_t ret;
4063   re_string_t *pstr = &mctx->input;
4064 
4065   /* Avoid overflow.  */
4066   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4067 			<= pstr->bufs_len))
4068     return REG_ESPACE;
4069 
4070   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
4071   ret = re_string_realloc_buffers (pstr,
4072 				   MAX (min_len,
4073 					MIN (pstr->len, pstr->bufs_len * 2)));
4074   if (__glibc_unlikely (ret != REG_NOERROR))
4075     return ret;
4076 
4077   if (mctx->state_log != NULL)
4078     {
4079       /* And double the length of state_log.  */
4080       /* XXX We have no indication of the size of this buffer.  If this
4081 	 allocation fail we have no indication that the state_log array
4082 	 does not have the right size.  */
4083       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4084 					      pstr->bufs_len + 1);
4085       if (__glibc_unlikely (new_array == NULL))
4086 	return REG_ESPACE;
4087       mctx->state_log = new_array;
4088     }
4089 
4090   /* Then reconstruct the buffers.  */
4091   if (pstr->icase)
4092     {
4093 #ifdef RE_ENABLE_I18N
4094       if (pstr->mb_cur_max > 1)
4095 	{
4096 	  ret = build_wcs_upper_buffer (pstr);
4097 	  if (__glibc_unlikely (ret != REG_NOERROR))
4098 	    return ret;
4099 	}
4100       else
4101 #endif /* RE_ENABLE_I18N  */
4102 	build_upper_buffer (pstr);
4103     }
4104   else
4105     {
4106 #ifdef RE_ENABLE_I18N
4107       if (pstr->mb_cur_max > 1)
4108 	build_wcs_buffer (pstr);
4109       else
4110 #endif /* RE_ENABLE_I18N  */
4111 	{
4112 	  if (pstr->trans != NULL)
4113 	    re_string_translate_buffer (pstr);
4114 	}
4115     }
4116   return REG_NOERROR;
4117 }
4118 
4119 
4120 /* Functions for matching context.  */
4121 
4122 /* Initialize MCTX.  */
4123 
4124 static reg_errcode_t
4125 __attribute_warn_unused_result__
match_ctx_init(re_match_context_t * mctx,int eflags,Idx n)4126 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4127 {
4128   mctx->eflags = eflags;
4129   mctx->match_last = -1;
4130   if (n > 0)
4131     {
4132       /* Avoid overflow.  */
4133       size_t max_object_size =
4134 	MAX (sizeof (struct re_backref_cache_entry),
4135 	     sizeof (re_sub_match_top_t *));
4136       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4137 	return REG_ESPACE;
4138 
4139       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4140       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4141       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4142 	return REG_ESPACE;
4143     }
4144   /* Already zero-ed by the caller.
4145      else
4146        mctx->bkref_ents = NULL;
4147      mctx->nbkref_ents = 0;
4148      mctx->nsub_tops = 0;  */
4149   mctx->abkref_ents = n;
4150   mctx->max_mb_elem_len = 1;
4151   mctx->asub_tops = n;
4152   return REG_NOERROR;
4153 }
4154 
4155 /* Clean the entries which depend on the current input in MCTX.
4156    This function must be invoked when the matcher changes the start index
4157    of the input, or changes the input string.  */
4158 
4159 static void
match_ctx_clean(re_match_context_t * mctx)4160 match_ctx_clean (re_match_context_t *mctx)
4161 {
4162   Idx st_idx;
4163   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4164     {
4165       Idx sl_idx;
4166       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4167       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4168 	{
4169 	  re_sub_match_last_t *last = top->lasts[sl_idx];
4170 	  re_free (last->path.array);
4171 	  re_free (last);
4172 	}
4173       re_free (top->lasts);
4174       if (top->path)
4175 	{
4176 	  re_free (top->path->array);
4177 	  re_free (top->path);
4178 	}
4179       re_free (top);
4180     }
4181 
4182   mctx->nsub_tops = 0;
4183   mctx->nbkref_ents = 0;
4184 }
4185 
4186 /* Free all the memory associated with MCTX.  */
4187 
4188 static void
match_ctx_free(re_match_context_t * mctx)4189 match_ctx_free (re_match_context_t *mctx)
4190 {
4191   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4192   match_ctx_clean (mctx);
4193   re_free (mctx->sub_tops);
4194   re_free (mctx->bkref_ents);
4195 }
4196 
4197 /* Add a new backreference entry to MCTX.
4198    Note that we assume that caller never call this function with duplicate
4199    entry, and call with STR_IDX which isn't smaller than any existing entry.
4200 */
4201 
4202 static reg_errcode_t
4203 __attribute_warn_unused_result__
match_ctx_add_entry(re_match_context_t * mctx,Idx node,Idx str_idx,Idx from,Idx to)4204 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4205 		     Idx to)
4206 {
4207   if (mctx->nbkref_ents >= mctx->abkref_ents)
4208     {
4209       struct re_backref_cache_entry* new_entry;
4210       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4211 			      mctx->abkref_ents * 2);
4212       if (__glibc_unlikely (new_entry == NULL))
4213 	{
4214 	  re_free (mctx->bkref_ents);
4215 	  return REG_ESPACE;
4216 	}
4217       mctx->bkref_ents = new_entry;
4218       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4219 	      sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4220       mctx->abkref_ents *= 2;
4221     }
4222   if (mctx->nbkref_ents > 0
4223       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4224     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4225 
4226   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4227   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4228   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4229   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4230 
4231   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4232      If bit N is clear, means that this entry won't epsilon-transition to
4233      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4234      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4235      such node.
4236 
4237      A backreference does not epsilon-transition unless it is empty, so set
4238      to all zeros if FROM != TO.  */
4239   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4240     = (from == to ? -1 : 0);
4241 
4242   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4243   if (mctx->max_mb_elem_len < to - from)
4244     mctx->max_mb_elem_len = to - from;
4245   return REG_NOERROR;
4246 }
4247 
4248 /* Return the first entry with the same str_idx, or -1 if none is
4249    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4250 
4251 static Idx
search_cur_bkref_entry(const re_match_context_t * mctx,Idx str_idx)4252 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4253 {
4254   Idx left, right, mid, last;
4255   last = right = mctx->nbkref_ents;
4256   for (left = 0; left < right;)
4257     {
4258       mid = (left + right) / 2;
4259       if (mctx->bkref_ents[mid].str_idx < str_idx)
4260 	left = mid + 1;
4261       else
4262 	right = mid;
4263     }
4264   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4265     return left;
4266   else
4267     return -1;
4268 }
4269 
4270 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4271    at STR_IDX.  */
4272 
4273 static reg_errcode_t
4274 __attribute_warn_unused_result__
match_ctx_add_subtop(re_match_context_t * mctx,Idx node,Idx str_idx)4275 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4276 {
4277 #ifdef DEBUG
4278   assert (mctx->sub_tops != NULL);
4279   assert (mctx->asub_tops > 0);
4280 #endif
4281   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4282     {
4283       Idx new_asub_tops = mctx->asub_tops * 2;
4284       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4285 						   re_sub_match_top_t *,
4286 						   new_asub_tops);
4287       if (__glibc_unlikely (new_array == NULL))
4288 	return REG_ESPACE;
4289       mctx->sub_tops = new_array;
4290       mctx->asub_tops = new_asub_tops;
4291     }
4292   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4293   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4294     return REG_ESPACE;
4295   mctx->sub_tops[mctx->nsub_tops]->node = node;
4296   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4297   return REG_NOERROR;
4298 }
4299 
4300 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4301    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4302 
4303 static re_sub_match_last_t *
match_ctx_add_sublast(re_sub_match_top_t * subtop,Idx node,Idx str_idx)4304 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4305 {
4306   re_sub_match_last_t *new_entry;
4307   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4308     {
4309       Idx new_alasts = 2 * subtop->alasts + 1;
4310       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4311 						    re_sub_match_last_t *,
4312 						    new_alasts);
4313       if (__glibc_unlikely (new_array == NULL))
4314 	return NULL;
4315       subtop->lasts = new_array;
4316       subtop->alasts = new_alasts;
4317     }
4318   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4319   if (__glibc_likely (new_entry != NULL))
4320     {
4321       subtop->lasts[subtop->nlasts] = new_entry;
4322       new_entry->node = node;
4323       new_entry->str_idx = str_idx;
4324       ++subtop->nlasts;
4325     }
4326   return new_entry;
4327 }
4328 
4329 static void
sift_ctx_init(re_sift_context_t * sctx,re_dfastate_t ** sifted_sts,re_dfastate_t ** limited_sts,Idx last_node,Idx last_str_idx)4330 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4331 	       re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4332 {
4333   sctx->sifted_states = sifted_sts;
4334   sctx->limited_states = limited_sts;
4335   sctx->last_node = last_node;
4336   sctx->last_str_idx = last_str_idx;
4337   re_node_set_init_empty (&sctx->limits);
4338 }
4339