1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23 
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 					  size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 				     const re_dfastate_t *init_state,
28 				     char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 static void free_charset (re_charset_t *cset);
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 			       reg_errcode_t (fn (void *, bin_tree_t *)),
37 			       void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 				reg_errcode_t (fn (void *, bin_tree_t *)),
40 				void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 				 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 				   unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 					 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 			 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 			reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 			  reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 				  re_token_t *token, reg_syntax_t syntax,
63 				  Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 				 re_token_t *token, reg_syntax_t syntax,
66 				 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 				     re_token_t *token, reg_syntax_t syntax,
69 				     Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 				  re_token_t *token, reg_syntax_t syntax,
72 				  Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 				 re_dfa_t *dfa, re_token_t *token,
75 				 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 				      re_token_t *token, reg_syntax_t syntax,
78 				      reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 					    re_string_t *regexp,
81 					    re_token_t *token, int token_len,
82 					    re_dfa_t *dfa,
83 					    reg_syntax_t syntax,
84 					    bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 					  re_string_t *regexp,
87 					  re_token_t *token);
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 					re_charset_t *mbcset,
90 					Idx *equiv_class_alloc,
91 					const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 				      bitset_t sbcset,
94 				      re_charset_t *mbcset,
95 				      Idx *char_class_alloc,
96 				      const char *class_name,
97 				      reg_syntax_t syntax);
98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99 				       RE_TRANSLATE_TYPE trans,
100 				       const char *class_name,
101 				       const char *extra,
102 				       bool non_match, reg_errcode_t *err);
103 static bin_tree_t *create_tree (re_dfa_t *dfa,
104 				bin_tree_t *left, bin_tree_t *right,
105 				re_token_type_t type);
106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107 				      bin_tree_t *left, bin_tree_t *right,
108 				      const re_token_t *token);
109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110 static void free_token (re_token_t *node);
111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
113 
114 /* This table gives an error message for each of the error codes listed
115    in regex.h.  Obviously the order here has to be same as there.
116    POSIX doesn't require that we do anything for REG_NOERROR,
117    but why not be nice?  */
118 
119 static const char __re_error_msgid[] =
120   {
121 #define REG_NOERROR_IDX	0
122     gettext_noop ("Success")	/* REG_NOERROR */
123     "\0"
124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125     gettext_noop ("No match")	/* REG_NOMATCH */
126     "\0"
127 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
128     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
129     "\0"
130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
132     "\0"
133 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
135     "\0"
136 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
137     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
138     "\0"
139 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
140     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
141     "\0"
142 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
143     gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
144     "\0"
145 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
147     "\0"
148 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
150     "\0"
151 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
152     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
153     "\0"
154 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155     gettext_noop ("Invalid range end")	/* REG_ERANGE */
156     "\0"
157 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
158     gettext_noop ("Memory exhausted") /* REG_ESPACE */
159     "\0"
160 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
161     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
162     "\0"
163 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164     gettext_noop ("Premature end of regular expression") /* REG_EEND */
165     "\0"
166 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
167     gettext_noop ("Regular expression too big") /* REG_ESIZE */
168     "\0"
169 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
170     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
171   };
172 
173 static const size_t __re_error_msgid_idx[] =
174   {
175     REG_NOERROR_IDX,
176     REG_NOMATCH_IDX,
177     REG_BADPAT_IDX,
178     REG_ECOLLATE_IDX,
179     REG_ECTYPE_IDX,
180     REG_EESCAPE_IDX,
181     REG_ESUBREG_IDX,
182     REG_EBRACK_IDX,
183     REG_EPAREN_IDX,
184     REG_EBRACE_IDX,
185     REG_BADBR_IDX,
186     REG_ERANGE_IDX,
187     REG_ESPACE_IDX,
188     REG_BADRPT_IDX,
189     REG_EEND_IDX,
190     REG_ESIZE_IDX,
191     REG_ERPAREN_IDX
192   };
193 
194 /* Entry points for GNU code.  */
195 
196 /* re_compile_pattern is the GNU regular expression compiler: it
197    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
198    Returns 0 if the pattern was valid, otherwise an error string.
199 
200    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
201    are set in BUFP on entry.  */
202 
203 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)204 re_compile_pattern (const char *pattern, size_t length,
205 		    struct re_pattern_buffer *bufp)
206 {
207   reg_errcode_t ret;
208 
209   /* And GNU code determines whether or not to get register information
210      by passing null for the REGS argument to re_match, etc., not by
211      setting no_sub, unless RE_NO_SUB is set.  */
212   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
213 
214   /* Match anchors at newline.  */
215   bufp->newline_anchor = 1;
216 
217   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
218 
219   if (!ret)
220     return NULL;
221   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
222 }
weak_alias(__re_compile_pattern,re_compile_pattern)223 weak_alias (__re_compile_pattern, re_compile_pattern)
224 
225 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
226    also be assigned to arbitrarily: each pattern buffer stores its own
227    syntax, so it can be changed between regex compilations.  */
228 /* This has no initializer because initialized variables in Emacs
229    become read-only after dumping.  */
230 reg_syntax_t re_syntax_options;
231 
232 
233 /* Specify the precise syntax of regexps for compilation.  This provides
234    for compatibility for various utilities which historically have
235    different, incompatible syntaxes.
236 
237    The argument SYNTAX is a bit mask comprised of the various bits
238    defined in regex.h.  We return the old syntax.  */
239 
240 reg_syntax_t
241 re_set_syntax (reg_syntax_t syntax)
242 {
243   reg_syntax_t ret = re_syntax_options;
244 
245   re_syntax_options = syntax;
246   return ret;
247 }
weak_alias(__re_set_syntax,re_set_syntax)248 weak_alias (__re_set_syntax, re_set_syntax)
249 
250 int
251 re_compile_fastmap (struct re_pattern_buffer *bufp)
252 {
253   re_dfa_t *dfa = bufp->buffer;
254   char *fastmap = bufp->fastmap;
255 
256   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
257   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
258   if (dfa->init_state != dfa->init_state_word)
259     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
260   if (dfa->init_state != dfa->init_state_nl)
261     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
262   if (dfa->init_state != dfa->init_state_begbuf)
263     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
264   bufp->fastmap_accurate = 1;
265   return 0;
266 }
weak_alias(__re_compile_fastmap,re_compile_fastmap)267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
268 
269 static __always_inline void
270 re_set_fastmap (char *fastmap, bool icase, int ch)
271 {
272   fastmap[ch] = 1;
273   if (icase)
274     fastmap[tolower (ch)] = 1;
275 }
276 
277 /* Helper function for re_compile_fastmap.
278    Compile fastmap for the initial_state INIT_STATE.  */
279 
280 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
282 			 char *fastmap)
283 {
284   re_dfa_t *dfa = bufp->buffer;
285   Idx node_cnt;
286   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
287   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
288     {
289       Idx node = init_state->nodes.elems[node_cnt];
290       re_token_type_t type = dfa->nodes[node].type;
291 
292       if (type == CHARACTER)
293 	{
294 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
295 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
296 	    {
297 	      unsigned char buf[MB_LEN_MAX];
298 	      unsigned char *p;
299 	      wchar_t wc;
300 	      mbstate_t state;
301 
302 	      p = buf;
303 	      *p++ = dfa->nodes[node].opr.c;
304 	      while (++node < dfa->nodes_len
305 		     &&	dfa->nodes[node].type == CHARACTER
306 		     && dfa->nodes[node].mb_partial)
307 		*p++ = dfa->nodes[node].opr.c;
308 	      memset (&state, '\0', sizeof (state));
309 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
310 			     &state) == p - buf
311 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
312 		      != (size_t) -1))
313 		re_set_fastmap (fastmap, false, buf[0]);
314 	    }
315 	}
316       else if (type == SIMPLE_BRACKET)
317 	{
318 	  int i, ch;
319 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
320 	    {
321 	      int j;
322 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
323 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
324 		if (w & ((bitset_word_t) 1 << j))
325 		  re_set_fastmap (fastmap, icase, ch);
326 	    }
327 	}
328       else if (type == COMPLEX_BRACKET)
329 	{
330 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
331 	  Idx i;
332 
333 #ifdef _LIBC
334 	  /* See if we have to try all bytes which start multiple collation
335 	     elements.
336 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
337 		  collation element, and don't catch 'b' since 'b' is
338 		  the only collation element which starts from 'b' (and
339 		  it is caught by SIMPLE_BRACKET).  */
340 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
341 		  && (cset->ncoll_syms || cset->nranges))
342 		{
343 		  const int32_t *table = (const int32_t *)
344 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345 		  for (i = 0; i < SBC_MAX; ++i)
346 		    if (table[i] < 0)
347 		      re_set_fastmap (fastmap, icase, i);
348 		}
349 #endif /* _LIBC */
350 
351 	  /* See if we have to start the match at all multibyte characters,
352 	     i.e. where we would not find an invalid sequence.  This only
353 	     applies to multibyte character sets; for single byte character
354 	     sets, the SIMPLE_BRACKET again suffices.  */
355 	  if (dfa->mb_cur_max > 1
356 	      && (cset->nchar_classes || cset->non_match || cset->nranges
357 #ifdef _LIBC
358 		  || cset->nequiv_classes
359 #endif /* _LIBC */
360 		 ))
361 	    {
362 	      unsigned char c = 0;
363 	      do
364 		{
365 		  mbstate_t mbs;
366 		  memset (&mbs, 0, sizeof (mbs));
367 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
368 		    re_set_fastmap (fastmap, false, (int) c);
369 		}
370 	      while (++c != 0);
371 	    }
372 
373 	  else
374 	    {
375 	      /* ... Else catch all bytes which can start the mbchars.  */
376 	      for (i = 0; i < cset->nmbchars; ++i)
377 		{
378 		  char buf[256];
379 		  mbstate_t state;
380 		  memset (&state, '\0', sizeof (state));
381 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384 		    {
385 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
386 			  != (size_t) -1)
387 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
388 		    }
389 		}
390 	    }
391 	}
392       else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
393 	{
394 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 	  if (type == END_OF_RE)
396 	    bufp->can_be_null = 1;
397 	  return;
398 	}
399     }
400 }
401 
402 /* Entry point for POSIX code.  */
403 /* regcomp takes a regular expression as a string and compiles it.
404 
405    PREG is a regex_t *.  We do not expect any fields to be initialized,
406    since POSIX says we shouldn't.  Thus, we set
407 
408      'buffer' to the compiled pattern;
409      'used' to the length of the compiled pattern;
410      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
411        REG_EXTENDED bit in CFLAGS is set; otherwise, to
412        RE_SYNTAX_POSIX_BASIC;
413      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
414      'fastmap' to an allocated space for the fastmap;
415      'fastmap_accurate' to zero;
416      're_nsub' to the number of subexpressions in PATTERN.
417 
418    PATTERN is the address of the pattern string.
419 
420    CFLAGS is a series of bits which affect compilation.
421 
422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423      use POSIX basic syntax.
424 
425      If REG_NEWLINE is set, then . and [^...] don't match newline.
426      Also, regexec will try a match beginning after every newline.
427 
428      If REG_ICASE is set, then we considers upper- and lowercase
429      versions of letters to be equivalent when matching.
430 
431      If REG_NOSUB is set, then when PREG is passed to regexec, that
432      routine will report only success or failure, and nothing about the
433      registers.
434 
435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
436    the return codes and their meanings.)  */
437 
438 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440 {
441   reg_errcode_t ret;
442   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
443 			 : RE_SYNTAX_POSIX_BASIC);
444 
445   preg->buffer = NULL;
446   preg->allocated = 0;
447   preg->used = 0;
448 
449   /* Try to allocate space for the fastmap.  */
450   preg->fastmap = re_malloc (char, SBC_MAX);
451   if (__glibc_unlikely (preg->fastmap == NULL))
452     return REG_ESPACE;
453 
454   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
455 
456   /* If REG_NEWLINE is set, newlines are treated differently.  */
457   if (cflags & REG_NEWLINE)
458     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
459       syntax &= ~RE_DOT_NEWLINE;
460       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
461       /* It also changes the matching behavior.  */
462       preg->newline_anchor = 1;
463     }
464   else
465     preg->newline_anchor = 0;
466   preg->no_sub = !!(cflags & REG_NOSUB);
467   preg->translate = NULL;
468 
469   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470 
471   /* POSIX doesn't distinguish between an unmatched open-group and an
472      unmatched close-group: both are REG_EPAREN.  */
473   if (ret == REG_ERPAREN)
474     ret = REG_EPAREN;
475 
476   /* We have already checked preg->fastmap != NULL.  */
477   if (__glibc_likely (ret == REG_NOERROR))
478     /* Compute the fastmap now, since regexec cannot modify the pattern
479        buffer.  This function never fails in this implementation.  */
480     (void) re_compile_fastmap (preg);
481   else
482     {
483       /* Some error occurred while compiling the expression.  */
484       re_free (preg->fastmap);
485       preg->fastmap = NULL;
486     }
487 
488   return (int) ret;
489 }
490 libc_hidden_def (__regcomp)
weak_alias(__regcomp,regcomp)491 weak_alias (__regcomp, regcomp)
492 
493 /* Returns a message corresponding to an error code, ERRCODE, returned
494    from either regcomp or regexec.   We don't use PREG here.  */
495 
496 size_t
497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
498 	  size_t errbuf_size)
499 {
500   const char *msg;
501   size_t msg_size;
502   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
503 
504   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
505     /* Only error codes returned by the rest of the code should be passed
506        to this routine.  If we are given anything else, or if other regex
507        code generates an invalid error code, then the program has a bug.
508        Dump core so we can fix it.  */
509     abort ();
510 
511   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512 
513   msg_size = strlen (msg) + 1; /* Includes the null.  */
514 
515   if (__glibc_likely (errbuf_size != 0))
516     {
517       size_t cpy_size = msg_size;
518       if (__glibc_unlikely (msg_size > errbuf_size))
519 	{
520 	  cpy_size = errbuf_size - 1;
521 	  errbuf[cpy_size] = '\0';
522 	}
523       memcpy (errbuf, msg, cpy_size);
524     }
525 
526   return msg_size;
527 }
528 weak_alias (__regerror, regerror)
529 
530 
531 /* This static array is used for the map to single-byte characters when
532    UTF-8 is used.  Otherwise we would allocate memory just to initialize
533    it the same all the time.  UTF-8 is the preferred encoding so this is
534    a worthwhile optimization.  */
535 static const bitset_t utf8_sb_map =
536 {
537   /* Set the first 128 bits.  */
538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
540 #else
541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
542 #  error "bitset_word_t is narrower than 32 bits"
543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546   BITSET_WORD_MAX, BITSET_WORD_MAX,
547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
548   BITSET_WORD_MAX,
549 # endif
550   (BITSET_WORD_MAX
551    >> (SBC_MAX % BITSET_WORD_BITS == 0
552        ? 0
553        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
554 #endif
555 };
556 
557 
558 static void
free_dfa_content(re_dfa_t * dfa)559 free_dfa_content (re_dfa_t *dfa)
560 {
561   Idx i, j;
562 
563   if (dfa->nodes)
564     for (i = 0; i < dfa->nodes_len; ++i)
565       free_token (dfa->nodes + i);
566   re_free (dfa->nexts);
567   for (i = 0; i < dfa->nodes_len; ++i)
568     {
569       if (dfa->eclosures != NULL)
570 	re_node_set_free (dfa->eclosures + i);
571       if (dfa->inveclosures != NULL)
572 	re_node_set_free (dfa->inveclosures + i);
573       if (dfa->edests != NULL)
574 	re_node_set_free (dfa->edests + i);
575     }
576   re_free (dfa->edests);
577   re_free (dfa->eclosures);
578   re_free (dfa->inveclosures);
579   re_free (dfa->nodes);
580 
581   if (dfa->state_table)
582     for (i = 0; i <= dfa->state_hash_mask; ++i)
583       {
584 	struct re_state_table_entry *entry = dfa->state_table + i;
585 	for (j = 0; j < entry->num; ++j)
586 	  {
587 	    re_dfastate_t *state = entry->array[j];
588 	    free_state (state);
589 	  }
590 	re_free (entry->array);
591       }
592   re_free (dfa->state_table);
593   if (dfa->sb_char != utf8_sb_map)
594     re_free (dfa->sb_char);
595   re_free (dfa->subexp_map);
596 #ifdef DEBUG
597   re_free (dfa->re_str);
598 #endif
599 
600   re_free (dfa);
601 }
602 
603 
604 /* Free dynamically allocated space used by PREG.  */
605 
606 void
regfree(regex_t * preg)607 regfree (regex_t *preg)
608 {
609   re_dfa_t *dfa = preg->buffer;
610   if (__glibc_likely (dfa != NULL))
611     {
612       lock_fini (dfa->lock);
613       free_dfa_content (dfa);
614     }
615   preg->buffer = NULL;
616   preg->allocated = 0;
617 
618   re_free (preg->fastmap);
619   preg->fastmap = NULL;
620 
621   re_free (preg->translate);
622   preg->translate = NULL;
623 }
624 libc_hidden_def (__regfree)
625 weak_alias (__regfree, regfree)
626 
627 /* Entry points compatible with 4.2 BSD regex library.  We don't define
628    them unless specifically requested.  */
629 
630 #if defined _REGEX_RE_COMP || defined _LIBC
631 
632 /* BSD has one and only one pattern buffer.  */
633 static struct re_pattern_buffer re_comp_buf;
634 
635 char *
636 # ifdef _LIBC
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638    these names if they don't use our functions, and still use
639    regcomp/regexec above without link errors.  */
640 weak_function
641 # endif
re_comp(const char * s)642 re_comp (const char *s)
643 {
644   reg_errcode_t ret;
645   char *fastmap;
646 
647   if (!s)
648     {
649       if (!re_comp_buf.buffer)
650 	return gettext ("No previous regular expression");
651       return 0;
652     }
653 
654   if (re_comp_buf.buffer)
655     {
656       fastmap = re_comp_buf.fastmap;
657       re_comp_buf.fastmap = NULL;
658       __regfree (&re_comp_buf);
659       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660       re_comp_buf.fastmap = fastmap;
661     }
662 
663   if (re_comp_buf.fastmap == NULL)
664     {
665       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
666       if (re_comp_buf.fastmap == NULL)
667 	return (char *) gettext (__re_error_msgid
668 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
669     }
670 
671   /* Since 're_exec' always passes NULL for the 'regs' argument, we
672      don't need to initialize the pattern buffer fields which affect it.  */
673 
674   /* Match anchors at newlines.  */
675   re_comp_buf.newline_anchor = 1;
676 
677   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
678 
679   if (!ret)
680     return NULL;
681 
682   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
683   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
684 }
685 
686 #ifdef _LIBC
libc_freeres_fn(free_mem)687 libc_freeres_fn (free_mem)
688 {
689   __regfree (&re_comp_buf);
690 }
691 #endif
692 
693 #endif /* _REGEX_RE_COMP */
694 
695 /* Internal entry point.
696    Compile the regular expression PATTERN, whose length is LENGTH.
697    SYNTAX indicate regular expression's syntax.  */
698 
699 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
701 		     reg_syntax_t syntax)
702 {
703   reg_errcode_t err = REG_NOERROR;
704   re_dfa_t *dfa;
705   re_string_t regexp;
706 
707   /* Initialize the pattern buffer.  */
708   preg->fastmap_accurate = 0;
709   preg->syntax = syntax;
710   preg->not_bol = preg->not_eol = 0;
711   preg->used = 0;
712   preg->re_nsub = 0;
713   preg->can_be_null = 0;
714   preg->regs_allocated = REGS_UNALLOCATED;
715 
716   /* Initialize the dfa.  */
717   dfa = preg->buffer;
718   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
719     {
720       /* If zero allocated, but buffer is non-null, try to realloc
721 	 enough space.  This loses if buffer's address is bogus, but
722 	 that is the user's responsibility.  If ->buffer is NULL this
723 	 is a simple allocation.  */
724       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725       if (dfa == NULL)
726 	return REG_ESPACE;
727       preg->allocated = sizeof (re_dfa_t);
728       preg->buffer = dfa;
729     }
730   preg->used = sizeof (re_dfa_t);
731 
732   err = init_dfa (dfa, length);
733   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
734     err = REG_ESPACE;
735   if (__glibc_unlikely (err != REG_NOERROR))
736     {
737       free_dfa_content (dfa);
738       preg->buffer = NULL;
739       preg->allocated = 0;
740       return err;
741     }
742 #ifdef DEBUG
743   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
744   dfa->re_str = re_malloc (char, length + 1);
745   strncpy (dfa->re_str, pattern, length + 1);
746 #endif
747 
748   err = re_string_construct (&regexp, pattern, length, preg->translate,
749 			     (syntax & RE_ICASE) != 0, dfa);
750   if (__glibc_unlikely (err != REG_NOERROR))
751     {
752     re_compile_internal_free_return:
753       free_workarea_compile (preg);
754       re_string_destruct (&regexp);
755       lock_fini (dfa->lock);
756       free_dfa_content (dfa);
757       preg->buffer = NULL;
758       preg->allocated = 0;
759       return err;
760     }
761 
762   /* Parse the regular expression, and build a structure tree.  */
763   preg->re_nsub = 0;
764   dfa->str_tree = parse (&regexp, preg, syntax, &err);
765   if (__glibc_unlikely (dfa->str_tree == NULL))
766     goto re_compile_internal_free_return;
767 
768   /* Analyze the tree and create the nfa.  */
769   err = analyze (preg);
770   if (__glibc_unlikely (err != REG_NOERROR))
771     goto re_compile_internal_free_return;
772 
773   /* If possible, do searching in single byte encoding to speed things up.  */
774   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
775     optimize_utf8 (dfa);
776 
777   /* Then create the initial state of the dfa.  */
778   err = create_initial_state (dfa);
779 
780   /* Release work areas.  */
781   free_workarea_compile (preg);
782   re_string_destruct (&regexp);
783 
784   if (__glibc_unlikely (err != REG_NOERROR))
785     {
786       lock_fini (dfa->lock);
787       free_dfa_content (dfa);
788       preg->buffer = NULL;
789       preg->allocated = 0;
790     }
791 
792   return err;
793 }
794 
795 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
796    as the initial length of some arrays.  */
797 
798 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)799 init_dfa (re_dfa_t *dfa, size_t pat_len)
800 {
801   __re_size_t table_size;
802 #ifndef _LIBC
803   const char *codeset_name;
804 #endif
805   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
806   size_t max_object_size =
807     MAX (sizeof (struct re_state_table_entry),
808 	 MAX (sizeof (re_token_t),
809 	      MAX (sizeof (re_node_set),
810 		   MAX (sizeof (regmatch_t),
811 			max_i18n_object_size))));
812 
813   memset (dfa, '\0', sizeof (re_dfa_t));
814 
815   /* Force allocation of str_tree_storage the first time.  */
816   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
817 
818   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
819      calculation below, and for similar doubling calculations
820      elsewhere.  And it's <= rather than <, because some of the
821      doubling calculations add 1 afterwards.  */
822   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
823 			<= pat_len))
824     return REG_ESPACE;
825 
826   dfa->nodes_alloc = pat_len + 1;
827   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828 
829   /*  table_size = 2 ^ ceil(log pat_len) */
830   for (table_size = 1; ; table_size <<= 1)
831     if (table_size > pat_len)
832       break;
833 
834   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
835   dfa->state_hash_mask = table_size - 1;
836 
837   dfa->mb_cur_max = MB_CUR_MAX;
838 #ifdef _LIBC
839   if (dfa->mb_cur_max == 6
840       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
841     dfa->is_utf8 = 1;
842   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
843 		       != 0);
844 #else
845   codeset_name = nl_langinfo (CODESET);
846   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
847       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
848       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
849       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
850     dfa->is_utf8 = 1;
851 
852   /* We check exhaustively in the loop below if this charset is a
853      superset of ASCII.  */
854   dfa->map_notascii = 0;
855 #endif
856 
857   if (dfa->mb_cur_max > 1)
858     {
859       if (dfa->is_utf8)
860 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861       else
862 	{
863 	  int i, j, ch;
864 
865 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866 	  if (__glibc_unlikely (dfa->sb_char == NULL))
867 	    return REG_ESPACE;
868 
869 	  /* Set the bits corresponding to single byte chars.  */
870 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872 	      {
873 		wint_t wch = __btowc (ch);
874 		if (wch != WEOF)
875 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876 #ifndef _LIBC
877 		if (isascii (ch) && wch != ch)
878 		  dfa->map_notascii = 1;
879 #endif
880 	      }
881 	}
882     }
883 
884   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
885     return REG_ESPACE;
886   return REG_NOERROR;
887 }
888 
889 /* Initialize WORD_CHAR table, which indicate which character is
890    "word".  In this case "word" means that it is the word construction
891    character used by some operators like "\<", "\>", etc.  */
892 
893 static void
init_word_char(re_dfa_t * dfa)894 init_word_char (re_dfa_t *dfa)
895 {
896   int i = 0;
897   int j;
898   int ch = 0;
899   dfa->word_ops_used = 1;
900   if (__glibc_likely (dfa->map_notascii == 0))
901     {
902       bitset_word_t bits0 = 0x00000000;
903       bitset_word_t bits1 = 0x03ff0000;
904       bitset_word_t bits2 = 0x87fffffe;
905       bitset_word_t bits3 = 0x07fffffe;
906       if (BITSET_WORD_BITS == 64)
907 	{
908 	  /* Pacify gcc -Woverflow on 32-bit platformns.  */
909 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
910 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
911 	  i = 2;
912 	}
913       else if (BITSET_WORD_BITS == 32)
914 	{
915 	  dfa->word_char[0] = bits0;
916 	  dfa->word_char[1] = bits1;
917 	  dfa->word_char[2] = bits2;
918 	  dfa->word_char[3] = bits3;
919 	  i = 4;
920 	}
921       else
922         goto general_case;
923       ch = 128;
924 
925       if (__glibc_likely (dfa->is_utf8))
926 	{
927 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
928 	  return;
929 	}
930     }
931 
932  general_case:
933   for (; i < BITSET_WORDS; ++i)
934     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935       if (isalnum (ch) || ch == '_')
936 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
937 }
938 
939 /* Free the work area which are only used while compiling.  */
940 
941 static void
free_workarea_compile(regex_t * preg)942 free_workarea_compile (regex_t *preg)
943 {
944   re_dfa_t *dfa = preg->buffer;
945   bin_tree_storage_t *storage, *next;
946   for (storage = dfa->str_tree_storage; storage; storage = next)
947     {
948       next = storage->next;
949       re_free (storage);
950     }
951   dfa->str_tree_storage = NULL;
952   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
953   dfa->str_tree = NULL;
954   re_free (dfa->org_indices);
955   dfa->org_indices = NULL;
956 }
957 
958 /* Create initial states for all contexts.  */
959 
960 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)961 create_initial_state (re_dfa_t *dfa)
962 {
963   Idx first, i;
964   reg_errcode_t err;
965   re_node_set init_nodes;
966 
967   /* Initial states have the epsilon closure of the node which is
968      the first node of the regular expression.  */
969   first = dfa->str_tree->first->node_idx;
970   dfa->init_node = first;
971   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
972   if (__glibc_unlikely (err != REG_NOERROR))
973     return err;
974 
975   /* The back-references which are in initial states can epsilon transit,
976      since in this case all of the subexpressions can be null.
977      Then we add epsilon closures of the nodes which are the next nodes of
978      the back-references.  */
979   if (dfa->nbackref > 0)
980     for (i = 0; i < init_nodes.nelem; ++i)
981       {
982 	Idx node_idx = init_nodes.elems[i];
983 	re_token_type_t type = dfa->nodes[node_idx].type;
984 
985 	Idx clexp_idx;
986 	if (type != OP_BACK_REF)
987 	  continue;
988 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
989 	  {
990 	    re_token_t *clexp_node;
991 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
992 	    if (clexp_node->type == OP_CLOSE_SUBEXP
993 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
994 	      break;
995 	  }
996 	if (clexp_idx == init_nodes.nelem)
997 	  continue;
998 
999 	if (type == OP_BACK_REF)
1000 	  {
1001 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1002 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1003 	      {
1004 		reg_errcode_t merge_err
1005                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006 		if (merge_err != REG_NOERROR)
1007 		  return merge_err;
1008 		i = 0;
1009 	      }
1010 	  }
1011       }
1012 
1013   /* It must be the first time to invoke acquire_state.  */
1014   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015   /* We don't check ERR here, since the initial state must not be NULL.  */
1016   if (__glibc_unlikely (dfa->init_state == NULL))
1017     return err;
1018   if (dfa->init_state->has_constraint)
1019     {
1020       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021 						       CONTEXT_WORD);
1022       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023 						     CONTEXT_NEWLINE);
1024       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025 							 &init_nodes,
1026 							 CONTEXT_NEWLINE
1027 							 | CONTEXT_BEGBUF);
1028       if (__glibc_unlikely (dfa->init_state_word == NULL
1029 			    || dfa->init_state_nl == NULL
1030 			    || dfa->init_state_begbuf == NULL))
1031 	return err;
1032     }
1033   else
1034     dfa->init_state_word = dfa->init_state_nl
1035       = dfa->init_state_begbuf = dfa->init_state;
1036 
1037   re_node_set_free (&init_nodes);
1038   return REG_NOERROR;
1039 }
1040 
1041 /* If it is possible to do searching in single byte encoding instead of UTF-8
1042    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043    DFA nodes where needed.  */
1044 
1045 static void
optimize_utf8(re_dfa_t * dfa)1046 optimize_utf8 (re_dfa_t *dfa)
1047 {
1048   Idx node;
1049   int i;
1050   bool mb_chars = false;
1051   bool has_period = false;
1052 
1053   for (node = 0; node < dfa->nodes_len; ++node)
1054     switch (dfa->nodes[node].type)
1055       {
1056       case CHARACTER:
1057 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058 	  mb_chars = true;
1059 	break;
1060       case ANCHOR:
1061 	switch (dfa->nodes[node].opr.ctx_type)
1062 	  {
1063 	  case LINE_FIRST:
1064 	  case LINE_LAST:
1065 	  case BUF_FIRST:
1066 	  case BUF_LAST:
1067 	    break;
1068 	  default:
1069 	    /* Word anchors etc. cannot be handled.  It's okay to test
1070 	       opr.ctx_type since constraints (for all DFA nodes) are
1071 	       created by ORing one or more opr.ctx_type values.  */
1072 	    return;
1073 	  }
1074 	break;
1075       case OP_PERIOD:
1076 	has_period = true;
1077 	break;
1078       case OP_BACK_REF:
1079       case OP_ALT:
1080       case END_OF_RE:
1081       case OP_DUP_ASTERISK:
1082       case OP_OPEN_SUBEXP:
1083       case OP_CLOSE_SUBEXP:
1084 	break;
1085       case COMPLEX_BRACKET:
1086 	return;
1087       case SIMPLE_BRACKET:
1088 	/* Just double check.  */
1089 	{
1090 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1091 			? 0
1092 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1094 	    {
1095 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096 		return;
1097 	      rshift = 0;
1098 	    }
1099 	}
1100 	break;
1101       default:
1102 	abort ();
1103       }
1104 
1105   if (mb_chars || has_period)
1106     for (node = 0; node < dfa->nodes_len; ++node)
1107       {
1108 	if (dfa->nodes[node].type == CHARACTER
1109 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110 	  dfa->nodes[node].mb_partial = 0;
1111 	else if (dfa->nodes[node].type == OP_PERIOD)
1112 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1113       }
1114 
1115   /* The search can be in single byte locale.  */
1116   dfa->mb_cur_max = 1;
1117   dfa->is_utf8 = 0;
1118   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 }
1120 
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122    "eclosure", and "inveclosure".  */
1123 
1124 static reg_errcode_t
analyze(regex_t * preg)1125 analyze (regex_t *preg)
1126 {
1127   re_dfa_t *dfa = preg->buffer;
1128   reg_errcode_t ret;
1129 
1130   /* Allocate arrays.  */
1131   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136 			|| dfa->edests == NULL || dfa->eclosures == NULL))
1137     return REG_ESPACE;
1138 
1139   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140   if (dfa->subexp_map != NULL)
1141     {
1142       Idx i;
1143       for (i = 0; i < preg->re_nsub; i++)
1144 	dfa->subexp_map[i] = i;
1145       preorder (dfa->str_tree, optimize_subexps, dfa);
1146       for (i = 0; i < preg->re_nsub; i++)
1147 	if (dfa->subexp_map[i] != i)
1148 	  break;
1149       if (i == preg->re_nsub)
1150 	{
1151 	  re_free (dfa->subexp_map);
1152 	  dfa->subexp_map = NULL;
1153 	}
1154     }
1155 
1156   ret = postorder (dfa->str_tree, lower_subexps, preg);
1157   if (__glibc_unlikely (ret != REG_NOERROR))
1158     return ret;
1159   ret = postorder (dfa->str_tree, calc_first, dfa);
1160   if (__glibc_unlikely (ret != REG_NOERROR))
1161     return ret;
1162   preorder (dfa->str_tree, calc_next, dfa);
1163   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164   if (__glibc_unlikely (ret != REG_NOERROR))
1165     return ret;
1166   ret = calc_eclosure (dfa);
1167   if (__glibc_unlikely (ret != REG_NOERROR))
1168     return ret;
1169 
1170   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173       || dfa->nbackref)
1174     {
1175       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176       if (__glibc_unlikely (dfa->inveclosures == NULL))
1177 	return REG_ESPACE;
1178       ret = calc_inveclosure (dfa);
1179     }
1180 
1181   return ret;
1182 }
1183 
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185    implement parse tree visits.  Instead, we use parent pointers and
1186    some hairy code in these two functions.  */
1187 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 	   void *extra)
1190 {
1191   bin_tree_t *node, *prev;
1192 
1193   for (node = root; ; )
1194     {
1195       /* Descend down the tree, preferably to the left (or to the right
1196 	 if that's the only child).  */
1197       while (node->left || node->right)
1198 	if (node->left)
1199 	  node = node->left;
1200 	else
1201 	  node = node->right;
1202 
1203       do
1204 	{
1205 	  reg_errcode_t err = fn (extra, node);
1206 	  if (__glibc_unlikely (err != REG_NOERROR))
1207 	    return err;
1208 	  if (node->parent == NULL)
1209 	    return REG_NOERROR;
1210 	  prev = node;
1211 	  node = node->parent;
1212 	}
1213       /* Go up while we have a node that is reached from the right.  */
1214       while (node->right == prev || node->right == NULL);
1215       node = node->right;
1216     }
1217 }
1218 
1219 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221 	  void *extra)
1222 {
1223   bin_tree_t *node;
1224 
1225   for (node = root; ; )
1226     {
1227       reg_errcode_t err = fn (extra, node);
1228       if (__glibc_unlikely (err != REG_NOERROR))
1229 	return err;
1230 
1231       /* Go to the left node, or up and to the right.  */
1232       if (node->left)
1233 	node = node->left;
1234       else
1235 	{
1236 	  bin_tree_t *prev = NULL;
1237 	  while (node->right == prev || node->right == NULL)
1238 	    {
1239 	      prev = node;
1240 	      node = node->parent;
1241 	      if (!node)
1242 		return REG_NOERROR;
1243 	    }
1244 	  node = node->right;
1245 	}
1246     }
1247 }
1248 
1249 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1251    backreferences as well.  Requires a preorder visit.  */
1252 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1253 optimize_subexps (void *extra, bin_tree_t *node)
1254 {
1255   re_dfa_t *dfa = (re_dfa_t *) extra;
1256 
1257   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1258     {
1259       int idx = node->token.opr.idx;
1260       node->token.opr.idx = dfa->subexp_map[idx];
1261       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1262     }
1263 
1264   else if (node->token.type == SUBEXP
1265 	   && node->left && node->left->token.type == SUBEXP)
1266     {
1267       Idx other_idx = node->left->token.opr.idx;
1268 
1269       node->left = node->left->left;
1270       if (node->left)
1271 	node->left->parent = node;
1272 
1273       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274       if (other_idx < BITSET_WORD_BITS)
1275 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1276     }
1277 
1278   return REG_NOERROR;
1279 }
1280 
1281 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1283 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1284 lower_subexps (void *extra, bin_tree_t *node)
1285 {
1286   regex_t *preg = (regex_t *) extra;
1287   reg_errcode_t err = REG_NOERROR;
1288 
1289   if (node->left && node->left->token.type == SUBEXP)
1290     {
1291       node->left = lower_subexp (&err, preg, node->left);
1292       if (node->left)
1293 	node->left->parent = node;
1294     }
1295   if (node->right && node->right->token.type == SUBEXP)
1296     {
1297       node->right = lower_subexp (&err, preg, node->right);
1298       if (node->right)
1299 	node->right->parent = node;
1300     }
1301 
1302   return err;
1303 }
1304 
1305 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1307 {
1308   re_dfa_t *dfa = preg->buffer;
1309   bin_tree_t *body = node->left;
1310   bin_tree_t *op, *cls, *tree1, *tree;
1311 
1312   if (preg->no_sub
1313       /* We do not optimize empty subexpressions, because otherwise we may
1314 	 have bad CONCAT nodes with NULL children.  This is obviously not
1315 	 very common, so we do not lose much.  An example that triggers
1316 	 this case is the sed "script" /\(\)/x.  */
1317       && node->left != NULL
1318       && (node->token.opr.idx >= BITSET_WORD_BITS
1319 	  || !(dfa->used_bkref_map
1320 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1321     return node->left;
1322 
1323   /* Convert the SUBEXP node to the concatenation of an
1324      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1325   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328   tree = create_tree (dfa, op, tree1, CONCAT);
1329   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330 			|| op == NULL || cls == NULL))
1331     {
1332       *err = REG_ESPACE;
1333       return NULL;
1334     }
1335 
1336   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338   return tree;
1339 }
1340 
1341 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342    nodes.  Requires a postorder visit.  */
1343 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1344 calc_first (void *extra, bin_tree_t *node)
1345 {
1346   re_dfa_t *dfa = (re_dfa_t *) extra;
1347   if (node->token.type == CONCAT)
1348     {
1349       node->first = node->left->first;
1350       node->node_idx = node->left->node_idx;
1351     }
1352   else
1353     {
1354       node->first = node;
1355       node->node_idx = re_dfa_add_node (dfa, node->token);
1356       if (__glibc_unlikely (node->node_idx == -1))
1357 	return REG_ESPACE;
1358       if (node->token.type == ANCHOR)
1359 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1360     }
1361   return REG_NOERROR;
1362 }
1363 
1364 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1365 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1366 calc_next (void *extra, bin_tree_t *node)
1367 {
1368   switch (node->token.type)
1369     {
1370     case OP_DUP_ASTERISK:
1371       node->left->next = node;
1372       break;
1373     case CONCAT:
1374       node->left->next = node->right->first;
1375       node->right->next = node->next;
1376       break;
1377     default:
1378       if (node->left)
1379 	node->left->next = node->next;
1380       if (node->right)
1381 	node->right->next = node->next;
1382       break;
1383     }
1384   return REG_NOERROR;
1385 }
1386 
1387 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1388 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1389 link_nfa_nodes (void *extra, bin_tree_t *node)
1390 {
1391   re_dfa_t *dfa = (re_dfa_t *) extra;
1392   Idx idx = node->node_idx;
1393   reg_errcode_t err = REG_NOERROR;
1394 
1395   switch (node->token.type)
1396     {
1397     case CONCAT:
1398       break;
1399 
1400     case END_OF_RE:
1401       DEBUG_ASSERT (node->next == NULL);
1402       break;
1403 
1404     case OP_DUP_ASTERISK:
1405     case OP_ALT:
1406       {
1407 	Idx left, right;
1408 	dfa->has_plural_match = 1;
1409 	if (node->left != NULL)
1410 	  left = node->left->first->node_idx;
1411 	else
1412 	  left = node->next->node_idx;
1413 	if (node->right != NULL)
1414 	  right = node->right->first->node_idx;
1415 	else
1416 	  right = node->next->node_idx;
1417 	DEBUG_ASSERT (left > -1);
1418 	DEBUG_ASSERT (right > -1);
1419 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1420       }
1421       break;
1422 
1423     case ANCHOR:
1424     case OP_OPEN_SUBEXP:
1425     case OP_CLOSE_SUBEXP:
1426       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427       break;
1428 
1429     case OP_BACK_REF:
1430       dfa->nexts[idx] = node->next->node_idx;
1431       if (node->token.type == OP_BACK_REF)
1432 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433       break;
1434 
1435     default:
1436       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437       dfa->nexts[idx] = node->next->node_idx;
1438       break;
1439     }
1440 
1441   return err;
1442 }
1443 
1444 /* Duplicate the epsilon closure of the node ROOT_NODE.
1445    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446    to their own constraint.  */
1447 
1448 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450 			Idx root_node, unsigned int init_constraint)
1451 {
1452   Idx org_node, clone_node;
1453   bool ok;
1454   unsigned int constraint = init_constraint;
1455   for (org_node = top_org_node, clone_node = top_clone_node;;)
1456     {
1457       Idx org_dest, clone_dest;
1458       if (dfa->nodes[org_node].type == OP_BACK_REF)
1459 	{
1460 	  /* If the back reference epsilon-transit, its destination must
1461 	     also have the constraint.  Then duplicate the epsilon closure
1462 	     of the destination of the back reference, and store it in
1463 	     edests of the back reference.  */
1464 	  org_dest = dfa->nexts[org_node];
1465 	  re_node_set_empty (dfa->edests + clone_node);
1466 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1467 	  if (__glibc_unlikely (clone_dest == -1))
1468 	    return REG_ESPACE;
1469 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1470 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471 	  if (__glibc_unlikely (! ok))
1472 	    return REG_ESPACE;
1473 	}
1474       else if (dfa->edests[org_node].nelem == 0)
1475 	{
1476 	  /* In case of the node can't epsilon-transit, don't duplicate the
1477 	     destination and store the original destination as the
1478 	     destination of the node.  */
1479 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1480 	  break;
1481 	}
1482       else if (dfa->edests[org_node].nelem == 1)
1483 	{
1484 	  /* In case of the node can epsilon-transit, and it has only one
1485 	     destination.  */
1486 	  org_dest = dfa->edests[org_node].elems[0];
1487 	  re_node_set_empty (dfa->edests + clone_node);
1488 	  /* If the node is root_node itself, it means the epsilon closure
1489 	     has a loop.  Then tie it to the destination of the root_node.  */
1490 	  if (org_node == root_node && clone_node != org_node)
1491 	    {
1492 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493 	      if (__glibc_unlikely (! ok))
1494 	        return REG_ESPACE;
1495 	      break;
1496 	    }
1497 	  /* In case the node has another constraint, append it.  */
1498 	  constraint |= dfa->nodes[org_node].constraint;
1499 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1500 	  if (__glibc_unlikely (clone_dest == -1))
1501 	    return REG_ESPACE;
1502 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 	  if (__glibc_unlikely (! ok))
1504 	    return REG_ESPACE;
1505 	}
1506       else /* dfa->edests[org_node].nelem == 2 */
1507 	{
1508 	  /* In case of the node can epsilon-transit, and it has two
1509 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1510 	  org_dest = dfa->edests[org_node].elems[0];
1511 	  re_node_set_empty (dfa->edests + clone_node);
1512 	  /* Search for a duplicated node which satisfies the constraint.  */
1513 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514 	  if (clone_dest == -1)
1515 	    {
1516 	      /* There is no such duplicated node, create a new one.  */
1517 	      reg_errcode_t err;
1518 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1519 	      if (__glibc_unlikely (clone_dest == -1))
1520 		return REG_ESPACE;
1521 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522 	      if (__glibc_unlikely (! ok))
1523 		return REG_ESPACE;
1524 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525 					    root_node, constraint);
1526 	      if (__glibc_unlikely (err != REG_NOERROR))
1527 		return err;
1528 	    }
1529 	  else
1530 	    {
1531 	      /* There is a duplicated node which satisfies the constraint,
1532 		 use it to avoid infinite loop.  */
1533 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 	      if (__glibc_unlikely (! ok))
1535 		return REG_ESPACE;
1536 	    }
1537 
1538 	  org_dest = dfa->edests[org_node].elems[1];
1539 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 	  if (__glibc_unlikely (clone_dest == -1))
1541 	    return REG_ESPACE;
1542 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543 	  if (__glibc_unlikely (! ok))
1544 	    return REG_ESPACE;
1545 	}
1546       org_node = org_dest;
1547       clone_node = clone_dest;
1548     }
1549   return REG_NOERROR;
1550 }
1551 
1552 /* Search for a node which is duplicated from the node ORG_NODE, and
1553    satisfies the constraint CONSTRAINT.  */
1554 
1555 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557 			unsigned int constraint)
1558 {
1559   Idx idx;
1560   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1561     {
1562       if (org_node == dfa->org_indices[idx]
1563 	  && constraint == dfa->nodes[idx].constraint)
1564 	return idx; /* Found.  */
1565     }
1566   return -1; /* Not found.  */
1567 }
1568 
1569 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570    Return the index of the new node, or -1 if insufficient storage is
1571    available.  */
1572 
1573 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1575 {
1576   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577   if (__glibc_likely (dup_idx != -1))
1578     {
1579       dfa->nodes[dup_idx].constraint = constraint;
1580       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581       dfa->nodes[dup_idx].duplicated = 1;
1582 
1583       /* Store the index of the original node.  */
1584       dfa->org_indices[dup_idx] = org_idx;
1585     }
1586   return dup_idx;
1587 }
1588 
1589 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1590 calc_inveclosure (re_dfa_t *dfa)
1591 {
1592   Idx src, idx;
1593   bool ok;
1594   for (idx = 0; idx < dfa->nodes_len; ++idx)
1595     re_node_set_init_empty (dfa->inveclosures + idx);
1596 
1597   for (src = 0; src < dfa->nodes_len; ++src)
1598     {
1599       Idx *elems = dfa->eclosures[src].elems;
1600       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1601 	{
1602 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603 	  if (__glibc_unlikely (! ok))
1604 	    return REG_ESPACE;
1605 	}
1606     }
1607 
1608   return REG_NOERROR;
1609 }
1610 
1611 /* Calculate "eclosure" for all the node in DFA.  */
1612 
1613 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1614 calc_eclosure (re_dfa_t *dfa)
1615 {
1616   Idx node_idx;
1617   bool incomplete;
1618   DEBUG_ASSERT (dfa->nodes_len > 0);
1619   incomplete = false;
1620   /* For each nodes, calculate epsilon closure.  */
1621   for (node_idx = 0; ; ++node_idx)
1622     {
1623       reg_errcode_t err;
1624       re_node_set eclosure_elem;
1625       if (node_idx == dfa->nodes_len)
1626 	{
1627 	  if (!incomplete)
1628 	    break;
1629 	  incomplete = false;
1630 	  node_idx = 0;
1631 	}
1632 
1633       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1634 
1635       /* If we have already calculated, skip it.  */
1636       if (dfa->eclosures[node_idx].nelem != 0)
1637 	continue;
1638       /* Calculate epsilon closure of 'node_idx'.  */
1639       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640       if (__glibc_unlikely (err != REG_NOERROR))
1641 	return err;
1642 
1643       if (dfa->eclosures[node_idx].nelem == 0)
1644 	{
1645 	  incomplete = true;
1646 	  re_node_set_free (&eclosure_elem);
1647 	}
1648     }
1649   return REG_NOERROR;
1650 }
1651 
1652 /* Calculate epsilon closure of NODE.  */
1653 
1654 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1656 {
1657   reg_errcode_t err;
1658   Idx i;
1659   re_node_set eclosure;
1660   bool incomplete = false;
1661   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662   if (__glibc_unlikely (err != REG_NOERROR))
1663     return err;
1664 
1665   /* An epsilon closure includes itself.  */
1666   eclosure.elems[eclosure.nelem++] = node;
1667 
1668   /* This indicates that we are calculating this node now.
1669      We reference this value to avoid infinite loop.  */
1670   dfa->eclosures[node].nelem = -1;
1671 
1672   /* If the current node has constraints, duplicate all nodes
1673      since they must inherit the constraints.  */
1674   if (dfa->nodes[node].constraint
1675       && dfa->edests[node].nelem
1676       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1677     {
1678       err = duplicate_node_closure (dfa, node, node, node,
1679 				    dfa->nodes[node].constraint);
1680       if (__glibc_unlikely (err != REG_NOERROR))
1681 	return err;
1682     }
1683 
1684   /* Expand each epsilon destination nodes.  */
1685   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686     for (i = 0; i < dfa->edests[node].nelem; ++i)
1687       {
1688 	re_node_set eclosure_elem;
1689 	Idx edest = dfa->edests[node].elems[i];
1690 	/* If calculating the epsilon closure of 'edest' is in progress,
1691 	   return intermediate result.  */
1692 	if (dfa->eclosures[edest].nelem == -1)
1693 	  {
1694 	    incomplete = true;
1695 	    continue;
1696 	  }
1697 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1698 	   calculate now. Otherwise use calculated epsilon closure.  */
1699 	if (dfa->eclosures[edest].nelem == 0)
1700 	  {
1701 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702 	    if (__glibc_unlikely (err != REG_NOERROR))
1703 	      return err;
1704 	  }
1705 	else
1706 	  eclosure_elem = dfa->eclosures[edest];
1707 	/* Merge the epsilon closure of 'edest'.  */
1708 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1709 	if (__glibc_unlikely (err != REG_NOERROR))
1710 	  return err;
1711 	/* If the epsilon closure of 'edest' is incomplete,
1712 	   the epsilon closure of this node is also incomplete.  */
1713 	if (dfa->eclosures[edest].nelem == 0)
1714 	  {
1715 	    incomplete = true;
1716 	    re_node_set_free (&eclosure_elem);
1717 	  }
1718       }
1719 
1720   if (incomplete && !root)
1721     dfa->eclosures[node].nelem = 0;
1722   else
1723     dfa->eclosures[node] = eclosure;
1724   *new_set = eclosure;
1725   return REG_NOERROR;
1726 }
1727 
1728 /* Functions for token which are used in the parser.  */
1729 
1730 /* Fetch a token from INPUT.
1731    We must not use this function inside bracket expressions.  */
1732 
1733 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1735 {
1736   re_string_skip_bytes (input, peek_token (result, input, syntax));
1737 }
1738 
1739 /* Peek a token from INPUT, and return the length of the token.
1740    We must not use this function inside bracket expressions.  */
1741 
1742 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1744 {
1745   unsigned char c;
1746 
1747   if (re_string_eoi (input))
1748     {
1749       token->type = END_OF_RE;
1750       return 0;
1751     }
1752 
1753   c = re_string_peek_byte (input, 0);
1754   token->opr.c = c;
1755 
1756   token->word_char = 0;
1757   token->mb_partial = 0;
1758   if (input->mb_cur_max > 1
1759       && !re_string_first_byte (input, re_string_cur_idx (input)))
1760     {
1761       token->type = CHARACTER;
1762       token->mb_partial = 1;
1763       return 1;
1764     }
1765   if (c == '\\')
1766     {
1767       unsigned char c2;
1768       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1769 	{
1770 	  token->type = BACK_SLASH;
1771 	  return 1;
1772 	}
1773 
1774       c2 = re_string_peek_byte_case (input, 1);
1775       token->opr.c = c2;
1776       token->type = CHARACTER;
1777       if (input->mb_cur_max > 1)
1778 	{
1779 	  wint_t wc = re_string_wchar_at (input,
1780 					  re_string_cur_idx (input) + 1);
1781 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1782 	}
1783       else
1784 	token->word_char = IS_WORD_CHAR (c2) != 0;
1785 
1786       switch (c2)
1787 	{
1788 	case '|':
1789 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790 	    token->type = OP_ALT;
1791 	  break;
1792 	case '1': case '2': case '3': case '4': case '5':
1793 	case '6': case '7': case '8': case '9':
1794 	  if (!(syntax & RE_NO_BK_REFS))
1795 	    {
1796 	      token->type = OP_BACK_REF;
1797 	      token->opr.idx = c2 - '1';
1798 	    }
1799 	  break;
1800 	case '<':
1801 	  if (!(syntax & RE_NO_GNU_OPS))
1802 	    {
1803 	      token->type = ANCHOR;
1804 	      token->opr.ctx_type = WORD_FIRST;
1805 	    }
1806 	  break;
1807 	case '>':
1808 	  if (!(syntax & RE_NO_GNU_OPS))
1809 	    {
1810 	      token->type = ANCHOR;
1811 	      token->opr.ctx_type = WORD_LAST;
1812 	    }
1813 	  break;
1814 	case 'b':
1815 	  if (!(syntax & RE_NO_GNU_OPS))
1816 	    {
1817 	      token->type = ANCHOR;
1818 	      token->opr.ctx_type = WORD_DELIM;
1819 	    }
1820 	  break;
1821 	case 'B':
1822 	  if (!(syntax & RE_NO_GNU_OPS))
1823 	    {
1824 	      token->type = ANCHOR;
1825 	      token->opr.ctx_type = NOT_WORD_DELIM;
1826 	    }
1827 	  break;
1828 	case 'w':
1829 	  if (!(syntax & RE_NO_GNU_OPS))
1830 	    token->type = OP_WORD;
1831 	  break;
1832 	case 'W':
1833 	  if (!(syntax & RE_NO_GNU_OPS))
1834 	    token->type = OP_NOTWORD;
1835 	  break;
1836 	case 's':
1837 	  if (!(syntax & RE_NO_GNU_OPS))
1838 	    token->type = OP_SPACE;
1839 	  break;
1840 	case 'S':
1841 	  if (!(syntax & RE_NO_GNU_OPS))
1842 	    token->type = OP_NOTSPACE;
1843 	  break;
1844 	case '`':
1845 	  if (!(syntax & RE_NO_GNU_OPS))
1846 	    {
1847 	      token->type = ANCHOR;
1848 	      token->opr.ctx_type = BUF_FIRST;
1849 	    }
1850 	  break;
1851 	case '\'':
1852 	  if (!(syntax & RE_NO_GNU_OPS))
1853 	    {
1854 	      token->type = ANCHOR;
1855 	      token->opr.ctx_type = BUF_LAST;
1856 	    }
1857 	  break;
1858 	case '(':
1859 	  if (!(syntax & RE_NO_BK_PARENS))
1860 	    token->type = OP_OPEN_SUBEXP;
1861 	  break;
1862 	case ')':
1863 	  if (!(syntax & RE_NO_BK_PARENS))
1864 	    token->type = OP_CLOSE_SUBEXP;
1865 	  break;
1866 	case '+':
1867 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868 	    token->type = OP_DUP_PLUS;
1869 	  break;
1870 	case '?':
1871 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872 	    token->type = OP_DUP_QUESTION;
1873 	  break;
1874 	case '{':
1875 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876 	    token->type = OP_OPEN_DUP_NUM;
1877 	  break;
1878 	case '}':
1879 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880 	    token->type = OP_CLOSE_DUP_NUM;
1881 	  break;
1882 	default:
1883 	  break;
1884 	}
1885       return 2;
1886     }
1887 
1888   token->type = CHARACTER;
1889   if (input->mb_cur_max > 1)
1890     {
1891       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1893     }
1894   else
1895     token->word_char = IS_WORD_CHAR (token->opr.c);
1896 
1897   switch (c)
1898     {
1899     case '\n':
1900       if (syntax & RE_NEWLINE_ALT)
1901 	token->type = OP_ALT;
1902       break;
1903     case '|':
1904       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905 	token->type = OP_ALT;
1906       break;
1907     case '*':
1908       token->type = OP_DUP_ASTERISK;
1909       break;
1910     case '+':
1911       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912 	token->type = OP_DUP_PLUS;
1913       break;
1914     case '?':
1915       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916 	token->type = OP_DUP_QUESTION;
1917       break;
1918     case '{':
1919       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920 	token->type = OP_OPEN_DUP_NUM;
1921       break;
1922     case '}':
1923       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924 	token->type = OP_CLOSE_DUP_NUM;
1925       break;
1926     case '(':
1927       if (syntax & RE_NO_BK_PARENS)
1928 	token->type = OP_OPEN_SUBEXP;
1929       break;
1930     case ')':
1931       if (syntax & RE_NO_BK_PARENS)
1932 	token->type = OP_CLOSE_SUBEXP;
1933       break;
1934     case '[':
1935       token->type = OP_OPEN_BRACKET;
1936       break;
1937     case '.':
1938       token->type = OP_PERIOD;
1939       break;
1940     case '^':
1941       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942 	  && re_string_cur_idx (input) != 0)
1943 	{
1944 	  char prev = re_string_peek_byte (input, -1);
1945 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946 	    break;
1947 	}
1948       token->type = ANCHOR;
1949       token->opr.ctx_type = LINE_FIRST;
1950       break;
1951     case '$':
1952       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953 	  && re_string_cur_idx (input) + 1 != re_string_length (input))
1954 	{
1955 	  re_token_t next;
1956 	  re_string_skip_bytes (input, 1);
1957 	  peek_token (&next, input, syntax);
1958 	  re_string_skip_bytes (input, -1);
1959 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960 	    break;
1961 	}
1962       token->type = ANCHOR;
1963       token->opr.ctx_type = LINE_LAST;
1964       break;
1965     default:
1966       break;
1967     }
1968   return 1;
1969 }
1970 
1971 /* Peek a token from INPUT, and return the length of the token.
1972    We must not use this function out of bracket expressions.  */
1973 
1974 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1976 {
1977   unsigned char c;
1978   if (re_string_eoi (input))
1979     {
1980       token->type = END_OF_RE;
1981       return 0;
1982     }
1983   c = re_string_peek_byte (input, 0);
1984   token->opr.c = c;
1985 
1986   if (input->mb_cur_max > 1
1987       && !re_string_first_byte (input, re_string_cur_idx (input)))
1988     {
1989       token->type = CHARACTER;
1990       return 1;
1991     }
1992 
1993   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994       && re_string_cur_idx (input) + 1 < re_string_length (input))
1995     {
1996       /* In this case, '\' escape a character.  */
1997       unsigned char c2;
1998       re_string_skip_bytes (input, 1);
1999       c2 = re_string_peek_byte (input, 0);
2000       token->opr.c = c2;
2001       token->type = CHARACTER;
2002       return 1;
2003     }
2004   if (c == '[') /* '[' is a special char in a bracket exps.  */
2005     {
2006       unsigned char c2;
2007       int token_len;
2008       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009 	c2 = re_string_peek_byte (input, 1);
2010       else
2011 	c2 = 0;
2012       token->opr.c = c2;
2013       token_len = 2;
2014       switch (c2)
2015 	{
2016 	case '.':
2017 	  token->type = OP_OPEN_COLL_ELEM;
2018 	  break;
2019 
2020 	case '=':
2021 	  token->type = OP_OPEN_EQUIV_CLASS;
2022 	  break;
2023 
2024 	case ':':
2025 	  if (syntax & RE_CHAR_CLASSES)
2026 	    {
2027 	      token->type = OP_OPEN_CHAR_CLASS;
2028 	      break;
2029 	    }
2030 	  FALLTHROUGH;
2031 	default:
2032 	  token->type = CHARACTER;
2033 	  token->opr.c = c;
2034 	  token_len = 1;
2035 	  break;
2036 	}
2037       return token_len;
2038     }
2039   switch (c)
2040     {
2041     case '-':
2042       token->type = OP_CHARSET_RANGE;
2043       break;
2044     case ']':
2045       token->type = OP_CLOSE_BRACKET;
2046       break;
2047     case '^':
2048       token->type = OP_NON_MATCH_LIST;
2049       break;
2050     default:
2051       token->type = CHARACTER;
2052     }
2053   return 1;
2054 }
2055 
2056 /* Functions for parser.  */
2057 
2058 /* Entry point of the parser.
2059    Parse the regular expression REGEXP and return the structure tree.
2060    If an error occurs, ERR is set by error code, and return NULL.
2061    This function build the following tree, from regular expression <reg_exp>:
2062 	   CAT
2063 	   / \
2064 	  /   \
2065    <reg_exp>  EOR
2066 
2067    CAT means concatenation.
2068    EOR means end of regular expression.  */
2069 
2070 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2071 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2072        reg_errcode_t *err)
2073 {
2074   re_dfa_t *dfa = preg->buffer;
2075   bin_tree_t *tree, *eor, *root;
2076   re_token_t current_token;
2077   dfa->syntax = syntax;
2078   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2079   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2080   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2081     return NULL;
2082   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2083   if (tree != NULL)
2084     root = create_tree (dfa, tree, eor, CONCAT);
2085   else
2086     root = eor;
2087   if (__glibc_unlikely (eor == NULL || root == NULL))
2088     {
2089       *err = REG_ESPACE;
2090       return NULL;
2091     }
2092   return root;
2093 }
2094 
2095 /* This function build the following tree, from regular expression
2096    <branch1>|<branch2>:
2097 	   ALT
2098 	   / \
2099 	  /   \
2100    <branch1> <branch2>
2101 
2102    ALT means alternative, which represents the operator '|'.  */
2103 
2104 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2105 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2106 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2107 {
2108   re_dfa_t *dfa = preg->buffer;
2109   bin_tree_t *tree, *branch = NULL;
2110   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2111   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2112   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2113     return NULL;
2114 
2115   while (token->type == OP_ALT)
2116     {
2117       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2118       if (token->type != OP_ALT && token->type != END_OF_RE
2119 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120 	{
2121 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2122 	  dfa->completed_bkref_map = initial_bkref_map;
2123 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2124 	  if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2125 	    {
2126 	      if (tree != NULL)
2127 		postorder (tree, free_tree, NULL);
2128 	      return NULL;
2129 	    }
2130 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2131 	}
2132       else
2133 	branch = NULL;
2134       tree = create_tree (dfa, tree, branch, OP_ALT);
2135       if (__glibc_unlikely (tree == NULL))
2136 	{
2137 	  *err = REG_ESPACE;
2138 	  return NULL;
2139 	}
2140     }
2141   return tree;
2142 }
2143 
2144 /* This function build the following tree, from regular expression
2145    <exp1><exp2>:
2146 	CAT
2147 	/ \
2148        /   \
2149    <exp1> <exp2>
2150 
2151    CAT means concatenation.  */
2152 
2153 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2154 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2155 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2156 {
2157   bin_tree_t *tree, *expr;
2158   re_dfa_t *dfa = preg->buffer;
2159   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2160   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2161     return NULL;
2162 
2163   while (token->type != OP_ALT && token->type != END_OF_RE
2164 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165     {
2166       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2167       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2168 	{
2169 	  if (tree != NULL)
2170 	    postorder (tree, free_tree, NULL);
2171 	  return NULL;
2172 	}
2173       if (tree != NULL && expr != NULL)
2174 	{
2175 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2176 	  if (newtree == NULL)
2177 	    {
2178 	      postorder (expr, free_tree, NULL);
2179 	      postorder (tree, free_tree, NULL);
2180 	      *err = REG_ESPACE;
2181 	      return NULL;
2182 	    }
2183 	  tree = newtree;
2184 	}
2185       else if (tree == NULL)
2186 	tree = expr;
2187       /* Otherwise expr == NULL, we don't need to create new tree.  */
2188     }
2189   return tree;
2190 }
2191 
2192 /* This function build the following tree, from regular expression a*:
2193 	 *
2194 	 |
2195 	 a
2196 */
2197 
2198 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2199 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2200 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2201 {
2202   re_dfa_t *dfa = preg->buffer;
2203   bin_tree_t *tree;
2204   switch (token->type)
2205     {
2206     case CHARACTER:
2207       tree = create_token_tree (dfa, NULL, NULL, token);
2208       if (__glibc_unlikely (tree == NULL))
2209 	{
2210 	  *err = REG_ESPACE;
2211 	  return NULL;
2212 	}
2213       if (dfa->mb_cur_max > 1)
2214 	{
2215 	  while (!re_string_eoi (regexp)
2216 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2217 	    {
2218 	      bin_tree_t *mbc_remain;
2219 	      fetch_token (token, regexp, syntax);
2220 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2221 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2222 	      if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2223 		{
2224 		  *err = REG_ESPACE;
2225 		  return NULL;
2226 		}
2227 	    }
2228 	}
2229       break;
2230 
2231     case OP_OPEN_SUBEXP:
2232       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2233       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2234 	return NULL;
2235       break;
2236 
2237     case OP_OPEN_BRACKET:
2238       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2239       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2240 	return NULL;
2241       break;
2242 
2243     case OP_BACK_REF:
2244       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2245 	{
2246 	  *err = REG_ESUBREG;
2247 	  return NULL;
2248 	}
2249       dfa->used_bkref_map |= 1 << token->opr.idx;
2250       tree = create_token_tree (dfa, NULL, NULL, token);
2251       if (__glibc_unlikely (tree == NULL))
2252 	{
2253 	  *err = REG_ESPACE;
2254 	  return NULL;
2255 	}
2256       ++dfa->nbackref;
2257       dfa->has_mb_node = 1;
2258       break;
2259 
2260     case OP_OPEN_DUP_NUM:
2261       if (syntax & RE_CONTEXT_INVALID_DUP)
2262 	{
2263 	  *err = REG_BADRPT;
2264 	  return NULL;
2265 	}
2266       FALLTHROUGH;
2267     case OP_DUP_ASTERISK:
2268     case OP_DUP_PLUS:
2269     case OP_DUP_QUESTION:
2270       if (syntax & RE_CONTEXT_INVALID_OPS)
2271 	{
2272 	  *err = REG_BADRPT;
2273 	  return NULL;
2274 	}
2275       else if (syntax & RE_CONTEXT_INDEP_OPS)
2276 	{
2277 	  fetch_token (token, regexp, syntax);
2278 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2279 	}
2280       FALLTHROUGH;
2281     case OP_CLOSE_SUBEXP:
2282       if ((token->type == OP_CLOSE_SUBEXP)
2283 	  && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2284 	{
2285 	  *err = REG_ERPAREN;
2286 	  return NULL;
2287 	}
2288       FALLTHROUGH;
2289     case OP_CLOSE_DUP_NUM:
2290       /* We treat it as a normal character.  */
2291 
2292       /* Then we can these characters as normal characters.  */
2293       token->type = CHARACTER;
2294       /* mb_partial and word_char bits should be initialized already
2295 	 by peek_token.  */
2296       tree = create_token_tree (dfa, NULL, NULL, token);
2297       if (__glibc_unlikely (tree == NULL))
2298 	{
2299 	  *err = REG_ESPACE;
2300 	  return NULL;
2301 	}
2302       break;
2303 
2304     case ANCHOR:
2305       if ((token->opr.ctx_type
2306 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2307 	  && dfa->word_ops_used == 0)
2308 	init_word_char (dfa);
2309       if (token->opr.ctx_type == WORD_DELIM
2310 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2311 	{
2312 	  bin_tree_t *tree_first, *tree_last;
2313 	  if (token->opr.ctx_type == WORD_DELIM)
2314 	    {
2315 	      token->opr.ctx_type = WORD_FIRST;
2316 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2317 	      token->opr.ctx_type = WORD_LAST;
2318 	    }
2319 	  else
2320 	    {
2321 	      token->opr.ctx_type = INSIDE_WORD;
2322 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2323 	      token->opr.ctx_type = INSIDE_NOTWORD;
2324 	    }
2325 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2326 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2327 	  if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2328 				|| tree == NULL))
2329 	    {
2330 	      *err = REG_ESPACE;
2331 	      return NULL;
2332 	    }
2333 	}
2334       else
2335 	{
2336 	  tree = create_token_tree (dfa, NULL, NULL, token);
2337 	  if (__glibc_unlikely (tree == NULL))
2338 	    {
2339 	      *err = REG_ESPACE;
2340 	      return NULL;
2341 	    }
2342 	}
2343       /* We must return here, since ANCHORs can't be followed
2344 	 by repetition operators.
2345 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2346 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2347       fetch_token (token, regexp, syntax);
2348       return tree;
2349 
2350     case OP_PERIOD:
2351       tree = create_token_tree (dfa, NULL, NULL, token);
2352       if (__glibc_unlikely (tree == NULL))
2353 	{
2354 	  *err = REG_ESPACE;
2355 	  return NULL;
2356 	}
2357       if (dfa->mb_cur_max > 1)
2358 	dfa->has_mb_node = 1;
2359       break;
2360 
2361     case OP_WORD:
2362     case OP_NOTWORD:
2363       tree = build_charclass_op (dfa, regexp->trans,
2364 				 "alnum",
2365 				 "_",
2366 				 token->type == OP_NOTWORD, err);
2367       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2368 	return NULL;
2369       break;
2370 
2371     case OP_SPACE:
2372     case OP_NOTSPACE:
2373       tree = build_charclass_op (dfa, regexp->trans,
2374 				 "space",
2375 				 "",
2376 				 token->type == OP_NOTSPACE, err);
2377       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378 	return NULL;
2379       break;
2380 
2381     case OP_ALT:
2382     case END_OF_RE:
2383       return NULL;
2384 
2385     case BACK_SLASH:
2386       *err = REG_EESCAPE;
2387       return NULL;
2388 
2389     default:
2390       /* Must not happen?  */
2391       DEBUG_ASSERT (false);
2392       return NULL;
2393     }
2394   fetch_token (token, regexp, syntax);
2395 
2396   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2397 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2398     {
2399       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2400 					   syntax, err);
2401       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2402 	{
2403 	  if (tree != NULL)
2404 	    postorder (tree, free_tree, NULL);
2405 	  return NULL;
2406 	}
2407       tree = dup_tree;
2408       /* In BRE consecutive duplications are not allowed.  */
2409       if ((syntax & RE_CONTEXT_INVALID_DUP)
2410 	  && (token->type == OP_DUP_ASTERISK
2411 	      || token->type == OP_OPEN_DUP_NUM))
2412 	{
2413 	  if (tree != NULL)
2414 	    postorder (tree, free_tree, NULL);
2415 	  *err = REG_BADRPT;
2416 	  return NULL;
2417 	}
2418     }
2419 
2420   return tree;
2421 }
2422 
2423 /* This function build the following tree, from regular expression
2424    (<reg_exp>):
2425 	 SUBEXP
2426 	    |
2427 	<reg_exp>
2428 */
2429 
2430 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2431 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2432 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2433 {
2434   re_dfa_t *dfa = preg->buffer;
2435   bin_tree_t *tree;
2436   size_t cur_nsub;
2437   cur_nsub = preg->re_nsub++;
2438 
2439   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2440 
2441   /* The subexpression may be a null string.  */
2442   if (token->type == OP_CLOSE_SUBEXP)
2443     tree = NULL;
2444   else
2445     {
2446       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2447       if (__glibc_unlikely (*err == REG_NOERROR
2448 			    && token->type != OP_CLOSE_SUBEXP))
2449 	{
2450 	  if (tree != NULL)
2451 	    postorder (tree, free_tree, NULL);
2452 	  *err = REG_EPAREN;
2453 	}
2454       if (__glibc_unlikely (*err != REG_NOERROR))
2455 	return NULL;
2456     }
2457 
2458   if (cur_nsub <= '9' - '1')
2459     dfa->completed_bkref_map |= 1 << cur_nsub;
2460 
2461   tree = create_tree (dfa, tree, NULL, SUBEXP);
2462   if (__glibc_unlikely (tree == NULL))
2463     {
2464       *err = REG_ESPACE;
2465       return NULL;
2466     }
2467   tree->token.opr.idx = cur_nsub;
2468   return tree;
2469 }
2470 
2471 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2472 
2473 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2474 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2475 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2476 {
2477   bin_tree_t *tree = NULL, *old_tree = NULL;
2478   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2479   re_token_t start_token = *token;
2480 
2481   if (token->type == OP_OPEN_DUP_NUM)
2482     {
2483       end = 0;
2484       start = fetch_number (regexp, token, syntax);
2485       if (start == -1)
2486 	{
2487 	  if (token->type == CHARACTER && token->opr.c == ',')
2488 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2489 	  else
2490 	    {
2491 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2492 	      return NULL;
2493 	    }
2494 	}
2495       if (__glibc_likely (start != -2))
2496 	{
2497 	  /* We treat "{n}" as "{n,n}".  */
2498 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2499 		 : ((token->type == CHARACTER && token->opr.c == ',')
2500 		    ? fetch_number (regexp, token, syntax) : -2));
2501 	}
2502       if (__glibc_unlikely (start == -2 || end == -2))
2503 	{
2504 	  /* Invalid sequence.  */
2505 	  if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2506 	    {
2507 	      if (token->type == END_OF_RE)
2508 		*err = REG_EBRACE;
2509 	      else
2510 		*err = REG_BADBR;
2511 
2512 	      return NULL;
2513 	    }
2514 
2515 	  /* If the syntax bit is set, rollback.  */
2516 	  re_string_set_index (regexp, start_idx);
2517 	  *token = start_token;
2518 	  token->type = CHARACTER;
2519 	  /* mb_partial and word_char bits should be already initialized by
2520 	     peek_token.  */
2521 	  return elem;
2522 	}
2523 
2524       if (__glibc_unlikely ((end != -1 && start > end)
2525 			    || token->type != OP_CLOSE_DUP_NUM))
2526 	{
2527 	  /* First number greater than second.  */
2528 	  *err = REG_BADBR;
2529 	  return NULL;
2530 	}
2531 
2532       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2533 	{
2534 	  *err = REG_ESIZE;
2535 	  return NULL;
2536 	}
2537     }
2538   else
2539     {
2540       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2541       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2542     }
2543 
2544   fetch_token (token, regexp, syntax);
2545 
2546   if (__glibc_unlikely (elem == NULL))
2547     return NULL;
2548   if (__glibc_unlikely (start == 0 && end == 0))
2549     {
2550       postorder (elem, free_tree, NULL);
2551       return NULL;
2552     }
2553 
2554   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2555   if (__glibc_unlikely (start > 0))
2556     {
2557       tree = elem;
2558       for (i = 2; i <= start; ++i)
2559 	{
2560 	  elem = duplicate_tree (elem, dfa);
2561 	  tree = create_tree (dfa, tree, elem, CONCAT);
2562 	  if (__glibc_unlikely (elem == NULL || tree == NULL))
2563 	    goto parse_dup_op_espace;
2564 	}
2565 
2566       if (start == end)
2567 	return tree;
2568 
2569       /* Duplicate ELEM before it is marked optional.  */
2570       elem = duplicate_tree (elem, dfa);
2571       if (__glibc_unlikely (elem == NULL))
2572         goto parse_dup_op_espace;
2573       old_tree = tree;
2574     }
2575   else
2576     old_tree = NULL;
2577 
2578   if (elem->token.type == SUBEXP)
2579     {
2580       uintptr_t subidx = elem->token.opr.idx;
2581       postorder (elem, mark_opt_subexp, (void *) subidx);
2582     }
2583 
2584   tree = create_tree (dfa, elem, NULL,
2585 		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2586   if (__glibc_unlikely (tree == NULL))
2587     goto parse_dup_op_espace;
2588 
2589   /* This loop is actually executed only when end != -1,
2590      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2591      already created the start+1-th copy.  */
2592   if (TYPE_SIGNED (Idx) || end != -1)
2593     for (i = start + 2; i <= end; ++i)
2594       {
2595 	elem = duplicate_tree (elem, dfa);
2596 	tree = create_tree (dfa, tree, elem, CONCAT);
2597 	if (__glibc_unlikely (elem == NULL || tree == NULL))
2598 	  goto parse_dup_op_espace;
2599 
2600 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2601 	if (__glibc_unlikely (tree == NULL))
2602 	  goto parse_dup_op_espace;
2603       }
2604 
2605   if (old_tree)
2606     tree = create_tree (dfa, old_tree, tree, CONCAT);
2607 
2608   return tree;
2609 
2610  parse_dup_op_espace:
2611   *err = REG_ESPACE;
2612   return NULL;
2613 }
2614 
2615 /* Size of the names for collating symbol/equivalence_class/character_class.
2616    I'm not sure, but maybe enough.  */
2617 #define BRACKET_NAME_BUF_SIZE 32
2618 
2619 #ifndef _LIBC
2620 
2621 /* Convert the byte B to the corresponding wide character.  In a
2622    unibyte locale, treat B as itself.  In a multibyte locale, return
2623    WEOF if B is an encoding error.  */
2624 static wint_t
parse_byte(unsigned char b,re_dfa_t const * dfa)2625 parse_byte (unsigned char b, re_dfa_t const *dfa)
2626 {
2627   return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2628 }
2629 
2630 /* Local function for parse_bracket_exp used in _LIBC environment.
2631    Build the range expression which starts from START_ELEM, and ends
2632    at END_ELEM.  The result are written to MBCSET and SBCSET.
2633    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2634    mbcset->range_ends, is a pointer argument since we may
2635    update it.  */
2636 
2637 static reg_errcode_t
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem,re_dfa_t * dfa,reg_syntax_t syntax,uint_fast32_t nrules,const unsigned char * collseqmb,const char * collseqwc,int_fast32_t table_size,const void * symb_table,const unsigned char * extra)2638 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2639 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2640 		 re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2641 		 const unsigned char *collseqmb, const char *collseqwc,
2642 		 int_fast32_t table_size, const void *symb_table,
2643 		 const unsigned char *extra)
2644 {
2645   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2646   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2647 			|| start_elem->type == CHAR_CLASS
2648 			|| end_elem->type == EQUIV_CLASS
2649 			|| end_elem->type == CHAR_CLASS))
2650     return REG_ERANGE;
2651 
2652   /* We can handle no multi character collating elements without libc
2653      support.  */
2654   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2655 			 && strlen ((char *) start_elem->opr.name) > 1)
2656 			|| (end_elem->type == COLL_SYM
2657 			    && strlen ((char *) end_elem->opr.name) > 1)))
2658     return REG_ECOLLATE;
2659 
2660   unsigned int
2661     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663 		   : 0)),
2664     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666 		 : 0));
2667   wint_t
2668     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669 		? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2670     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671 	      ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2672 
2673   if (start_wc == WEOF || end_wc == WEOF)
2674     return REG_ECOLLATE;
2675   else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2676                              && start_wc > end_wc))
2677     return REG_ERANGE;
2678 
2679   /* Got valid collation sequence values, add them as a new entry.
2680      However, for !_LIBC we have no collation elements: if the
2681      character set is single byte, the single byte character set
2682      that we build below suffices.  parse_bracket_exp passes
2683      no MBCSET if dfa->mb_cur_max == 1.  */
2684   if (dfa->mb_cur_max > 1)
2685     {
2686       /* Check the space of the arrays.  */
2687       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2688         {
2689           /* There is not enough space, need realloc.  */
2690           wchar_t *new_array_start, *new_array_end;
2691           Idx new_nranges;
2692 
2693           /* +1 in case of mbcset->nranges is 0.  */
2694           new_nranges = 2 * mbcset->nranges + 1;
2695           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2696              are NULL if *range_alloc == 0.  */
2697           new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2698                                         new_nranges);
2699           new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2700                                       new_nranges);
2701 
2702           if (__glibc_unlikely (new_array_start == NULL
2703                                 || new_array_end == NULL))
2704             {
2705               re_free (new_array_start);
2706               re_free (new_array_end);
2707               return REG_ESPACE;
2708             }
2709 
2710           mbcset->range_starts = new_array_start;
2711           mbcset->range_ends = new_array_end;
2712           *range_alloc = new_nranges;
2713         }
2714 
2715       mbcset->range_starts[mbcset->nranges] = start_wc;
2716       mbcset->range_ends[mbcset->nranges++] = end_wc;
2717     }
2718 
2719   /* Build the table for single byte characters.  */
2720   for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2721     {
2722       if (start_wc <= wc && wc <= end_wc)
2723         bitset_set (sbcset, wc);
2724     }
2725 
2726   return REG_NOERROR;
2727 }
2728 #endif /* not _LIBC */
2729 
2730 #ifndef _LIBC
2731 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2732    Build the collating element which is represented by NAME.
2733    The result are written to MBCSET and SBCSET.
2734    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2735    pointer argument since we may update it.  */
2736 
2737 static reg_errcode_t
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name,uint_fast32_t nrules,int_fast32_t table_size,const void * symb_table,const unsigned char * extra)2738 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2739 			Idx *coll_sym_alloc, const unsigned char *name,
2740 			uint_fast32_t nrules, int_fast32_t table_size,
2741 			const void *symb_table, const unsigned char *extra)
2742 {
2743   size_t name_len = strlen ((const char *) name);
2744   if (__glibc_unlikely (name_len != 1))
2745     return REG_ECOLLATE;
2746   else
2747     {
2748       bitset_set (sbcset, name[0]);
2749       return REG_NOERROR;
2750     }
2751 }
2752 #endif /* not _LIBC */
2753 
2754 #ifdef _LIBC
2755 /* Local function for parse_bracket_exp used in _LIBC environment.
2756    Seek the collating symbol entry corresponding to NAME.
2757    Return the index of the symbol in the SYMB_TABLE,
2758    or -1 if not found.  */
2759 
2760 static __always_inline int32_t
seek_collating_symbol_entry(const unsigned char * name,size_t name_len,const int32_t * symb_table,int_fast32_t table_size,const unsigned char * extra)2761 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2762 			     const int32_t *symb_table,
2763 			     int_fast32_t table_size,
2764 			     const unsigned char *extra)
2765 {
2766   int_fast32_t elem;
2767 
2768   for (elem = 0; elem < table_size; elem++)
2769     if (symb_table[2 * elem] != 0)
2770       {
2771 	int32_t idx = symb_table[2 * elem + 1];
2772 	/* Skip the name of collating element name.  */
2773 	idx += 1 + extra[idx];
2774 	if (/* Compare the length of the name.  */
2775 	    name_len == extra[idx]
2776 	    /* Compare the name.  */
2777 	    && memcmp (name, &extra[idx + 1], name_len) == 0)
2778 	  /* Yep, this is the entry.  */
2779 	  return elem;
2780       }
2781   return -1;
2782 }
2783 
2784 /* Local function for parse_bracket_exp used in _LIBC environment.
2785    Look up the collation sequence value of BR_ELEM.
2786    Return the value if succeeded, UINT_MAX otherwise.  */
2787 
2788 static __always_inline unsigned int
lookup_collation_sequence_value(bracket_elem_t * br_elem,uint32_t nrules,const unsigned char * collseqmb,const char * collseqwc,int_fast32_t table_size,const int32_t * symb_table,const unsigned char * extra)2789 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2790 				 const unsigned char *collseqmb,
2791 				 const char *collseqwc,
2792 				 int_fast32_t table_size,
2793 				 const int32_t *symb_table,
2794 				 const unsigned char *extra)
2795 {
2796   if (br_elem->type == SB_CHAR)
2797     {
2798       /* if (MB_CUR_MAX == 1) */
2799       if (nrules == 0)
2800 	return collseqmb[br_elem->opr.ch];
2801       else
2802 	{
2803 	  wint_t wc = __btowc (br_elem->opr.ch);
2804 	  return __collseq_table_lookup (collseqwc, wc);
2805 	}
2806     }
2807   else if (br_elem->type == MB_CHAR)
2808     {
2809       if (nrules != 0)
2810 	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2811     }
2812   else if (br_elem->type == COLL_SYM)
2813     {
2814       size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2815       if (nrules != 0)
2816 	{
2817 	  int32_t elem, idx;
2818 	  elem = seek_collating_symbol_entry (br_elem->opr.name,
2819 					      sym_name_len,
2820 					      symb_table, table_size,
2821 					      extra);
2822 	  if (elem != -1)
2823 	    {
2824 	      /* We found the entry.  */
2825 	      idx = symb_table[2 * elem + 1];
2826 	      /* Skip the name of collating element name.  */
2827 	      idx += 1 + extra[idx];
2828 	      /* Skip the byte sequence of the collating element.  */
2829 	      idx += 1 + extra[idx];
2830 	      /* Adjust for the alignment.  */
2831 	      idx = (idx + 3) & ~3;
2832 	      /* Skip the multibyte collation sequence value.  */
2833 	      idx += sizeof (unsigned int);
2834 	      /* Skip the wide char sequence of the collating element.  */
2835 	      idx += sizeof (unsigned int) *
2836 		(1 + *(unsigned int *) (extra + idx));
2837 	      /* Return the collation sequence value.  */
2838 	      return *(unsigned int *) (extra + idx);
2839 	    }
2840 	  else if (sym_name_len == 1)
2841 	    {
2842 	      /* No valid character.  Match it as a single byte
2843 		 character.  */
2844 	      return collseqmb[br_elem->opr.name[0]];
2845 	    }
2846 	}
2847       else if (sym_name_len == 1)
2848 	return collseqmb[br_elem->opr.name[0]];
2849     }
2850   return UINT_MAX;
2851 }
2852 
2853 /* Local function for parse_bracket_exp used in _LIBC environment.
2854    Build the range expression which starts from START_ELEM, and ends
2855    at END_ELEM.  The result are written to MBCSET and SBCSET.
2856    RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2857    mbcset->range_ends, is a pointer argument since we may
2858    update it.  */
2859 
2860 static __always_inline reg_errcode_t
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem,re_dfa_t * dfa,reg_syntax_t syntax,uint32_t nrules,const unsigned char * collseqmb,const char * collseqwc,int_fast32_t table_size,const int32_t * symb_table,const unsigned char * extra)2861 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2862 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2863 		 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2864 		 const unsigned char *collseqmb, const char *collseqwc,
2865 		 int_fast32_t table_size, const int32_t *symb_table,
2866 		 const unsigned char *extra)
2867 {
2868   unsigned int ch;
2869   uint32_t start_collseq;
2870   uint32_t end_collseq;
2871 
2872   /* Equivalence Classes and Character Classes can't be a range
2873      start/end.  */
2874   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2875                         || start_elem->type == CHAR_CLASS
2876                         || end_elem->type == EQUIV_CLASS
2877                         || end_elem->type == CHAR_CLASS))
2878     return REG_ERANGE;
2879 
2880   /* FIXME: Implement rational ranges here, too.  */
2881   start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2882 						   table_size, symb_table, extra);
2883   end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2884 						 table_size, symb_table, extra);
2885   /* Check start/end collation sequence values.  */
2886   if (__glibc_unlikely (start_collseq == UINT_MAX
2887                         || end_collseq == UINT_MAX))
2888     return REG_ECOLLATE;
2889   if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2890                         && start_collseq > end_collseq))
2891     return REG_ERANGE;
2892 
2893   /* Got valid collation sequence values, add them as a new entry.
2894      However, if we have no collation elements, and the character set
2895      is single byte, the single byte character set that we
2896      build below suffices. */
2897   if (nrules > 0 || dfa->mb_cur_max > 1)
2898     {
2899       /* Check the space of the arrays.  */
2900       if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2901 	{
2902 	  /* There is not enough space, need realloc.  */
2903 	  uint32_t *new_array_start;
2904 	  uint32_t *new_array_end;
2905 	  int new_nranges;
2906 
2907 	  /* +1 in case of mbcset->nranges is 0.  */
2908 	  new_nranges = 2 * mbcset->nranges + 1;
2909 	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2910 					new_nranges);
2911 	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2912 				      new_nranges);
2913 
2914           if (__glibc_unlikely (new_array_start == NULL
2915                                 || new_array_end == NULL))
2916 	    return REG_ESPACE;
2917 
2918 	  mbcset->range_starts = new_array_start;
2919 	  mbcset->range_ends = new_array_end;
2920 	  *range_alloc = new_nranges;
2921 	}
2922 
2923       mbcset->range_starts[mbcset->nranges] = start_collseq;
2924       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2925     }
2926 
2927   /* Build the table for single byte characters.  */
2928   for (ch = 0; ch < SBC_MAX; ch++)
2929     {
2930       uint32_t ch_collseq;
2931       /* if (MB_CUR_MAX == 1) */
2932       if (nrules == 0)
2933 	ch_collseq = collseqmb[ch];
2934       else
2935 	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2936       if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2937 	bitset_set (sbcset, ch);
2938     }
2939   return REG_NOERROR;
2940 }
2941 
2942 /* Local function for parse_bracket_exp used in _LIBC environment.
2943    Build the collating element which is represented by NAME.
2944    The result are written to MBCSET and SBCSET.
2945    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2946    pointer argument since we may update it.  */
2947 
2948 static __always_inline reg_errcode_t
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name,uint_fast32_t nrules,int_fast32_t table_size,const int32_t * symb_table,const unsigned char * extra)2949 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2950 			Idx *coll_sym_alloc, const unsigned char *name,
2951 			uint_fast32_t nrules, int_fast32_t table_size,
2952 			const int32_t *symb_table, const unsigned char *extra)
2953 {
2954   int32_t elem, idx;
2955   size_t name_len = strlen ((const char *) name);
2956   if (nrules != 0)
2957     {
2958       elem = seek_collating_symbol_entry (name, name_len, symb_table,
2959 					  table_size, extra);
2960       if (elem != -1)
2961 	{
2962 	  /* We found the entry.  */
2963 	  idx = symb_table[2 * elem + 1];
2964 	  /* Skip the name of collating element name.  */
2965 	  idx += 1 + extra[idx];
2966 	}
2967       else if (name_len == 1)
2968 	{
2969 	  /* No valid character, treat it as a normal
2970 	     character.  */
2971 	  bitset_set (sbcset, name[0]);
2972 	  return REG_NOERROR;
2973 	}
2974       else
2975 	return REG_ECOLLATE;
2976 
2977       /* Got valid collation sequence, add it as a new entry.  */
2978       /* Check the space of the arrays.  */
2979       if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2980 	{
2981 	  /* Not enough, realloc it.  */
2982 	  /* +1 in case of mbcset->ncoll_syms is 0.  */
2983 	  int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2984 	  /* Use realloc since mbcset->coll_syms is NULL
2985 	     if *alloc == 0.  */
2986 	  int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2987 					       new_coll_sym_alloc);
2988           if (__glibc_unlikely (new_coll_syms == NULL))
2989 	    return REG_ESPACE;
2990 	  mbcset->coll_syms = new_coll_syms;
2991 	  *coll_sym_alloc = new_coll_sym_alloc;
2992 	}
2993       mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2994       return REG_NOERROR;
2995     }
2996   else
2997     {
2998       if (__glibc_unlikely (name_len != 1))
2999 	return REG_ECOLLATE;
3000       else
3001 	{
3002 	  bitset_set (sbcset, name[0]);
3003 	  return REG_NOERROR;
3004 	}
3005     }
3006 }
3007 #endif /* _LIBC */
3008 
3009 /* This function parse bracket expression like "[abc]", "[a-c]",
3010    "[[.a-a.]]" etc.  */
3011 
3012 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)3013 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3014 		   reg_syntax_t syntax, reg_errcode_t *err)
3015 {
3016   const unsigned char *collseqmb = NULL;
3017   const char *collseqwc = NULL;
3018   uint_fast32_t nrules = 0;
3019   int_fast32_t table_size = 0;
3020   const void *symb_table = NULL;
3021   const unsigned char *extra = NULL;
3022 
3023   re_token_t br_token;
3024   re_bitset_ptr_t sbcset;
3025   re_charset_t *mbcset;
3026   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3028   bool non_match = false;
3029   bin_tree_t *work_tree;
3030   int token_len;
3031   bool first_round = true;
3032 #ifdef _LIBC
3033   collseqmb = (const unsigned char *)
3034     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036   if (nrules)
3037     {
3038       /*
3039       if (MB_CUR_MAX > 1)
3040       */
3041       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043       symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3044       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3045 						   _NL_COLLATE_SYMB_EXTRAMB);
3046     }
3047 #endif
3048   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3049   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3050   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3051     {
3052       re_free (sbcset);
3053       re_free (mbcset);
3054       *err = REG_ESPACE;
3055       return NULL;
3056     }
3057 
3058   token_len = peek_token_bracket (token, regexp, syntax);
3059   if (__glibc_unlikely (token->type == END_OF_RE))
3060     {
3061       *err = REG_BADPAT;
3062       goto parse_bracket_exp_free_return;
3063     }
3064   if (token->type == OP_NON_MATCH_LIST)
3065     {
3066       mbcset->non_match = 1;
3067       non_match = true;
3068       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3069 	bitset_set (sbcset, '\n');
3070       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3071       token_len = peek_token_bracket (token, regexp, syntax);
3072       if (__glibc_unlikely (token->type == END_OF_RE))
3073 	{
3074 	  *err = REG_BADPAT;
3075 	  goto parse_bracket_exp_free_return;
3076 	}
3077     }
3078 
3079   /* We treat the first ']' as a normal character.  */
3080   if (token->type == OP_CLOSE_BRACKET)
3081     token->type = CHARACTER;
3082 
3083   while (1)
3084     {
3085       bracket_elem_t start_elem, end_elem;
3086       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3087       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3088       reg_errcode_t ret;
3089       int token_len2 = 0;
3090       bool is_range_exp = false;
3091       re_token_t token2;
3092 
3093       start_elem.opr.name = start_name_buf;
3094       start_elem.type = COLL_SYM;
3095       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3096 				   syntax, first_round);
3097       if (__glibc_unlikely (ret != REG_NOERROR))
3098 	{
3099 	  *err = ret;
3100 	  goto parse_bracket_exp_free_return;
3101 	}
3102       first_round = false;
3103 
3104       /* Get information about the next token.  We need it in any case.  */
3105       token_len = peek_token_bracket (token, regexp, syntax);
3106 
3107       /* Do not check for ranges if we know they are not allowed.  */
3108       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3109 	{
3110 	  if (__glibc_unlikely (token->type == END_OF_RE))
3111 	    {
3112 	      *err = REG_EBRACK;
3113 	      goto parse_bracket_exp_free_return;
3114 	    }
3115 	  if (token->type == OP_CHARSET_RANGE)
3116 	    {
3117 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3118 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3119 	      if (__glibc_unlikely (token2.type == END_OF_RE))
3120 		{
3121 		  *err = REG_EBRACK;
3122 		  goto parse_bracket_exp_free_return;
3123 		}
3124 	      if (token2.type == OP_CLOSE_BRACKET)
3125 		{
3126 		  /* We treat the last '-' as a normal character.  */
3127 		  re_string_skip_bytes (regexp, -token_len);
3128 		  token->type = CHARACTER;
3129 		}
3130 	      else
3131 		is_range_exp = true;
3132 	    }
3133 	}
3134 
3135       if (is_range_exp == true)
3136 	{
3137 	  end_elem.opr.name = end_name_buf;
3138 	  end_elem.type = COLL_SYM;
3139 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3140 				       dfa, syntax, true);
3141 	  if (__glibc_unlikely (ret != REG_NOERROR))
3142 	    {
3143 	      *err = ret;
3144 	      goto parse_bracket_exp_free_return;
3145 	    }
3146 
3147 	  token_len = peek_token_bracket (token, regexp, syntax);
3148 
3149 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3150 				  &start_elem, &end_elem,
3151 				  dfa, syntax, nrules, collseqmb, collseqwc,
3152 				  table_size, symb_table, extra);
3153 	  if (__glibc_unlikely (*err != REG_NOERROR))
3154 	    goto parse_bracket_exp_free_return;
3155 	}
3156       else
3157 	{
3158 	  switch (start_elem.type)
3159 	    {
3160 	    case SB_CHAR:
3161 	      bitset_set (sbcset, start_elem.opr.ch);
3162 	      break;
3163 	    case MB_CHAR:
3164 	      /* Check whether the array has enough space.  */
3165 	      if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3166 		{
3167 		  wchar_t *new_mbchars;
3168 		  /* Not enough, realloc it.  */
3169 		  /* +1 in case of mbcset->nmbchars is 0.  */
3170 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3171 		  /* Use realloc since array is NULL if *alloc == 0.  */
3172 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3173 					    mbchar_alloc);
3174 		  if (__glibc_unlikely (new_mbchars == NULL))
3175 		    goto parse_bracket_exp_espace;
3176 		  mbcset->mbchars = new_mbchars;
3177 		}
3178 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3179 	      break;
3180 	    case EQUIV_CLASS:
3181 	      *err = build_equiv_class (sbcset,
3182 					mbcset, &equiv_class_alloc,
3183 					start_elem.opr.name);
3184 	      if (__glibc_unlikely (*err != REG_NOERROR))
3185 		goto parse_bracket_exp_free_return;
3186 	      break;
3187 	    case COLL_SYM:
3188 	      *err = build_collating_symbol (sbcset,
3189 					     mbcset, &coll_sym_alloc,
3190 					     start_elem.opr.name,
3191 					     nrules, table_size, symb_table, extra);
3192 	      if (__glibc_unlikely (*err != REG_NOERROR))
3193 		goto parse_bracket_exp_free_return;
3194 	      break;
3195 	    case CHAR_CLASS:
3196 	      *err = build_charclass (regexp->trans, sbcset,
3197 				      mbcset, &char_class_alloc,
3198 				      (const char *) start_elem.opr.name,
3199 				      syntax);
3200 	      if (__glibc_unlikely (*err != REG_NOERROR))
3201 	       goto parse_bracket_exp_free_return;
3202 	      break;
3203 	    default:
3204 	      DEBUG_ASSERT (false);
3205 	      break;
3206 	    }
3207 	}
3208       if (__glibc_unlikely (token->type == END_OF_RE))
3209 	{
3210 	  *err = REG_EBRACK;
3211 	  goto parse_bracket_exp_free_return;
3212 	}
3213       if (token->type == OP_CLOSE_BRACKET)
3214 	break;
3215     }
3216 
3217   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3218 
3219   /* If it is non-matching list.  */
3220   if (non_match)
3221     bitset_not (sbcset);
3222 
3223   /* Ensure only single byte characters are set.  */
3224   if (dfa->mb_cur_max > 1)
3225     bitset_mask (sbcset, dfa->sb_char);
3226 
3227   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3228       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3229 						     || mbcset->non_match)))
3230     {
3231       bin_tree_t *mbc_tree;
3232       int sbc_idx;
3233       /* Build a tree for complex bracket.  */
3234       dfa->has_mb_node = 1;
3235       br_token.type = COMPLEX_BRACKET;
3236       br_token.opr.mbcset = mbcset;
3237       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3238       if (__glibc_unlikely (mbc_tree == NULL))
3239 	goto parse_bracket_exp_espace;
3240       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3241 	if (sbcset[sbc_idx])
3242 	  break;
3243       /* If there are no bits set in sbcset, there is no point
3244 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3245       if (sbc_idx < BITSET_WORDS)
3246 	{
3247 	  /* Build a tree for simple bracket.  */
3248 	  br_token.type = SIMPLE_BRACKET;
3249 	  br_token.opr.sbcset = sbcset;
3250 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3251 	  if (__glibc_unlikely (work_tree == NULL))
3252 	    goto parse_bracket_exp_espace;
3253 
3254 	  /* Then join them by ALT node.  */
3255 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3256 	  if (__glibc_unlikely (work_tree == NULL))
3257 	    goto parse_bracket_exp_espace;
3258 	}
3259       else
3260 	{
3261 	  re_free (sbcset);
3262 	  work_tree = mbc_tree;
3263 	}
3264     }
3265   else
3266     {
3267       free_charset (mbcset);
3268       /* Build a tree for simple bracket.  */
3269       br_token.type = SIMPLE_BRACKET;
3270       br_token.opr.sbcset = sbcset;
3271       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272       if (__glibc_unlikely (work_tree == NULL))
3273 	goto parse_bracket_exp_espace;
3274     }
3275   return work_tree;
3276 
3277  parse_bracket_exp_espace:
3278   *err = REG_ESPACE;
3279  parse_bracket_exp_free_return:
3280   re_free (sbcset);
3281   free_charset (mbcset);
3282   return NULL;
3283 }
3284 
3285 /* Parse an element in the bracket expression.  */
3286 
3287 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3288 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3289 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3290 		       reg_syntax_t syntax, bool accept_hyphen)
3291 {
3292   int cur_char_size;
3293   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3294   if (cur_char_size > 1)
3295     {
3296       elem->type = MB_CHAR;
3297       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3298       re_string_skip_bytes (regexp, cur_char_size);
3299       return REG_NOERROR;
3300     }
3301   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3302   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3303       || token->type == OP_OPEN_EQUIV_CLASS)
3304     return parse_bracket_symbol (elem, regexp, token);
3305   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3306     {
3307       /* A '-' must only appear as anything but a range indicator before
3308 	 the closing bracket.  Everything else is an error.  */
3309       re_token_t token2;
3310       (void) peek_token_bracket (&token2, regexp, syntax);
3311       if (token2.type != OP_CLOSE_BRACKET)
3312 	/* The actual error value is not standardized since this whole
3313 	   case is undefined.  But ERANGE makes good sense.  */
3314 	return REG_ERANGE;
3315     }
3316   elem->type = SB_CHAR;
3317   elem->opr.ch = token->opr.c;
3318   return REG_NOERROR;
3319 }
3320 
3321 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3322    such as [:<character_class>:], [.<collating_element>.], and
3323    [=<equivalent_class>=].  */
3324 
3325 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3326 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3327 		      re_token_t *token)
3328 {
3329   unsigned char ch, delim = token->opr.c;
3330   int i = 0;
3331   if (re_string_eoi(regexp))
3332     return REG_EBRACK;
3333   for (;; ++i)
3334     {
3335       if (i >= BRACKET_NAME_BUF_SIZE)
3336 	return REG_EBRACK;
3337       if (token->type == OP_OPEN_CHAR_CLASS)
3338 	ch = re_string_fetch_byte_case (regexp);
3339       else
3340 	ch = re_string_fetch_byte (regexp);
3341       if (re_string_eoi(regexp))
3342 	return REG_EBRACK;
3343       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3344 	break;
3345       elem->opr.name[i] = ch;
3346     }
3347   re_string_skip_bytes (regexp, 1);
3348   elem->opr.name[i] = '\0';
3349   switch (token->type)
3350     {
3351     case OP_OPEN_COLL_ELEM:
3352       elem->type = COLL_SYM;
3353       break;
3354     case OP_OPEN_EQUIV_CLASS:
3355       elem->type = EQUIV_CLASS;
3356       break;
3357     case OP_OPEN_CHAR_CLASS:
3358       elem->type = CHAR_CLASS;
3359       break;
3360     default:
3361       break;
3362     }
3363   return REG_NOERROR;
3364 }
3365 
3366   /* Helper function for parse_bracket_exp.
3367      Build the equivalence class which is represented by NAME.
3368      The result are written to MBCSET and SBCSET.
3369      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3370      is a pointer argument since we may update it.  */
3371 
3372 static reg_errcode_t
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3373 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3374 		   Idx *equiv_class_alloc, const unsigned char *name)
3375 {
3376 #ifdef _LIBC
3377   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3378   if (nrules != 0)
3379     {
3380       const int32_t *table, *indirect;
3381       const unsigned char *weights, *extra, *cp;
3382       unsigned char char_buf[2];
3383       int32_t idx1, idx2;
3384       unsigned int ch;
3385       size_t len;
3386       /* Calculate the index for equivalence class.  */
3387       cp = name;
3388       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3389       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3390 					       _NL_COLLATE_WEIGHTMB);
3391       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3392 						   _NL_COLLATE_EXTRAMB);
3393       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3394 						_NL_COLLATE_INDIRECTMB);
3395       idx1 = findidx (table, indirect, extra, &cp, -1);
3396       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3397 	/* This isn't a valid character.  */
3398 	return REG_ECOLLATE;
3399 
3400       /* Build single byte matching table for this equivalence class.  */
3401       len = weights[idx1 & 0xffffff];
3402       for (ch = 0; ch < SBC_MAX; ++ch)
3403 	{
3404 	  char_buf[0] = ch;
3405 	  cp = char_buf;
3406 	  idx2 = findidx (table, indirect, extra, &cp, 1);
3407 /*
3408 	  idx2 = table[ch];
3409 */
3410 	  if (idx2 == 0)
3411 	    /* This isn't a valid character.  */
3412 	    continue;
3413 	  /* Compare only if the length matches and the collation rule
3414 	     index is the same.  */
3415 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3416 	      && memcmp (weights + (idx1 & 0xffffff) + 1,
3417 			 weights + (idx2 & 0xffffff) + 1, len) == 0)
3418 	    bitset_set (sbcset, ch);
3419 	}
3420       /* Check whether the array has enough space.  */
3421       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3422 	{
3423 	  /* Not enough, realloc it.  */
3424 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3425 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3426 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3427 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3428 						   int32_t,
3429 						   new_equiv_class_alloc);
3430 	  if (__glibc_unlikely (new_equiv_classes == NULL))
3431 	    return REG_ESPACE;
3432 	  mbcset->equiv_classes = new_equiv_classes;
3433 	  *equiv_class_alloc = new_equiv_class_alloc;
3434 	}
3435       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3436     }
3437   else
3438 #endif /* _LIBC */
3439     {
3440       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3441 	return REG_ECOLLATE;
3442       bitset_set (sbcset, *name);
3443     }
3444   return REG_NOERROR;
3445 }
3446 
3447   /* Helper function for parse_bracket_exp.
3448      Build the character class which is represented by NAME.
3449      The result are written to MBCSET and SBCSET.
3450      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3451      is a pointer argument since we may update it.  */
3452 
3453 static reg_errcode_t
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)3454 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3455 		 re_charset_t *mbcset, Idx *char_class_alloc,
3456 		 const char *class_name, reg_syntax_t syntax)
3457 {
3458   int i;
3459   const char *name = class_name;
3460 
3461   /* In case of REG_ICASE "upper" and "lower" match the both of
3462      upper and lower cases.  */
3463   if ((syntax & RE_ICASE)
3464       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3465     name = "alpha";
3466 
3467   /* Check the space of the arrays.  */
3468   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3469     {
3470       /* Not enough, realloc it.  */
3471       /* +1 in case of mbcset->nchar_classes is 0.  */
3472       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3473       /* Use realloc since array is NULL if *alloc == 0.  */
3474       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3475 					       new_char_class_alloc);
3476       if (__glibc_unlikely (new_char_classes == NULL))
3477 	return REG_ESPACE;
3478       mbcset->char_classes = new_char_classes;
3479       *char_class_alloc = new_char_class_alloc;
3480     }
3481   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3482 
3483 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3484   do {						\
3485     if (__glibc_unlikely (trans != NULL))			\
3486       {						\
3487 	for (i = 0; i < SBC_MAX; ++i)		\
3488 	  if (ctype_func (i))			\
3489 	    bitset_set (sbcset, trans[i]);	\
3490       }						\
3491     else					\
3492       {						\
3493 	for (i = 0; i < SBC_MAX; ++i)		\
3494 	  if (ctype_func (i))			\
3495 	    bitset_set (sbcset, i);		\
3496       }						\
3497   } while (0)
3498 
3499   if (strcmp (name, "alnum") == 0)
3500     BUILD_CHARCLASS_LOOP (isalnum);
3501   else if (strcmp (name, "cntrl") == 0)
3502     BUILD_CHARCLASS_LOOP (iscntrl);
3503   else if (strcmp (name, "lower") == 0)
3504     BUILD_CHARCLASS_LOOP (islower);
3505   else if (strcmp (name, "space") == 0)
3506     BUILD_CHARCLASS_LOOP (isspace);
3507   else if (strcmp (name, "alpha") == 0)
3508     BUILD_CHARCLASS_LOOP (isalpha);
3509   else if (strcmp (name, "digit") == 0)
3510     BUILD_CHARCLASS_LOOP (isdigit);
3511   else if (strcmp (name, "print") == 0)
3512     BUILD_CHARCLASS_LOOP (isprint);
3513   else if (strcmp (name, "upper") == 0)
3514     BUILD_CHARCLASS_LOOP (isupper);
3515   else if (strcmp (name, "blank") == 0)
3516     BUILD_CHARCLASS_LOOP (isblank);
3517   else if (strcmp (name, "graph") == 0)
3518     BUILD_CHARCLASS_LOOP (isgraph);
3519   else if (strcmp (name, "punct") == 0)
3520     BUILD_CHARCLASS_LOOP (ispunct);
3521   else if (strcmp (name, "xdigit") == 0)
3522     BUILD_CHARCLASS_LOOP (isxdigit);
3523   else
3524     return REG_ECTYPE;
3525 
3526   return REG_NOERROR;
3527 }
3528 
3529 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)3530 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3531 		    const char *class_name,
3532 		    const char *extra, bool non_match,
3533 		    reg_errcode_t *err)
3534 {
3535   re_bitset_ptr_t sbcset;
3536   re_charset_t *mbcset;
3537   Idx alloc = 0;
3538   reg_errcode_t ret;
3539   bin_tree_t *tree;
3540 
3541   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3542   if (__glibc_unlikely (sbcset == NULL))
3543     {
3544       *err = REG_ESPACE;
3545       return NULL;
3546     }
3547   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3548   if (__glibc_unlikely (mbcset == NULL))
3549     {
3550       re_free (sbcset);
3551       *err = REG_ESPACE;
3552       return NULL;
3553     }
3554   mbcset->non_match = non_match;
3555 
3556   /* We don't care the syntax in this case.  */
3557   ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3558 
3559   if (__glibc_unlikely (ret != REG_NOERROR))
3560     {
3561       re_free (sbcset);
3562       free_charset (mbcset);
3563       *err = ret;
3564       return NULL;
3565     }
3566   /* \w match '_' also.  */
3567   for (; *extra; extra++)
3568     bitset_set (sbcset, *extra);
3569 
3570   /* If it is non-matching list.  */
3571   if (non_match)
3572     bitset_not (sbcset);
3573 
3574   /* Ensure only single byte characters are set.  */
3575   if (dfa->mb_cur_max > 1)
3576     bitset_mask (sbcset, dfa->sb_char);
3577 
3578   /* Build a tree for simple bracket.  */
3579   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3580   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581   if (__glibc_unlikely (tree == NULL))
3582     goto build_word_op_espace;
3583 
3584   if (dfa->mb_cur_max > 1)
3585     {
3586       bin_tree_t *mbc_tree;
3587       /* Build a tree for complex bracket.  */
3588       br_token.type = COMPLEX_BRACKET;
3589       br_token.opr.mbcset = mbcset;
3590       dfa->has_mb_node = 1;
3591       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3592       if (__glibc_unlikely (mbc_tree == NULL))
3593 	goto build_word_op_espace;
3594       /* Then join them by ALT node.  */
3595       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3596       if (__glibc_likely (mbc_tree != NULL))
3597 	return tree;
3598     }
3599   else
3600     {
3601       free_charset (mbcset);
3602       return tree;
3603     }
3604 
3605  build_word_op_espace:
3606   re_free (sbcset);
3607   free_charset (mbcset);
3608   *err = REG_ESPACE;
3609   return NULL;
3610 }
3611 
3612 /* This is intended for the expressions like "a{1,3}".
3613    Fetch a number from 'input', and return the number.
3614    Return -1 if the number field is empty like "{,1}".
3615    Return RE_DUP_MAX + 1 if the number field is too large.
3616    Return -2 if an error occurred.  */
3617 
3618 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3619 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3620 {
3621   Idx num = -1;
3622   unsigned char c;
3623   while (1)
3624     {
3625       fetch_token (token, input, syntax);
3626       c = token->opr.c;
3627       if (__glibc_unlikely (token->type == END_OF_RE))
3628 	return -2;
3629       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3630 	break;
3631       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3632 	     ? -2
3633 	     : num == -1
3634 	     ? c - '0'
3635 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3636     }
3637   return num;
3638 }
3639 
3640 static void
free_charset(re_charset_t * cset)3641 free_charset (re_charset_t *cset)
3642 {
3643   re_free (cset->mbchars);
3644 #ifdef _LIBC
3645   re_free (cset->coll_syms);
3646   re_free (cset->equiv_classes);
3647 #endif
3648   re_free (cset->range_starts);
3649   re_free (cset->range_ends);
3650   re_free (cset->char_classes);
3651   re_free (cset);
3652 }
3653 
3654 /* Functions for binary tree operation.  */
3655 
3656 /* Create a tree node.  */
3657 
3658 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3659 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3660 	     re_token_type_t type)
3661 {
3662   re_token_t t = { .type = type };
3663   return create_token_tree (dfa, left, right, &t);
3664 }
3665 
3666 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3667 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3668 		   const re_token_t *token)
3669 {
3670   bin_tree_t *tree;
3671   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3672     {
3673       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3674 
3675       if (storage == NULL)
3676 	return NULL;
3677       storage->next = dfa->str_tree_storage;
3678       dfa->str_tree_storage = storage;
3679       dfa->str_tree_storage_idx = 0;
3680     }
3681   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3682 
3683   tree->parent = NULL;
3684   tree->left = left;
3685   tree->right = right;
3686   tree->token = *token;
3687   tree->token.duplicated = 0;
3688   tree->token.opt_subexp = 0;
3689   tree->first = NULL;
3690   tree->next = NULL;
3691   tree->node_idx = -1;
3692 
3693   if (left != NULL)
3694     left->parent = tree;
3695   if (right != NULL)
3696     right->parent = tree;
3697   return tree;
3698 }
3699 
3700 /* Mark the tree SRC as an optional subexpression.
3701    To be called from preorder or postorder.  */
3702 
3703 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3704 mark_opt_subexp (void *extra, bin_tree_t *node)
3705 {
3706   Idx idx = (uintptr_t) extra;
3707   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3708     node->token.opt_subexp = 1;
3709 
3710   return REG_NOERROR;
3711 }
3712 
3713 /* Free the allocated memory inside NODE. */
3714 
3715 static void
free_token(re_token_t * node)3716 free_token (re_token_t *node)
3717 {
3718   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3719     free_charset (node->opr.mbcset);
3720   else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3721     re_free (node->opr.sbcset);
3722 }
3723 
3724 /* Worker function for tree walking.  Free the allocated memory inside NODE
3725    and its children. */
3726 
3727 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3728 free_tree (void *extra, bin_tree_t *node)
3729 {
3730   free_token (&node->token);
3731   return REG_NOERROR;
3732 }
3733 
3734 
3735 /* Duplicate the node SRC, and return new node.  This is a preorder
3736    visit similar to the one implemented by the generic visitor, but
3737    we need more infrastructure to maintain two parallel trees --- so,
3738    it's easier to duplicate.  */
3739 
3740 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3741 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3742 {
3743   const bin_tree_t *node;
3744   bin_tree_t *dup_root;
3745   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3746 
3747   for (node = root; ; )
3748     {
3749       /* Create a new tree and link it back to the current parent.  */
3750       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3751       if (*p_new == NULL)
3752 	return NULL;
3753       (*p_new)->parent = dup_node;
3754       (*p_new)->token.duplicated = 1;
3755       dup_node = *p_new;
3756 
3757       /* Go to the left node, or up and to the right.  */
3758       if (node->left)
3759 	{
3760 	  node = node->left;
3761 	  p_new = &dup_node->left;
3762 	}
3763       else
3764 	{
3765 	  const bin_tree_t *prev = NULL;
3766 	  while (node->right == prev || node->right == NULL)
3767 	    {
3768 	      prev = node;
3769 	      node = node->parent;
3770 	      dup_node = dup_node->parent;
3771 	      if (!node)
3772 		return dup_root;
3773 	    }
3774 	  node = node->right;
3775 	  p_new = &dup_node->right;
3776 	}
3777     }
3778 }
3779