xref: /dragonfly/contrib/diffutils/lib/regcomp.c (revision 1975d09e)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <https://www.gnu.org/licenses/>.  */
19 
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23 
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 					  size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 				     const re_dfastate_t *init_state,
28 				     char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 			       reg_errcode_t (fn (void *, bin_tree_t *)),
41 			       void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 				reg_errcode_t (fn (void *, bin_tree_t *)),
44 				void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 				 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 				   unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 					 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 			 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 			reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 			  reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 				  re_token_t *token, reg_syntax_t syntax,
67 				  Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 				 re_token_t *token, reg_syntax_t syntax,
70 				 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 				     re_token_t *token, reg_syntax_t syntax,
73 				     Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 				  re_token_t *token, reg_syntax_t syntax,
76 				  Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 				 re_dfa_t *dfa, re_token_t *token,
79 				 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 				      re_token_t *token, reg_syntax_t syntax,
82 				      reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 					    re_string_t *regexp,
85 					    re_token_t *token, int token_len,
86 					    re_dfa_t *dfa,
87 					    reg_syntax_t syntax,
88 					    bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 					  re_string_t *regexp,
91 					  re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 					re_charset_t *mbcset,
95 					Idx *equiv_class_alloc,
96 					const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 				      bitset_t sbcset,
99 				      re_charset_t *mbcset,
100 				      Idx *char_class_alloc,
101 				      const char *class_name,
102 				      reg_syntax_t syntax);
103 #else  /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 					const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 				      bitset_t sbcset,
108 				      const char *class_name,
109 				      reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 				       RE_TRANSLATE_TYPE trans,
113 				       const char *class_name,
114 				       const char *extra,
115 				       bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 				bin_tree_t *left, bin_tree_t *right,
118 				re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 				      bin_tree_t *left, bin_tree_t *right,
121 				      const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 
127 /* This table gives an error message for each of the error codes listed
128    in regex.h.  Obviously the order here has to be same as there.
129    POSIX doesn't require that we do anything for REG_NOERROR,
130    but why not be nice?  */
131 
132 static const char __re_error_msgid[] =
133   {
134 #define REG_NOERROR_IDX	0
135     gettext_noop ("Success")	/* REG_NOERROR */
136     "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138     gettext_noop ("No match")	/* REG_NOMATCH */
139     "\0"
140 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142     "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145     "\0"
146 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148     "\0"
149 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151     "\0"
152 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154     "\0"
155 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
156     gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
157     "\0"
158 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160     "\0"
161 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163     "\0"
164 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166     "\0"
167 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168     gettext_noop ("Invalid range end")	/* REG_ERANGE */
169     "\0"
170 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
172     "\0"
173 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175     "\0"
176 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
178     "\0"
179 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
181     "\0"
182 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184   };
185 
186 static const size_t __re_error_msgid_idx[] =
187   {
188     REG_NOERROR_IDX,
189     REG_NOMATCH_IDX,
190     REG_BADPAT_IDX,
191     REG_ECOLLATE_IDX,
192     REG_ECTYPE_IDX,
193     REG_EESCAPE_IDX,
194     REG_ESUBREG_IDX,
195     REG_EBRACK_IDX,
196     REG_EPAREN_IDX,
197     REG_EBRACE_IDX,
198     REG_BADBR_IDX,
199     REG_ERANGE_IDX,
200     REG_ESPACE_IDX,
201     REG_BADRPT_IDX,
202     REG_EEND_IDX,
203     REG_ESIZE_IDX,
204     REG_ERPAREN_IDX
205   };
206 
207 /* Entry points for GNU code.  */
208 
209 /* re_compile_pattern is the GNU regular expression compiler: it
210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211    Returns 0 if the pattern was valid, otherwise an error string.
212 
213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214    are set in BUFP on entry.  */
215 
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 		    struct re_pattern_buffer *bufp)
219 {
220   reg_errcode_t ret;
221 
222   /* And GNU code determines whether or not to get register information
223      by passing null for the REGS argument to re_match, etc., not by
224      setting no_sub, unless RE_NO_SUB is set.  */
225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 
227   /* Match anchors at newline.  */
228   bufp->newline_anchor = 1;
229 
230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 
232   if (!ret)
233     return NULL;
234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239 
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
241    also be assigned to arbitrarily: each pattern buffer stores its own
242    syntax, so it can be changed between regex compilations.  */
243 /* This has no initializer because initialized variables in Emacs
244    become read-only after dumping.  */
245 reg_syntax_t re_syntax_options;
246 
247 
248 /* Specify the precise syntax of regexps for compilation.  This provides
249    for compatibility for various utilities which historically have
250    different, incompatible syntaxes.
251 
252    The argument SYNTAX is a bit mask comprised of the various bits
253    defined in regex.h.  We return the old syntax.  */
254 
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
257 {
258   reg_syntax_t ret = re_syntax_options;
259 
260   re_syntax_options = syntax;
261   return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266 
267 int
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
269 {
270   re_dfa_t *dfa = bufp->buffer;
271   char *fastmap = bufp->fastmap;
272 
273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275   if (dfa->init_state != dfa->init_state_word)
276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277   if (dfa->init_state != dfa->init_state_nl)
278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279   if (dfa->init_state != dfa->init_state_begbuf)
280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281   bufp->fastmap_accurate = 1;
282   return 0;
283 }
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
287 
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
291 {
292   fastmap[ch] = 1;
293   if (icase)
294     fastmap[tolower (ch)] = 1;
295 }
296 
297 /* Helper function for re_compile_fastmap.
298    Compile fastmap for the initial_state INIT_STATE.  */
299 
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 			 char *fastmap)
303 {
304   re_dfa_t *dfa = bufp->buffer;
305   Idx node_cnt;
306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308     {
309       Idx node = init_state->nodes.elems[node_cnt];
310       re_token_type_t type = dfa->nodes[node].type;
311 
312       if (type == CHARACTER)
313 	{
314 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 	    {
318 	      unsigned char buf[MB_LEN_MAX];
319 	      unsigned char *p;
320 	      wchar_t wc;
321 	      mbstate_t state;
322 
323 	      p = buf;
324 	      *p++ = dfa->nodes[node].opr.c;
325 	      while (++node < dfa->nodes_len
326 		     &&	dfa->nodes[node].type == CHARACTER
327 		     && dfa->nodes[node].mb_partial)
328 		*p++ = dfa->nodes[node].opr.c;
329 	      memset (&state, '\0', sizeof (state));
330 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 			     &state) == p - buf
332 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 		      != (size_t) -1))
334 		re_set_fastmap (fastmap, false, buf[0]);
335 	    }
336 #endif
337 	}
338       else if (type == SIMPLE_BRACKET)
339 	{
340 	  int i, ch;
341 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 	    {
343 	      int j;
344 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 		if (w & ((bitset_word_t) 1 << j))
347 		  re_set_fastmap (fastmap, icase, ch);
348 	    }
349 	}
350 #ifdef RE_ENABLE_I18N
351       else if (type == COMPLEX_BRACKET)
352 	{
353 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 	  Idx i;
355 
356 # ifdef _LIBC
357 	  /* See if we have to try all bytes which start multiple collation
358 	     elements.
359 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 		  collation element, and don't catch 'b' since 'b' is
361 		  the only collation element which starts from 'b' (and
362 		  it is caught by SIMPLE_BRACKET).  */
363 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 		  && (cset->ncoll_syms || cset->nranges))
365 		{
366 		  const int32_t *table = (const int32_t *)
367 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 		  for (i = 0; i < SBC_MAX; ++i)
369 		    if (table[i] < 0)
370 		      re_set_fastmap (fastmap, icase, i);
371 		}
372 # endif /* _LIBC */
373 
374 	  /* See if we have to start the match at all multibyte characters,
375 	     i.e. where we would not find an invalid sequence.  This only
376 	     applies to multibyte character sets; for single byte character
377 	     sets, the SIMPLE_BRACKET again suffices.  */
378 	  if (dfa->mb_cur_max > 1
379 	      && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 		  || cset->nequiv_classes
382 # endif /* _LIBC */
383 		 ))
384 	    {
385 	      unsigned char c = 0;
386 	      do
387 		{
388 		  mbstate_t mbs;
389 		  memset (&mbs, 0, sizeof (mbs));
390 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 		    re_set_fastmap (fastmap, false, (int) c);
392 		}
393 	      while (++c != 0);
394 	    }
395 
396 	  else
397 	    {
398 	      /* ... Else catch all bytes which can start the mbchars.  */
399 	      for (i = 0; i < cset->nmbchars; ++i)
400 		{
401 		  char buf[256];
402 		  mbstate_t state;
403 		  memset (&state, '\0', sizeof (state));
404 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 		    {
408 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 			  != (size_t) -1)
410 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 		    }
412 		}
413 	    }
414 	}
415 #endif /* RE_ENABLE_I18N */
416       else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 	       || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 	       || type == END_OF_RE)
421 	{
422 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 	  if (type == END_OF_RE)
424 	    bufp->can_be_null = 1;
425 	  return;
426 	}
427     }
428 }
429 
430 /* Entry point for POSIX code.  */
431 /* regcomp takes a regular expression as a string and compiles it.
432 
433    PREG is a regex_t *.  We do not expect any fields to be initialized,
434    since POSIX says we shouldn't.  Thus, we set
435 
436      'buffer' to the compiled pattern;
437      'used' to the length of the compiled pattern;
438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
440        RE_SYNTAX_POSIX_BASIC;
441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442      'fastmap' to an allocated space for the fastmap;
443      'fastmap_accurate' to zero;
444      're_nsub' to the number of subexpressions in PATTERN.
445 
446    PATTERN is the address of the pattern string.
447 
448    CFLAGS is a series of bits which affect compilation.
449 
450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451      use POSIX basic syntax.
452 
453      If REG_NEWLINE is set, then . and [^...] don't match newline.
454      Also, regexec will try a match beginning after every newline.
455 
456      If REG_ICASE is set, then we considers upper- and lowercase
457      versions of letters to be equivalent when matching.
458 
459      If REG_NOSUB is set, then when PREG is passed to regexec, that
460      routine will report only success or failure, and nothing about the
461      registers.
462 
463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
464    the return codes and their meanings.)  */
465 
466 int
467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
468 {
469   reg_errcode_t ret;
470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 			 : RE_SYNTAX_POSIX_BASIC);
472 
473   preg->buffer = NULL;
474   preg->allocated = 0;
475   preg->used = 0;
476 
477   /* Try to allocate space for the fastmap.  */
478   preg->fastmap = re_malloc (char, SBC_MAX);
479   if (__glibc_unlikely (preg->fastmap == NULL))
480     return REG_ESPACE;
481 
482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 
484   /* If REG_NEWLINE is set, newlines are treated differently.  */
485   if (cflags & REG_NEWLINE)
486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
487       syntax &= ~RE_DOT_NEWLINE;
488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489       /* It also changes the matching behavior.  */
490       preg->newline_anchor = 1;
491     }
492   else
493     preg->newline_anchor = 0;
494   preg->no_sub = !!(cflags & REG_NOSUB);
495   preg->translate = NULL;
496 
497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 
499   /* POSIX doesn't distinguish between an unmatched open-group and an
500      unmatched close-group: both are REG_EPAREN.  */
501   if (ret == REG_ERPAREN)
502     ret = REG_EPAREN;
503 
504   /* We have already checked preg->fastmap != NULL.  */
505   if (__glibc_likely (ret == REG_NOERROR))
506     /* Compute the fastmap now, since regexec cannot modify the pattern
507        buffer.  This function never fails in this implementation.  */
508     (void) re_compile_fastmap (preg);
509   else
510     {
511       /* Some error occurred while compiling the expression.  */
512       re_free (preg->fastmap);
513       preg->fastmap = NULL;
514     }
515 
516   return (int) ret;
517 }
518 #ifdef _LIBC
519 libc_hidden_def (__regcomp)
520 weak_alias (__regcomp, regcomp)
521 #endif
522 
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524    from either regcomp or regexec.   We don't use PREG here.  */
525 
526 size_t
527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 	  size_t errbuf_size)
529 {
530   const char *msg;
531   size_t msg_size;
532   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
533 
534   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
535     /* Only error codes returned by the rest of the code should be passed
536        to this routine.  If we are given anything else, or if other regex
537        code generates an invalid error code, then the program has a bug.
538        Dump core so we can fix it.  */
539     abort ();
540 
541   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
542 
543   msg_size = strlen (msg) + 1; /* Includes the null.  */
544 
545   if (__glibc_likely (errbuf_size != 0))
546     {
547       size_t cpy_size = msg_size;
548       if (__glibc_unlikely (msg_size > errbuf_size))
549 	{
550 	  cpy_size = errbuf_size - 1;
551 	  errbuf[cpy_size] = '\0';
552 	}
553       memcpy (errbuf, msg, cpy_size);
554     }
555 
556   return msg_size;
557 }
558 #ifdef _LIBC
559 weak_alias (__regerror, regerror)
560 #endif
561 
562 
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565    UTF-8 is used.  Otherwise we would allocate memory just to initialize
566    it the same all the time.  UTF-8 is the preferred encoding so this is
567    a worthwhile optimization.  */
568 static const bitset_t utf8_sb_map =
569 {
570   /* Set the first 128 bits.  */
571 # if defined __GNUC__ && !defined __STRICT_ANSI__
572   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
573 # else
574 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
575 #   error "bitset_word_t is narrower than 32 bits"
576 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
577   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
578 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
579   BITSET_WORD_MAX, BITSET_WORD_MAX,
580 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
581   BITSET_WORD_MAX,
582 #  endif
583   (BITSET_WORD_MAX
584    >> (SBC_MAX % BITSET_WORD_BITS == 0
585        ? 0
586        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
587 # endif
588 };
589 #endif
590 
591 
592 static void
593 free_dfa_content (re_dfa_t *dfa)
594 {
595   Idx i, j;
596 
597   if (dfa->nodes)
598     for (i = 0; i < dfa->nodes_len; ++i)
599       free_token (dfa->nodes + i);
600   re_free (dfa->nexts);
601   for (i = 0; i < dfa->nodes_len; ++i)
602     {
603       if (dfa->eclosures != NULL)
604 	re_node_set_free (dfa->eclosures + i);
605       if (dfa->inveclosures != NULL)
606 	re_node_set_free (dfa->inveclosures + i);
607       if (dfa->edests != NULL)
608 	re_node_set_free (dfa->edests + i);
609     }
610   re_free (dfa->edests);
611   re_free (dfa->eclosures);
612   re_free (dfa->inveclosures);
613   re_free (dfa->nodes);
614 
615   if (dfa->state_table)
616     for (i = 0; i <= dfa->state_hash_mask; ++i)
617       {
618 	struct re_state_table_entry *entry = dfa->state_table + i;
619 	for (j = 0; j < entry->num; ++j)
620 	  {
621 	    re_dfastate_t *state = entry->array[j];
622 	    free_state (state);
623 	  }
624 	re_free (entry->array);
625       }
626   re_free (dfa->state_table);
627 #ifdef RE_ENABLE_I18N
628   if (dfa->sb_char != utf8_sb_map)
629     re_free (dfa->sb_char);
630 #endif
631   re_free (dfa->subexp_map);
632 #ifdef DEBUG
633   re_free (dfa->re_str);
634 #endif
635 
636   re_free (dfa);
637 }
638 
639 
640 /* Free dynamically allocated space used by PREG.  */
641 
642 void
643 regfree (regex_t *preg)
644 {
645   re_dfa_t *dfa = preg->buffer;
646   if (__glibc_likely (dfa != NULL))
647     {
648       lock_fini (dfa->lock);
649       free_dfa_content (dfa);
650     }
651   preg->buffer = NULL;
652   preg->allocated = 0;
653 
654   re_free (preg->fastmap);
655   preg->fastmap = NULL;
656 
657   re_free (preg->translate);
658   preg->translate = NULL;
659 }
660 #ifdef _LIBC
661 libc_hidden_def (__regfree)
662 weak_alias (__regfree, regfree)
663 #endif
664 
665 /* Entry points compatible with 4.2 BSD regex library.  We don't define
666    them unless specifically requested.  */
667 
668 #if defined _REGEX_RE_COMP || defined _LIBC
669 
670 /* BSD has one and only one pattern buffer.  */
671 static struct re_pattern_buffer re_comp_buf;
672 
673 char *
674 # ifdef _LIBC
675 /* Make these definitions weak in libc, so POSIX programs can redefine
676    these names if they don't use our functions, and still use
677    regcomp/regexec above without link errors.  */
678 weak_function
679 # endif
680 re_comp (const char *s)
681 {
682   reg_errcode_t ret;
683   char *fastmap;
684 
685   if (!s)
686     {
687       if (!re_comp_buf.buffer)
688 	return gettext ("No previous regular expression");
689       return 0;
690     }
691 
692   if (re_comp_buf.buffer)
693     {
694       fastmap = re_comp_buf.fastmap;
695       re_comp_buf.fastmap = NULL;
696       __regfree (&re_comp_buf);
697       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
698       re_comp_buf.fastmap = fastmap;
699     }
700 
701   if (re_comp_buf.fastmap == NULL)
702     {
703       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
704       if (re_comp_buf.fastmap == NULL)
705 	return (char *) gettext (__re_error_msgid
706 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
707     }
708 
709   /* Since 're_exec' always passes NULL for the 'regs' argument, we
710      don't need to initialize the pattern buffer fields which affect it.  */
711 
712   /* Match anchors at newlines.  */
713   re_comp_buf.newline_anchor = 1;
714 
715   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
716 
717   if (!ret)
718     return NULL;
719 
720   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
721   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
722 }
723 
724 #ifdef _LIBC
725 libc_freeres_fn (free_mem)
726 {
727   __regfree (&re_comp_buf);
728 }
729 #endif
730 
731 #endif /* _REGEX_RE_COMP */
732 
733 /* Internal entry point.
734    Compile the regular expression PATTERN, whose length is LENGTH.
735    SYNTAX indicate regular expression's syntax.  */
736 
737 static reg_errcode_t
738 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
739 		     reg_syntax_t syntax)
740 {
741   reg_errcode_t err = REG_NOERROR;
742   re_dfa_t *dfa;
743   re_string_t regexp;
744 
745   /* Initialize the pattern buffer.  */
746   preg->fastmap_accurate = 0;
747   preg->syntax = syntax;
748   preg->not_bol = preg->not_eol = 0;
749   preg->used = 0;
750   preg->re_nsub = 0;
751   preg->can_be_null = 0;
752   preg->regs_allocated = REGS_UNALLOCATED;
753 
754   /* Initialize the dfa.  */
755   dfa = preg->buffer;
756   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
757     {
758       /* If zero allocated, but buffer is non-null, try to realloc
759 	 enough space.  This loses if buffer's address is bogus, but
760 	 that is the user's responsibility.  If ->buffer is NULL this
761 	 is a simple allocation.  */
762       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
763       if (dfa == NULL)
764 	return REG_ESPACE;
765       preg->allocated = sizeof (re_dfa_t);
766       preg->buffer = dfa;
767     }
768   preg->used = sizeof (re_dfa_t);
769 
770   err = init_dfa (dfa, length);
771   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
772     err = REG_ESPACE;
773   if (__glibc_unlikely (err != REG_NOERROR))
774     {
775       free_dfa_content (dfa);
776       preg->buffer = NULL;
777       preg->allocated = 0;
778       return err;
779     }
780 #ifdef DEBUG
781   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
782   dfa->re_str = re_malloc (char, length + 1);
783   strncpy (dfa->re_str, pattern, length + 1);
784 #endif
785 
786   err = re_string_construct (&regexp, pattern, length, preg->translate,
787 			     (syntax & RE_ICASE) != 0, dfa);
788   if (__glibc_unlikely (err != REG_NOERROR))
789     {
790     re_compile_internal_free_return:
791       free_workarea_compile (preg);
792       re_string_destruct (&regexp);
793       lock_fini (dfa->lock);
794       free_dfa_content (dfa);
795       preg->buffer = NULL;
796       preg->allocated = 0;
797       return err;
798     }
799 
800   /* Parse the regular expression, and build a structure tree.  */
801   preg->re_nsub = 0;
802   dfa->str_tree = parse (&regexp, preg, syntax, &err);
803   if (__glibc_unlikely (dfa->str_tree == NULL))
804     goto re_compile_internal_free_return;
805 
806   /* Analyze the tree and create the nfa.  */
807   err = analyze (preg);
808   if (__glibc_unlikely (err != REG_NOERROR))
809     goto re_compile_internal_free_return;
810 
811 #ifdef RE_ENABLE_I18N
812   /* If possible, do searching in single byte encoding to speed things up.  */
813   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
814     optimize_utf8 (dfa);
815 #endif
816 
817   /* Then create the initial state of the dfa.  */
818   err = create_initial_state (dfa);
819 
820   /* Release work areas.  */
821   free_workarea_compile (preg);
822   re_string_destruct (&regexp);
823 
824   if (__glibc_unlikely (err != REG_NOERROR))
825     {
826       lock_fini (dfa->lock);
827       free_dfa_content (dfa);
828       preg->buffer = NULL;
829       preg->allocated = 0;
830     }
831 
832   return err;
833 }
834 
835 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
836    as the initial length of some arrays.  */
837 
838 static reg_errcode_t
839 init_dfa (re_dfa_t *dfa, size_t pat_len)
840 {
841   __re_size_t table_size;
842 #ifndef _LIBC
843   const char *codeset_name;
844 #endif
845 #ifdef RE_ENABLE_I18N
846   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
847 #else
848   size_t max_i18n_object_size = 0;
849 #endif
850   size_t max_object_size =
851     MAX (sizeof (struct re_state_table_entry),
852 	 MAX (sizeof (re_token_t),
853 	      MAX (sizeof (re_node_set),
854 		   MAX (sizeof (regmatch_t),
855 			max_i18n_object_size))));
856 
857   memset (dfa, '\0', sizeof (re_dfa_t));
858 
859   /* Force allocation of str_tree_storage the first time.  */
860   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
861 
862   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
863      calculation below, and for similar doubling calculations
864      elsewhere.  And it's <= rather than <, because some of the
865      doubling calculations add 1 afterwards.  */
866   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
867 			<= pat_len))
868     return REG_ESPACE;
869 
870   dfa->nodes_alloc = pat_len + 1;
871   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
872 
873   /*  table_size = 2 ^ ceil(log pat_len) */
874   for (table_size = 1; ; table_size <<= 1)
875     if (table_size > pat_len)
876       break;
877 
878   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879   dfa->state_hash_mask = table_size - 1;
880 
881   dfa->mb_cur_max = MB_CUR_MAX;
882 #ifdef _LIBC
883   if (dfa->mb_cur_max == 6
884       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885     dfa->is_utf8 = 1;
886   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 		       != 0);
888 #else
889   codeset_name = nl_langinfo (CODESET);
890   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
894     dfa->is_utf8 = 1;
895 
896   /* We check exhaustively in the loop below if this charset is a
897      superset of ASCII.  */
898   dfa->map_notascii = 0;
899 #endif
900 
901 #ifdef RE_ENABLE_I18N
902   if (dfa->mb_cur_max > 1)
903     {
904       if (dfa->is_utf8)
905 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906       else
907 	{
908 	  int i, j, ch;
909 
910 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 	  if (__glibc_unlikely (dfa->sb_char == NULL))
912 	    return REG_ESPACE;
913 
914 	  /* Set the bits corresponding to single byte chars.  */
915 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
917 	      {
918 		wint_t wch = __btowc (ch);
919 		if (wch != WEOF)
920 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
921 # ifndef _LIBC
922 		if (isascii (ch) && wch != ch)
923 		  dfa->map_notascii = 1;
924 # endif
925 	      }
926 	}
927     }
928 #endif
929 
930   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
931     return REG_ESPACE;
932   return REG_NOERROR;
933 }
934 
935 /* Initialize WORD_CHAR table, which indicate which character is
936    "word".  In this case "word" means that it is the word construction
937    character used by some operators like "\<", "\>", etc.  */
938 
939 static void
940 init_word_char (re_dfa_t *dfa)
941 {
942   int i = 0;
943   int j;
944   int ch = 0;
945   dfa->word_ops_used = 1;
946   if (__glibc_likely (dfa->map_notascii == 0))
947     {
948       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 	 them, an issue when this code is used in Gnulib.  */
950       bitset_word_t bits0 = 0x00000000;
951       bitset_word_t bits1 = 0x03ff0000;
952       bitset_word_t bits2 = 0x87fffffe;
953       bitset_word_t bits3 = 0x07fffffe;
954       if (BITSET_WORD_BITS == 64)
955 	{
956 	  /* Pacify gcc -Woverflow on 32-bit platformns.  */
957 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
959 	  i = 2;
960 	}
961       else if (BITSET_WORD_BITS == 32)
962 	{
963 	  dfa->word_char[0] = bits0;
964 	  dfa->word_char[1] = bits1;
965 	  dfa->word_char[2] = bits2;
966 	  dfa->word_char[3] = bits3;
967 	  i = 4;
968 	}
969       else
970         goto general_case;
971       ch = 128;
972 
973       if (__glibc_likely (dfa->is_utf8))
974 	{
975 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 	  return;
977 	}
978     }
979 
980  general_case:
981   for (; i < BITSET_WORDS; ++i)
982     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
983       if (isalnum (ch) || ch == '_')
984 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
985 }
986 
987 /* Free the work area which are only used while compiling.  */
988 
989 static void
990 free_workarea_compile (regex_t *preg)
991 {
992   re_dfa_t *dfa = preg->buffer;
993   bin_tree_storage_t *storage, *next;
994   for (storage = dfa->str_tree_storage; storage; storage = next)
995     {
996       next = storage->next;
997       re_free (storage);
998     }
999   dfa->str_tree_storage = NULL;
1000   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001   dfa->str_tree = NULL;
1002   re_free (dfa->org_indices);
1003   dfa->org_indices = NULL;
1004 }
1005 
1006 /* Create initial states for all contexts.  */
1007 
1008 static reg_errcode_t
1009 create_initial_state (re_dfa_t *dfa)
1010 {
1011   Idx first, i;
1012   reg_errcode_t err;
1013   re_node_set init_nodes;
1014 
1015   /* Initial states have the epsilon closure of the node which is
1016      the first node of the regular expression.  */
1017   first = dfa->str_tree->first->node_idx;
1018   dfa->init_node = first;
1019   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020   if (__glibc_unlikely (err != REG_NOERROR))
1021     return err;
1022 
1023   /* The back-references which are in initial states can epsilon transit,
1024      since in this case all of the subexpressions can be null.
1025      Then we add epsilon closures of the nodes which are the next nodes of
1026      the back-references.  */
1027   if (dfa->nbackref > 0)
1028     for (i = 0; i < init_nodes.nelem; ++i)
1029       {
1030 	Idx node_idx = init_nodes.elems[i];
1031 	re_token_type_t type = dfa->nodes[node_idx].type;
1032 
1033 	Idx clexp_idx;
1034 	if (type != OP_BACK_REF)
1035 	  continue;
1036 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1037 	  {
1038 	    re_token_t *clexp_node;
1039 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1041 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042 	      break;
1043 	  }
1044 	if (clexp_idx == init_nodes.nelem)
1045 	  continue;
1046 
1047 	if (type == OP_BACK_REF)
1048 	  {
1049 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1050 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1051 	      {
1052 		reg_errcode_t merge_err
1053                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 		if (merge_err != REG_NOERROR)
1055 		  return merge_err;
1056 		i = 0;
1057 	      }
1058 	  }
1059       }
1060 
1061   /* It must be the first time to invoke acquire_state.  */
1062   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063   /* We don't check ERR here, since the initial state must not be NULL.  */
1064   if (__glibc_unlikely (dfa->init_state == NULL))
1065     return err;
1066   if (dfa->init_state->has_constraint)
1067     {
1068       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069 						       CONTEXT_WORD);
1070       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071 						     CONTEXT_NEWLINE);
1072       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073 							 &init_nodes,
1074 							 CONTEXT_NEWLINE
1075 							 | CONTEXT_BEGBUF);
1076       if (__glibc_unlikely (dfa->init_state_word == NULL
1077 			    || dfa->init_state_nl == NULL
1078 			    || dfa->init_state_begbuf == NULL))
1079 	return err;
1080     }
1081   else
1082     dfa->init_state_word = dfa->init_state_nl
1083       = dfa->init_state_begbuf = dfa->init_state;
1084 
1085   re_node_set_free (&init_nodes);
1086   return REG_NOERROR;
1087 }
1088 
1089 #ifdef RE_ENABLE_I18N
1090 /* If it is possible to do searching in single byte encoding instead of UTF-8
1091    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1092    DFA nodes where needed.  */
1093 
1094 static void
1095 optimize_utf8 (re_dfa_t *dfa)
1096 {
1097   Idx node;
1098   int i;
1099   bool mb_chars = false;
1100   bool has_period = false;
1101 
1102   for (node = 0; node < dfa->nodes_len; ++node)
1103     switch (dfa->nodes[node].type)
1104       {
1105       case CHARACTER:
1106 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1107 	  mb_chars = true;
1108 	break;
1109       case ANCHOR:
1110 	switch (dfa->nodes[node].opr.ctx_type)
1111 	  {
1112 	  case LINE_FIRST:
1113 	  case LINE_LAST:
1114 	  case BUF_FIRST:
1115 	  case BUF_LAST:
1116 	    break;
1117 	  default:
1118 	    /* Word anchors etc. cannot be handled.  It's okay to test
1119 	       opr.ctx_type since constraints (for all DFA nodes) are
1120 	       created by ORing one or more opr.ctx_type values.  */
1121 	    return;
1122 	  }
1123 	break;
1124       case OP_PERIOD:
1125 	has_period = true;
1126 	break;
1127       case OP_BACK_REF:
1128       case OP_ALT:
1129       case END_OF_RE:
1130       case OP_DUP_ASTERISK:
1131       case OP_OPEN_SUBEXP:
1132       case OP_CLOSE_SUBEXP:
1133 	break;
1134       case COMPLEX_BRACKET:
1135 	return;
1136       case SIMPLE_BRACKET:
1137 	/* Just double check.  */
1138 	{
1139 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1140 			? 0
1141 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1142 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1143 	    {
1144 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1145 		return;
1146 	      rshift = 0;
1147 	    }
1148 	}
1149 	break;
1150       default:
1151 	abort ();
1152       }
1153 
1154   if (mb_chars || has_period)
1155     for (node = 0; node < dfa->nodes_len; ++node)
1156       {
1157 	if (dfa->nodes[node].type == CHARACTER
1158 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1159 	  dfa->nodes[node].mb_partial = 0;
1160 	else if (dfa->nodes[node].type == OP_PERIOD)
1161 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1162       }
1163 
1164   /* The search can be in single byte locale.  */
1165   dfa->mb_cur_max = 1;
1166   dfa->is_utf8 = 0;
1167   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1168 }
1169 #endif
1170 
1171 /* Analyze the structure tree, and calculate "first", "next", "edest",
1172    "eclosure", and "inveclosure".  */
1173 
1174 static reg_errcode_t
1175 analyze (regex_t *preg)
1176 {
1177   re_dfa_t *dfa = preg->buffer;
1178   reg_errcode_t ret;
1179 
1180   /* Allocate arrays.  */
1181   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1182   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1183   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1184   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1185   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1186 			|| dfa->edests == NULL || dfa->eclosures == NULL))
1187     return REG_ESPACE;
1188 
1189   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1190   if (dfa->subexp_map != NULL)
1191     {
1192       Idx i;
1193       for (i = 0; i < preg->re_nsub; i++)
1194 	dfa->subexp_map[i] = i;
1195       preorder (dfa->str_tree, optimize_subexps, dfa);
1196       for (i = 0; i < preg->re_nsub; i++)
1197 	if (dfa->subexp_map[i] != i)
1198 	  break;
1199       if (i == preg->re_nsub)
1200 	{
1201 	  re_free (dfa->subexp_map);
1202 	  dfa->subexp_map = NULL;
1203 	}
1204     }
1205 
1206   ret = postorder (dfa->str_tree, lower_subexps, preg);
1207   if (__glibc_unlikely (ret != REG_NOERROR))
1208     return ret;
1209   ret = postorder (dfa->str_tree, calc_first, dfa);
1210   if (__glibc_unlikely (ret != REG_NOERROR))
1211     return ret;
1212   preorder (dfa->str_tree, calc_next, dfa);
1213   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1214   if (__glibc_unlikely (ret != REG_NOERROR))
1215     return ret;
1216   ret = calc_eclosure (dfa);
1217   if (__glibc_unlikely (ret != REG_NOERROR))
1218     return ret;
1219 
1220   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1221      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1222   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1223       || dfa->nbackref)
1224     {
1225       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1226       if (__glibc_unlikely (dfa->inveclosures == NULL))
1227 	return REG_ESPACE;
1228       ret = calc_inveclosure (dfa);
1229     }
1230 
1231   return ret;
1232 }
1233 
1234 /* Our parse trees are very unbalanced, so we cannot use a stack to
1235    implement parse tree visits.  Instead, we use parent pointers and
1236    some hairy code in these two functions.  */
1237 static reg_errcode_t
1238 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1239 	   void *extra)
1240 {
1241   bin_tree_t *node, *prev;
1242 
1243   for (node = root; ; )
1244     {
1245       /* Descend down the tree, preferably to the left (or to the right
1246 	 if that's the only child).  */
1247       while (node->left || node->right)
1248 	if (node->left)
1249 	  node = node->left;
1250 	else
1251 	  node = node->right;
1252 
1253       do
1254 	{
1255 	  reg_errcode_t err = fn (extra, node);
1256 	  if (__glibc_unlikely (err != REG_NOERROR))
1257 	    return err;
1258 	  if (node->parent == NULL)
1259 	    return REG_NOERROR;
1260 	  prev = node;
1261 	  node = node->parent;
1262 	}
1263       /* Go up while we have a node that is reached from the right.  */
1264       while (node->right == prev || node->right == NULL);
1265       node = node->right;
1266     }
1267 }
1268 
1269 static reg_errcode_t
1270 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1271 	  void *extra)
1272 {
1273   bin_tree_t *node;
1274 
1275   for (node = root; ; )
1276     {
1277       reg_errcode_t err = fn (extra, node);
1278       if (__glibc_unlikely (err != REG_NOERROR))
1279 	return err;
1280 
1281       /* Go to the left node, or up and to the right.  */
1282       if (node->left)
1283 	node = node->left;
1284       else
1285 	{
1286 	  bin_tree_t *prev = NULL;
1287 	  while (node->right == prev || node->right == NULL)
1288 	    {
1289 	      prev = node;
1290 	      node = node->parent;
1291 	      if (!node)
1292 		return REG_NOERROR;
1293 	    }
1294 	  node = node->right;
1295 	}
1296     }
1297 }
1298 
1299 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1300    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1301    backreferences as well.  Requires a preorder visit.  */
1302 static reg_errcode_t
1303 optimize_subexps (void *extra, bin_tree_t *node)
1304 {
1305   re_dfa_t *dfa = (re_dfa_t *) extra;
1306 
1307   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1308     {
1309       int idx = node->token.opr.idx;
1310       node->token.opr.idx = dfa->subexp_map[idx];
1311       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1312     }
1313 
1314   else if (node->token.type == SUBEXP
1315 	   && node->left && node->left->token.type == SUBEXP)
1316     {
1317       Idx other_idx = node->left->token.opr.idx;
1318 
1319       node->left = node->left->left;
1320       if (node->left)
1321 	node->left->parent = node;
1322 
1323       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1324       if (other_idx < BITSET_WORD_BITS)
1325 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1326     }
1327 
1328   return REG_NOERROR;
1329 }
1330 
1331 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1332    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1333 static reg_errcode_t
1334 lower_subexps (void *extra, bin_tree_t *node)
1335 {
1336   regex_t *preg = (regex_t *) extra;
1337   reg_errcode_t err = REG_NOERROR;
1338 
1339   if (node->left && node->left->token.type == SUBEXP)
1340     {
1341       node->left = lower_subexp (&err, preg, node->left);
1342       if (node->left)
1343 	node->left->parent = node;
1344     }
1345   if (node->right && node->right->token.type == SUBEXP)
1346     {
1347       node->right = lower_subexp (&err, preg, node->right);
1348       if (node->right)
1349 	node->right->parent = node;
1350     }
1351 
1352   return err;
1353 }
1354 
1355 static bin_tree_t *
1356 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1357 {
1358   re_dfa_t *dfa = preg->buffer;
1359   bin_tree_t *body = node->left;
1360   bin_tree_t *op, *cls, *tree1, *tree;
1361 
1362   if (preg->no_sub
1363       /* We do not optimize empty subexpressions, because otherwise we may
1364 	 have bad CONCAT nodes with NULL children.  This is obviously not
1365 	 very common, so we do not lose much.  An example that triggers
1366 	 this case is the sed "script" /\(\)/x.  */
1367       && node->left != NULL
1368       && (node->token.opr.idx >= BITSET_WORD_BITS
1369 	  || !(dfa->used_bkref_map
1370 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1371     return node->left;
1372 
1373   /* Convert the SUBEXP node to the concatenation of an
1374      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1375   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1376   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1377   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1378   tree = create_tree (dfa, op, tree1, CONCAT);
1379   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1380 			|| op == NULL || cls == NULL))
1381     {
1382       *err = REG_ESPACE;
1383       return NULL;
1384     }
1385 
1386   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1387   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1388   return tree;
1389 }
1390 
1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392    nodes.  Requires a postorder visit.  */
1393 static reg_errcode_t
1394 calc_first (void *extra, bin_tree_t *node)
1395 {
1396   re_dfa_t *dfa = (re_dfa_t *) extra;
1397   if (node->token.type == CONCAT)
1398     {
1399       node->first = node->left->first;
1400       node->node_idx = node->left->node_idx;
1401     }
1402   else
1403     {
1404       node->first = node;
1405       node->node_idx = re_dfa_add_node (dfa, node->token);
1406       if (__glibc_unlikely (node->node_idx == -1))
1407 	return REG_ESPACE;
1408       if (node->token.type == ANCHOR)
1409 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1410     }
1411   return REG_NOERROR;
1412 }
1413 
1414 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1415 static reg_errcode_t
1416 calc_next (void *extra, bin_tree_t *node)
1417 {
1418   switch (node->token.type)
1419     {
1420     case OP_DUP_ASTERISK:
1421       node->left->next = node;
1422       break;
1423     case CONCAT:
1424       node->left->next = node->right->first;
1425       node->right->next = node->next;
1426       break;
1427     default:
1428       if (node->left)
1429 	node->left->next = node->next;
1430       if (node->right)
1431 	node->right->next = node->next;
1432       break;
1433     }
1434   return REG_NOERROR;
1435 }
1436 
1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1438 static reg_errcode_t
1439 link_nfa_nodes (void *extra, bin_tree_t *node)
1440 {
1441   re_dfa_t *dfa = (re_dfa_t *) extra;
1442   Idx idx = node->node_idx;
1443   reg_errcode_t err = REG_NOERROR;
1444 
1445   switch (node->token.type)
1446     {
1447     case CONCAT:
1448       break;
1449 
1450     case END_OF_RE:
1451       assert (node->next == NULL);
1452       break;
1453 
1454     case OP_DUP_ASTERISK:
1455     case OP_ALT:
1456       {
1457 	Idx left, right;
1458 	dfa->has_plural_match = 1;
1459 	if (node->left != NULL)
1460 	  left = node->left->first->node_idx;
1461 	else
1462 	  left = node->next->node_idx;
1463 	if (node->right != NULL)
1464 	  right = node->right->first->node_idx;
1465 	else
1466 	  right = node->next->node_idx;
1467 	assert (left > -1);
1468 	assert (right > -1);
1469 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1470       }
1471       break;
1472 
1473     case ANCHOR:
1474     case OP_OPEN_SUBEXP:
1475     case OP_CLOSE_SUBEXP:
1476       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1477       break;
1478 
1479     case OP_BACK_REF:
1480       dfa->nexts[idx] = node->next->node_idx;
1481       if (node->token.type == OP_BACK_REF)
1482 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1483       break;
1484 
1485     default:
1486       assert (!IS_EPSILON_NODE (node->token.type));
1487       dfa->nexts[idx] = node->next->node_idx;
1488       break;
1489     }
1490 
1491   return err;
1492 }
1493 
1494 /* Duplicate the epsilon closure of the node ROOT_NODE.
1495    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496    to their own constraint.  */
1497 
1498 static reg_errcode_t
1499 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1500 			Idx root_node, unsigned int init_constraint)
1501 {
1502   Idx org_node, clone_node;
1503   bool ok;
1504   unsigned int constraint = init_constraint;
1505   for (org_node = top_org_node, clone_node = top_clone_node;;)
1506     {
1507       Idx org_dest, clone_dest;
1508       if (dfa->nodes[org_node].type == OP_BACK_REF)
1509 	{
1510 	  /* If the back reference epsilon-transit, its destination must
1511 	     also have the constraint.  Then duplicate the epsilon closure
1512 	     of the destination of the back reference, and store it in
1513 	     edests of the back reference.  */
1514 	  org_dest = dfa->nexts[org_node];
1515 	  re_node_set_empty (dfa->edests + clone_node);
1516 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 	  if (__glibc_unlikely (clone_dest == -1))
1518 	    return REG_ESPACE;
1519 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1520 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 	  if (__glibc_unlikely (! ok))
1522 	    return REG_ESPACE;
1523 	}
1524       else if (dfa->edests[org_node].nelem == 0)
1525 	{
1526 	  /* In case of the node can't epsilon-transit, don't duplicate the
1527 	     destination and store the original destination as the
1528 	     destination of the node.  */
1529 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 	  break;
1531 	}
1532       else if (dfa->edests[org_node].nelem == 1)
1533 	{
1534 	  /* In case of the node can epsilon-transit, and it has only one
1535 	     destination.  */
1536 	  org_dest = dfa->edests[org_node].elems[0];
1537 	  re_node_set_empty (dfa->edests + clone_node);
1538 	  /* If the node is root_node itself, it means the epsilon closure
1539 	     has a loop.  Then tie it to the destination of the root_node.  */
1540 	  if (org_node == root_node && clone_node != org_node)
1541 	    {
1542 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1543 	      if (__glibc_unlikely (! ok))
1544 	        return REG_ESPACE;
1545 	      break;
1546 	    }
1547 	  /* In case the node has another constraint, append it.  */
1548 	  constraint |= dfa->nodes[org_node].constraint;
1549 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1550 	  if (__glibc_unlikely (clone_dest == -1))
1551 	    return REG_ESPACE;
1552 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553 	  if (__glibc_unlikely (! ok))
1554 	    return REG_ESPACE;
1555 	}
1556       else /* dfa->edests[org_node].nelem == 2 */
1557 	{
1558 	  /* In case of the node can epsilon-transit, and it has two
1559 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1560 	  org_dest = dfa->edests[org_node].elems[0];
1561 	  re_node_set_empty (dfa->edests + clone_node);
1562 	  /* Search for a duplicated node which satisfies the constraint.  */
1563 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1564 	  if (clone_dest == -1)
1565 	    {
1566 	      /* There is no such duplicated node, create a new one.  */
1567 	      reg_errcode_t err;
1568 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1569 	      if (__glibc_unlikely (clone_dest == -1))
1570 		return REG_ESPACE;
1571 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 	      if (__glibc_unlikely (! ok))
1573 		return REG_ESPACE;
1574 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1575 					    root_node, constraint);
1576 	      if (__glibc_unlikely (err != REG_NOERROR))
1577 		return err;
1578 	    }
1579 	  else
1580 	    {
1581 	      /* There is a duplicated node which satisfies the constraint,
1582 		 use it to avoid infinite loop.  */
1583 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584 	      if (__glibc_unlikely (! ok))
1585 		return REG_ESPACE;
1586 	    }
1587 
1588 	  org_dest = dfa->edests[org_node].elems[1];
1589 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1590 	  if (__glibc_unlikely (clone_dest == -1))
1591 	    return REG_ESPACE;
1592 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593 	  if (__glibc_unlikely (! ok))
1594 	    return REG_ESPACE;
1595 	}
1596       org_node = org_dest;
1597       clone_node = clone_dest;
1598     }
1599   return REG_NOERROR;
1600 }
1601 
1602 /* Search for a node which is duplicated from the node ORG_NODE, and
1603    satisfies the constraint CONSTRAINT.  */
1604 
1605 static Idx
1606 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1607 			unsigned int constraint)
1608 {
1609   Idx idx;
1610   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1611     {
1612       if (org_node == dfa->org_indices[idx]
1613 	  && constraint == dfa->nodes[idx].constraint)
1614 	return idx; /* Found.  */
1615     }
1616   return -1; /* Not found.  */
1617 }
1618 
1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620    Return the index of the new node, or -1 if insufficient storage is
1621    available.  */
1622 
1623 static Idx
1624 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1625 {
1626   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1627   if (__glibc_likely (dup_idx != -1))
1628     {
1629       dfa->nodes[dup_idx].constraint = constraint;
1630       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1631       dfa->nodes[dup_idx].duplicated = 1;
1632 
1633       /* Store the index of the original node.  */
1634       dfa->org_indices[dup_idx] = org_idx;
1635     }
1636   return dup_idx;
1637 }
1638 
1639 static reg_errcode_t
1640 calc_inveclosure (re_dfa_t *dfa)
1641 {
1642   Idx src, idx;
1643   bool ok;
1644   for (idx = 0; idx < dfa->nodes_len; ++idx)
1645     re_node_set_init_empty (dfa->inveclosures + idx);
1646 
1647   for (src = 0; src < dfa->nodes_len; ++src)
1648     {
1649       Idx *elems = dfa->eclosures[src].elems;
1650       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1651 	{
1652 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1653 	  if (__glibc_unlikely (! ok))
1654 	    return REG_ESPACE;
1655 	}
1656     }
1657 
1658   return REG_NOERROR;
1659 }
1660 
1661 /* Calculate "eclosure" for all the node in DFA.  */
1662 
1663 static reg_errcode_t
1664 calc_eclosure (re_dfa_t *dfa)
1665 {
1666   Idx node_idx;
1667   bool incomplete;
1668 #ifdef DEBUG
1669   assert (dfa->nodes_len > 0);
1670 #endif
1671   incomplete = false;
1672   /* For each nodes, calculate epsilon closure.  */
1673   for (node_idx = 0; ; ++node_idx)
1674     {
1675       reg_errcode_t err;
1676       re_node_set eclosure_elem;
1677       if (node_idx == dfa->nodes_len)
1678 	{
1679 	  if (!incomplete)
1680 	    break;
1681 	  incomplete = false;
1682 	  node_idx = 0;
1683 	}
1684 
1685 #ifdef DEBUG
1686       assert (dfa->eclosures[node_idx].nelem != -1);
1687 #endif
1688 
1689       /* If we have already calculated, skip it.  */
1690       if (dfa->eclosures[node_idx].nelem != 0)
1691 	continue;
1692       /* Calculate epsilon closure of 'node_idx'.  */
1693       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1694       if (__glibc_unlikely (err != REG_NOERROR))
1695 	return err;
1696 
1697       if (dfa->eclosures[node_idx].nelem == 0)
1698 	{
1699 	  incomplete = true;
1700 	  re_node_set_free (&eclosure_elem);
1701 	}
1702     }
1703   return REG_NOERROR;
1704 }
1705 
1706 /* Calculate epsilon closure of NODE.  */
1707 
1708 static reg_errcode_t
1709 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1710 {
1711   reg_errcode_t err;
1712   Idx i;
1713   re_node_set eclosure;
1714   bool ok;
1715   bool incomplete = false;
1716   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1717   if (__glibc_unlikely (err != REG_NOERROR))
1718     return err;
1719 
1720   /* This indicates that we are calculating this node now.
1721      We reference this value to avoid infinite loop.  */
1722   dfa->eclosures[node].nelem = -1;
1723 
1724   /* If the current node has constraints, duplicate all nodes
1725      since they must inherit the constraints.  */
1726   if (dfa->nodes[node].constraint
1727       && dfa->edests[node].nelem
1728       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1729     {
1730       err = duplicate_node_closure (dfa, node, node, node,
1731 				    dfa->nodes[node].constraint);
1732       if (__glibc_unlikely (err != REG_NOERROR))
1733 	return err;
1734     }
1735 
1736   /* Expand each epsilon destination nodes.  */
1737   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1738     for (i = 0; i < dfa->edests[node].nelem; ++i)
1739       {
1740 	re_node_set eclosure_elem;
1741 	Idx edest = dfa->edests[node].elems[i];
1742 	/* If calculating the epsilon closure of 'edest' is in progress,
1743 	   return intermediate result.  */
1744 	if (dfa->eclosures[edest].nelem == -1)
1745 	  {
1746 	    incomplete = true;
1747 	    continue;
1748 	  }
1749 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1750 	   calculate now. Otherwise use calculated epsilon closure.  */
1751 	if (dfa->eclosures[edest].nelem == 0)
1752 	  {
1753 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1754 	    if (__glibc_unlikely (err != REG_NOERROR))
1755 	      return err;
1756 	  }
1757 	else
1758 	  eclosure_elem = dfa->eclosures[edest];
1759 	/* Merge the epsilon closure of 'edest'.  */
1760 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1761 	if (__glibc_unlikely (err != REG_NOERROR))
1762 	  return err;
1763 	/* If the epsilon closure of 'edest' is incomplete,
1764 	   the epsilon closure of this node is also incomplete.  */
1765 	if (dfa->eclosures[edest].nelem == 0)
1766 	  {
1767 	    incomplete = true;
1768 	    re_node_set_free (&eclosure_elem);
1769 	  }
1770       }
1771 
1772   /* An epsilon closure includes itself.  */
1773   ok = re_node_set_insert (&eclosure, node);
1774   if (__glibc_unlikely (! ok))
1775     return REG_ESPACE;
1776   if (incomplete && !root)
1777     dfa->eclosures[node].nelem = 0;
1778   else
1779     dfa->eclosures[node] = eclosure;
1780   *new_set = eclosure;
1781   return REG_NOERROR;
1782 }
1783 
1784 /* Functions for token which are used in the parser.  */
1785 
1786 /* Fetch a token from INPUT.
1787    We must not use this function inside bracket expressions.  */
1788 
1789 static void
1790 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1791 {
1792   re_string_skip_bytes (input, peek_token (result, input, syntax));
1793 }
1794 
1795 /* Peek a token from INPUT, and return the length of the token.
1796    We must not use this function inside bracket expressions.  */
1797 
1798 static int
1799 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1800 {
1801   unsigned char c;
1802 
1803   if (re_string_eoi (input))
1804     {
1805       token->type = END_OF_RE;
1806       return 0;
1807     }
1808 
1809   c = re_string_peek_byte (input, 0);
1810   token->opr.c = c;
1811 
1812   token->word_char = 0;
1813 #ifdef RE_ENABLE_I18N
1814   token->mb_partial = 0;
1815   if (input->mb_cur_max > 1 &&
1816       !re_string_first_byte (input, re_string_cur_idx (input)))
1817     {
1818       token->type = CHARACTER;
1819       token->mb_partial = 1;
1820       return 1;
1821     }
1822 #endif
1823   if (c == '\\')
1824     {
1825       unsigned char c2;
1826       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1827 	{
1828 	  token->type = BACK_SLASH;
1829 	  return 1;
1830 	}
1831 
1832       c2 = re_string_peek_byte_case (input, 1);
1833       token->opr.c = c2;
1834       token->type = CHARACTER;
1835 #ifdef RE_ENABLE_I18N
1836       if (input->mb_cur_max > 1)
1837 	{
1838 	  wint_t wc = re_string_wchar_at (input,
1839 					  re_string_cur_idx (input) + 1);
1840 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1841 	}
1842       else
1843 #endif
1844 	token->word_char = IS_WORD_CHAR (c2) != 0;
1845 
1846       switch (c2)
1847 	{
1848 	case '|':
1849 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1850 	    token->type = OP_ALT;
1851 	  break;
1852 	case '1': case '2': case '3': case '4': case '5':
1853 	case '6': case '7': case '8': case '9':
1854 	  if (!(syntax & RE_NO_BK_REFS))
1855 	    {
1856 	      token->type = OP_BACK_REF;
1857 	      token->opr.idx = c2 - '1';
1858 	    }
1859 	  break;
1860 	case '<':
1861 	  if (!(syntax & RE_NO_GNU_OPS))
1862 	    {
1863 	      token->type = ANCHOR;
1864 	      token->opr.ctx_type = WORD_FIRST;
1865 	    }
1866 	  break;
1867 	case '>':
1868 	  if (!(syntax & RE_NO_GNU_OPS))
1869 	    {
1870 	      token->type = ANCHOR;
1871 	      token->opr.ctx_type = WORD_LAST;
1872 	    }
1873 	  break;
1874 	case 'b':
1875 	  if (!(syntax & RE_NO_GNU_OPS))
1876 	    {
1877 	      token->type = ANCHOR;
1878 	      token->opr.ctx_type = WORD_DELIM;
1879 	    }
1880 	  break;
1881 	case 'B':
1882 	  if (!(syntax & RE_NO_GNU_OPS))
1883 	    {
1884 	      token->type = ANCHOR;
1885 	      token->opr.ctx_type = NOT_WORD_DELIM;
1886 	    }
1887 	  break;
1888 	case 'w':
1889 	  if (!(syntax & RE_NO_GNU_OPS))
1890 	    token->type = OP_WORD;
1891 	  break;
1892 	case 'W':
1893 	  if (!(syntax & RE_NO_GNU_OPS))
1894 	    token->type = OP_NOTWORD;
1895 	  break;
1896 	case 's':
1897 	  if (!(syntax & RE_NO_GNU_OPS))
1898 	    token->type = OP_SPACE;
1899 	  break;
1900 	case 'S':
1901 	  if (!(syntax & RE_NO_GNU_OPS))
1902 	    token->type = OP_NOTSPACE;
1903 	  break;
1904 	case '`':
1905 	  if (!(syntax & RE_NO_GNU_OPS))
1906 	    {
1907 	      token->type = ANCHOR;
1908 	      token->opr.ctx_type = BUF_FIRST;
1909 	    }
1910 	  break;
1911 	case '\'':
1912 	  if (!(syntax & RE_NO_GNU_OPS))
1913 	    {
1914 	      token->type = ANCHOR;
1915 	      token->opr.ctx_type = BUF_LAST;
1916 	    }
1917 	  break;
1918 	case '(':
1919 	  if (!(syntax & RE_NO_BK_PARENS))
1920 	    token->type = OP_OPEN_SUBEXP;
1921 	  break;
1922 	case ')':
1923 	  if (!(syntax & RE_NO_BK_PARENS))
1924 	    token->type = OP_CLOSE_SUBEXP;
1925 	  break;
1926 	case '+':
1927 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1928 	    token->type = OP_DUP_PLUS;
1929 	  break;
1930 	case '?':
1931 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1932 	    token->type = OP_DUP_QUESTION;
1933 	  break;
1934 	case '{':
1935 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1936 	    token->type = OP_OPEN_DUP_NUM;
1937 	  break;
1938 	case '}':
1939 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1940 	    token->type = OP_CLOSE_DUP_NUM;
1941 	  break;
1942 	default:
1943 	  break;
1944 	}
1945       return 2;
1946     }
1947 
1948   token->type = CHARACTER;
1949 #ifdef RE_ENABLE_I18N
1950   if (input->mb_cur_max > 1)
1951     {
1952       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1953       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1954     }
1955   else
1956 #endif
1957     token->word_char = IS_WORD_CHAR (token->opr.c);
1958 
1959   switch (c)
1960     {
1961     case '\n':
1962       if (syntax & RE_NEWLINE_ALT)
1963 	token->type = OP_ALT;
1964       break;
1965     case '|':
1966       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1967 	token->type = OP_ALT;
1968       break;
1969     case '*':
1970       token->type = OP_DUP_ASTERISK;
1971       break;
1972     case '+':
1973       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1974 	token->type = OP_DUP_PLUS;
1975       break;
1976     case '?':
1977       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1978 	token->type = OP_DUP_QUESTION;
1979       break;
1980     case '{':
1981       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1982 	token->type = OP_OPEN_DUP_NUM;
1983       break;
1984     case '}':
1985       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1986 	token->type = OP_CLOSE_DUP_NUM;
1987       break;
1988     case '(':
1989       if (syntax & RE_NO_BK_PARENS)
1990 	token->type = OP_OPEN_SUBEXP;
1991       break;
1992     case ')':
1993       if (syntax & RE_NO_BK_PARENS)
1994 	token->type = OP_CLOSE_SUBEXP;
1995       break;
1996     case '[':
1997       token->type = OP_OPEN_BRACKET;
1998       break;
1999     case '.':
2000       token->type = OP_PERIOD;
2001       break;
2002     case '^':
2003       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2004 	  re_string_cur_idx (input) != 0)
2005 	{
2006 	  char prev = re_string_peek_byte (input, -1);
2007 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2008 	    break;
2009 	}
2010       token->type = ANCHOR;
2011       token->opr.ctx_type = LINE_FIRST;
2012       break;
2013     case '$':
2014       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2015 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2016 	{
2017 	  re_token_t next;
2018 	  re_string_skip_bytes (input, 1);
2019 	  peek_token (&next, input, syntax);
2020 	  re_string_skip_bytes (input, -1);
2021 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2022 	    break;
2023 	}
2024       token->type = ANCHOR;
2025       token->opr.ctx_type = LINE_LAST;
2026       break;
2027     default:
2028       break;
2029     }
2030   return 1;
2031 }
2032 
2033 /* Peek a token from INPUT, and return the length of the token.
2034    We must not use this function out of bracket expressions.  */
2035 
2036 static int
2037 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2038 {
2039   unsigned char c;
2040   if (re_string_eoi (input))
2041     {
2042       token->type = END_OF_RE;
2043       return 0;
2044     }
2045   c = re_string_peek_byte (input, 0);
2046   token->opr.c = c;
2047 
2048 #ifdef RE_ENABLE_I18N
2049   if (input->mb_cur_max > 1 &&
2050       !re_string_first_byte (input, re_string_cur_idx (input)))
2051     {
2052       token->type = CHARACTER;
2053       return 1;
2054     }
2055 #endif /* RE_ENABLE_I18N */
2056 
2057   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2058       && re_string_cur_idx (input) + 1 < re_string_length (input))
2059     {
2060       /* In this case, '\' escape a character.  */
2061       unsigned char c2;
2062       re_string_skip_bytes (input, 1);
2063       c2 = re_string_peek_byte (input, 0);
2064       token->opr.c = c2;
2065       token->type = CHARACTER;
2066       return 1;
2067     }
2068   if (c == '[') /* '[' is a special char in a bracket exps.  */
2069     {
2070       unsigned char c2;
2071       int token_len;
2072       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2073 	c2 = re_string_peek_byte (input, 1);
2074       else
2075 	c2 = 0;
2076       token->opr.c = c2;
2077       token_len = 2;
2078       switch (c2)
2079 	{
2080 	case '.':
2081 	  token->type = OP_OPEN_COLL_ELEM;
2082 	  break;
2083 
2084 	case '=':
2085 	  token->type = OP_OPEN_EQUIV_CLASS;
2086 	  break;
2087 
2088 	case ':':
2089 	  if (syntax & RE_CHAR_CLASSES)
2090 	    {
2091 	      token->type = OP_OPEN_CHAR_CLASS;
2092 	      break;
2093 	    }
2094 	  FALLTHROUGH;
2095 	default:
2096 	  token->type = CHARACTER;
2097 	  token->opr.c = c;
2098 	  token_len = 1;
2099 	  break;
2100 	}
2101       return token_len;
2102     }
2103   switch (c)
2104     {
2105     case '-':
2106       token->type = OP_CHARSET_RANGE;
2107       break;
2108     case ']':
2109       token->type = OP_CLOSE_BRACKET;
2110       break;
2111     case '^':
2112       token->type = OP_NON_MATCH_LIST;
2113       break;
2114     default:
2115       token->type = CHARACTER;
2116     }
2117   return 1;
2118 }
2119 
2120 /* Functions for parser.  */
2121 
2122 /* Entry point of the parser.
2123    Parse the regular expression REGEXP and return the structure tree.
2124    If an error occurs, ERR is set by error code, and return NULL.
2125    This function build the following tree, from regular expression <reg_exp>:
2126 	   CAT
2127 	   / \
2128 	  /   \
2129    <reg_exp>  EOR
2130 
2131    CAT means concatenation.
2132    EOR means end of regular expression.  */
2133 
2134 static bin_tree_t *
2135 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2136        reg_errcode_t *err)
2137 {
2138   re_dfa_t *dfa = preg->buffer;
2139   bin_tree_t *tree, *eor, *root;
2140   re_token_t current_token;
2141   dfa->syntax = syntax;
2142   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2143   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2144   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2145     return NULL;
2146   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2147   if (tree != NULL)
2148     root = create_tree (dfa, tree, eor, CONCAT);
2149   else
2150     root = eor;
2151   if (__glibc_unlikely (eor == NULL || root == NULL))
2152     {
2153       *err = REG_ESPACE;
2154       return NULL;
2155     }
2156   return root;
2157 }
2158 
2159 /* This function build the following tree, from regular expression
2160    <branch1>|<branch2>:
2161 	   ALT
2162 	   / \
2163 	  /   \
2164    <branch1> <branch2>
2165 
2166    ALT means alternative, which represents the operator '|'.  */
2167 
2168 static bin_tree_t *
2169 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2170 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2171 {
2172   re_dfa_t *dfa = preg->buffer;
2173   bin_tree_t *tree, *branch = NULL;
2174   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2175   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2176   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2177     return NULL;
2178 
2179   while (token->type == OP_ALT)
2180     {
2181       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2182       if (token->type != OP_ALT && token->type != END_OF_RE
2183 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2184 	{
2185 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2186 	  dfa->completed_bkref_map = initial_bkref_map;
2187 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2188 	  if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2189 	    {
2190 	      if (tree != NULL)
2191 		postorder (tree, free_tree, NULL);
2192 	      return NULL;
2193 	    }
2194 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2195 	}
2196       else
2197 	branch = NULL;
2198       tree = create_tree (dfa, tree, branch, OP_ALT);
2199       if (__glibc_unlikely (tree == NULL))
2200 	{
2201 	  *err = REG_ESPACE;
2202 	  return NULL;
2203 	}
2204     }
2205   return tree;
2206 }
2207 
2208 /* This function build the following tree, from regular expression
2209    <exp1><exp2>:
2210 	CAT
2211 	/ \
2212        /   \
2213    <exp1> <exp2>
2214 
2215    CAT means concatenation.  */
2216 
2217 static bin_tree_t *
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2220 {
2221   bin_tree_t *tree, *expr;
2222   re_dfa_t *dfa = preg->buffer;
2223   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2225     return NULL;
2226 
2227   while (token->type != OP_ALT && token->type != END_OF_RE
2228 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2229     {
2230       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2232 	{
2233 	  if (tree != NULL)
2234 	    postorder (tree, free_tree, NULL);
2235 	  return NULL;
2236 	}
2237       if (tree != NULL && expr != NULL)
2238 	{
2239 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240 	  if (newtree == NULL)
2241 	    {
2242 	      postorder (expr, free_tree, NULL);
2243 	      postorder (tree, free_tree, NULL);
2244 	      *err = REG_ESPACE;
2245 	      return NULL;
2246 	    }
2247 	  tree = newtree;
2248 	}
2249       else if (tree == NULL)
2250 	tree = expr;
2251       /* Otherwise expr == NULL, we don't need to create new tree.  */
2252     }
2253   return tree;
2254 }
2255 
2256 /* This function build the following tree, from regular expression a*:
2257 	 *
2258 	 |
2259 	 a
2260 */
2261 
2262 static bin_tree_t *
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2265 {
2266   re_dfa_t *dfa = preg->buffer;
2267   bin_tree_t *tree;
2268   switch (token->type)
2269     {
2270     case CHARACTER:
2271       tree = create_token_tree (dfa, NULL, NULL, token);
2272       if (__glibc_unlikely (tree == NULL))
2273 	{
2274 	  *err = REG_ESPACE;
2275 	  return NULL;
2276 	}
2277 #ifdef RE_ENABLE_I18N
2278       if (dfa->mb_cur_max > 1)
2279 	{
2280 	  while (!re_string_eoi (regexp)
2281 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2282 	    {
2283 	      bin_tree_t *mbc_remain;
2284 	      fetch_token (token, regexp, syntax);
2285 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287 	      if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2288 		{
2289 		  *err = REG_ESPACE;
2290 		  return NULL;
2291 		}
2292 	    }
2293 	}
2294 #endif
2295       break;
2296 
2297     case OP_OPEN_SUBEXP:
2298       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2299       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2300 	return NULL;
2301       break;
2302 
2303     case OP_OPEN_BRACKET:
2304       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2306 	return NULL;
2307       break;
2308 
2309     case OP_BACK_REF:
2310       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2311 	{
2312 	  *err = REG_ESUBREG;
2313 	  return NULL;
2314 	}
2315       dfa->used_bkref_map |= 1 << token->opr.idx;
2316       tree = create_token_tree (dfa, NULL, NULL, token);
2317       if (__glibc_unlikely (tree == NULL))
2318 	{
2319 	  *err = REG_ESPACE;
2320 	  return NULL;
2321 	}
2322       ++dfa->nbackref;
2323       dfa->has_mb_node = 1;
2324       break;
2325 
2326     case OP_OPEN_DUP_NUM:
2327       if (syntax & RE_CONTEXT_INVALID_DUP)
2328 	{
2329 	  *err = REG_BADRPT;
2330 	  return NULL;
2331 	}
2332       FALLTHROUGH;
2333     case OP_DUP_ASTERISK:
2334     case OP_DUP_PLUS:
2335     case OP_DUP_QUESTION:
2336       if (syntax & RE_CONTEXT_INVALID_OPS)
2337 	{
2338 	  *err = REG_BADRPT;
2339 	  return NULL;
2340 	}
2341       else if (syntax & RE_CONTEXT_INDEP_OPS)
2342 	{
2343 	  fetch_token (token, regexp, syntax);
2344 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2345 	}
2346       FALLTHROUGH;
2347     case OP_CLOSE_SUBEXP:
2348       if ((token->type == OP_CLOSE_SUBEXP) &&
2349 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350 	{
2351 	  *err = REG_ERPAREN;
2352 	  return NULL;
2353 	}
2354       FALLTHROUGH;
2355     case OP_CLOSE_DUP_NUM:
2356       /* We treat it as a normal character.  */
2357 
2358       /* Then we can these characters as normal characters.  */
2359       token->type = CHARACTER;
2360       /* mb_partial and word_char bits should be initialized already
2361 	 by peek_token.  */
2362       tree = create_token_tree (dfa, NULL, NULL, token);
2363       if (__glibc_unlikely (tree == NULL))
2364 	{
2365 	  *err = REG_ESPACE;
2366 	  return NULL;
2367 	}
2368       break;
2369 
2370     case ANCHOR:
2371       if ((token->opr.ctx_type
2372 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2373 	  && dfa->word_ops_used == 0)
2374 	init_word_char (dfa);
2375       if (token->opr.ctx_type == WORD_DELIM
2376 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2377 	{
2378 	  bin_tree_t *tree_first, *tree_last;
2379 	  if (token->opr.ctx_type == WORD_DELIM)
2380 	    {
2381 	      token->opr.ctx_type = WORD_FIRST;
2382 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2383 	      token->opr.ctx_type = WORD_LAST;
2384 	    }
2385 	  else
2386 	    {
2387 	      token->opr.ctx_type = INSIDE_WORD;
2388 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2389 	      token->opr.ctx_type = INSIDE_NOTWORD;
2390 	    }
2391 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2392 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2393 	  if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2394 				|| tree == NULL))
2395 	    {
2396 	      *err = REG_ESPACE;
2397 	      return NULL;
2398 	    }
2399 	}
2400       else
2401 	{
2402 	  tree = create_token_tree (dfa, NULL, NULL, token);
2403 	  if (__glibc_unlikely (tree == NULL))
2404 	    {
2405 	      *err = REG_ESPACE;
2406 	      return NULL;
2407 	    }
2408 	}
2409       /* We must return here, since ANCHORs can't be followed
2410 	 by repetition operators.
2411 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2412 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2413       fetch_token (token, regexp, syntax);
2414       return tree;
2415 
2416     case OP_PERIOD:
2417       tree = create_token_tree (dfa, NULL, NULL, token);
2418       if (__glibc_unlikely (tree == NULL))
2419 	{
2420 	  *err = REG_ESPACE;
2421 	  return NULL;
2422 	}
2423       if (dfa->mb_cur_max > 1)
2424 	dfa->has_mb_node = 1;
2425       break;
2426 
2427     case OP_WORD:
2428     case OP_NOTWORD:
2429       tree = build_charclass_op (dfa, regexp->trans,
2430 				 "alnum",
2431 				 "_",
2432 				 token->type == OP_NOTWORD, err);
2433       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2434 	return NULL;
2435       break;
2436 
2437     case OP_SPACE:
2438     case OP_NOTSPACE:
2439       tree = build_charclass_op (dfa, regexp->trans,
2440 				 "space",
2441 				 "",
2442 				 token->type == OP_NOTSPACE, err);
2443       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2444 	return NULL;
2445       break;
2446 
2447     case OP_ALT:
2448     case END_OF_RE:
2449       return NULL;
2450 
2451     case BACK_SLASH:
2452       *err = REG_EESCAPE;
2453       return NULL;
2454 
2455     default:
2456       /* Must not happen?  */
2457 #ifdef DEBUG
2458       assert (0);
2459 #endif
2460       return NULL;
2461     }
2462   fetch_token (token, regexp, syntax);
2463 
2464   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2465 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2466     {
2467       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2468 					   syntax, err);
2469       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2470 	{
2471 	  if (tree != NULL)
2472 	    postorder (tree, free_tree, NULL);
2473 	  return NULL;
2474 	}
2475       tree = dup_tree;
2476       /* In BRE consecutive duplications are not allowed.  */
2477       if ((syntax & RE_CONTEXT_INVALID_DUP)
2478 	  && (token->type == OP_DUP_ASTERISK
2479 	      || token->type == OP_OPEN_DUP_NUM))
2480 	{
2481 	  if (tree != NULL)
2482 	    postorder (tree, free_tree, NULL);
2483 	  *err = REG_BADRPT;
2484 	  return NULL;
2485 	}
2486     }
2487 
2488   return tree;
2489 }
2490 
2491 /* This function build the following tree, from regular expression
2492    (<reg_exp>):
2493 	 SUBEXP
2494 	    |
2495 	<reg_exp>
2496 */
2497 
2498 static bin_tree_t *
2499 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2500 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2501 {
2502   re_dfa_t *dfa = preg->buffer;
2503   bin_tree_t *tree;
2504   size_t cur_nsub;
2505   cur_nsub = preg->re_nsub++;
2506 
2507   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2508 
2509   /* The subexpression may be a null string.  */
2510   if (token->type == OP_CLOSE_SUBEXP)
2511     tree = NULL;
2512   else
2513     {
2514       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2515       if (__glibc_unlikely (*err == REG_NOERROR
2516 			    && token->type != OP_CLOSE_SUBEXP))
2517 	{
2518 	  if (tree != NULL)
2519 	    postorder (tree, free_tree, NULL);
2520 	  *err = REG_EPAREN;
2521 	}
2522       if (__glibc_unlikely (*err != REG_NOERROR))
2523 	return NULL;
2524     }
2525 
2526   if (cur_nsub <= '9' - '1')
2527     dfa->completed_bkref_map |= 1 << cur_nsub;
2528 
2529   tree = create_tree (dfa, tree, NULL, SUBEXP);
2530   if (__glibc_unlikely (tree == NULL))
2531     {
2532       *err = REG_ESPACE;
2533       return NULL;
2534     }
2535   tree->token.opr.idx = cur_nsub;
2536   return tree;
2537 }
2538 
2539 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2540 
2541 static bin_tree_t *
2542 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2543 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2544 {
2545   bin_tree_t *tree = NULL, *old_tree = NULL;
2546   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2547   re_token_t start_token = *token;
2548 
2549   if (token->type == OP_OPEN_DUP_NUM)
2550     {
2551       end = 0;
2552       start = fetch_number (regexp, token, syntax);
2553       if (start == -1)
2554 	{
2555 	  if (token->type == CHARACTER && token->opr.c == ',')
2556 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2557 	  else
2558 	    {
2559 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2560 	      return NULL;
2561 	    }
2562 	}
2563       if (__glibc_likely (start != -2))
2564 	{
2565 	  /* We treat "{n}" as "{n,n}".  */
2566 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2567 		 : ((token->type == CHARACTER && token->opr.c == ',')
2568 		    ? fetch_number (regexp, token, syntax) : -2));
2569 	}
2570       if (__glibc_unlikely (start == -2 || end == -2))
2571 	{
2572 	  /* Invalid sequence.  */
2573 	  if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2574 	    {
2575 	      if (token->type == END_OF_RE)
2576 		*err = REG_EBRACE;
2577 	      else
2578 		*err = REG_BADBR;
2579 
2580 	      return NULL;
2581 	    }
2582 
2583 	  /* If the syntax bit is set, rollback.  */
2584 	  re_string_set_index (regexp, start_idx);
2585 	  *token = start_token;
2586 	  token->type = CHARACTER;
2587 	  /* mb_partial and word_char bits should be already initialized by
2588 	     peek_token.  */
2589 	  return elem;
2590 	}
2591 
2592       if (__glibc_unlikely ((end != -1 && start > end)
2593 			    || token->type != OP_CLOSE_DUP_NUM))
2594 	{
2595 	  /* First number greater than second.  */
2596 	  *err = REG_BADBR;
2597 	  return NULL;
2598 	}
2599 
2600       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2601 	{
2602 	  *err = REG_ESIZE;
2603 	  return NULL;
2604 	}
2605     }
2606   else
2607     {
2608       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2609       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2610     }
2611 
2612   fetch_token (token, regexp, syntax);
2613 
2614   if (__glibc_unlikely (elem == NULL))
2615     return NULL;
2616   if (__glibc_unlikely (start == 0 && end == 0))
2617     {
2618       postorder (elem, free_tree, NULL);
2619       return NULL;
2620     }
2621 
2622   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2623   if (__glibc_unlikely (start > 0))
2624     {
2625       tree = elem;
2626       for (i = 2; i <= start; ++i)
2627 	{
2628 	  elem = duplicate_tree (elem, dfa);
2629 	  tree = create_tree (dfa, tree, elem, CONCAT);
2630 	  if (__glibc_unlikely (elem == NULL || tree == NULL))
2631 	    goto parse_dup_op_espace;
2632 	}
2633 
2634       if (start == end)
2635 	return tree;
2636 
2637       /* Duplicate ELEM before it is marked optional.  */
2638       elem = duplicate_tree (elem, dfa);
2639       if (__glibc_unlikely (elem == NULL))
2640         goto parse_dup_op_espace;
2641       old_tree = tree;
2642     }
2643   else
2644     old_tree = NULL;
2645 
2646   if (elem->token.type == SUBEXP)
2647     {
2648       uintptr_t subidx = elem->token.opr.idx;
2649       postorder (elem, mark_opt_subexp, (void *) subidx);
2650     }
2651 
2652   tree = create_tree (dfa, elem, NULL,
2653 		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2654   if (__glibc_unlikely (tree == NULL))
2655     goto parse_dup_op_espace;
2656 
2657   /* This loop is actually executed only when end != -1,
2658      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2659      already created the start+1-th copy.  */
2660   if (TYPE_SIGNED (Idx) || end != -1)
2661     for (i = start + 2; i <= end; ++i)
2662       {
2663 	elem = duplicate_tree (elem, dfa);
2664 	tree = create_tree (dfa, tree, elem, CONCAT);
2665 	if (__glibc_unlikely (elem == NULL || tree == NULL))
2666 	  goto parse_dup_op_espace;
2667 
2668 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2669 	if (__glibc_unlikely (tree == NULL))
2670 	  goto parse_dup_op_espace;
2671       }
2672 
2673   if (old_tree)
2674     tree = create_tree (dfa, old_tree, tree, CONCAT);
2675 
2676   return tree;
2677 
2678  parse_dup_op_espace:
2679   *err = REG_ESPACE;
2680   return NULL;
2681 }
2682 
2683 /* Size of the names for collating symbol/equivalence_class/character_class.
2684    I'm not sure, but maybe enough.  */
2685 #define BRACKET_NAME_BUF_SIZE 32
2686 
2687 #ifndef _LIBC
2688 
2689 # ifdef RE_ENABLE_I18N
2690 /* Convert the byte B to the corresponding wide character.  In a
2691    unibyte locale, treat B as itself.  In a multibyte locale, return
2692    WEOF if B is an encoding error.  */
2693 static wint_t
2694 parse_byte (unsigned char b, re_charset_t *mbcset)
2695 {
2696   return mbcset == NULL ? b : __btowc (b);
2697 }
2698 # endif
2699 
2700   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2701      Build the range expression which starts from START_ELEM, and ends
2702      at END_ELEM.  The result are written to MBCSET and SBCSET.
2703      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2704      mbcset->range_ends, is a pointer argument since we may
2705      update it.  */
2706 
2707 static reg_errcode_t
2708 # ifdef RE_ENABLE_I18N
2709 build_range_exp (const reg_syntax_t syntax,
2710                  bitset_t sbcset,
2711                  re_charset_t *mbcset,
2712                  Idx *range_alloc,
2713                  const bracket_elem_t *start_elem,
2714                  const bracket_elem_t *end_elem)
2715 # else /* not RE_ENABLE_I18N */
2716 build_range_exp (const reg_syntax_t syntax,
2717                  bitset_t sbcset,
2718                  const bracket_elem_t *start_elem,
2719                  const bracket_elem_t *end_elem)
2720 # endif /* not RE_ENABLE_I18N */
2721 {
2722   unsigned int start_ch, end_ch;
2723   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2724   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2725 			|| start_elem->type == CHAR_CLASS
2726 			|| end_elem->type == EQUIV_CLASS
2727 			|| end_elem->type == CHAR_CLASS))
2728     return REG_ERANGE;
2729 
2730   /* We can handle no multi character collating elements without libc
2731      support.  */
2732   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2733 			 && strlen ((char *) start_elem->opr.name) > 1)
2734 			|| (end_elem->type == COLL_SYM
2735 			    && strlen ((char *) end_elem->opr.name) > 1)))
2736     return REG_ECOLLATE;
2737 
2738 # ifdef RE_ENABLE_I18N
2739   {
2740     wchar_t wc;
2741     wint_t start_wc;
2742     wint_t end_wc;
2743 
2744     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2745 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2746 		   : 0));
2747     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2748 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2749 		 : 0));
2750     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2751 		? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2752     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2753 	      ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2754     if (start_wc == WEOF || end_wc == WEOF)
2755       return REG_ECOLLATE;
2756     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2757 			       && start_wc > end_wc))
2758       return REG_ERANGE;
2759 
2760     /* Got valid collation sequence values, add them as a new entry.
2761        However, for !_LIBC we have no collation elements: if the
2762        character set is single byte, the single byte character set
2763        that we build below suffices.  parse_bracket_exp passes
2764        no MBCSET if dfa->mb_cur_max == 1.  */
2765     if (mbcset)
2766       {
2767 	/* Check the space of the arrays.  */
2768 	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2769 	  {
2770 	    /* There is not enough space, need realloc.  */
2771 	    wchar_t *new_array_start, *new_array_end;
2772 	    Idx new_nranges;
2773 
2774 	    /* +1 in case of mbcset->nranges is 0.  */
2775 	    new_nranges = 2 * mbcset->nranges + 1;
2776 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2777 	       are NULL if *range_alloc == 0.  */
2778 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2779 					  new_nranges);
2780 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2781 					new_nranges);
2782 
2783 	    if (__glibc_unlikely (new_array_start == NULL
2784 				  || new_array_end == NULL))
2785 	      {
2786 		re_free (new_array_start);
2787 		re_free (new_array_end);
2788 		return REG_ESPACE;
2789 	      }
2790 
2791 	    mbcset->range_starts = new_array_start;
2792 	    mbcset->range_ends = new_array_end;
2793 	    *range_alloc = new_nranges;
2794 	  }
2795 
2796 	mbcset->range_starts[mbcset->nranges] = start_wc;
2797 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2798       }
2799 
2800     /* Build the table for single byte characters.  */
2801     for (wc = 0; wc < SBC_MAX; ++wc)
2802       {
2803 	if (start_wc <= wc && wc <= end_wc)
2804 	  bitset_set (sbcset, wc);
2805       }
2806   }
2807 # else /* not RE_ENABLE_I18N */
2808   {
2809     unsigned int ch;
2810     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2811 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2812 		   : 0));
2813     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2814 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2815 		 : 0));
2816     if (start_ch > end_ch)
2817       return REG_ERANGE;
2818     /* Build the table for single byte characters.  */
2819     for (ch = 0; ch < SBC_MAX; ++ch)
2820       if (start_ch <= ch  && ch <= end_ch)
2821 	bitset_set (sbcset, ch);
2822   }
2823 # endif /* not RE_ENABLE_I18N */
2824   return REG_NOERROR;
2825 }
2826 #endif /* not _LIBC */
2827 
2828 #ifndef _LIBC
2829 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2830    Build the collating element which is represented by NAME.
2831    The result are written to MBCSET and SBCSET.
2832    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2833    pointer argument since we may update it.  */
2834 
2835 static reg_errcode_t
2836 # ifdef RE_ENABLE_I18N
2837 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2838 			Idx *coll_sym_alloc, const unsigned char *name)
2839 # else /* not RE_ENABLE_I18N */
2840 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2841 # endif /* not RE_ENABLE_I18N */
2842 {
2843   size_t name_len = strlen ((const char *) name);
2844   if (__glibc_unlikely (name_len != 1))
2845     return REG_ECOLLATE;
2846   else
2847     {
2848       bitset_set (sbcset, name[0]);
2849       return REG_NOERROR;
2850     }
2851 }
2852 #endif /* not _LIBC */
2853 
2854 /* This function parse bracket expression like "[abc]", "[a-c]",
2855    "[[.a-a.]]" etc.  */
2856 
2857 static bin_tree_t *
2858 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2859 		   reg_syntax_t syntax, reg_errcode_t *err)
2860 {
2861 #ifdef _LIBC
2862   const unsigned char *collseqmb;
2863   const char *collseqwc;
2864   uint32_t nrules;
2865   int32_t table_size;
2866   const int32_t *symb_table;
2867   const unsigned char *extra;
2868 
2869   /* Local function for parse_bracket_exp used in _LIBC environment.
2870      Seek the collating symbol entry corresponding to NAME.
2871      Return the index of the symbol in the SYMB_TABLE,
2872      or -1 if not found.  */
2873 
2874   auto inline int32_t
2875   __attribute__ ((always_inline))
2876   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2877     {
2878       int32_t elem;
2879 
2880       for (elem = 0; elem < table_size; elem++)
2881 	if (symb_table[2 * elem] != 0)
2882 	  {
2883 	    int32_t idx = symb_table[2 * elem + 1];
2884 	    /* Skip the name of collating element name.  */
2885 	    idx += 1 + extra[idx];
2886 	    if (/* Compare the length of the name.  */
2887 		name_len == extra[idx]
2888 		/* Compare the name.  */
2889 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2890 	      /* Yep, this is the entry.  */
2891 	      return elem;
2892 	  }
2893       return -1;
2894     }
2895 
2896   /* Local function for parse_bracket_exp used in _LIBC environment.
2897      Look up the collation sequence value of BR_ELEM.
2898      Return the value if succeeded, UINT_MAX otherwise.  */
2899 
2900   auto inline unsigned int
2901   __attribute__ ((always_inline))
2902   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2903     {
2904       if (br_elem->type == SB_CHAR)
2905 	{
2906 	  /*
2907 	  if (MB_CUR_MAX == 1)
2908 	  */
2909 	  if (nrules == 0)
2910 	    return collseqmb[br_elem->opr.ch];
2911 	  else
2912 	    {
2913 	      wint_t wc = __btowc (br_elem->opr.ch);
2914 	      return __collseq_table_lookup (collseqwc, wc);
2915 	    }
2916 	}
2917       else if (br_elem->type == MB_CHAR)
2918 	{
2919 	  if (nrules != 0)
2920 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2921 	}
2922       else if (br_elem->type == COLL_SYM)
2923 	{
2924 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2925 	  if (nrules != 0)
2926 	    {
2927 	      int32_t elem, idx;
2928 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2929 						  sym_name_len);
2930 	      if (elem != -1)
2931 		{
2932 		  /* We found the entry.  */
2933 		  idx = symb_table[2 * elem + 1];
2934 		  /* Skip the name of collating element name.  */
2935 		  idx += 1 + extra[idx];
2936 		  /* Skip the byte sequence of the collating element.  */
2937 		  idx += 1 + extra[idx];
2938 		  /* Adjust for the alignment.  */
2939 		  idx = (idx + 3) & ~3;
2940 		  /* Skip the multibyte collation sequence value.  */
2941 		  idx += sizeof (unsigned int);
2942 		  /* Skip the wide char sequence of the collating element.  */
2943 		  idx += sizeof (unsigned int) *
2944 		    (1 + *(unsigned int *) (extra + idx));
2945 		  /* Return the collation sequence value.  */
2946 		  return *(unsigned int *) (extra + idx);
2947 		}
2948 	      else if (sym_name_len == 1)
2949 		{
2950 		  /* No valid character.  Match it as a single byte
2951 		     character.  */
2952 		  return collseqmb[br_elem->opr.name[0]];
2953 		}
2954 	    }
2955 	  else if (sym_name_len == 1)
2956 	    return collseqmb[br_elem->opr.name[0]];
2957 	}
2958       return UINT_MAX;
2959     }
2960 
2961   /* Local function for parse_bracket_exp used in _LIBC environment.
2962      Build the range expression which starts from START_ELEM, and ends
2963      at END_ELEM.  The result are written to MBCSET and SBCSET.
2964      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2965      mbcset->range_ends, is a pointer argument since we may
2966      update it.  */
2967 
2968   auto inline reg_errcode_t
2969   __attribute__ ((always_inline))
2970   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2971 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2972     {
2973       unsigned int ch;
2974       uint32_t start_collseq;
2975       uint32_t end_collseq;
2976 
2977       /* Equivalence Classes and Character Classes can't be a range
2978 	 start/end.  */
2979       if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2980 			    || start_elem->type == CHAR_CLASS
2981 			    || end_elem->type == EQUIV_CLASS
2982 			    || end_elem->type == CHAR_CLASS))
2983 	return REG_ERANGE;
2984 
2985       /* FIXME: Implement rational ranges here, too.  */
2986       start_collseq = lookup_collation_sequence_value (start_elem);
2987       end_collseq = lookup_collation_sequence_value (end_elem);
2988       /* Check start/end collation sequence values.  */
2989       if (__glibc_unlikely (start_collseq == UINT_MAX
2990 			    || end_collseq == UINT_MAX))
2991 	return REG_ECOLLATE;
2992       if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2993 			    && start_collseq > end_collseq))
2994 	return REG_ERANGE;
2995 
2996       /* Got valid collation sequence values, add them as a new entry.
2997 	 However, if we have no collation elements, and the character set
2998 	 is single byte, the single byte character set that we
2999 	 build below suffices. */
3000       if (nrules > 0 || dfa->mb_cur_max > 1)
3001 	{
3002 	  /* Check the space of the arrays.  */
3003 	  if (__glibc_unlikely (*range_alloc == mbcset->nranges))
3004 	    {
3005 	      /* There is not enough space, need realloc.  */
3006 	      uint32_t *new_array_start;
3007 	      uint32_t *new_array_end;
3008 	      Idx new_nranges;
3009 
3010 	      /* +1 in case of mbcset->nranges is 0.  */
3011 	      new_nranges = 2 * mbcset->nranges + 1;
3012 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3013 					    new_nranges);
3014 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3015 					  new_nranges);
3016 
3017 	      if (__glibc_unlikely (new_array_start == NULL
3018 				    || new_array_end == NULL))
3019 		return REG_ESPACE;
3020 
3021 	      mbcset->range_starts = new_array_start;
3022 	      mbcset->range_ends = new_array_end;
3023 	      *range_alloc = new_nranges;
3024 	    }
3025 
3026 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
3027 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
3028 	}
3029 
3030       /* Build the table for single byte characters.  */
3031       for (ch = 0; ch < SBC_MAX; ch++)
3032 	{
3033 	  uint32_t ch_collseq;
3034 	  /*
3035 	  if (MB_CUR_MAX == 1)
3036 	  */
3037 	  if (nrules == 0)
3038 	    ch_collseq = collseqmb[ch];
3039 	  else
3040 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3041 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3042 	    bitset_set (sbcset, ch);
3043 	}
3044       return REG_NOERROR;
3045     }
3046 
3047   /* Local function for parse_bracket_exp used in _LIBC environment.
3048      Build the collating element which is represented by NAME.
3049      The result are written to MBCSET and SBCSET.
3050      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3051      pointer argument since we may update it.  */
3052 
3053   auto inline reg_errcode_t
3054   __attribute__ ((always_inline))
3055   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3056 			  Idx *coll_sym_alloc, const unsigned char *name)
3057     {
3058       int32_t elem, idx;
3059       size_t name_len = strlen ((const char *) name);
3060       if (nrules != 0)
3061 	{
3062 	  elem = seek_collating_symbol_entry (name, name_len);
3063 	  if (elem != -1)
3064 	    {
3065 	      /* We found the entry.  */
3066 	      idx = symb_table[2 * elem + 1];
3067 	      /* Skip the name of collating element name.  */
3068 	      idx += 1 + extra[idx];
3069 	    }
3070 	  else if (name_len == 1)
3071 	    {
3072 	      /* No valid character, treat it as a normal
3073 		 character.  */
3074 	      bitset_set (sbcset, name[0]);
3075 	      return REG_NOERROR;
3076 	    }
3077 	  else
3078 	    return REG_ECOLLATE;
3079 
3080 	  /* Got valid collation sequence, add it as a new entry.  */
3081 	  /* Check the space of the arrays.  */
3082 	  if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3083 	    {
3084 	      /* Not enough, realloc it.  */
3085 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3086 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3087 	      /* Use realloc since mbcset->coll_syms is NULL
3088 		 if *alloc == 0.  */
3089 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3090 						   new_coll_sym_alloc);
3091 	      if (__glibc_unlikely (new_coll_syms == NULL))
3092 		return REG_ESPACE;
3093 	      mbcset->coll_syms = new_coll_syms;
3094 	      *coll_sym_alloc = new_coll_sym_alloc;
3095 	    }
3096 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3097 	  return REG_NOERROR;
3098 	}
3099       else
3100 	{
3101 	  if (__glibc_unlikely (name_len != 1))
3102 	    return REG_ECOLLATE;
3103 	  else
3104 	    {
3105 	      bitset_set (sbcset, name[0]);
3106 	      return REG_NOERROR;
3107 	    }
3108 	}
3109     }
3110 #endif
3111 
3112   re_token_t br_token;
3113   re_bitset_ptr_t sbcset;
3114 #ifdef RE_ENABLE_I18N
3115   re_charset_t *mbcset;
3116   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3117   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3118 #endif /* not RE_ENABLE_I18N */
3119   bool non_match = false;
3120   bin_tree_t *work_tree;
3121   int token_len;
3122   bool first_round = true;
3123 #ifdef _LIBC
3124   collseqmb = (const unsigned char *)
3125     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3126   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3127   if (nrules)
3128     {
3129       /*
3130       if (MB_CUR_MAX > 1)
3131       */
3132       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3133       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3134       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3135 						  _NL_COLLATE_SYMB_TABLEMB);
3136       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3137 						   _NL_COLLATE_SYMB_EXTRAMB);
3138     }
3139 #endif
3140   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3141 #ifdef RE_ENABLE_I18N
3142   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3143 #endif /* RE_ENABLE_I18N */
3144 #ifdef RE_ENABLE_I18N
3145   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3146 #else
3147   if (__glibc_unlikely (sbcset == NULL))
3148 #endif /* RE_ENABLE_I18N */
3149     {
3150       re_free (sbcset);
3151 #ifdef RE_ENABLE_I18N
3152       re_free (mbcset);
3153 #endif
3154       *err = REG_ESPACE;
3155       return NULL;
3156     }
3157 
3158   token_len = peek_token_bracket (token, regexp, syntax);
3159   if (__glibc_unlikely (token->type == END_OF_RE))
3160     {
3161       *err = REG_BADPAT;
3162       goto parse_bracket_exp_free_return;
3163     }
3164   if (token->type == OP_NON_MATCH_LIST)
3165     {
3166 #ifdef RE_ENABLE_I18N
3167       mbcset->non_match = 1;
3168 #endif /* not RE_ENABLE_I18N */
3169       non_match = true;
3170       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3171 	bitset_set (sbcset, '\n');
3172       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3173       token_len = peek_token_bracket (token, regexp, syntax);
3174       if (__glibc_unlikely (token->type == END_OF_RE))
3175 	{
3176 	  *err = REG_BADPAT;
3177 	  goto parse_bracket_exp_free_return;
3178 	}
3179     }
3180 
3181   /* We treat the first ']' as a normal character.  */
3182   if (token->type == OP_CLOSE_BRACKET)
3183     token->type = CHARACTER;
3184 
3185   while (1)
3186     {
3187       bracket_elem_t start_elem, end_elem;
3188       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3189       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3190       reg_errcode_t ret;
3191       int token_len2 = 0;
3192       bool is_range_exp = false;
3193       re_token_t token2;
3194 
3195       start_elem.opr.name = start_name_buf;
3196       start_elem.type = COLL_SYM;
3197       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3198 				   syntax, first_round);
3199       if (__glibc_unlikely (ret != REG_NOERROR))
3200 	{
3201 	  *err = ret;
3202 	  goto parse_bracket_exp_free_return;
3203 	}
3204       first_round = false;
3205 
3206       /* Get information about the next token.  We need it in any case.  */
3207       token_len = peek_token_bracket (token, regexp, syntax);
3208 
3209       /* Do not check for ranges if we know they are not allowed.  */
3210       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3211 	{
3212 	  if (__glibc_unlikely (token->type == END_OF_RE))
3213 	    {
3214 	      *err = REG_EBRACK;
3215 	      goto parse_bracket_exp_free_return;
3216 	    }
3217 	  if (token->type == OP_CHARSET_RANGE)
3218 	    {
3219 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3220 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3221 	      if (__glibc_unlikely (token2.type == END_OF_RE))
3222 		{
3223 		  *err = REG_EBRACK;
3224 		  goto parse_bracket_exp_free_return;
3225 		}
3226 	      if (token2.type == OP_CLOSE_BRACKET)
3227 		{
3228 		  /* We treat the last '-' as a normal character.  */
3229 		  re_string_skip_bytes (regexp, -token_len);
3230 		  token->type = CHARACTER;
3231 		}
3232 	      else
3233 		is_range_exp = true;
3234 	    }
3235 	}
3236 
3237       if (is_range_exp == true)
3238 	{
3239 	  end_elem.opr.name = end_name_buf;
3240 	  end_elem.type = COLL_SYM;
3241 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3242 				       dfa, syntax, true);
3243 	  if (__glibc_unlikely (ret != REG_NOERROR))
3244 	    {
3245 	      *err = ret;
3246 	      goto parse_bracket_exp_free_return;
3247 	    }
3248 
3249 	  token_len = peek_token_bracket (token, regexp, syntax);
3250 
3251 #ifdef _LIBC
3252 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3253 				  &start_elem, &end_elem);
3254 #else
3255 # ifdef RE_ENABLE_I18N
3256 	  *err = build_range_exp (syntax, sbcset,
3257 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3258 				  &range_alloc, &start_elem, &end_elem);
3259 # else
3260 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3261 # endif
3262 #endif /* RE_ENABLE_I18N */
3263 	  if (__glibc_unlikely (*err != REG_NOERROR))
3264 	    goto parse_bracket_exp_free_return;
3265 	}
3266       else
3267 	{
3268 	  switch (start_elem.type)
3269 	    {
3270 	    case SB_CHAR:
3271 	      bitset_set (sbcset, start_elem.opr.ch);
3272 	      break;
3273 #ifdef RE_ENABLE_I18N
3274 	    case MB_CHAR:
3275 	      /* Check whether the array has enough space.  */
3276 	      if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3277 		{
3278 		  wchar_t *new_mbchars;
3279 		  /* Not enough, realloc it.  */
3280 		  /* +1 in case of mbcset->nmbchars is 0.  */
3281 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3282 		  /* Use realloc since array is NULL if *alloc == 0.  */
3283 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3284 					    mbchar_alloc);
3285 		  if (__glibc_unlikely (new_mbchars == NULL))
3286 		    goto parse_bracket_exp_espace;
3287 		  mbcset->mbchars = new_mbchars;
3288 		}
3289 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3290 	      break;
3291 #endif /* RE_ENABLE_I18N */
3292 	    case EQUIV_CLASS:
3293 	      *err = build_equiv_class (sbcset,
3294 #ifdef RE_ENABLE_I18N
3295 					mbcset, &equiv_class_alloc,
3296 #endif /* RE_ENABLE_I18N */
3297 					start_elem.opr.name);
3298 	      if (__glibc_unlikely (*err != REG_NOERROR))
3299 		goto parse_bracket_exp_free_return;
3300 	      break;
3301 	    case COLL_SYM:
3302 	      *err = build_collating_symbol (sbcset,
3303 #ifdef RE_ENABLE_I18N
3304 					     mbcset, &coll_sym_alloc,
3305 #endif /* RE_ENABLE_I18N */
3306 					     start_elem.opr.name);
3307 	      if (__glibc_unlikely (*err != REG_NOERROR))
3308 		goto parse_bracket_exp_free_return;
3309 	      break;
3310 	    case CHAR_CLASS:
3311 	      *err = build_charclass (regexp->trans, sbcset,
3312 #ifdef RE_ENABLE_I18N
3313 				      mbcset, &char_class_alloc,
3314 #endif /* RE_ENABLE_I18N */
3315 				      (const char *) start_elem.opr.name,
3316 				      syntax);
3317 	      if (__glibc_unlikely (*err != REG_NOERROR))
3318 	       goto parse_bracket_exp_free_return;
3319 	      break;
3320 	    default:
3321 	      assert (0);
3322 	      break;
3323 	    }
3324 	}
3325       if (__glibc_unlikely (token->type == END_OF_RE))
3326 	{
3327 	  *err = REG_EBRACK;
3328 	  goto parse_bracket_exp_free_return;
3329 	}
3330       if (token->type == OP_CLOSE_BRACKET)
3331 	break;
3332     }
3333 
3334   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3335 
3336   /* If it is non-matching list.  */
3337   if (non_match)
3338     bitset_not (sbcset);
3339 
3340 #ifdef RE_ENABLE_I18N
3341   /* Ensure only single byte characters are set.  */
3342   if (dfa->mb_cur_max > 1)
3343     bitset_mask (sbcset, dfa->sb_char);
3344 
3345   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3346       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3347 						     || mbcset->non_match)))
3348     {
3349       bin_tree_t *mbc_tree;
3350       int sbc_idx;
3351       /* Build a tree for complex bracket.  */
3352       dfa->has_mb_node = 1;
3353       br_token.type = COMPLEX_BRACKET;
3354       br_token.opr.mbcset = mbcset;
3355       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3356       if (__glibc_unlikely (mbc_tree == NULL))
3357 	goto parse_bracket_exp_espace;
3358       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3359 	if (sbcset[sbc_idx])
3360 	  break;
3361       /* If there are no bits set in sbcset, there is no point
3362 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3363       if (sbc_idx < BITSET_WORDS)
3364 	{
3365 	  /* Build a tree for simple bracket.  */
3366 	  br_token.type = SIMPLE_BRACKET;
3367 	  br_token.opr.sbcset = sbcset;
3368 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3369 	  if (__glibc_unlikely (work_tree == NULL))
3370 	    goto parse_bracket_exp_espace;
3371 
3372 	  /* Then join them by ALT node.  */
3373 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3374 	  if (__glibc_unlikely (work_tree == NULL))
3375 	    goto parse_bracket_exp_espace;
3376 	}
3377       else
3378 	{
3379 	  re_free (sbcset);
3380 	  work_tree = mbc_tree;
3381 	}
3382     }
3383   else
3384 #endif /* not RE_ENABLE_I18N */
3385     {
3386 #ifdef RE_ENABLE_I18N
3387       free_charset (mbcset);
3388 #endif
3389       /* Build a tree for simple bracket.  */
3390       br_token.type = SIMPLE_BRACKET;
3391       br_token.opr.sbcset = sbcset;
3392       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3393       if (__glibc_unlikely (work_tree == NULL))
3394 	goto parse_bracket_exp_espace;
3395     }
3396   return work_tree;
3397 
3398  parse_bracket_exp_espace:
3399   *err = REG_ESPACE;
3400  parse_bracket_exp_free_return:
3401   re_free (sbcset);
3402 #ifdef RE_ENABLE_I18N
3403   free_charset (mbcset);
3404 #endif /* RE_ENABLE_I18N */
3405   return NULL;
3406 }
3407 
3408 /* Parse an element in the bracket expression.  */
3409 
3410 static reg_errcode_t
3411 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3412 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3413 		       reg_syntax_t syntax, bool accept_hyphen)
3414 {
3415 #ifdef RE_ENABLE_I18N
3416   int cur_char_size;
3417   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3418   if (cur_char_size > 1)
3419     {
3420       elem->type = MB_CHAR;
3421       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3422       re_string_skip_bytes (regexp, cur_char_size);
3423       return REG_NOERROR;
3424     }
3425 #endif /* RE_ENABLE_I18N */
3426   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3427   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3428       || token->type == OP_OPEN_EQUIV_CLASS)
3429     return parse_bracket_symbol (elem, regexp, token);
3430   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3431     {
3432       /* A '-' must only appear as anything but a range indicator before
3433 	 the closing bracket.  Everything else is an error.  */
3434       re_token_t token2;
3435       (void) peek_token_bracket (&token2, regexp, syntax);
3436       if (token2.type != OP_CLOSE_BRACKET)
3437 	/* The actual error value is not standardized since this whole
3438 	   case is undefined.  But ERANGE makes good sense.  */
3439 	return REG_ERANGE;
3440     }
3441   elem->type = SB_CHAR;
3442   elem->opr.ch = token->opr.c;
3443   return REG_NOERROR;
3444 }
3445 
3446 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3447    such as [:<character_class>:], [.<collating_element>.], and
3448    [=<equivalent_class>=].  */
3449 
3450 static reg_errcode_t
3451 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3452 		      re_token_t *token)
3453 {
3454   unsigned char ch, delim = token->opr.c;
3455   int i = 0;
3456   if (re_string_eoi(regexp))
3457     return REG_EBRACK;
3458   for (;; ++i)
3459     {
3460       if (i >= BRACKET_NAME_BUF_SIZE)
3461 	return REG_EBRACK;
3462       if (token->type == OP_OPEN_CHAR_CLASS)
3463 	ch = re_string_fetch_byte_case (regexp);
3464       else
3465 	ch = re_string_fetch_byte (regexp);
3466       if (re_string_eoi(regexp))
3467 	return REG_EBRACK;
3468       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3469 	break;
3470       elem->opr.name[i] = ch;
3471     }
3472   re_string_skip_bytes (regexp, 1);
3473   elem->opr.name[i] = '\0';
3474   switch (token->type)
3475     {
3476     case OP_OPEN_COLL_ELEM:
3477       elem->type = COLL_SYM;
3478       break;
3479     case OP_OPEN_EQUIV_CLASS:
3480       elem->type = EQUIV_CLASS;
3481       break;
3482     case OP_OPEN_CHAR_CLASS:
3483       elem->type = CHAR_CLASS;
3484       break;
3485     default:
3486       break;
3487     }
3488   return REG_NOERROR;
3489 }
3490 
3491   /* Helper function for parse_bracket_exp.
3492      Build the equivalence class which is represented by NAME.
3493      The result are written to MBCSET and SBCSET.
3494      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3495      is a pointer argument since we may update it.  */
3496 
3497 static reg_errcode_t
3498 #ifdef RE_ENABLE_I18N
3499 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3500 		   Idx *equiv_class_alloc, const unsigned char *name)
3501 #else /* not RE_ENABLE_I18N */
3502 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3503 #endif /* not RE_ENABLE_I18N */
3504 {
3505 #ifdef _LIBC
3506   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3507   if (nrules != 0)
3508     {
3509       const int32_t *table, *indirect;
3510       const unsigned char *weights, *extra, *cp;
3511       unsigned char char_buf[2];
3512       int32_t idx1, idx2;
3513       unsigned int ch;
3514       size_t len;
3515       /* Calculate the index for equivalence class.  */
3516       cp = name;
3517       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3518       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3519 					       _NL_COLLATE_WEIGHTMB);
3520       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3521 						   _NL_COLLATE_EXTRAMB);
3522       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3523 						_NL_COLLATE_INDIRECTMB);
3524       idx1 = findidx (table, indirect, extra, &cp, -1);
3525       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3526 	/* This isn't a valid character.  */
3527 	return REG_ECOLLATE;
3528 
3529       /* Build single byte matching table for this equivalence class.  */
3530       len = weights[idx1 & 0xffffff];
3531       for (ch = 0; ch < SBC_MAX; ++ch)
3532 	{
3533 	  char_buf[0] = ch;
3534 	  cp = char_buf;
3535 	  idx2 = findidx (table, indirect, extra, &cp, 1);
3536 /*
3537 	  idx2 = table[ch];
3538 */
3539 	  if (idx2 == 0)
3540 	    /* This isn't a valid character.  */
3541 	    continue;
3542 	  /* Compare only if the length matches and the collation rule
3543 	     index is the same.  */
3544 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3545 	      && memcmp (weights + (idx1 & 0xffffff) + 1,
3546 			 weights + (idx2 & 0xffffff) + 1, len) == 0)
3547 	    bitset_set (sbcset, ch);
3548 	}
3549       /* Check whether the array has enough space.  */
3550       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3551 	{
3552 	  /* Not enough, realloc it.  */
3553 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3554 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3555 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3556 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3557 						   int32_t,
3558 						   new_equiv_class_alloc);
3559 	  if (__glibc_unlikely (new_equiv_classes == NULL))
3560 	    return REG_ESPACE;
3561 	  mbcset->equiv_classes = new_equiv_classes;
3562 	  *equiv_class_alloc = new_equiv_class_alloc;
3563 	}
3564       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3565     }
3566   else
3567 #endif /* _LIBC */
3568     {
3569       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3570 	return REG_ECOLLATE;
3571       bitset_set (sbcset, *name);
3572     }
3573   return REG_NOERROR;
3574 }
3575 
3576   /* Helper function for parse_bracket_exp.
3577      Build the character class which is represented by NAME.
3578      The result are written to MBCSET and SBCSET.
3579      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3580      is a pointer argument since we may update it.  */
3581 
3582 static reg_errcode_t
3583 #ifdef RE_ENABLE_I18N
3584 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3585 		 re_charset_t *mbcset, Idx *char_class_alloc,
3586 		 const char *class_name, reg_syntax_t syntax)
3587 #else /* not RE_ENABLE_I18N */
3588 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3589 		 const char *class_name, reg_syntax_t syntax)
3590 #endif /* not RE_ENABLE_I18N */
3591 {
3592   int i;
3593   const char *name = class_name;
3594 
3595   /* In case of REG_ICASE "upper" and "lower" match the both of
3596      upper and lower cases.  */
3597   if ((syntax & RE_ICASE)
3598       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3599     name = "alpha";
3600 
3601 #ifdef RE_ENABLE_I18N
3602   /* Check the space of the arrays.  */
3603   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3604     {
3605       /* Not enough, realloc it.  */
3606       /* +1 in case of mbcset->nchar_classes is 0.  */
3607       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3608       /* Use realloc since array is NULL if *alloc == 0.  */
3609       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3610 					       new_char_class_alloc);
3611       if (__glibc_unlikely (new_char_classes == NULL))
3612 	return REG_ESPACE;
3613       mbcset->char_classes = new_char_classes;
3614       *char_class_alloc = new_char_class_alloc;
3615     }
3616   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3617 #endif /* RE_ENABLE_I18N */
3618 
3619 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3620   do {						\
3621     if (__glibc_unlikely (trans != NULL))			\
3622       {						\
3623 	for (i = 0; i < SBC_MAX; ++i)		\
3624 	  if (ctype_func (i))			\
3625 	    bitset_set (sbcset, trans[i]);	\
3626       }						\
3627     else					\
3628       {						\
3629 	for (i = 0; i < SBC_MAX; ++i)		\
3630 	  if (ctype_func (i))			\
3631 	    bitset_set (sbcset, i);		\
3632       }						\
3633   } while (0)
3634 
3635   if (strcmp (name, "alnum") == 0)
3636     BUILD_CHARCLASS_LOOP (isalnum);
3637   else if (strcmp (name, "cntrl") == 0)
3638     BUILD_CHARCLASS_LOOP (iscntrl);
3639   else if (strcmp (name, "lower") == 0)
3640     BUILD_CHARCLASS_LOOP (islower);
3641   else if (strcmp (name, "space") == 0)
3642     BUILD_CHARCLASS_LOOP (isspace);
3643   else if (strcmp (name, "alpha") == 0)
3644     BUILD_CHARCLASS_LOOP (isalpha);
3645   else if (strcmp (name, "digit") == 0)
3646     BUILD_CHARCLASS_LOOP (isdigit);
3647   else if (strcmp (name, "print") == 0)
3648     BUILD_CHARCLASS_LOOP (isprint);
3649   else if (strcmp (name, "upper") == 0)
3650     BUILD_CHARCLASS_LOOP (isupper);
3651   else if (strcmp (name, "blank") == 0)
3652     BUILD_CHARCLASS_LOOP (isblank);
3653   else if (strcmp (name, "graph") == 0)
3654     BUILD_CHARCLASS_LOOP (isgraph);
3655   else if (strcmp (name, "punct") == 0)
3656     BUILD_CHARCLASS_LOOP (ispunct);
3657   else if (strcmp (name, "xdigit") == 0)
3658     BUILD_CHARCLASS_LOOP (isxdigit);
3659   else
3660     return REG_ECTYPE;
3661 
3662   return REG_NOERROR;
3663 }
3664 
3665 static bin_tree_t *
3666 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3667 		    const char *class_name,
3668 		    const char *extra, bool non_match,
3669 		    reg_errcode_t *err)
3670 {
3671   re_bitset_ptr_t sbcset;
3672 #ifdef RE_ENABLE_I18N
3673   re_charset_t *mbcset;
3674   Idx alloc = 0;
3675 #endif /* not RE_ENABLE_I18N */
3676   reg_errcode_t ret;
3677   re_token_t br_token;
3678   bin_tree_t *tree;
3679 
3680   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3681   if (__glibc_unlikely (sbcset == NULL))
3682     {
3683       *err = REG_ESPACE;
3684       return NULL;
3685     }
3686 #ifdef RE_ENABLE_I18N
3687   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3688   if (__glibc_unlikely (mbcset == NULL))
3689     {
3690       re_free (sbcset);
3691       *err = REG_ESPACE;
3692       return NULL;
3693     }
3694   mbcset->non_match = non_match;
3695 #endif /* RE_ENABLE_I18N */
3696 
3697   /* We don't care the syntax in this case.  */
3698   ret = build_charclass (trans, sbcset,
3699 #ifdef RE_ENABLE_I18N
3700 			 mbcset, &alloc,
3701 #endif /* RE_ENABLE_I18N */
3702 			 class_name, 0);
3703 
3704   if (__glibc_unlikely (ret != REG_NOERROR))
3705     {
3706       re_free (sbcset);
3707 #ifdef RE_ENABLE_I18N
3708       free_charset (mbcset);
3709 #endif /* RE_ENABLE_I18N */
3710       *err = ret;
3711       return NULL;
3712     }
3713   /* \w match '_' also.  */
3714   for (; *extra; extra++)
3715     bitset_set (sbcset, *extra);
3716 
3717   /* If it is non-matching list.  */
3718   if (non_match)
3719     bitset_not (sbcset);
3720 
3721 #ifdef RE_ENABLE_I18N
3722   /* Ensure only single byte characters are set.  */
3723   if (dfa->mb_cur_max > 1)
3724     bitset_mask (sbcset, dfa->sb_char);
3725 #endif
3726 
3727   /* Build a tree for simple bracket.  */
3728 #if defined GCC_LINT || defined lint
3729   memset (&br_token, 0, sizeof br_token);
3730 #endif
3731   br_token.type = SIMPLE_BRACKET;
3732   br_token.opr.sbcset = sbcset;
3733   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734   if (__glibc_unlikely (tree == NULL))
3735     goto build_word_op_espace;
3736 
3737 #ifdef RE_ENABLE_I18N
3738   if (dfa->mb_cur_max > 1)
3739     {
3740       bin_tree_t *mbc_tree;
3741       /* Build a tree for complex bracket.  */
3742       br_token.type = COMPLEX_BRACKET;
3743       br_token.opr.mbcset = mbcset;
3744       dfa->has_mb_node = 1;
3745       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746       if (__glibc_unlikely (mbc_tree == NULL))
3747 	goto build_word_op_espace;
3748       /* Then join them by ALT node.  */
3749       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750       if (__glibc_likely (mbc_tree != NULL))
3751 	return tree;
3752     }
3753   else
3754     {
3755       free_charset (mbcset);
3756       return tree;
3757     }
3758 #else /* not RE_ENABLE_I18N */
3759   return tree;
3760 #endif /* not RE_ENABLE_I18N */
3761 
3762  build_word_op_espace:
3763   re_free (sbcset);
3764 #ifdef RE_ENABLE_I18N
3765   free_charset (mbcset);
3766 #endif /* RE_ENABLE_I18N */
3767   *err = REG_ESPACE;
3768   return NULL;
3769 }
3770 
3771 /* This is intended for the expressions like "a{1,3}".
3772    Fetch a number from 'input', and return the number.
3773    Return -1 if the number field is empty like "{,1}".
3774    Return RE_DUP_MAX + 1 if the number field is too large.
3775    Return -2 if an error occurred.  */
3776 
3777 static Idx
3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3779 {
3780   Idx num = -1;
3781   unsigned char c;
3782   while (1)
3783     {
3784       fetch_token (token, input, syntax);
3785       c = token->opr.c;
3786       if (__glibc_unlikely (token->type == END_OF_RE))
3787 	return -2;
3788       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3789 	break;
3790       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3791 	     ? -2
3792 	     : num == -1
3793 	     ? c - '0'
3794 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3795     }
3796   return num;
3797 }
3798 
3799 #ifdef RE_ENABLE_I18N
3800 static void
3801 free_charset (re_charset_t *cset)
3802 {
3803   re_free (cset->mbchars);
3804 # ifdef _LIBC
3805   re_free (cset->coll_syms);
3806   re_free (cset->equiv_classes);
3807 # endif
3808   re_free (cset->range_starts);
3809   re_free (cset->range_ends);
3810   re_free (cset->char_classes);
3811   re_free (cset);
3812 }
3813 #endif /* RE_ENABLE_I18N */
3814 
3815 /* Functions for binary tree operation.  */
3816 
3817 /* Create a tree node.  */
3818 
3819 static bin_tree_t *
3820 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3821 	     re_token_type_t type)
3822 {
3823   re_token_t t;
3824 #if defined GCC_LINT || defined lint
3825   memset (&t, 0, sizeof t);
3826 #endif
3827   t.type = type;
3828   return create_token_tree (dfa, left, right, &t);
3829 }
3830 
3831 static bin_tree_t *
3832 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3833 		   const re_token_t *token)
3834 {
3835   bin_tree_t *tree;
3836   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3837     {
3838       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3839 
3840       if (storage == NULL)
3841 	return NULL;
3842       storage->next = dfa->str_tree_storage;
3843       dfa->str_tree_storage = storage;
3844       dfa->str_tree_storage_idx = 0;
3845     }
3846   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3847 
3848   tree->parent = NULL;
3849   tree->left = left;
3850   tree->right = right;
3851   tree->token = *token;
3852   tree->token.duplicated = 0;
3853   tree->token.opt_subexp = 0;
3854   tree->first = NULL;
3855   tree->next = NULL;
3856   tree->node_idx = -1;
3857 
3858   if (left != NULL)
3859     left->parent = tree;
3860   if (right != NULL)
3861     right->parent = tree;
3862   return tree;
3863 }
3864 
3865 /* Mark the tree SRC as an optional subexpression.
3866    To be called from preorder or postorder.  */
3867 
3868 static reg_errcode_t
3869 mark_opt_subexp (void *extra, bin_tree_t *node)
3870 {
3871   Idx idx = (uintptr_t) extra;
3872   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3873     node->token.opt_subexp = 1;
3874 
3875   return REG_NOERROR;
3876 }
3877 
3878 /* Free the allocated memory inside NODE. */
3879 
3880 static void
3881 free_token (re_token_t *node)
3882 {
3883 #ifdef RE_ENABLE_I18N
3884   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3885     free_charset (node->opr.mbcset);
3886   else
3887 #endif /* RE_ENABLE_I18N */
3888     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3889       re_free (node->opr.sbcset);
3890 }
3891 
3892 /* Worker function for tree walking.  Free the allocated memory inside NODE
3893    and its children. */
3894 
3895 static reg_errcode_t
3896 free_tree (void *extra, bin_tree_t *node)
3897 {
3898   free_token (&node->token);
3899   return REG_NOERROR;
3900 }
3901 
3902 
3903 /* Duplicate the node SRC, and return new node.  This is a preorder
3904    visit similar to the one implemented by the generic visitor, but
3905    we need more infrastructure to maintain two parallel trees --- so,
3906    it's easier to duplicate.  */
3907 
3908 static bin_tree_t *
3909 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3910 {
3911   const bin_tree_t *node;
3912   bin_tree_t *dup_root;
3913   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3914 
3915   for (node = root; ; )
3916     {
3917       /* Create a new tree and link it back to the current parent.  */
3918       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3919       if (*p_new == NULL)
3920 	return NULL;
3921       (*p_new)->parent = dup_node;
3922       (*p_new)->token.duplicated = 1;
3923       dup_node = *p_new;
3924 
3925       /* Go to the left node, or up and to the right.  */
3926       if (node->left)
3927 	{
3928 	  node = node->left;
3929 	  p_new = &dup_node->left;
3930 	}
3931       else
3932 	{
3933 	  const bin_tree_t *prev = NULL;
3934 	  while (node->right == prev || node->right == NULL)
3935 	    {
3936 	      prev = node;
3937 	      node = node->parent;
3938 	      dup_node = dup_node->parent;
3939 	      if (!node)
3940 		return dup_root;
3941 	    }
3942 	  node = node->right;
3943 	  p_new = &dup_node->right;
3944 	}
3945     }
3946 }
3947