xref: /dragonfly/contrib/grep/lib/regcomp.c (revision 33311965)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
18 
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 					  size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 				     const re_dfastate_t *init_state,
23 				     char *fastmap);
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
25 #ifdef RE_ENABLE_I18N
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
30 #ifdef RE_ENABLE_I18N
31 static void optimize_utf8 (re_dfa_t *dfa);
32 #endif
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 			       reg_errcode_t (fn (void *, bin_tree_t *)),
36 			       void *extra);
37 static reg_errcode_t postorder (bin_tree_t *root,
38 				reg_errcode_t (fn (void *, bin_tree_t *)),
39 				void *extra);
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
43 				 bin_tree_t *node);
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 				   unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
52 					 Idx node, bool root);
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
55 			 reg_syntax_t syntax);
56 static int peek_token (re_token_t *token, re_string_t *input,
57 			reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 			  reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 				  re_token_t *token, reg_syntax_t syntax,
62 				  Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 				 re_token_t *token, reg_syntax_t syntax,
65 				 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 				     re_token_t *token, reg_syntax_t syntax,
68 				     Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 				  re_token_t *token, reg_syntax_t syntax,
71 				  Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 				 re_dfa_t *dfa, re_token_t *token,
74 				 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 				      re_token_t *token, reg_syntax_t syntax,
77 				      reg_errcode_t *err);
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
79 					    re_string_t *regexp,
80 					    re_token_t *token, int token_len,
81 					    re_dfa_t *dfa,
82 					    reg_syntax_t syntax,
83 					    bool accept_hyphen);
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85 					  re_string_t *regexp,
86 					  re_token_t *token);
87 #ifdef RE_ENABLE_I18N
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 					re_charset_t *mbcset,
90 					Idx *equiv_class_alloc,
91 					const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 				      bitset_t sbcset,
94 				      re_charset_t *mbcset,
95 				      Idx *char_class_alloc,
96 				      const unsigned char *class_name,
97 				      reg_syntax_t syntax);
98 #else  /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 					const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 				      bitset_t sbcset,
103 				      const unsigned char *class_name,
104 				      reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 				       RE_TRANSLATE_TYPE trans,
108 				       const unsigned char *class_name,
109 				       const unsigned char *extra,
110 				       bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 				bin_tree_t *left, bin_tree_t *right,
113 				re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 				      bin_tree_t *left, bin_tree_t *right,
116 				      const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
121 
122 /* This table gives an error message for each of the error codes listed
123    in regex.h.  Obviously the order here has to be same as there.
124    POSIX doesn't require that we do anything for REG_NOERROR,
125    but why not be nice?  */
126 
127 static const char __re_error_msgid[] =
128   {
129 #define REG_NOERROR_IDX	0
130     gettext_noop ("Success")	/* REG_NOERROR */
131     "\0"
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133     gettext_noop ("No match")	/* REG_NOMATCH */
134     "\0"
135 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
136     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
137     "\0"
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
140     "\0"
141 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
143     "\0"
144 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
145     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
146     "\0"
147 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
148     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
149     "\0"
150 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
151     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
152     "\0"
153 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
155     "\0"
156 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
158     "\0"
159 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
160     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
161     "\0"
162 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163     gettext_noop ("Invalid range end")	/* REG_ERANGE */
164     "\0"
165 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
166     gettext_noop ("Memory exhausted") /* REG_ESPACE */
167     "\0"
168 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
169     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
170     "\0"
171 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172     gettext_noop ("Premature end of regular expression") /* REG_EEND */
173     "\0"
174 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
175     gettext_noop ("Regular expression too big") /* REG_ESIZE */
176     "\0"
177 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
178     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
179   };
180 
181 static const size_t __re_error_msgid_idx[] =
182   {
183     REG_NOERROR_IDX,
184     REG_NOMATCH_IDX,
185     REG_BADPAT_IDX,
186     REG_ECOLLATE_IDX,
187     REG_ECTYPE_IDX,
188     REG_EESCAPE_IDX,
189     REG_ESUBREG_IDX,
190     REG_EBRACK_IDX,
191     REG_EPAREN_IDX,
192     REG_EBRACE_IDX,
193     REG_BADBR_IDX,
194     REG_ERANGE_IDX,
195     REG_ESPACE_IDX,
196     REG_BADRPT_IDX,
197     REG_EEND_IDX,
198     REG_ESIZE_IDX,
199     REG_ERPAREN_IDX
200   };
201 
202 /* Entry points for GNU code.  */
203 
204 /* re_compile_pattern is the GNU regular expression compiler: it
205    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206    Returns 0 if the pattern was valid, otherwise an error string.
207 
208    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209    are set in BUFP on entry.  */
210 
211 #ifdef _LIBC
212 const char *
213 re_compile_pattern (pattern, length, bufp)
214     const char *pattern;
215     size_t length;
216     struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 		    struct re_pattern_buffer *bufp)
221 #endif
222 {
223   reg_errcode_t ret;
224 
225   /* And GNU code determines whether or not to get register information
226      by passing null for the REGS argument to re_match, etc., not by
227      setting no_sub, unless RE_NO_SUB is set.  */
228   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229 
230   /* Match anchors at newline.  */
231   bufp->newline_anchor = 1;
232 
233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 
235   if (!ret)
236     return NULL;
237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 }
239 #ifdef _LIBC
240 weak_alias (__re_compile_pattern, re_compile_pattern)
241 #endif
242 
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
244    also be assigned to arbitrarily: each pattern buffer stores its own
245    syntax, so it can be changed between regex compilations.  */
246 /* This has no initializer because initialized variables in Emacs
247    become read-only after dumping.  */
248 reg_syntax_t re_syntax_options;
249 
250 
251 /* Specify the precise syntax of regexps for compilation.  This provides
252    for compatibility for various utilities which historically have
253    different, incompatible syntaxes.
254 
255    The argument SYNTAX is a bit mask comprised of the various bits
256    defined in regex.h.  We return the old syntax.  */
257 
258 reg_syntax_t
259 re_set_syntax (syntax)
260     reg_syntax_t syntax;
261 {
262   reg_syntax_t ret = re_syntax_options;
263 
264   re_syntax_options = syntax;
265   return ret;
266 }
267 #ifdef _LIBC
268 weak_alias (__re_set_syntax, re_set_syntax)
269 #endif
270 
271 int
272 re_compile_fastmap (bufp)
273     struct re_pattern_buffer *bufp;
274 {
275   re_dfa_t *dfa = bufp->buffer;
276   char *fastmap = bufp->fastmap;
277 
278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280   if (dfa->init_state != dfa->init_state_word)
281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282   if (dfa->init_state != dfa->init_state_nl)
283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284   if (dfa->init_state != dfa->init_state_begbuf)
285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286   bufp->fastmap_accurate = 1;
287   return 0;
288 }
289 #ifdef _LIBC
290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
291 #endif
292 
293 static inline void
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
296 {
297   fastmap[ch] = 1;
298   if (icase)
299     fastmap[tolower (ch)] = 1;
300 }
301 
302 /* Helper function for re_compile_fastmap.
303    Compile fastmap for the initial_state INIT_STATE.  */
304 
305 static void
306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
307 			 char *fastmap)
308 {
309   re_dfa_t *dfa = bufp->buffer;
310   Idx node_cnt;
311   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
313     {
314       Idx node = init_state->nodes.elems[node_cnt];
315       re_token_type_t type = dfa->nodes[node].type;
316 
317       if (type == CHARACTER)
318 	{
319 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
322 	    {
323 	      unsigned char buf[MB_LEN_MAX];
324 	      unsigned char *p;
325 	      wchar_t wc;
326 	      mbstate_t state;
327 
328 	      p = buf;
329 	      *p++ = dfa->nodes[node].opr.c;
330 	      while (++node < dfa->nodes_len
331 		     &&	dfa->nodes[node].type == CHARACTER
332 		     && dfa->nodes[node].mb_partial)
333 		*p++ = dfa->nodes[node].opr.c;
334 	      memset (&state, '\0', sizeof (state));
335 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
336 			     &state) == p - buf
337 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
338 		      != (size_t) -1))
339 		re_set_fastmap (fastmap, false, buf[0]);
340 	    }
341 #endif
342 	}
343       else if (type == SIMPLE_BRACKET)
344 	{
345 	  int i, ch;
346 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
347 	    {
348 	      int j;
349 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 		if (w & ((bitset_word_t) 1 << j))
352 		  re_set_fastmap (fastmap, icase, ch);
353 	    }
354 	}
355 #ifdef RE_ENABLE_I18N
356       else if (type == COMPLEX_BRACKET)
357 	{
358 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
359 	  Idx i;
360 
361 # ifdef _LIBC
362 	  /* See if we have to try all bytes which start multiple collation
363 	     elements.
364 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 		  collation element, and don't catch 'b' since 'b' is
366 		  the only collation element which starts from 'b' (and
367 		  it is caught by SIMPLE_BRACKET).  */
368 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 		  && (cset->ncoll_syms || cset->nranges))
370 		{
371 		  const int32_t *table = (const int32_t *)
372 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 		  for (i = 0; i < SBC_MAX; ++i)
374 		    if (table[i] < 0)
375 		      re_set_fastmap (fastmap, icase, i);
376 		}
377 # endif /* _LIBC */
378 
379 	  /* See if we have to start the match at all multibyte characters,
380 	     i.e. where we would not find an invalid sequence.  This only
381 	     applies to multibyte character sets; for single byte character
382 	     sets, the SIMPLE_BRACKET again suffices.  */
383 	  if (dfa->mb_cur_max > 1
384 	      && (cset->nchar_classes || cset->non_match || cset->nranges
385 # ifdef _LIBC
386 		  || cset->nequiv_classes
387 # endif /* _LIBC */
388 		 ))
389 	    {
390 	      unsigned char c = 0;
391 	      do
392 		{
393 		  mbstate_t mbs;
394 		  memset (&mbs, 0, sizeof (mbs));
395 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 		    re_set_fastmap (fastmap, false, (int) c);
397 		}
398 	      while (++c != 0);
399 	    }
400 
401 	  else
402 	    {
403 	      /* ... Else catch all bytes which can start the mbchars.  */
404 	      for (i = 0; i < cset->nmbchars; ++i)
405 		{
406 		  char buf[256];
407 		  mbstate_t state;
408 		  memset (&state, '\0', sizeof (state));
409 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
412 		    {
413 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
414 			  != (size_t) -1)
415 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 		    }
417 		}
418 	    }
419 	}
420 #endif /* RE_ENABLE_I18N */
421       else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 	       || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 	       || type == END_OF_RE)
426 	{
427 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 	  if (type == END_OF_RE)
429 	    bufp->can_be_null = 1;
430 	  return;
431 	}
432     }
433 }
434 
435 /* Entry point for POSIX code.  */
436 /* regcomp takes a regular expression as a string and compiles it.
437 
438    PREG is a regex_t *.  We do not expect any fields to be initialized,
439    since POSIX says we shouldn't.  Thus, we set
440 
441      'buffer' to the compiled pattern;
442      'used' to the length of the compiled pattern;
443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
445        RE_SYNTAX_POSIX_BASIC;
446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447      'fastmap' to an allocated space for the fastmap;
448      'fastmap_accurate' to zero;
449      're_nsub' to the number of subexpressions in PATTERN.
450 
451    PATTERN is the address of the pattern string.
452 
453    CFLAGS is a series of bits which affect compilation.
454 
455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456      use POSIX basic syntax.
457 
458      If REG_NEWLINE is set, then . and [^...] don't match newline.
459      Also, regexec will try a match beginning after every newline.
460 
461      If REG_ICASE is set, then we considers upper- and lowercase
462      versions of letters to be equivalent when matching.
463 
464      If REG_NOSUB is set, then when PREG is passed to regexec, that
465      routine will report only success or failure, and nothing about the
466      registers.
467 
468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
469    the return codes and their meanings.)  */
470 
471 int
472 regcomp (preg, pattern, cflags)
473     regex_t *_Restrict_ preg;
474     const char *_Restrict_ pattern;
475     int cflags;
476 {
477   reg_errcode_t ret;
478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 			 : RE_SYNTAX_POSIX_BASIC);
480 
481   preg->buffer = NULL;
482   preg->allocated = 0;
483   preg->used = 0;
484 
485   /* Try to allocate space for the fastmap.  */
486   preg->fastmap = re_malloc (char, SBC_MAX);
487   if (BE (preg->fastmap == NULL, 0))
488     return REG_ESPACE;
489 
490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491 
492   /* If REG_NEWLINE is set, newlines are treated differently.  */
493   if (cflags & REG_NEWLINE)
494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
495       syntax &= ~RE_DOT_NEWLINE;
496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497       /* It also changes the matching behavior.  */
498       preg->newline_anchor = 1;
499     }
500   else
501     preg->newline_anchor = 0;
502   preg->no_sub = !!(cflags & REG_NOSUB);
503   preg->translate = NULL;
504 
505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506 
507   /* POSIX doesn't distinguish between an unmatched open-group and an
508      unmatched close-group: both are REG_EPAREN.  */
509   if (ret == REG_ERPAREN)
510     ret = REG_EPAREN;
511 
512   /* We have already checked preg->fastmap != NULL.  */
513   if (BE (ret == REG_NOERROR, 1))
514     /* Compute the fastmap now, since regexec cannot modify the pattern
515        buffer.  This function never fails in this implementation.  */
516     (void) re_compile_fastmap (preg);
517   else
518     {
519       /* Some error occurred while compiling the expression.  */
520       re_free (preg->fastmap);
521       preg->fastmap = NULL;
522     }
523 
524   return (int) ret;
525 }
526 #ifdef _LIBC
527 weak_alias (__regcomp, regcomp)
528 #endif
529 
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531    from either regcomp or regexec.   We don't use PREG here.  */
532 
533 #ifdef _LIBC
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536     int errcode;
537     const regex_t *_Restrict_ preg;
538     char *_Restrict_ errbuf;
539     size_t errbuf_size;
540 #else /* size_t might promote */
541 size_t
542 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
543 	  char *_Restrict_ errbuf, size_t errbuf_size)
544 #endif
545 {
546   const char *msg;
547   size_t msg_size;
548 
549   if (BE (errcode < 0
550 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 			       / sizeof (__re_error_msgid_idx[0])), 0))
552     /* Only error codes returned by the rest of the code should be passed
553        to this routine.  If we are given anything else, or if other regex
554        code generates an invalid error code, then the program has a bug.
555        Dump core so we can fix it.  */
556     abort ();
557 
558   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
559 
560   msg_size = strlen (msg) + 1; /* Includes the null.  */
561 
562   if (BE (errbuf_size != 0, 1))
563     {
564       size_t cpy_size = msg_size;
565       if (BE (msg_size > errbuf_size, 0))
566 	{
567 	  cpy_size = errbuf_size - 1;
568 	  errbuf[cpy_size] = '\0';
569 	}
570       memcpy (errbuf, msg, cpy_size);
571     }
572 
573   return msg_size;
574 }
575 #ifdef _LIBC
576 weak_alias (__regerror, regerror)
577 #endif
578 
579 
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
583    it the same all the time.  UTF-8 is the preferred encoding so this is
584    a worthwhile optimization.  */
585 static const bitset_t utf8_sb_map =
586 {
587   /* Set the first 128 bits.  */
588 # ifdef __GNUC__
589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
590 # else
591 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 #   error "bitset_word_t is narrower than 32 bits"
593 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596   BITSET_WORD_MAX, BITSET_WORD_MAX,
597 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
598   BITSET_WORD_MAX,
599 #  endif
600   (BITSET_WORD_MAX
601    >> (SBC_MAX % BITSET_WORD_BITS == 0
602        ? 0
603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
604 # endif
605 };
606 #endif
607 
608 
609 static void
610 free_dfa_content (re_dfa_t *dfa)
611 {
612   Idx i, j;
613 
614   if (dfa->nodes)
615     for (i = 0; i < dfa->nodes_len; ++i)
616       free_token (dfa->nodes + i);
617   re_free (dfa->nexts);
618   for (i = 0; i < dfa->nodes_len; ++i)
619     {
620       if (dfa->eclosures != NULL)
621 	re_node_set_free (dfa->eclosures + i);
622       if (dfa->inveclosures != NULL)
623 	re_node_set_free (dfa->inveclosures + i);
624       if (dfa->edests != NULL)
625 	re_node_set_free (dfa->edests + i);
626     }
627   re_free (dfa->edests);
628   re_free (dfa->eclosures);
629   re_free (dfa->inveclosures);
630   re_free (dfa->nodes);
631 
632   if (dfa->state_table)
633     for (i = 0; i <= dfa->state_hash_mask; ++i)
634       {
635 	struct re_state_table_entry *entry = dfa->state_table + i;
636 	for (j = 0; j < entry->num; ++j)
637 	  {
638 	    re_dfastate_t *state = entry->array[j];
639 	    free_state (state);
640 	  }
641 	re_free (entry->array);
642       }
643   re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645   if (dfa->sb_char != utf8_sb_map)
646     re_free (dfa->sb_char);
647 #endif
648   re_free (dfa->subexp_map);
649 #ifdef DEBUG
650   re_free (dfa->re_str);
651 #endif
652 
653   re_free (dfa);
654 }
655 
656 
657 /* Free dynamically allocated space used by PREG.  */
658 
659 void
660 regfree (preg)
661     regex_t *preg;
662 {
663   re_dfa_t *dfa = preg->buffer;
664   if (BE (dfa != NULL, 1))
665     free_dfa_content (dfa);
666   preg->buffer = NULL;
667   preg->allocated = 0;
668 
669   re_free (preg->fastmap);
670   preg->fastmap = NULL;
671 
672   re_free (preg->translate);
673   preg->translate = NULL;
674 }
675 #ifdef _LIBC
676 weak_alias (__regfree, regfree)
677 #endif
678 
679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
680    them unless specifically requested.  */
681 
682 #if defined _REGEX_RE_COMP || defined _LIBC
683 
684 /* BSD has one and only one pattern buffer.  */
685 static struct re_pattern_buffer re_comp_buf;
686 
687 char *
688 # ifdef _LIBC
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690    these names if they don't use our functions, and still use
691    regcomp/regexec above without link errors.  */
692 weak_function
693 # endif
694 re_comp (s)
695      const char *s;
696 {
697   reg_errcode_t ret;
698   char *fastmap;
699 
700   if (!s)
701     {
702       if (!re_comp_buf.buffer)
703 	return gettext ("No previous regular expression");
704       return 0;
705     }
706 
707   if (re_comp_buf.buffer)
708     {
709       fastmap = re_comp_buf.fastmap;
710       re_comp_buf.fastmap = NULL;
711       __regfree (&re_comp_buf);
712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713       re_comp_buf.fastmap = fastmap;
714     }
715 
716   if (re_comp_buf.fastmap == NULL)
717     {
718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719       if (re_comp_buf.fastmap == NULL)
720 	return (char *) gettext (__re_error_msgid
721 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
722     }
723 
724   /* Since 're_exec' always passes NULL for the 'regs' argument, we
725      don't need to initialize the pattern buffer fields which affect it.  */
726 
727   /* Match anchors at newlines.  */
728   re_comp_buf.newline_anchor = 1;
729 
730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731 
732   if (!ret)
733     return NULL;
734 
735   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 }
738 
739 #ifdef _LIBC
740 libc_freeres_fn (free_mem)
741 {
742   __regfree (&re_comp_buf);
743 }
744 #endif
745 
746 #endif /* _REGEX_RE_COMP */
747 
748 /* Internal entry point.
749    Compile the regular expression PATTERN, whose length is LENGTH.
750    SYNTAX indicate regular expression's syntax.  */
751 
752 static reg_errcode_t
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 		     reg_syntax_t syntax)
755 {
756   reg_errcode_t err = REG_NOERROR;
757   re_dfa_t *dfa;
758   re_string_t regexp;
759 
760   /* Initialize the pattern buffer.  */
761   preg->fastmap_accurate = 0;
762   preg->syntax = syntax;
763   preg->not_bol = preg->not_eol = 0;
764   preg->used = 0;
765   preg->re_nsub = 0;
766   preg->can_be_null = 0;
767   preg->regs_allocated = REGS_UNALLOCATED;
768 
769   /* Initialize the dfa.  */
770   dfa = preg->buffer;
771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772     {
773       /* If zero allocated, but buffer is non-null, try to realloc
774 	 enough space.  This loses if buffer's address is bogus, but
775 	 that is the user's responsibility.  If ->buffer is NULL this
776 	 is a simple allocation.  */
777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778       if (dfa == NULL)
779 	return REG_ESPACE;
780       preg->allocated = sizeof (re_dfa_t);
781       preg->buffer = dfa;
782     }
783   preg->used = sizeof (re_dfa_t);
784 
785   err = init_dfa (dfa, length);
786   if (BE (err != REG_NOERROR, 0))
787     {
788       free_dfa_content (dfa);
789       preg->buffer = NULL;
790       preg->allocated = 0;
791       return err;
792     }
793 #ifdef DEBUG
794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
795   dfa->re_str = re_malloc (char, length + 1);
796   strncpy (dfa->re_str, pattern, length + 1);
797 #endif
798 
799   __libc_lock_init (dfa->lock);
800 
801   err = re_string_construct (&regexp, pattern, length, preg->translate,
802 			     (syntax & RE_ICASE) != 0, dfa);
803   if (BE (err != REG_NOERROR, 0))
804     {
805     re_compile_internal_free_return:
806       free_workarea_compile (preg);
807       re_string_destruct (&regexp);
808       free_dfa_content (dfa);
809       preg->buffer = NULL;
810       preg->allocated = 0;
811       return err;
812     }
813 
814   /* Parse the regular expression, and build a structure tree.  */
815   preg->re_nsub = 0;
816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
817   if (BE (dfa->str_tree == NULL, 0))
818     goto re_compile_internal_free_return;
819 
820   /* Analyze the tree and create the nfa.  */
821   err = analyze (preg);
822   if (BE (err != REG_NOERROR, 0))
823     goto re_compile_internal_free_return;
824 
825 #ifdef RE_ENABLE_I18N
826   /* If possible, do searching in single byte encoding to speed things up.  */
827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828     optimize_utf8 (dfa);
829 #endif
830 
831   /* Then create the initial state of the dfa.  */
832   err = create_initial_state (dfa);
833 
834   /* Release work areas.  */
835   free_workarea_compile (preg);
836   re_string_destruct (&regexp);
837 
838   if (BE (err != REG_NOERROR, 0))
839     {
840       free_dfa_content (dfa);
841       preg->buffer = NULL;
842       preg->allocated = 0;
843     }
844 
845   return err;
846 }
847 
848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
849    as the initial length of some arrays.  */
850 
851 static reg_errcode_t
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 {
854   __re_size_t table_size;
855 #ifndef _LIBC
856   const char *codeset_name;
857 #endif
858 #ifdef RE_ENABLE_I18N
859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
860 #else
861   size_t max_i18n_object_size = 0;
862 #endif
863   size_t max_object_size =
864     MAX (sizeof (struct re_state_table_entry),
865 	 MAX (sizeof (re_token_t),
866 	      MAX (sizeof (re_node_set),
867 		   MAX (sizeof (regmatch_t),
868 			max_i18n_object_size))));
869 
870   memset (dfa, '\0', sizeof (re_dfa_t));
871 
872   /* Force allocation of str_tree_storage the first time.  */
873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874 
875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
876      calculation below, and for similar doubling calculations
877      elsewhere.  And it's <= rather than <, because some of the
878      doubling calculations add 1 afterwards.  */
879   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
880     return REG_ESPACE;
881 
882   dfa->nodes_alloc = pat_len + 1;
883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
884 
885   /*  table_size = 2 ^ ceil(log pat_len) */
886   for (table_size = 1; ; table_size <<= 1)
887     if (table_size > pat_len)
888       break;
889 
890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891   dfa->state_hash_mask = table_size - 1;
892 
893   dfa->mb_cur_max = MB_CUR_MAX;
894 #ifdef _LIBC
895   if (dfa->mb_cur_max == 6
896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
897     dfa->is_utf8 = 1;
898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 		       != 0);
900 #else
901   codeset_name = nl_langinfo (CODESET);
902   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
903       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
904       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
905       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
906     dfa->is_utf8 = 1;
907 
908   /* We check exhaustively in the loop below if this charset is a
909      superset of ASCII.  */
910   dfa->map_notascii = 0;
911 #endif
912 
913 #ifdef RE_ENABLE_I18N
914   if (dfa->mb_cur_max > 1)
915     {
916       if (dfa->is_utf8)
917 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
918       else
919 	{
920 	  int i, j, ch;
921 
922 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
923 	  if (BE (dfa->sb_char == NULL, 0))
924 	    return REG_ESPACE;
925 
926 	  /* Set the bits corresponding to single byte chars.  */
927 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929 	      {
930 		wint_t wch = __btowc (ch);
931 		if (wch != WEOF)
932 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
933 # ifndef _LIBC
934 		if (isascii (ch) && wch != ch)
935 		  dfa->map_notascii = 1;
936 # endif
937 	      }
938 	}
939     }
940 #endif
941 
942   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943     return REG_ESPACE;
944   return REG_NOERROR;
945 }
946 
947 /* Initialize WORD_CHAR table, which indicate which character is
948    "word".  In this case "word" means that it is the word construction
949    character used by some operators like "\<", "\>", etc.  */
950 
951 static void
952 internal_function
953 init_word_char (re_dfa_t *dfa)
954 {
955   dfa->word_ops_used = 1;
956   int i = 0;
957   int j;
958   int ch = 0;
959   if (BE (dfa->map_notascii == 0, 1))
960     {
961       bitset_word_t bits0 = 0x00000000;
962       bitset_word_t bits1 = 0x03ff0000;
963       bitset_word_t bits2 = 0x87fffffe;
964       bitset_word_t bits3 = 0x07fffffe;
965       if (BITSET_WORD_BITS == 64)
966 	{
967 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
968 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
969 	  i = 2;
970 	}
971       else if (BITSET_WORD_BITS == 32)
972 	{
973 	  dfa->word_char[0] = bits0;
974 	  dfa->word_char[1] = bits1;
975 	  dfa->word_char[2] = bits2;
976 	  dfa->word_char[3] = bits3;
977 	  i = 4;
978 	}
979       else
980         goto general_case;
981       ch = 128;
982 
983       if (BE (dfa->is_utf8, 1))
984 	{
985 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
986 	  return;
987 	}
988     }
989 
990  general_case:
991   for (; i < BITSET_WORDS; ++i)
992     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
993       if (isalnum (ch) || ch == '_')
994 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
995 }
996 
997 /* Free the work area which are only used while compiling.  */
998 
999 static void
1000 free_workarea_compile (regex_t *preg)
1001 {
1002   re_dfa_t *dfa = preg->buffer;
1003   bin_tree_storage_t *storage, *next;
1004   for (storage = dfa->str_tree_storage; storage; storage = next)
1005     {
1006       next = storage->next;
1007       re_free (storage);
1008     }
1009   dfa->str_tree_storage = NULL;
1010   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1011   dfa->str_tree = NULL;
1012   re_free (dfa->org_indices);
1013   dfa->org_indices = NULL;
1014 }
1015 
1016 /* Create initial states for all contexts.  */
1017 
1018 static reg_errcode_t
1019 create_initial_state (re_dfa_t *dfa)
1020 {
1021   Idx first, i;
1022   reg_errcode_t err;
1023   re_node_set init_nodes;
1024 
1025   /* Initial states have the epsilon closure of the node which is
1026      the first node of the regular expression.  */
1027   first = dfa->str_tree->first->node_idx;
1028   dfa->init_node = first;
1029   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1030   if (BE (err != REG_NOERROR, 0))
1031     return err;
1032 
1033   /* The back-references which are in initial states can epsilon transit,
1034      since in this case all of the subexpressions can be null.
1035      Then we add epsilon closures of the nodes which are the next nodes of
1036      the back-references.  */
1037   if (dfa->nbackref > 0)
1038     for (i = 0; i < init_nodes.nelem; ++i)
1039       {
1040 	Idx node_idx = init_nodes.elems[i];
1041 	re_token_type_t type = dfa->nodes[node_idx].type;
1042 
1043 	Idx clexp_idx;
1044 	if (type != OP_BACK_REF)
1045 	  continue;
1046 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1047 	  {
1048 	    re_token_t *clexp_node;
1049 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1050 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1051 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1052 	      break;
1053 	  }
1054 	if (clexp_idx == init_nodes.nelem)
1055 	  continue;
1056 
1057 	if (type == OP_BACK_REF)
1058 	  {
1059 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1060 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1061 	      {
1062 		reg_errcode_t merge_err
1063                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1064 		if (merge_err != REG_NOERROR)
1065 		  return merge_err;
1066 		i = 0;
1067 	      }
1068 	  }
1069       }
1070 
1071   /* It must be the first time to invoke acquire_state.  */
1072   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1073   /* We don't check ERR here, since the initial state must not be NULL.  */
1074   if (BE (dfa->init_state == NULL, 0))
1075     return err;
1076   if (dfa->init_state->has_constraint)
1077     {
1078       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1079 						       CONTEXT_WORD);
1080       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1081 						     CONTEXT_NEWLINE);
1082       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1083 							 &init_nodes,
1084 							 CONTEXT_NEWLINE
1085 							 | CONTEXT_BEGBUF);
1086       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1087 	      || dfa->init_state_begbuf == NULL, 0))
1088 	return err;
1089     }
1090   else
1091     dfa->init_state_word = dfa->init_state_nl
1092       = dfa->init_state_begbuf = dfa->init_state;
1093 
1094   re_node_set_free (&init_nodes);
1095   return REG_NOERROR;
1096 }
1097 
1098 #ifdef RE_ENABLE_I18N
1099 /* If it is possible to do searching in single byte encoding instead of UTF-8
1100    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1101    DFA nodes where needed.  */
1102 
1103 static void
1104 optimize_utf8 (re_dfa_t *dfa)
1105 {
1106   Idx node;
1107   int i;
1108   bool mb_chars = false;
1109   bool has_period = false;
1110 
1111   for (node = 0; node < dfa->nodes_len; ++node)
1112     switch (dfa->nodes[node].type)
1113       {
1114       case CHARACTER:
1115 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1116 	  mb_chars = true;
1117 	break;
1118       case ANCHOR:
1119 	switch (dfa->nodes[node].opr.ctx_type)
1120 	  {
1121 	  case LINE_FIRST:
1122 	  case LINE_LAST:
1123 	  case BUF_FIRST:
1124 	  case BUF_LAST:
1125 	    break;
1126 	  default:
1127 	    /* Word anchors etc. cannot be handled.  It's okay to test
1128 	       opr.ctx_type since constraints (for all DFA nodes) are
1129 	       created by ORing one or more opr.ctx_type values.  */
1130 	    return;
1131 	  }
1132 	break;
1133       case OP_PERIOD:
1134 	has_period = true;
1135 	break;
1136       case OP_BACK_REF:
1137       case OP_ALT:
1138       case END_OF_RE:
1139       case OP_DUP_ASTERISK:
1140       case OP_OPEN_SUBEXP:
1141       case OP_CLOSE_SUBEXP:
1142 	break;
1143       case COMPLEX_BRACKET:
1144 	return;
1145       case SIMPLE_BRACKET:
1146 	/* Just double check.  */
1147 	{
1148 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1149 			? 0
1150 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1151 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1152 	    {
1153 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1154 		return;
1155 	      rshift = 0;
1156 	    }
1157 	}
1158 	break;
1159       default:
1160 	abort ();
1161       }
1162 
1163   if (mb_chars || has_period)
1164     for (node = 0; node < dfa->nodes_len; ++node)
1165       {
1166 	if (dfa->nodes[node].type == CHARACTER
1167 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1168 	  dfa->nodes[node].mb_partial = 0;
1169 	else if (dfa->nodes[node].type == OP_PERIOD)
1170 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1171       }
1172 
1173   /* The search can be in single byte locale.  */
1174   dfa->mb_cur_max = 1;
1175   dfa->is_utf8 = 0;
1176   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1177 }
1178 #endif
1179 
1180 /* Analyze the structure tree, and calculate "first", "next", "edest",
1181    "eclosure", and "inveclosure".  */
1182 
1183 static reg_errcode_t
1184 analyze (regex_t *preg)
1185 {
1186   re_dfa_t *dfa = preg->buffer;
1187   reg_errcode_t ret;
1188 
1189   /* Allocate arrays.  */
1190   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1191   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1192   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1193   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1194   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1195 	  || dfa->eclosures == NULL, 0))
1196     return REG_ESPACE;
1197 
1198   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1199   if (dfa->subexp_map != NULL)
1200     {
1201       Idx i;
1202       for (i = 0; i < preg->re_nsub; i++)
1203 	dfa->subexp_map[i] = i;
1204       preorder (dfa->str_tree, optimize_subexps, dfa);
1205       for (i = 0; i < preg->re_nsub; i++)
1206 	if (dfa->subexp_map[i] != i)
1207 	  break;
1208       if (i == preg->re_nsub)
1209 	{
1210 	  free (dfa->subexp_map);
1211 	  dfa->subexp_map = NULL;
1212 	}
1213     }
1214 
1215   ret = postorder (dfa->str_tree, lower_subexps, preg);
1216   if (BE (ret != REG_NOERROR, 0))
1217     return ret;
1218   ret = postorder (dfa->str_tree, calc_first, dfa);
1219   if (BE (ret != REG_NOERROR, 0))
1220     return ret;
1221   preorder (dfa->str_tree, calc_next, dfa);
1222   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1223   if (BE (ret != REG_NOERROR, 0))
1224     return ret;
1225   ret = calc_eclosure (dfa);
1226   if (BE (ret != REG_NOERROR, 0))
1227     return ret;
1228 
1229   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1230      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1231   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1232       || dfa->nbackref)
1233     {
1234       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1235       if (BE (dfa->inveclosures == NULL, 0))
1236 	return REG_ESPACE;
1237       ret = calc_inveclosure (dfa);
1238     }
1239 
1240   return ret;
1241 }
1242 
1243 /* Our parse trees are very unbalanced, so we cannot use a stack to
1244    implement parse tree visits.  Instead, we use parent pointers and
1245    some hairy code in these two functions.  */
1246 static reg_errcode_t
1247 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 	   void *extra)
1249 {
1250   bin_tree_t *node, *prev;
1251 
1252   for (node = root; ; )
1253     {
1254       /* Descend down the tree, preferably to the left (or to the right
1255 	 if that's the only child).  */
1256       while (node->left || node->right)
1257 	if (node->left)
1258 	  node = node->left;
1259 	else
1260 	  node = node->right;
1261 
1262       do
1263 	{
1264 	  reg_errcode_t err = fn (extra, node);
1265 	  if (BE (err != REG_NOERROR, 0))
1266 	    return err;
1267 	  if (node->parent == NULL)
1268 	    return REG_NOERROR;
1269 	  prev = node;
1270 	  node = node->parent;
1271 	}
1272       /* Go up while we have a node that is reached from the right.  */
1273       while (node->right == prev || node->right == NULL);
1274       node = node->right;
1275     }
1276 }
1277 
1278 static reg_errcode_t
1279 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1280 	  void *extra)
1281 {
1282   bin_tree_t *node;
1283 
1284   for (node = root; ; )
1285     {
1286       reg_errcode_t err = fn (extra, node);
1287       if (BE (err != REG_NOERROR, 0))
1288 	return err;
1289 
1290       /* Go to the left node, or up and to the right.  */
1291       if (node->left)
1292 	node = node->left;
1293       else
1294 	{
1295 	  bin_tree_t *prev = NULL;
1296 	  while (node->right == prev || node->right == NULL)
1297 	    {
1298 	      prev = node;
1299 	      node = node->parent;
1300 	      if (!node)
1301 		return REG_NOERROR;
1302 	    }
1303 	  node = node->right;
1304 	}
1305     }
1306 }
1307 
1308 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1309    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1310    backreferences as well.  Requires a preorder visit.  */
1311 static reg_errcode_t
1312 optimize_subexps (void *extra, bin_tree_t *node)
1313 {
1314   re_dfa_t *dfa = (re_dfa_t *) extra;
1315 
1316   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1317     {
1318       int idx = node->token.opr.idx;
1319       node->token.opr.idx = dfa->subexp_map[idx];
1320       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1321     }
1322 
1323   else if (node->token.type == SUBEXP
1324 	   && node->left && node->left->token.type == SUBEXP)
1325     {
1326       Idx other_idx = node->left->token.opr.idx;
1327 
1328       node->left = node->left->left;
1329       if (node->left)
1330 	node->left->parent = node;
1331 
1332       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1333       if (other_idx < BITSET_WORD_BITS)
1334 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1335     }
1336 
1337   return REG_NOERROR;
1338 }
1339 
1340 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1341    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1342 static reg_errcode_t
1343 lower_subexps (void *extra, bin_tree_t *node)
1344 {
1345   regex_t *preg = (regex_t *) extra;
1346   reg_errcode_t err = REG_NOERROR;
1347 
1348   if (node->left && node->left->token.type == SUBEXP)
1349     {
1350       node->left = lower_subexp (&err, preg, node->left);
1351       if (node->left)
1352 	node->left->parent = node;
1353     }
1354   if (node->right && node->right->token.type == SUBEXP)
1355     {
1356       node->right = lower_subexp (&err, preg, node->right);
1357       if (node->right)
1358 	node->right->parent = node;
1359     }
1360 
1361   return err;
1362 }
1363 
1364 static bin_tree_t *
1365 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1366 {
1367   re_dfa_t *dfa = preg->buffer;
1368   bin_tree_t *body = node->left;
1369   bin_tree_t *op, *cls, *tree1, *tree;
1370 
1371   if (preg->no_sub
1372       /* We do not optimize empty subexpressions, because otherwise we may
1373 	 have bad CONCAT nodes with NULL children.  This is obviously not
1374 	 very common, so we do not lose much.  An example that triggers
1375 	 this case is the sed "script" /\(\)/x.  */
1376       && node->left != NULL
1377       && (node->token.opr.idx >= BITSET_WORD_BITS
1378 	  || !(dfa->used_bkref_map
1379 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1380     return node->left;
1381 
1382   /* Convert the SUBEXP node to the concatenation of an
1383      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1384   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1385   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1386   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1387   tree = create_tree (dfa, op, tree1, CONCAT);
1388   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1389     {
1390       *err = REG_ESPACE;
1391       return NULL;
1392     }
1393 
1394   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1395   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1396   return tree;
1397 }
1398 
1399 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1400    nodes.  Requires a postorder visit.  */
1401 static reg_errcode_t
1402 calc_first (void *extra, bin_tree_t *node)
1403 {
1404   re_dfa_t *dfa = (re_dfa_t *) extra;
1405   if (node->token.type == CONCAT)
1406     {
1407       node->first = node->left->first;
1408       node->node_idx = node->left->node_idx;
1409     }
1410   else
1411     {
1412       node->first = node;
1413       node->node_idx = re_dfa_add_node (dfa, node->token);
1414       if (BE (node->node_idx == REG_MISSING, 0))
1415 	return REG_ESPACE;
1416       if (node->token.type == ANCHOR)
1417 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1418     }
1419   return REG_NOERROR;
1420 }
1421 
1422 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1423 static reg_errcode_t
1424 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1425 {
1426   switch (node->token.type)
1427     {
1428     case OP_DUP_ASTERISK:
1429       node->left->next = node;
1430       break;
1431     case CONCAT:
1432       node->left->next = node->right->first;
1433       node->right->next = node->next;
1434       break;
1435     default:
1436       if (node->left)
1437 	node->left->next = node->next;
1438       if (node->right)
1439 	node->right->next = node->next;
1440       break;
1441     }
1442   return REG_NOERROR;
1443 }
1444 
1445 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1446 static reg_errcode_t
1447 link_nfa_nodes (void *extra, bin_tree_t *node)
1448 {
1449   re_dfa_t *dfa = (re_dfa_t *) extra;
1450   Idx idx = node->node_idx;
1451   reg_errcode_t err = REG_NOERROR;
1452 
1453   switch (node->token.type)
1454     {
1455     case CONCAT:
1456       break;
1457 
1458     case END_OF_RE:
1459       assert (node->next == NULL);
1460       break;
1461 
1462     case OP_DUP_ASTERISK:
1463     case OP_ALT:
1464       {
1465 	Idx left, right;
1466 	dfa->has_plural_match = 1;
1467 	if (node->left != NULL)
1468 	  left = node->left->first->node_idx;
1469 	else
1470 	  left = node->next->node_idx;
1471 	if (node->right != NULL)
1472 	  right = node->right->first->node_idx;
1473 	else
1474 	  right = node->next->node_idx;
1475 	assert (REG_VALID_INDEX (left));
1476 	assert (REG_VALID_INDEX (right));
1477 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1478       }
1479       break;
1480 
1481     case ANCHOR:
1482     case OP_OPEN_SUBEXP:
1483     case OP_CLOSE_SUBEXP:
1484       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1485       break;
1486 
1487     case OP_BACK_REF:
1488       dfa->nexts[idx] = node->next->node_idx;
1489       if (node->token.type == OP_BACK_REF)
1490 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1491       break;
1492 
1493     default:
1494       assert (!IS_EPSILON_NODE (node->token.type));
1495       dfa->nexts[idx] = node->next->node_idx;
1496       break;
1497     }
1498 
1499   return err;
1500 }
1501 
1502 /* Duplicate the epsilon closure of the node ROOT_NODE.
1503    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1504    to their own constraint.  */
1505 
1506 static reg_errcode_t
1507 internal_function
1508 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1509 			Idx root_node, unsigned int init_constraint)
1510 {
1511   Idx org_node, clone_node;
1512   bool ok;
1513   unsigned int constraint = init_constraint;
1514   for (org_node = top_org_node, clone_node = top_clone_node;;)
1515     {
1516       Idx org_dest, clone_dest;
1517       if (dfa->nodes[org_node].type == OP_BACK_REF)
1518 	{
1519 	  /* If the back reference epsilon-transit, its destination must
1520 	     also have the constraint.  Then duplicate the epsilon closure
1521 	     of the destination of the back reference, and store it in
1522 	     edests of the back reference.  */
1523 	  org_dest = dfa->nexts[org_node];
1524 	  re_node_set_empty (dfa->edests + clone_node);
1525 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 	  if (BE (clone_dest == REG_MISSING, 0))
1527 	    return REG_ESPACE;
1528 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1529 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1530 	  if (BE (! ok, 0))
1531 	    return REG_ESPACE;
1532 	}
1533       else if (dfa->edests[org_node].nelem == 0)
1534 	{
1535 	  /* In case of the node can't epsilon-transit, don't duplicate the
1536 	     destination and store the original destination as the
1537 	     destination of the node.  */
1538 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1539 	  break;
1540 	}
1541       else if (dfa->edests[org_node].nelem == 1)
1542 	{
1543 	  /* In case of the node can epsilon-transit, and it has only one
1544 	     destination.  */
1545 	  org_dest = dfa->edests[org_node].elems[0];
1546 	  re_node_set_empty (dfa->edests + clone_node);
1547 	  /* If the node is root_node itself, it means the epsilon closure
1548 	     has a loop.  Then tie it to the destination of the root_node.  */
1549 	  if (org_node == root_node && clone_node != org_node)
1550 	    {
1551 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1552 	      if (BE (! ok, 0))
1553 	        return REG_ESPACE;
1554 	      break;
1555 	    }
1556 	  /* In case the node has another constraint, append it.  */
1557 	  constraint |= dfa->nodes[org_node].constraint;
1558 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1559 	  if (BE (clone_dest == REG_MISSING, 0))
1560 	    return REG_ESPACE;
1561 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1562 	  if (BE (! ok, 0))
1563 	    return REG_ESPACE;
1564 	}
1565       else /* dfa->edests[org_node].nelem == 2 */
1566 	{
1567 	  /* In case of the node can epsilon-transit, and it has two
1568 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1569 	  org_dest = dfa->edests[org_node].elems[0];
1570 	  re_node_set_empty (dfa->edests + clone_node);
1571 	  /* Search for a duplicated node which satisfies the constraint.  */
1572 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1573 	  if (clone_dest == REG_MISSING)
1574 	    {
1575 	      /* There is no such duplicated node, create a new one.  */
1576 	      reg_errcode_t err;
1577 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 	      if (BE (clone_dest == REG_MISSING, 0))
1579 		return REG_ESPACE;
1580 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 	      if (BE (! ok, 0))
1582 		return REG_ESPACE;
1583 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1584 					    root_node, constraint);
1585 	      if (BE (err != REG_NOERROR, 0))
1586 		return err;
1587 	    }
1588 	  else
1589 	    {
1590 	      /* There is a duplicated node which satisfies the constraint,
1591 		 use it to avoid infinite loop.  */
1592 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593 	      if (BE (! ok, 0))
1594 		return REG_ESPACE;
1595 	    }
1596 
1597 	  org_dest = dfa->edests[org_node].elems[1];
1598 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1599 	  if (BE (clone_dest == REG_MISSING, 0))
1600 	    return REG_ESPACE;
1601 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1602 	  if (BE (! ok, 0))
1603 	    return REG_ESPACE;
1604 	}
1605       org_node = org_dest;
1606       clone_node = clone_dest;
1607     }
1608   return REG_NOERROR;
1609 }
1610 
1611 /* Search for a node which is duplicated from the node ORG_NODE, and
1612    satisfies the constraint CONSTRAINT.  */
1613 
1614 static Idx
1615 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1616 			unsigned int constraint)
1617 {
1618   Idx idx;
1619   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1620     {
1621       if (org_node == dfa->org_indices[idx]
1622 	  && constraint == dfa->nodes[idx].constraint)
1623 	return idx; /* Found.  */
1624     }
1625   return REG_MISSING; /* Not found.  */
1626 }
1627 
1628 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1629    Return the index of the new node, or REG_MISSING if insufficient storage is
1630    available.  */
1631 
1632 static Idx
1633 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1634 {
1635   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1636   if (BE (dup_idx != REG_MISSING, 1))
1637     {
1638       dfa->nodes[dup_idx].constraint = constraint;
1639       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1640       dfa->nodes[dup_idx].duplicated = 1;
1641 
1642       /* Store the index of the original node.  */
1643       dfa->org_indices[dup_idx] = org_idx;
1644     }
1645   return dup_idx;
1646 }
1647 
1648 static reg_errcode_t
1649 calc_inveclosure (re_dfa_t *dfa)
1650 {
1651   Idx src, idx;
1652   bool ok;
1653   for (idx = 0; idx < dfa->nodes_len; ++idx)
1654     re_node_set_init_empty (dfa->inveclosures + idx);
1655 
1656   for (src = 0; src < dfa->nodes_len; ++src)
1657     {
1658       Idx *elems = dfa->eclosures[src].elems;
1659       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1660 	{
1661 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1662 	  if (BE (! ok, 0))
1663 	    return REG_ESPACE;
1664 	}
1665     }
1666 
1667   return REG_NOERROR;
1668 }
1669 
1670 /* Calculate "eclosure" for all the node in DFA.  */
1671 
1672 static reg_errcode_t
1673 calc_eclosure (re_dfa_t *dfa)
1674 {
1675   Idx node_idx;
1676   bool incomplete;
1677 #ifdef DEBUG
1678   assert (dfa->nodes_len > 0);
1679 #endif
1680   incomplete = false;
1681   /* For each nodes, calculate epsilon closure.  */
1682   for (node_idx = 0; ; ++node_idx)
1683     {
1684       reg_errcode_t err;
1685       re_node_set eclosure_elem;
1686       if (node_idx == dfa->nodes_len)
1687 	{
1688 	  if (!incomplete)
1689 	    break;
1690 	  incomplete = false;
1691 	  node_idx = 0;
1692 	}
1693 
1694 #ifdef DEBUG
1695       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1696 #endif
1697 
1698       /* If we have already calculated, skip it.  */
1699       if (dfa->eclosures[node_idx].nelem != 0)
1700 	continue;
1701       /* Calculate epsilon closure of 'node_idx'.  */
1702       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1703       if (BE (err != REG_NOERROR, 0))
1704 	return err;
1705 
1706       if (dfa->eclosures[node_idx].nelem == 0)
1707 	{
1708 	  incomplete = true;
1709 	  re_node_set_free (&eclosure_elem);
1710 	}
1711     }
1712   return REG_NOERROR;
1713 }
1714 
1715 /* Calculate epsilon closure of NODE.  */
1716 
1717 static reg_errcode_t
1718 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1719 {
1720   reg_errcode_t err;
1721   Idx i;
1722   re_node_set eclosure;
1723   bool ok;
1724   bool incomplete = false;
1725   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1726   if (BE (err != REG_NOERROR, 0))
1727     return err;
1728 
1729   /* This indicates that we are calculating this node now.
1730      We reference this value to avoid infinite loop.  */
1731   dfa->eclosures[node].nelem = REG_MISSING;
1732 
1733   /* If the current node has constraints, duplicate all nodes
1734      since they must inherit the constraints.  */
1735   if (dfa->nodes[node].constraint
1736       && dfa->edests[node].nelem
1737       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1738     {
1739       err = duplicate_node_closure (dfa, node, node, node,
1740 				    dfa->nodes[node].constraint);
1741       if (BE (err != REG_NOERROR, 0))
1742 	return err;
1743     }
1744 
1745   /* Expand each epsilon destination nodes.  */
1746   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1747     for (i = 0; i < dfa->edests[node].nelem; ++i)
1748       {
1749 	re_node_set eclosure_elem;
1750 	Idx edest = dfa->edests[node].elems[i];
1751 	/* If calculating the epsilon closure of 'edest' is in progress,
1752 	   return intermediate result.  */
1753 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1754 	  {
1755 	    incomplete = true;
1756 	    continue;
1757 	  }
1758 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1759 	   calculate now. Otherwise use calculated epsilon closure.  */
1760 	if (dfa->eclosures[edest].nelem == 0)
1761 	  {
1762 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1763 	    if (BE (err != REG_NOERROR, 0))
1764 	      return err;
1765 	  }
1766 	else
1767 	  eclosure_elem = dfa->eclosures[edest];
1768 	/* Merge the epsilon closure of 'edest'.  */
1769 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1770 	if (BE (err != REG_NOERROR, 0))
1771 	  return err;
1772 	/* If the epsilon closure of 'edest' is incomplete,
1773 	   the epsilon closure of this node is also incomplete.  */
1774 	if (dfa->eclosures[edest].nelem == 0)
1775 	  {
1776 	    incomplete = true;
1777 	    re_node_set_free (&eclosure_elem);
1778 	  }
1779       }
1780 
1781   /* An epsilon closure includes itself.  */
1782   ok = re_node_set_insert (&eclosure, node);
1783   if (BE (! ok, 0))
1784     return REG_ESPACE;
1785   if (incomplete && !root)
1786     dfa->eclosures[node].nelem = 0;
1787   else
1788     dfa->eclosures[node] = eclosure;
1789   *new_set = eclosure;
1790   return REG_NOERROR;
1791 }
1792 
1793 /* Functions for token which are used in the parser.  */
1794 
1795 /* Fetch a token from INPUT.
1796    We must not use this function inside bracket expressions.  */
1797 
1798 static void
1799 internal_function
1800 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1801 {
1802   re_string_skip_bytes (input, peek_token (result, input, syntax));
1803 }
1804 
1805 /* Peek a token from INPUT, and return the length of the token.
1806    We must not use this function inside bracket expressions.  */
1807 
1808 static int
1809 internal_function
1810 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1811 {
1812   unsigned char c;
1813 
1814   if (re_string_eoi (input))
1815     {
1816       token->type = END_OF_RE;
1817       return 0;
1818     }
1819 
1820   c = re_string_peek_byte (input, 0);
1821   token->opr.c = c;
1822 
1823   token->word_char = 0;
1824 #ifdef RE_ENABLE_I18N
1825   token->mb_partial = 0;
1826   if (input->mb_cur_max > 1 &&
1827       !re_string_first_byte (input, re_string_cur_idx (input)))
1828     {
1829       token->type = CHARACTER;
1830       token->mb_partial = 1;
1831       return 1;
1832     }
1833 #endif
1834   if (c == '\\')
1835     {
1836       unsigned char c2;
1837       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1838 	{
1839 	  token->type = BACK_SLASH;
1840 	  return 1;
1841 	}
1842 
1843       c2 = re_string_peek_byte_case (input, 1);
1844       token->opr.c = c2;
1845       token->type = CHARACTER;
1846 #ifdef RE_ENABLE_I18N
1847       if (input->mb_cur_max > 1)
1848 	{
1849 	  wint_t wc = re_string_wchar_at (input,
1850 					  re_string_cur_idx (input) + 1);
1851 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1852 	}
1853       else
1854 #endif
1855 	token->word_char = IS_WORD_CHAR (c2) != 0;
1856 
1857       switch (c2)
1858 	{
1859 	case '|':
1860 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1861 	    token->type = OP_ALT;
1862 	  break;
1863 	case '1': case '2': case '3': case '4': case '5':
1864 	case '6': case '7': case '8': case '9':
1865 	  if (!(syntax & RE_NO_BK_REFS))
1866 	    {
1867 	      token->type = OP_BACK_REF;
1868 	      token->opr.idx = c2 - '1';
1869 	    }
1870 	  break;
1871 	case '<':
1872 	  if (!(syntax & RE_NO_GNU_OPS))
1873 	    {
1874 	      token->type = ANCHOR;
1875 	      token->opr.ctx_type = WORD_FIRST;
1876 	    }
1877 	  break;
1878 	case '>':
1879 	  if (!(syntax & RE_NO_GNU_OPS))
1880 	    {
1881 	      token->type = ANCHOR;
1882 	      token->opr.ctx_type = WORD_LAST;
1883 	    }
1884 	  break;
1885 	case 'b':
1886 	  if (!(syntax & RE_NO_GNU_OPS))
1887 	    {
1888 	      token->type = ANCHOR;
1889 	      token->opr.ctx_type = WORD_DELIM;
1890 	    }
1891 	  break;
1892 	case 'B':
1893 	  if (!(syntax & RE_NO_GNU_OPS))
1894 	    {
1895 	      token->type = ANCHOR;
1896 	      token->opr.ctx_type = NOT_WORD_DELIM;
1897 	    }
1898 	  break;
1899 	case 'w':
1900 	  if (!(syntax & RE_NO_GNU_OPS))
1901 	    token->type = OP_WORD;
1902 	  break;
1903 	case 'W':
1904 	  if (!(syntax & RE_NO_GNU_OPS))
1905 	    token->type = OP_NOTWORD;
1906 	  break;
1907 	case 's':
1908 	  if (!(syntax & RE_NO_GNU_OPS))
1909 	    token->type = OP_SPACE;
1910 	  break;
1911 	case 'S':
1912 	  if (!(syntax & RE_NO_GNU_OPS))
1913 	    token->type = OP_NOTSPACE;
1914 	  break;
1915 	case '`':
1916 	  if (!(syntax & RE_NO_GNU_OPS))
1917 	    {
1918 	      token->type = ANCHOR;
1919 	      token->opr.ctx_type = BUF_FIRST;
1920 	    }
1921 	  break;
1922 	case '\'':
1923 	  if (!(syntax & RE_NO_GNU_OPS))
1924 	    {
1925 	      token->type = ANCHOR;
1926 	      token->opr.ctx_type = BUF_LAST;
1927 	    }
1928 	  break;
1929 	case '(':
1930 	  if (!(syntax & RE_NO_BK_PARENS))
1931 	    token->type = OP_OPEN_SUBEXP;
1932 	  break;
1933 	case ')':
1934 	  if (!(syntax & RE_NO_BK_PARENS))
1935 	    token->type = OP_CLOSE_SUBEXP;
1936 	  break;
1937 	case '+':
1938 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1939 	    token->type = OP_DUP_PLUS;
1940 	  break;
1941 	case '?':
1942 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1943 	    token->type = OP_DUP_QUESTION;
1944 	  break;
1945 	case '{':
1946 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1947 	    token->type = OP_OPEN_DUP_NUM;
1948 	  break;
1949 	case '}':
1950 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1951 	    token->type = OP_CLOSE_DUP_NUM;
1952 	  break;
1953 	default:
1954 	  break;
1955 	}
1956       return 2;
1957     }
1958 
1959   token->type = CHARACTER;
1960 #ifdef RE_ENABLE_I18N
1961   if (input->mb_cur_max > 1)
1962     {
1963       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1964       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1965     }
1966   else
1967 #endif
1968     token->word_char = IS_WORD_CHAR (token->opr.c);
1969 
1970   switch (c)
1971     {
1972     case '\n':
1973       if (syntax & RE_NEWLINE_ALT)
1974 	token->type = OP_ALT;
1975       break;
1976     case '|':
1977       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1978 	token->type = OP_ALT;
1979       break;
1980     case '*':
1981       token->type = OP_DUP_ASTERISK;
1982       break;
1983     case '+':
1984       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1985 	token->type = OP_DUP_PLUS;
1986       break;
1987     case '?':
1988       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1989 	token->type = OP_DUP_QUESTION;
1990       break;
1991     case '{':
1992       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1993 	token->type = OP_OPEN_DUP_NUM;
1994       break;
1995     case '}':
1996       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1997 	token->type = OP_CLOSE_DUP_NUM;
1998       break;
1999     case '(':
2000       if (syntax & RE_NO_BK_PARENS)
2001 	token->type = OP_OPEN_SUBEXP;
2002       break;
2003     case ')':
2004       if (syntax & RE_NO_BK_PARENS)
2005 	token->type = OP_CLOSE_SUBEXP;
2006       break;
2007     case '[':
2008       token->type = OP_OPEN_BRACKET;
2009       break;
2010     case '.':
2011       token->type = OP_PERIOD;
2012       break;
2013     case '^':
2014       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2015 	  re_string_cur_idx (input) != 0)
2016 	{
2017 	  char prev = re_string_peek_byte (input, -1);
2018 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2019 	    break;
2020 	}
2021       token->type = ANCHOR;
2022       token->opr.ctx_type = LINE_FIRST;
2023       break;
2024     case '$':
2025       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2026 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2027 	{
2028 	  re_token_t next;
2029 	  re_string_skip_bytes (input, 1);
2030 	  peek_token (&next, input, syntax);
2031 	  re_string_skip_bytes (input, -1);
2032 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2033 	    break;
2034 	}
2035       token->type = ANCHOR;
2036       token->opr.ctx_type = LINE_LAST;
2037       break;
2038     default:
2039       break;
2040     }
2041   return 1;
2042 }
2043 
2044 /* Peek a token from INPUT, and return the length of the token.
2045    We must not use this function out of bracket expressions.  */
2046 
2047 static int
2048 internal_function
2049 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2050 {
2051   unsigned char c;
2052   if (re_string_eoi (input))
2053     {
2054       token->type = END_OF_RE;
2055       return 0;
2056     }
2057   c = re_string_peek_byte (input, 0);
2058   token->opr.c = c;
2059 
2060 #ifdef RE_ENABLE_I18N
2061   if (input->mb_cur_max > 1 &&
2062       !re_string_first_byte (input, re_string_cur_idx (input)))
2063     {
2064       token->type = CHARACTER;
2065       return 1;
2066     }
2067 #endif /* RE_ENABLE_I18N */
2068 
2069   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2070       && re_string_cur_idx (input) + 1 < re_string_length (input))
2071     {
2072       /* In this case, '\' escape a character.  */
2073       unsigned char c2;
2074       re_string_skip_bytes (input, 1);
2075       c2 = re_string_peek_byte (input, 0);
2076       token->opr.c = c2;
2077       token->type = CHARACTER;
2078       return 1;
2079     }
2080   if (c == '[') /* '[' is a special char in a bracket exps.  */
2081     {
2082       unsigned char c2;
2083       int token_len;
2084       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2085 	c2 = re_string_peek_byte (input, 1);
2086       else
2087 	c2 = 0;
2088       token->opr.c = c2;
2089       token_len = 2;
2090       switch (c2)
2091 	{
2092 	case '.':
2093 	  token->type = OP_OPEN_COLL_ELEM;
2094 	  break;
2095 	case '=':
2096 	  token->type = OP_OPEN_EQUIV_CLASS;
2097 	  break;
2098 	case ':':
2099 	  if (syntax & RE_CHAR_CLASSES)
2100 	    {
2101 	      token->type = OP_OPEN_CHAR_CLASS;
2102 	      break;
2103 	    }
2104 	  /* else fall through.  */
2105 	default:
2106 	  token->type = CHARACTER;
2107 	  token->opr.c = c;
2108 	  token_len = 1;
2109 	  break;
2110 	}
2111       return token_len;
2112     }
2113   switch (c)
2114     {
2115     case '-':
2116       token->type = OP_CHARSET_RANGE;
2117       break;
2118     case ']':
2119       token->type = OP_CLOSE_BRACKET;
2120       break;
2121     case '^':
2122       token->type = OP_NON_MATCH_LIST;
2123       break;
2124     default:
2125       token->type = CHARACTER;
2126     }
2127   return 1;
2128 }
2129 
2130 /* Functions for parser.  */
2131 
2132 /* Entry point of the parser.
2133    Parse the regular expression REGEXP and return the structure tree.
2134    If an error occurs, ERR is set by error code, and return NULL.
2135    This function build the following tree, from regular expression <reg_exp>:
2136 	   CAT
2137 	   / \
2138 	  /   \
2139    <reg_exp>  EOR
2140 
2141    CAT means concatenation.
2142    EOR means end of regular expression.  */
2143 
2144 static bin_tree_t *
2145 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2146        reg_errcode_t *err)
2147 {
2148   re_dfa_t *dfa = preg->buffer;
2149   bin_tree_t *tree, *eor, *root;
2150   re_token_t current_token;
2151   dfa->syntax = syntax;
2152   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2154   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155     return NULL;
2156   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2157   if (tree != NULL)
2158     root = create_tree (dfa, tree, eor, CONCAT);
2159   else
2160     root = eor;
2161   if (BE (eor == NULL || root == NULL, 0))
2162     {
2163       *err = REG_ESPACE;
2164       return NULL;
2165     }
2166   return root;
2167 }
2168 
2169 /* This function build the following tree, from regular expression
2170    <branch1>|<branch2>:
2171 	   ALT
2172 	   / \
2173 	  /   \
2174    <branch1> <branch2>
2175 
2176    ALT means alternative, which represents the operator '|'.  */
2177 
2178 static bin_tree_t *
2179 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2181 {
2182   re_dfa_t *dfa = preg->buffer;
2183   bin_tree_t *tree, *branch = NULL;
2184   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2185   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186     return NULL;
2187 
2188   while (token->type == OP_ALT)
2189     {
2190       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2191       if (token->type != OP_ALT && token->type != END_OF_RE
2192 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2193 	{
2194 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2195 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2196 	    return NULL;
2197 	}
2198       else
2199 	branch = NULL;
2200       tree = create_tree (dfa, tree, branch, OP_ALT);
2201       if (BE (tree == NULL, 0))
2202 	{
2203 	  *err = REG_ESPACE;
2204 	  return NULL;
2205 	}
2206     }
2207   return tree;
2208 }
2209 
2210 /* This function build the following tree, from regular expression
2211    <exp1><exp2>:
2212 	CAT
2213 	/ \
2214        /   \
2215    <exp1> <exp2>
2216 
2217    CAT means concatenation.  */
2218 
2219 static bin_tree_t *
2220 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2222 {
2223   bin_tree_t *tree, *expr;
2224   re_dfa_t *dfa = preg->buffer;
2225   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2226   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227     return NULL;
2228 
2229   while (token->type != OP_ALT && token->type != END_OF_RE
2230 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2231     {
2232       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2233       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2234 	{
2235 	  if (tree != NULL)
2236 	    postorder (tree, free_tree, NULL);
2237 	  return NULL;
2238 	}
2239       if (tree != NULL && expr != NULL)
2240 	{
2241 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2242 	  if (newtree == NULL)
2243 	    {
2244 	      postorder (expr, free_tree, NULL);
2245 	      postorder (tree, free_tree, NULL);
2246 	      *err = REG_ESPACE;
2247 	      return NULL;
2248 	    }
2249 	  tree = newtree;
2250 	}
2251       else if (tree == NULL)
2252 	tree = expr;
2253       /* Otherwise expr == NULL, we don't need to create new tree.  */
2254     }
2255   return tree;
2256 }
2257 
2258 /* This function build the following tree, from regular expression a*:
2259 	 *
2260 	 |
2261 	 a
2262 */
2263 
2264 static bin_tree_t *
2265 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2266 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2267 {
2268   re_dfa_t *dfa = preg->buffer;
2269   bin_tree_t *tree;
2270   switch (token->type)
2271     {
2272     case CHARACTER:
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275 	{
2276 	  *err = REG_ESPACE;
2277 	  return NULL;
2278 	}
2279 #ifdef RE_ENABLE_I18N
2280       if (dfa->mb_cur_max > 1)
2281 	{
2282 	  while (!re_string_eoi (regexp)
2283 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2284 	    {
2285 	      bin_tree_t *mbc_remain;
2286 	      fetch_token (token, regexp, syntax);
2287 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2288 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2289 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2290 		{
2291 		  *err = REG_ESPACE;
2292 		  return NULL;
2293 		}
2294 	    }
2295 	}
2296 #endif
2297       break;
2298     case OP_OPEN_SUBEXP:
2299       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2300       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2301 	return NULL;
2302       break;
2303     case OP_OPEN_BRACKET:
2304       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306 	return NULL;
2307       break;
2308     case OP_BACK_REF:
2309       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2310 	{
2311 	  *err = REG_ESUBREG;
2312 	  return NULL;
2313 	}
2314       dfa->used_bkref_map |= 1 << token->opr.idx;
2315       tree = create_token_tree (dfa, NULL, NULL, token);
2316       if (BE (tree == NULL, 0))
2317 	{
2318 	  *err = REG_ESPACE;
2319 	  return NULL;
2320 	}
2321       ++dfa->nbackref;
2322       dfa->has_mb_node = 1;
2323       break;
2324     case OP_OPEN_DUP_NUM:
2325       if (syntax & RE_CONTEXT_INVALID_DUP)
2326 	{
2327 	  *err = REG_BADRPT;
2328 	  return NULL;
2329 	}
2330       /* FALLTHROUGH */
2331     case OP_DUP_ASTERISK:
2332     case OP_DUP_PLUS:
2333     case OP_DUP_QUESTION:
2334       if (syntax & RE_CONTEXT_INVALID_OPS)
2335 	{
2336 	  *err = REG_BADRPT;
2337 	  return NULL;
2338 	}
2339       else if (syntax & RE_CONTEXT_INDEP_OPS)
2340 	{
2341 	  fetch_token (token, regexp, syntax);
2342 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2343 	}
2344       /* else fall through  */
2345     case OP_CLOSE_SUBEXP:
2346       if ((token->type == OP_CLOSE_SUBEXP) &&
2347 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348 	{
2349 	  *err = REG_ERPAREN;
2350 	  return NULL;
2351 	}
2352       /* else fall through  */
2353     case OP_CLOSE_DUP_NUM:
2354       /* We treat it as a normal character.  */
2355 
2356       /* Then we can these characters as normal characters.  */
2357       token->type = CHARACTER;
2358       /* mb_partial and word_char bits should be initialized already
2359 	 by peek_token.  */
2360       tree = create_token_tree (dfa, NULL, NULL, token);
2361       if (BE (tree == NULL, 0))
2362 	{
2363 	  *err = REG_ESPACE;
2364 	  return NULL;
2365 	}
2366       break;
2367     case ANCHOR:
2368       if ((token->opr.ctx_type
2369 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2370 	  && dfa->word_ops_used == 0)
2371 	init_word_char (dfa);
2372       if (token->opr.ctx_type == WORD_DELIM
2373 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2374 	{
2375 	  bin_tree_t *tree_first, *tree_last;
2376 	  if (token->opr.ctx_type == WORD_DELIM)
2377 	    {
2378 	      token->opr.ctx_type = WORD_FIRST;
2379 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 	      token->opr.ctx_type = WORD_LAST;
2381 	    }
2382 	  else
2383 	    {
2384 	      token->opr.ctx_type = INSIDE_WORD;
2385 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2386 	      token->opr.ctx_type = INSIDE_NOTWORD;
2387 	    }
2388 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2389 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2390 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2391 	    {
2392 	      *err = REG_ESPACE;
2393 	      return NULL;
2394 	    }
2395 	}
2396       else
2397 	{
2398 	  tree = create_token_tree (dfa, NULL, NULL, token);
2399 	  if (BE (tree == NULL, 0))
2400 	    {
2401 	      *err = REG_ESPACE;
2402 	      return NULL;
2403 	    }
2404 	}
2405       /* We must return here, since ANCHORs can't be followed
2406 	 by repetition operators.
2407 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2408 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2409       fetch_token (token, regexp, syntax);
2410       return tree;
2411     case OP_PERIOD:
2412       tree = create_token_tree (dfa, NULL, NULL, token);
2413       if (BE (tree == NULL, 0))
2414 	{
2415 	  *err = REG_ESPACE;
2416 	  return NULL;
2417 	}
2418       if (dfa->mb_cur_max > 1)
2419 	dfa->has_mb_node = 1;
2420       break;
2421     case OP_WORD:
2422     case OP_NOTWORD:
2423       tree = build_charclass_op (dfa, regexp->trans,
2424 				 (const unsigned char *) "alnum",
2425 				 (const unsigned char *) "_",
2426 				 token->type == OP_NOTWORD, err);
2427       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2428 	return NULL;
2429       break;
2430     case OP_SPACE:
2431     case OP_NOTSPACE:
2432       tree = build_charclass_op (dfa, regexp->trans,
2433 				 (const unsigned char *) "space",
2434 				 (const unsigned char *) "",
2435 				 token->type == OP_NOTSPACE, err);
2436       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2437 	return NULL;
2438       break;
2439     case OP_ALT:
2440     case END_OF_RE:
2441       return NULL;
2442     case BACK_SLASH:
2443       *err = REG_EESCAPE;
2444       return NULL;
2445     default:
2446       /* Must not happen?  */
2447 #ifdef DEBUG
2448       assert (0);
2449 #endif
2450       return NULL;
2451     }
2452   fetch_token (token, regexp, syntax);
2453 
2454   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2455 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2456     {
2457       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2458       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2459 	return NULL;
2460       /* In BRE consecutive duplications are not allowed.  */
2461       if ((syntax & RE_CONTEXT_INVALID_DUP)
2462 	  && (token->type == OP_DUP_ASTERISK
2463 	      || token->type == OP_OPEN_DUP_NUM))
2464 	{
2465 	  *err = REG_BADRPT;
2466 	  return NULL;
2467 	}
2468     }
2469 
2470   return tree;
2471 }
2472 
2473 /* This function build the following tree, from regular expression
2474    (<reg_exp>):
2475 	 SUBEXP
2476 	    |
2477 	<reg_exp>
2478 */
2479 
2480 static bin_tree_t *
2481 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2482 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2483 {
2484   re_dfa_t *dfa = preg->buffer;
2485   bin_tree_t *tree;
2486   size_t cur_nsub;
2487   cur_nsub = preg->re_nsub++;
2488 
2489   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2490 
2491   /* The subexpression may be a null string.  */
2492   if (token->type == OP_CLOSE_SUBEXP)
2493     tree = NULL;
2494   else
2495     {
2496       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2498 	{
2499 	  if (tree != NULL)
2500 	    postorder (tree, free_tree, NULL);
2501 	  *err = REG_EPAREN;
2502 	}
2503       if (BE (*err != REG_NOERROR, 0))
2504 	return NULL;
2505     }
2506 
2507   if (cur_nsub <= '9' - '1')
2508     dfa->completed_bkref_map |= 1 << cur_nsub;
2509 
2510   tree = create_tree (dfa, tree, NULL, SUBEXP);
2511   if (BE (tree == NULL, 0))
2512     {
2513       *err = REG_ESPACE;
2514       return NULL;
2515     }
2516   tree->token.opr.idx = cur_nsub;
2517   return tree;
2518 }
2519 
2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2521 
2522 static bin_tree_t *
2523 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2525 {
2526   bin_tree_t *tree = NULL, *old_tree = NULL;
2527   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528   re_token_t start_token = *token;
2529 
2530   if (token->type == OP_OPEN_DUP_NUM)
2531     {
2532       end = 0;
2533       start = fetch_number (regexp, token, syntax);
2534       if (start == REG_MISSING)
2535 	{
2536 	  if (token->type == CHARACTER && token->opr.c == ',')
2537 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2538 	  else
2539 	    {
2540 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2541 	      return NULL;
2542 	    }
2543 	}
2544       if (BE (start != REG_ERROR, 1))
2545 	{
2546 	  /* We treat "{n}" as "{n,n}".  */
2547 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548 		 : ((token->type == CHARACTER && token->opr.c == ',')
2549 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2550 	}
2551       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2552 	{
2553 	  /* Invalid sequence.  */
2554 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2555 	    {
2556 	      if (token->type == END_OF_RE)
2557 		*err = REG_EBRACE;
2558 	      else
2559 		*err = REG_BADBR;
2560 
2561 	      return NULL;
2562 	    }
2563 
2564 	  /* If the syntax bit is set, rollback.  */
2565 	  re_string_set_index (regexp, start_idx);
2566 	  *token = start_token;
2567 	  token->type = CHARACTER;
2568 	  /* mb_partial and word_char bits should be already initialized by
2569 	     peek_token.  */
2570 	  return elem;
2571 	}
2572 
2573       if (BE ((end != REG_MISSING && start > end)
2574 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2575 	{
2576 	  /* First number greater than second.  */
2577 	  *err = REG_BADBR;
2578 	  return NULL;
2579 	}
2580 
2581       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2582 	{
2583 	  *err = REG_ESIZE;
2584 	  return NULL;
2585 	}
2586     }
2587   else
2588     {
2589       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2591     }
2592 
2593   fetch_token (token, regexp, syntax);
2594 
2595   if (BE (elem == NULL, 0))
2596     return NULL;
2597   if (BE (start == 0 && end == 0, 0))
2598     {
2599       postorder (elem, free_tree, NULL);
2600       return NULL;
2601     }
2602 
2603   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2604   if (BE (start > 0, 0))
2605     {
2606       tree = elem;
2607       for (i = 2; i <= start; ++i)
2608 	{
2609 	  elem = duplicate_tree (elem, dfa);
2610 	  tree = create_tree (dfa, tree, elem, CONCAT);
2611 	  if (BE (elem == NULL || tree == NULL, 0))
2612 	    goto parse_dup_op_espace;
2613 	}
2614 
2615       if (start == end)
2616 	return tree;
2617 
2618       /* Duplicate ELEM before it is marked optional.  */
2619       elem = duplicate_tree (elem, dfa);
2620       old_tree = tree;
2621     }
2622   else
2623     old_tree = NULL;
2624 
2625   if (elem->token.type == SUBEXP)
2626     {
2627       uintptr_t subidx = elem->token.opr.idx;
2628       postorder (elem, mark_opt_subexp, (void *) subidx);
2629     }
2630 
2631   tree = create_tree (dfa, elem, NULL,
2632 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2633   if (BE (tree == NULL, 0))
2634     goto parse_dup_op_espace;
2635 
2636 /* From gnulib's "intprops.h":
2637    True if the arithmetic type T is signed.  */
2638 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2639 
2640   /* This loop is actually executed only when end != REG_MISSING,
2641      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2642      already created the start+1-th copy.  */
2643   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2644     for (i = start + 2; i <= end; ++i)
2645       {
2646 	elem = duplicate_tree (elem, dfa);
2647 	tree = create_tree (dfa, tree, elem, CONCAT);
2648 	if (BE (elem == NULL || tree == NULL, 0))
2649 	  goto parse_dup_op_espace;
2650 
2651 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2652 	if (BE (tree == NULL, 0))
2653 	  goto parse_dup_op_espace;
2654       }
2655 
2656   if (old_tree)
2657     tree = create_tree (dfa, old_tree, tree, CONCAT);
2658 
2659   return tree;
2660 
2661  parse_dup_op_espace:
2662   *err = REG_ESPACE;
2663   return NULL;
2664 }
2665 
2666 /* Size of the names for collating symbol/equivalence_class/character_class.
2667    I'm not sure, but maybe enough.  */
2668 #define BRACKET_NAME_BUF_SIZE 32
2669 
2670 #ifndef _LIBC
2671   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2672      Build the range expression which starts from START_ELEM, and ends
2673      at END_ELEM.  The result are written to MBCSET and SBCSET.
2674      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2675      mbcset->range_ends, is a pointer argument since we may
2676      update it.  */
2677 
2678 static reg_errcode_t
2679 internal_function
2680 # ifdef RE_ENABLE_I18N
2681 build_range_exp (const reg_syntax_t syntax,
2682                  bitset_t sbcset,
2683                  re_charset_t *mbcset,
2684                  Idx *range_alloc,
2685                  const bracket_elem_t *start_elem,
2686                  const bracket_elem_t *end_elem)
2687 # else /* not RE_ENABLE_I18N */
2688 build_range_exp (const reg_syntax_t syntax,
2689                  bitset_t sbcset,
2690                  const bracket_elem_t *start_elem,
2691                  const bracket_elem_t *end_elem)
2692 # endif /* not RE_ENABLE_I18N */
2693 {
2694   unsigned int start_ch, end_ch;
2695   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2696   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2697 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2698 	  0))
2699     return REG_ERANGE;
2700 
2701   /* We can handle no multi character collating elements without libc
2702      support.  */
2703   if (BE ((start_elem->type == COLL_SYM
2704 	   && strlen ((char *) start_elem->opr.name) > 1)
2705 	  || (end_elem->type == COLL_SYM
2706 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2707     return REG_ECOLLATE;
2708 
2709 # ifdef RE_ENABLE_I18N
2710   {
2711     wchar_t wc;
2712     wint_t start_wc;
2713     wint_t end_wc;
2714     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2715 
2716     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718 		   : 0));
2719     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721 		 : 0));
2722     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723 		? __btowc (start_ch) : start_elem->opr.wch);
2724     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725 	      ? __btowc (end_ch) : end_elem->opr.wch);
2726     if (start_wc == WEOF || end_wc == WEOF)
2727       return REG_ECOLLATE;
2728     cmp_buf[0] = start_wc;
2729     cmp_buf[4] = end_wc;
2730 
2731     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2732             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733       return REG_ERANGE;
2734 
2735     /* Got valid collation sequence values, add them as a new entry.
2736        However, for !_LIBC we have no collation elements: if the
2737        character set is single byte, the single byte character set
2738        that we build below suffices.  parse_bracket_exp passes
2739        no MBCSET if dfa->mb_cur_max == 1.  */
2740     if (mbcset)
2741       {
2742 	/* Check the space of the arrays.  */
2743 	if (BE (*range_alloc == mbcset->nranges, 0))
2744 	  {
2745 	    /* There is not enough space, need realloc.  */
2746 	    wchar_t *new_array_start, *new_array_end;
2747 	    Idx new_nranges;
2748 
2749 	    /* +1 in case of mbcset->nranges is 0.  */
2750 	    new_nranges = 2 * mbcset->nranges + 1;
2751 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2752 	       are NULL if *range_alloc == 0.  */
2753 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2754 					  new_nranges);
2755 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2756 					new_nranges);
2757 
2758 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2759 	      return REG_ESPACE;
2760 
2761 	    mbcset->range_starts = new_array_start;
2762 	    mbcset->range_ends = new_array_end;
2763 	    *range_alloc = new_nranges;
2764 	  }
2765 
2766 	mbcset->range_starts[mbcset->nranges] = start_wc;
2767 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2768       }
2769 
2770     /* Build the table for single byte characters.  */
2771     for (wc = 0; wc < SBC_MAX; ++wc)
2772       {
2773 	cmp_buf[2] = wc;
2774 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2775 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2776 	  bitset_set (sbcset, wc);
2777       }
2778   }
2779 # else /* not RE_ENABLE_I18N */
2780   {
2781     unsigned int ch;
2782     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2783 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2784 		   : 0));
2785     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2786 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2787 		 : 0));
2788     if (start_ch > end_ch)
2789       return REG_ERANGE;
2790     /* Build the table for single byte characters.  */
2791     for (ch = 0; ch < SBC_MAX; ++ch)
2792       if (start_ch <= ch  && ch <= end_ch)
2793 	bitset_set (sbcset, ch);
2794   }
2795 # endif /* not RE_ENABLE_I18N */
2796   return REG_NOERROR;
2797 }
2798 #endif /* not _LIBC */
2799 
2800 #ifndef _LIBC
2801 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2802    Build the collating element which is represented by NAME.
2803    The result are written to MBCSET and SBCSET.
2804    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2805    pointer argument since we may update it.  */
2806 
2807 static reg_errcode_t
2808 internal_function
2809 # ifdef RE_ENABLE_I18N
2810 build_collating_symbol (bitset_t sbcset,
2811                         re_charset_t *mbcset _UNUSED_PARAMETER_,
2812 			Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2813                         const unsigned char *name)
2814 # else /* not RE_ENABLE_I18N */
2815 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2816 # endif /* not RE_ENABLE_I18N */
2817 {
2818   size_t name_len = strlen ((const char *) name);
2819   if (BE (name_len != 1, 0))
2820     return REG_ECOLLATE;
2821   else
2822     {
2823       bitset_set (sbcset, name[0]);
2824       return REG_NOERROR;
2825     }
2826 }
2827 #endif /* not _LIBC */
2828 
2829 /* This function parse bracket expression like "[abc]", "[a-c]",
2830    "[[.a-a.]]" etc.  */
2831 
2832 static bin_tree_t *
2833 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2834 		   reg_syntax_t syntax, reg_errcode_t *err)
2835 {
2836 #ifdef _LIBC
2837   const unsigned char *collseqmb;
2838   const char *collseqwc;
2839   uint32_t nrules;
2840   int32_t table_size;
2841   const int32_t *symb_table;
2842   const unsigned char *extra;
2843 
2844   /* Local function for parse_bracket_exp used in _LIBC environment.
2845      Seek the collating symbol entry corresponding to NAME.
2846      Return the index of the symbol in the SYMB_TABLE.  */
2847 
2848   auto inline int32_t
2849   __attribute ((always_inline))
2850   seek_collating_symbol_entry (name, name_len)
2851 	 const unsigned char *name;
2852 	 size_t name_len;
2853     {
2854       int32_t hash = elem_hash ((const char *) name, name_len);
2855       int32_t elem = hash % table_size;
2856       if (symb_table[2 * elem] != 0)
2857 	{
2858 	  int32_t second = hash % (table_size - 2) + 1;
2859 
2860 	  do
2861 	    {
2862 	      /* First compare the hashing value.  */
2863 	      if (symb_table[2 * elem] == hash
2864 		  /* Compare the length of the name.  */
2865 		  && name_len == extra[symb_table[2 * elem + 1]]
2866 		  /* Compare the name.  */
2867 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2868 			     name_len) == 0)
2869 		{
2870 		  /* Yep, this is the entry.  */
2871 		  break;
2872 		}
2873 
2874 	      /* Next entry.  */
2875 	      elem += second;
2876 	    }
2877 	  while (symb_table[2 * elem] != 0);
2878 	}
2879       return elem;
2880     }
2881 
2882   /* Local function for parse_bracket_exp used in _LIBC environment.
2883      Look up the collation sequence value of BR_ELEM.
2884      Return the value if succeeded, UINT_MAX otherwise.  */
2885 
2886   auto inline unsigned int
2887   __attribute ((always_inline))
2888   lookup_collation_sequence_value (br_elem)
2889 	 bracket_elem_t *br_elem;
2890     {
2891       if (br_elem->type == SB_CHAR)
2892 	{
2893 	  /*
2894 	  if (MB_CUR_MAX == 1)
2895 	  */
2896 	  if (nrules == 0)
2897 	    return collseqmb[br_elem->opr.ch];
2898 	  else
2899 	    {
2900 	      wint_t wc = __btowc (br_elem->opr.ch);
2901 	      return __collseq_table_lookup (collseqwc, wc);
2902 	    }
2903 	}
2904       else if (br_elem->type == MB_CHAR)
2905 	{
2906 	  if (nrules != 0)
2907 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908 	}
2909       else if (br_elem->type == COLL_SYM)
2910 	{
2911 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2912 	  if (nrules != 0)
2913 	    {
2914 	      int32_t elem, idx;
2915 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2916 						  sym_name_len);
2917 	      if (symb_table[2 * elem] != 0)
2918 		{
2919 		  /* We found the entry.  */
2920 		  idx = symb_table[2 * elem + 1];
2921 		  /* Skip the name of collating element name.  */
2922 		  idx += 1 + extra[idx];
2923 		  /* Skip the byte sequence of the collating element.  */
2924 		  idx += 1 + extra[idx];
2925 		  /* Adjust for the alignment.  */
2926 		  idx = (idx + 3) & ~3;
2927 		  /* Skip the multibyte collation sequence value.  */
2928 		  idx += sizeof (unsigned int);
2929 		  /* Skip the wide char sequence of the collating element.  */
2930 		  idx += sizeof (unsigned int) *
2931 		    (1 + *(unsigned int *) (extra + idx));
2932 		  /* Return the collation sequence value.  */
2933 		  return *(unsigned int *) (extra + idx);
2934 		}
2935 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2936 		{
2937 		  /* No valid character.  Match it as a single byte
2938 		     character.  */
2939 		  return collseqmb[br_elem->opr.name[0]];
2940 		}
2941 	    }
2942 	  else if (sym_name_len == 1)
2943 	    return collseqmb[br_elem->opr.name[0]];
2944 	}
2945       return UINT_MAX;
2946     }
2947 
2948   /* Local function for parse_bracket_exp used in _LIBC environment.
2949      Build the range expression which starts from START_ELEM, and ends
2950      at END_ELEM.  The result are written to MBCSET and SBCSET.
2951      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2952      mbcset->range_ends, is a pointer argument since we may
2953      update it.  */
2954 
2955   auto inline reg_errcode_t
2956   __attribute ((always_inline))
2957   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2958 	 re_charset_t *mbcset;
2959 	 Idx *range_alloc;
2960 	 bitset_t sbcset;
2961 	 bracket_elem_t *start_elem, *end_elem;
2962     {
2963       unsigned int ch;
2964       uint32_t start_collseq;
2965       uint32_t end_collseq;
2966 
2967       /* Equivalence Classes and Character Classes can't be a range
2968 	 start/end.  */
2969       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2970 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2971 	      0))
2972 	return REG_ERANGE;
2973 
2974       start_collseq = lookup_collation_sequence_value (start_elem);
2975       end_collseq = lookup_collation_sequence_value (end_elem);
2976       /* Check start/end collation sequence values.  */
2977       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2978 	return REG_ECOLLATE;
2979       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2980 	return REG_ERANGE;
2981 
2982       /* Got valid collation sequence values, add them as a new entry.
2983 	 However, if we have no collation elements, and the character set
2984 	 is single byte, the single byte character set that we
2985 	 build below suffices. */
2986       if (nrules > 0 || dfa->mb_cur_max > 1)
2987 	{
2988 	  /* Check the space of the arrays.  */
2989 	  if (BE (*range_alloc == mbcset->nranges, 0))
2990 	    {
2991 	      /* There is not enough space, need realloc.  */
2992 	      uint32_t *new_array_start;
2993 	      uint32_t *new_array_end;
2994 	      Idx new_nranges;
2995 
2996 	      /* +1 in case of mbcset->nranges is 0.  */
2997 	      new_nranges = 2 * mbcset->nranges + 1;
2998 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2999 					    new_nranges);
3000 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3001 					  new_nranges);
3002 
3003 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3004 		return REG_ESPACE;
3005 
3006 	      mbcset->range_starts = new_array_start;
3007 	      mbcset->range_ends = new_array_end;
3008 	      *range_alloc = new_nranges;
3009 	    }
3010 
3011 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
3012 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
3013 	}
3014 
3015       /* Build the table for single byte characters.  */
3016       for (ch = 0; ch < SBC_MAX; ch++)
3017 	{
3018 	  uint32_t ch_collseq;
3019 	  /*
3020 	  if (MB_CUR_MAX == 1)
3021 	  */
3022 	  if (nrules == 0)
3023 	    ch_collseq = collseqmb[ch];
3024 	  else
3025 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3026 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3027 	    bitset_set (sbcset, ch);
3028 	}
3029       return REG_NOERROR;
3030     }
3031 
3032   /* Local function for parse_bracket_exp used in _LIBC environment.
3033      Build the collating element which is represented by NAME.
3034      The result are written to MBCSET and SBCSET.
3035      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3036      pointer argument since we may update it.  */
3037 
3038   auto inline reg_errcode_t
3039   __attribute ((always_inline))
3040   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3041 	 re_charset_t *mbcset;
3042 	 Idx *coll_sym_alloc;
3043 	 bitset_t sbcset;
3044 	 const unsigned char *name;
3045     {
3046       int32_t elem, idx;
3047       size_t name_len = strlen ((const char *) name);
3048       if (nrules != 0)
3049 	{
3050 	  elem = seek_collating_symbol_entry (name, name_len);
3051 	  if (symb_table[2 * elem] != 0)
3052 	    {
3053 	      /* We found the entry.  */
3054 	      idx = symb_table[2 * elem + 1];
3055 	      /* Skip the name of collating element name.  */
3056 	      idx += 1 + extra[idx];
3057 	    }
3058 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3059 	    {
3060 	      /* No valid character, treat it as a normal
3061 		 character.  */
3062 	      bitset_set (sbcset, name[0]);
3063 	      return REG_NOERROR;
3064 	    }
3065 	  else
3066 	    return REG_ECOLLATE;
3067 
3068 	  /* Got valid collation sequence, add it as a new entry.  */
3069 	  /* Check the space of the arrays.  */
3070 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3071 	    {
3072 	      /* Not enough, realloc it.  */
3073 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3074 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3075 	      /* Use realloc since mbcset->coll_syms is NULL
3076 		 if *alloc == 0.  */
3077 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3078 						   new_coll_sym_alloc);
3079 	      if (BE (new_coll_syms == NULL, 0))
3080 		return REG_ESPACE;
3081 	      mbcset->coll_syms = new_coll_syms;
3082 	      *coll_sym_alloc = new_coll_sym_alloc;
3083 	    }
3084 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3085 	  return REG_NOERROR;
3086 	}
3087       else
3088 	{
3089 	  if (BE (name_len != 1, 0))
3090 	    return REG_ECOLLATE;
3091 	  else
3092 	    {
3093 	      bitset_set (sbcset, name[0]);
3094 	      return REG_NOERROR;
3095 	    }
3096 	}
3097     }
3098 #endif
3099 
3100   re_token_t br_token;
3101   re_bitset_ptr_t sbcset;
3102 #ifdef RE_ENABLE_I18N
3103   re_charset_t *mbcset;
3104   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3105   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3106 #endif /* not RE_ENABLE_I18N */
3107   bool non_match = false;
3108   bin_tree_t *work_tree;
3109   int token_len;
3110   bool first_round = true;
3111 #ifdef _LIBC
3112   collseqmb = (const unsigned char *)
3113     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3114   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3115   if (nrules)
3116     {
3117       /*
3118       if (MB_CUR_MAX > 1)
3119       */
3120       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3121       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3122       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3123 						  _NL_COLLATE_SYMB_TABLEMB);
3124       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3125 						   _NL_COLLATE_SYMB_EXTRAMB);
3126     }
3127 #endif
3128   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3129 #ifdef RE_ENABLE_I18N
3130   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3131 #endif /* RE_ENABLE_I18N */
3132 #ifdef RE_ENABLE_I18N
3133   if (BE (sbcset == NULL || mbcset == NULL, 0))
3134 #else
3135   if (BE (sbcset == NULL, 0))
3136 #endif /* RE_ENABLE_I18N */
3137     {
3138       re_free (sbcset);
3139 #ifdef RE_ENABLE_I18N
3140       re_free (mbcset);
3141 #endif
3142       *err = REG_ESPACE;
3143       return NULL;
3144     }
3145 
3146   token_len = peek_token_bracket (token, regexp, syntax);
3147   if (BE (token->type == END_OF_RE, 0))
3148     {
3149       *err = REG_BADPAT;
3150       goto parse_bracket_exp_free_return;
3151     }
3152   if (token->type == OP_NON_MATCH_LIST)
3153     {
3154 #ifdef RE_ENABLE_I18N
3155       mbcset->non_match = 1;
3156 #endif /* not RE_ENABLE_I18N */
3157       non_match = true;
3158       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3159 	bitset_set (sbcset, '\n');
3160       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3161       token_len = peek_token_bracket (token, regexp, syntax);
3162       if (BE (token->type == END_OF_RE, 0))
3163 	{
3164 	  *err = REG_BADPAT;
3165 	  goto parse_bracket_exp_free_return;
3166 	}
3167     }
3168 
3169   /* We treat the first ']' as a normal character.  */
3170   if (token->type == OP_CLOSE_BRACKET)
3171     token->type = CHARACTER;
3172 
3173   while (1)
3174     {
3175       bracket_elem_t start_elem, end_elem;
3176       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3177       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3178       reg_errcode_t ret;
3179       int token_len2 = 0;
3180       bool is_range_exp = false;
3181       re_token_t token2;
3182 
3183       start_elem.opr.name = start_name_buf;
3184       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3185 				   syntax, first_round);
3186       if (BE (ret != REG_NOERROR, 0))
3187 	{
3188 	  *err = ret;
3189 	  goto parse_bracket_exp_free_return;
3190 	}
3191       first_round = false;
3192 
3193       /* Get information about the next token.  We need it in any case.  */
3194       token_len = peek_token_bracket (token, regexp, syntax);
3195 
3196       /* Do not check for ranges if we know they are not allowed.  */
3197       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3198 	{
3199 	  if (BE (token->type == END_OF_RE, 0))
3200 	    {
3201 	      *err = REG_EBRACK;
3202 	      goto parse_bracket_exp_free_return;
3203 	    }
3204 	  if (token->type == OP_CHARSET_RANGE)
3205 	    {
3206 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3207 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3208 	      if (BE (token2.type == END_OF_RE, 0))
3209 		{
3210 		  *err = REG_EBRACK;
3211 		  goto parse_bracket_exp_free_return;
3212 		}
3213 	      if (token2.type == OP_CLOSE_BRACKET)
3214 		{
3215 		  /* We treat the last '-' as a normal character.  */
3216 		  re_string_skip_bytes (regexp, -token_len);
3217 		  token->type = CHARACTER;
3218 		}
3219 	      else
3220 		is_range_exp = true;
3221 	    }
3222 	}
3223 
3224       if (is_range_exp == true)
3225 	{
3226 	  end_elem.opr.name = end_name_buf;
3227 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3228 				       dfa, syntax, true);
3229 	  if (BE (ret != REG_NOERROR, 0))
3230 	    {
3231 	      *err = ret;
3232 	      goto parse_bracket_exp_free_return;
3233 	    }
3234 
3235 	  token_len = peek_token_bracket (token, regexp, syntax);
3236 
3237 #ifdef _LIBC
3238 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3239 				  &start_elem, &end_elem);
3240 #else
3241 # ifdef RE_ENABLE_I18N
3242 	  *err = build_range_exp (syntax, sbcset,
3243 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3244 				  &range_alloc, &start_elem, &end_elem);
3245 # else
3246 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3247 # endif
3248 #endif /* RE_ENABLE_I18N */
3249 	  if (BE (*err != REG_NOERROR, 0))
3250 	    goto parse_bracket_exp_free_return;
3251 	}
3252       else
3253 	{
3254 	  switch (start_elem.type)
3255 	    {
3256 	    case SB_CHAR:
3257 	      bitset_set (sbcset, start_elem.opr.ch);
3258 	      break;
3259 #ifdef RE_ENABLE_I18N
3260 	    case MB_CHAR:
3261 	      /* Check whether the array has enough space.  */
3262 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3263 		{
3264 		  wchar_t *new_mbchars;
3265 		  /* Not enough, realloc it.  */
3266 		  /* +1 in case of mbcset->nmbchars is 0.  */
3267 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3268 		  /* Use realloc since array is NULL if *alloc == 0.  */
3269 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3270 					    mbchar_alloc);
3271 		  if (BE (new_mbchars == NULL, 0))
3272 		    goto parse_bracket_exp_espace;
3273 		  mbcset->mbchars = new_mbchars;
3274 		}
3275 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3276 	      break;
3277 #endif /* RE_ENABLE_I18N */
3278 	    case EQUIV_CLASS:
3279 	      *err = build_equiv_class (sbcset,
3280 #ifdef RE_ENABLE_I18N
3281 					mbcset, &equiv_class_alloc,
3282 #endif /* RE_ENABLE_I18N */
3283 					start_elem.opr.name);
3284 	      if (BE (*err != REG_NOERROR, 0))
3285 		goto parse_bracket_exp_free_return;
3286 	      break;
3287 	    case COLL_SYM:
3288 	      *err = build_collating_symbol (sbcset,
3289 #ifdef RE_ENABLE_I18N
3290 					     mbcset, &coll_sym_alloc,
3291 #endif /* RE_ENABLE_I18N */
3292 					     start_elem.opr.name);
3293 	      if (BE (*err != REG_NOERROR, 0))
3294 		goto parse_bracket_exp_free_return;
3295 	      break;
3296 	    case CHAR_CLASS:
3297 	      *err = build_charclass (regexp->trans, sbcset,
3298 #ifdef RE_ENABLE_I18N
3299 				      mbcset, &char_class_alloc,
3300 #endif /* RE_ENABLE_I18N */
3301 				      start_elem.opr.name, syntax);
3302 	      if (BE (*err != REG_NOERROR, 0))
3303 	       goto parse_bracket_exp_free_return;
3304 	      break;
3305 	    default:
3306 	      assert (0);
3307 	      break;
3308 	    }
3309 	}
3310       if (BE (token->type == END_OF_RE, 0))
3311 	{
3312 	  *err = REG_EBRACK;
3313 	  goto parse_bracket_exp_free_return;
3314 	}
3315       if (token->type == OP_CLOSE_BRACKET)
3316 	break;
3317     }
3318 
3319   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3320 
3321   /* If it is non-matching list.  */
3322   if (non_match)
3323     bitset_not (sbcset);
3324 
3325 #ifdef RE_ENABLE_I18N
3326   /* Ensure only single byte characters are set.  */
3327   if (dfa->mb_cur_max > 1)
3328     bitset_mask (sbcset, dfa->sb_char);
3329 
3330   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3331       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3332 						     || mbcset->non_match)))
3333     {
3334       bin_tree_t *mbc_tree;
3335       int sbc_idx;
3336       /* Build a tree for complex bracket.  */
3337       dfa->has_mb_node = 1;
3338       br_token.type = COMPLEX_BRACKET;
3339       br_token.opr.mbcset = mbcset;
3340       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341       if (BE (mbc_tree == NULL, 0))
3342 	goto parse_bracket_exp_espace;
3343       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3344 	if (sbcset[sbc_idx])
3345 	  break;
3346       /* If there are no bits set in sbcset, there is no point
3347 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3348       if (sbc_idx < BITSET_WORDS)
3349 	{
3350 	  /* Build a tree for simple bracket.  */
3351 	  br_token.type = SIMPLE_BRACKET;
3352 	  br_token.opr.sbcset = sbcset;
3353 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354 	  if (BE (work_tree == NULL, 0))
3355 	    goto parse_bracket_exp_espace;
3356 
3357 	  /* Then join them by ALT node.  */
3358 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3359 	  if (BE (work_tree == NULL, 0))
3360 	    goto parse_bracket_exp_espace;
3361 	}
3362       else
3363 	{
3364 	  re_free (sbcset);
3365 	  work_tree = mbc_tree;
3366 	}
3367     }
3368   else
3369 #endif /* not RE_ENABLE_I18N */
3370     {
3371 #ifdef RE_ENABLE_I18N
3372       free_charset (mbcset);
3373 #endif
3374       /* Build a tree for simple bracket.  */
3375       br_token.type = SIMPLE_BRACKET;
3376       br_token.opr.sbcset = sbcset;
3377       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3378       if (BE (work_tree == NULL, 0))
3379 	goto parse_bracket_exp_espace;
3380     }
3381   return work_tree;
3382 
3383  parse_bracket_exp_espace:
3384   *err = REG_ESPACE;
3385  parse_bracket_exp_free_return:
3386   re_free (sbcset);
3387 #ifdef RE_ENABLE_I18N
3388   free_charset (mbcset);
3389 #endif /* RE_ENABLE_I18N */
3390   return NULL;
3391 }
3392 
3393 /* Parse an element in the bracket expression.  */
3394 
3395 static reg_errcode_t
3396 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3397 		       re_token_t *token, int token_len,
3398 		       re_dfa_t *dfa _UNUSED_PARAMETER_,
3399 		       reg_syntax_t syntax, bool accept_hyphen)
3400 {
3401 #ifdef RE_ENABLE_I18N
3402   int cur_char_size;
3403   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3404   if (cur_char_size > 1)
3405     {
3406       elem->type = MB_CHAR;
3407       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3408       re_string_skip_bytes (regexp, cur_char_size);
3409       return REG_NOERROR;
3410     }
3411 #endif /* RE_ENABLE_I18N */
3412   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3413   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3414       || token->type == OP_OPEN_EQUIV_CLASS)
3415     return parse_bracket_symbol (elem, regexp, token);
3416   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3417     {
3418       /* A '-' must only appear as anything but a range indicator before
3419 	 the closing bracket.  Everything else is an error.  */
3420       re_token_t token2;
3421       (void) peek_token_bracket (&token2, regexp, syntax);
3422       if (token2.type != OP_CLOSE_BRACKET)
3423 	/* The actual error value is not standardized since this whole
3424 	   case is undefined.  But ERANGE makes good sense.  */
3425 	return REG_ERANGE;
3426     }
3427   elem->type = SB_CHAR;
3428   elem->opr.ch = token->opr.c;
3429   return REG_NOERROR;
3430 }
3431 
3432 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3433    such as [:<character_class>:], [.<collating_element>.], and
3434    [=<equivalent_class>=].  */
3435 
3436 static reg_errcode_t
3437 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3438 		      re_token_t *token)
3439 {
3440   unsigned char ch, delim = token->opr.c;
3441   int i = 0;
3442   if (re_string_eoi(regexp))
3443     return REG_EBRACK;
3444   for (;; ++i)
3445     {
3446       if (i >= BRACKET_NAME_BUF_SIZE)
3447 	return REG_EBRACK;
3448       if (token->type == OP_OPEN_CHAR_CLASS)
3449 	ch = re_string_fetch_byte_case (regexp);
3450       else
3451 	ch = re_string_fetch_byte (regexp);
3452       if (re_string_eoi(regexp))
3453 	return REG_EBRACK;
3454       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3455 	break;
3456       elem->opr.name[i] = ch;
3457     }
3458   re_string_skip_bytes (regexp, 1);
3459   elem->opr.name[i] = '\0';
3460   switch (token->type)
3461     {
3462     case OP_OPEN_COLL_ELEM:
3463       elem->type = COLL_SYM;
3464       break;
3465     case OP_OPEN_EQUIV_CLASS:
3466       elem->type = EQUIV_CLASS;
3467       break;
3468     case OP_OPEN_CHAR_CLASS:
3469       elem->type = CHAR_CLASS;
3470       break;
3471     default:
3472       break;
3473     }
3474   return REG_NOERROR;
3475 }
3476 
3477   /* Helper function for parse_bracket_exp.
3478      Build the equivalence class which is represented by NAME.
3479      The result are written to MBCSET and SBCSET.
3480      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3481      is a pointer argument since we may update it.  */
3482 
3483 static reg_errcode_t
3484 #ifdef RE_ENABLE_I18N
3485 build_equiv_class (bitset_t sbcset,
3486                    re_charset_t *mbcset _UNUSED_PARAMETER_,
3487 		   Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3488 		   const unsigned char *name)
3489 #else /* not RE_ENABLE_I18N */
3490 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3491 #endif /* not RE_ENABLE_I18N */
3492 {
3493 #ifdef _LIBC
3494   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3495   if (nrules != 0)
3496     {
3497       const int32_t *table, *indirect;
3498       const unsigned char *weights, *extra, *cp;
3499       unsigned char char_buf[2];
3500       int32_t idx1, idx2;
3501       unsigned int ch;
3502       size_t len;
3503       /* This #include defines a local function!  */
3504 # include <locale/weight.h>
3505       /* Calculate the index for equivalence class.  */
3506       cp = name;
3507       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 					       _NL_COLLATE_WEIGHTMB);
3510       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 						   _NL_COLLATE_EXTRAMB);
3512       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 						_NL_COLLATE_INDIRECTMB);
3514       idx1 = findidx (&cp, -1);
3515       if (BE (idx1 == 0 || *cp != '\0', 0))
3516 	/* This isn't a valid character.  */
3517 	return REG_ECOLLATE;
3518 
3519       /* Build single byte matching table for this equivalence class.  */
3520       len = weights[idx1 & 0xffffff];
3521       for (ch = 0; ch < SBC_MAX; ++ch)
3522 	{
3523 	  char_buf[0] = ch;
3524 	  cp = char_buf;
3525 	  idx2 = findidx (&cp, 1);
3526 /*
3527 	  idx2 = table[ch];
3528 */
3529 	  if (idx2 == 0)
3530 	    /* This isn't a valid character.  */
3531 	    continue;
3532 	  /* Compare only if the length matches and the collation rule
3533 	     index is the same.  */
3534 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3535 	    {
3536 	      int cnt = 0;
3537 
3538 	      while (cnt <= len &&
3539 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3540 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3541 		++cnt;
3542 
3543 	      if (cnt > len)
3544 		bitset_set (sbcset, ch);
3545 	    }
3546 	}
3547       /* Check whether the array has enough space.  */
3548       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3549 	{
3550 	  /* Not enough, realloc it.  */
3551 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3552 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3554 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555 						   int32_t,
3556 						   new_equiv_class_alloc);
3557 	  if (BE (new_equiv_classes == NULL, 0))
3558 	    return REG_ESPACE;
3559 	  mbcset->equiv_classes = new_equiv_classes;
3560 	  *equiv_class_alloc = new_equiv_class_alloc;
3561 	}
3562       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563     }
3564   else
3565 #endif /* _LIBC */
3566     {
3567       if (BE (strlen ((const char *) name) != 1, 0))
3568 	return REG_ECOLLATE;
3569       bitset_set (sbcset, *name);
3570     }
3571   return REG_NOERROR;
3572 }
3573 
3574   /* Helper function for parse_bracket_exp.
3575      Build the character class which is represented by NAME.
3576      The result are written to MBCSET and SBCSET.
3577      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578      is a pointer argument since we may update it.  */
3579 
3580 static reg_errcode_t
3581 #ifdef RE_ENABLE_I18N
3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 		 re_charset_t *mbcset, Idx *char_class_alloc,
3584 		 const unsigned char *class_name, reg_syntax_t syntax)
3585 #else /* not RE_ENABLE_I18N */
3586 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587 		 const unsigned char *class_name, reg_syntax_t syntax)
3588 #endif /* not RE_ENABLE_I18N */
3589 {
3590   int i;
3591   const char *name = (const char *) class_name;
3592 
3593   /* In case of REG_ICASE "upper" and "lower" match the both of
3594      upper and lower cases.  */
3595   if ((syntax & RE_ICASE)
3596       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3597     name = "alpha";
3598 
3599 #ifdef RE_ENABLE_I18N
3600   /* Check the space of the arrays.  */
3601   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3602     {
3603       /* Not enough, realloc it.  */
3604       /* +1 in case of mbcset->nchar_classes is 0.  */
3605       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606       /* Use realloc since array is NULL if *alloc == 0.  */
3607       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608 					       new_char_class_alloc);
3609       if (BE (new_char_classes == NULL, 0))
3610 	return REG_ESPACE;
3611       mbcset->char_classes = new_char_classes;
3612       *char_class_alloc = new_char_class_alloc;
3613     }
3614   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615 #endif /* RE_ENABLE_I18N */
3616 
3617 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3618   do {						\
3619     if (BE (trans != NULL, 0))			\
3620       {						\
3621 	for (i = 0; i < SBC_MAX; ++i)		\
3622 	  if (ctype_func (i))			\
3623 	    bitset_set (sbcset, trans[i]);	\
3624       }						\
3625     else					\
3626       {						\
3627 	for (i = 0; i < SBC_MAX; ++i)		\
3628 	  if (ctype_func (i))			\
3629 	    bitset_set (sbcset, i);		\
3630       }						\
3631   } while (0)
3632 
3633   if (strcmp (name, "alnum") == 0)
3634     BUILD_CHARCLASS_LOOP (isalnum);
3635   else if (strcmp (name, "cntrl") == 0)
3636     BUILD_CHARCLASS_LOOP (iscntrl);
3637   else if (strcmp (name, "lower") == 0)
3638     BUILD_CHARCLASS_LOOP (islower);
3639   else if (strcmp (name, "space") == 0)
3640     BUILD_CHARCLASS_LOOP (isspace);
3641   else if (strcmp (name, "alpha") == 0)
3642     BUILD_CHARCLASS_LOOP (isalpha);
3643   else if (strcmp (name, "digit") == 0)
3644     BUILD_CHARCLASS_LOOP (isdigit);
3645   else if (strcmp (name, "print") == 0)
3646     BUILD_CHARCLASS_LOOP (isprint);
3647   else if (strcmp (name, "upper") == 0)
3648     BUILD_CHARCLASS_LOOP (isupper);
3649   else if (strcmp (name, "blank") == 0)
3650     BUILD_CHARCLASS_LOOP (isblank);
3651   else if (strcmp (name, "graph") == 0)
3652     BUILD_CHARCLASS_LOOP (isgraph);
3653   else if (strcmp (name, "punct") == 0)
3654     BUILD_CHARCLASS_LOOP (ispunct);
3655   else if (strcmp (name, "xdigit") == 0)
3656     BUILD_CHARCLASS_LOOP (isxdigit);
3657   else
3658     return REG_ECTYPE;
3659 
3660   return REG_NOERROR;
3661 }
3662 
3663 static bin_tree_t *
3664 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665 		    const unsigned char *class_name,
3666 		    const unsigned char *extra, bool non_match,
3667 		    reg_errcode_t *err)
3668 {
3669   re_bitset_ptr_t sbcset;
3670 #ifdef RE_ENABLE_I18N
3671   re_charset_t *mbcset;
3672   Idx alloc = 0;
3673 #endif /* not RE_ENABLE_I18N */
3674   reg_errcode_t ret;
3675   re_token_t br_token;
3676   bin_tree_t *tree;
3677 
3678   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679 #ifdef RE_ENABLE_I18N
3680   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681 #endif /* RE_ENABLE_I18N */
3682 
3683 #ifdef RE_ENABLE_I18N
3684   if (BE (sbcset == NULL || mbcset == NULL, 0))
3685 #else /* not RE_ENABLE_I18N */
3686   if (BE (sbcset == NULL, 0))
3687 #endif /* not RE_ENABLE_I18N */
3688     {
3689       *err = REG_ESPACE;
3690       return NULL;
3691     }
3692 
3693   if (non_match)
3694     {
3695 #ifdef RE_ENABLE_I18N
3696       mbcset->non_match = 1;
3697 #endif /* not RE_ENABLE_I18N */
3698     }
3699 
3700   /* We don't care the syntax in this case.  */
3701   ret = build_charclass (trans, sbcset,
3702 #ifdef RE_ENABLE_I18N
3703 			 mbcset, &alloc,
3704 #endif /* RE_ENABLE_I18N */
3705 			 class_name, 0);
3706 
3707   if (BE (ret != REG_NOERROR, 0))
3708     {
3709       re_free (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711       free_charset (mbcset);
3712 #endif /* RE_ENABLE_I18N */
3713       *err = ret;
3714       return NULL;
3715     }
3716   /* \w match '_' also.  */
3717   for (; *extra; extra++)
3718     bitset_set (sbcset, *extra);
3719 
3720   /* If it is non-matching list.  */
3721   if (non_match)
3722     bitset_not (sbcset);
3723 
3724 #ifdef RE_ENABLE_I18N
3725   /* Ensure only single byte characters are set.  */
3726   if (dfa->mb_cur_max > 1)
3727     bitset_mask (sbcset, dfa->sb_char);
3728 #endif
3729 
3730   /* Build a tree for simple bracket.  */
3731   br_token.type = SIMPLE_BRACKET;
3732   br_token.opr.sbcset = sbcset;
3733   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734   if (BE (tree == NULL, 0))
3735     goto build_word_op_espace;
3736 
3737 #ifdef RE_ENABLE_I18N
3738   if (dfa->mb_cur_max > 1)
3739     {
3740       bin_tree_t *mbc_tree;
3741       /* Build a tree for complex bracket.  */
3742       br_token.type = COMPLEX_BRACKET;
3743       br_token.opr.mbcset = mbcset;
3744       dfa->has_mb_node = 1;
3745       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746       if (BE (mbc_tree == NULL, 0))
3747 	goto build_word_op_espace;
3748       /* Then join them by ALT node.  */
3749       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750       if (BE (mbc_tree != NULL, 1))
3751 	return tree;
3752     }
3753   else
3754     {
3755       free_charset (mbcset);
3756       return tree;
3757     }
3758 #else /* not RE_ENABLE_I18N */
3759   return tree;
3760 #endif /* not RE_ENABLE_I18N */
3761 
3762  build_word_op_espace:
3763   re_free (sbcset);
3764 #ifdef RE_ENABLE_I18N
3765   free_charset (mbcset);
3766 #endif /* RE_ENABLE_I18N */
3767   *err = REG_ESPACE;
3768   return NULL;
3769 }
3770 
3771 /* This is intended for the expressions like "a{1,3}".
3772    Fetch a number from 'input', and return the number.
3773    Return REG_MISSING if the number field is empty like "{,1}".
3774    Return RE_DUP_MAX + 1 if the number field is too large.
3775    Return REG_ERROR if an error occurred.  */
3776 
3777 static Idx
3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3779 {
3780   Idx num = REG_MISSING;
3781   unsigned char c;
3782   while (1)
3783     {
3784       fetch_token (token, input, syntax);
3785       c = token->opr.c;
3786       if (BE (token->type == END_OF_RE, 0))
3787 	return REG_ERROR;
3788       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3789 	break;
3790       num = ((token->type != CHARACTER || c < '0' || '9' < c
3791 	      || num == REG_ERROR)
3792 	     ? REG_ERROR
3793 	     : num == REG_MISSING
3794 	     ? c - '0'
3795 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3796     }
3797   return num;
3798 }
3799 
3800 #ifdef RE_ENABLE_I18N
3801 static void
3802 free_charset (re_charset_t *cset)
3803 {
3804   re_free (cset->mbchars);
3805 # ifdef _LIBC
3806   re_free (cset->coll_syms);
3807   re_free (cset->equiv_classes);
3808   re_free (cset->range_starts);
3809   re_free (cset->range_ends);
3810 # endif
3811   re_free (cset->char_classes);
3812   re_free (cset);
3813 }
3814 #endif /* RE_ENABLE_I18N */
3815 
3816 /* Functions for binary tree operation.  */
3817 
3818 /* Create a tree node.  */
3819 
3820 static bin_tree_t *
3821 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 	     re_token_type_t type)
3823 {
3824   re_token_t t;
3825   t.type = type;
3826   return create_token_tree (dfa, left, right, &t);
3827 }
3828 
3829 static bin_tree_t *
3830 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3831 		   const re_token_t *token)
3832 {
3833   bin_tree_t *tree;
3834   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3835     {
3836       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3837 
3838       if (storage == NULL)
3839 	return NULL;
3840       storage->next = dfa->str_tree_storage;
3841       dfa->str_tree_storage = storage;
3842       dfa->str_tree_storage_idx = 0;
3843     }
3844   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3845 
3846   tree->parent = NULL;
3847   tree->left = left;
3848   tree->right = right;
3849   tree->token = *token;
3850   tree->token.duplicated = 0;
3851   tree->token.opt_subexp = 0;
3852   tree->first = NULL;
3853   tree->next = NULL;
3854   tree->node_idx = REG_MISSING;
3855 
3856   if (left != NULL)
3857     left->parent = tree;
3858   if (right != NULL)
3859     right->parent = tree;
3860   return tree;
3861 }
3862 
3863 /* Mark the tree SRC as an optional subexpression.
3864    To be called from preorder or postorder.  */
3865 
3866 static reg_errcode_t
3867 mark_opt_subexp (void *extra, bin_tree_t *node)
3868 {
3869   Idx idx = (uintptr_t) extra;
3870   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3871     node->token.opt_subexp = 1;
3872 
3873   return REG_NOERROR;
3874 }
3875 
3876 /* Free the allocated memory inside NODE. */
3877 
3878 static void
3879 free_token (re_token_t *node)
3880 {
3881 #ifdef RE_ENABLE_I18N
3882   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3883     free_charset (node->opr.mbcset);
3884   else
3885 #endif /* RE_ENABLE_I18N */
3886     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3887       re_free (node->opr.sbcset);
3888 }
3889 
3890 /* Worker function for tree walking.  Free the allocated memory inside NODE
3891    and its children. */
3892 
3893 static reg_errcode_t
3894 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3895 {
3896   free_token (&node->token);
3897   return REG_NOERROR;
3898 }
3899 
3900 
3901 /* Duplicate the node SRC, and return new node.  This is a preorder
3902    visit similar to the one implemented by the generic visitor, but
3903    we need more infrastructure to maintain two parallel trees --- so,
3904    it's easier to duplicate.  */
3905 
3906 static bin_tree_t *
3907 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3908 {
3909   const bin_tree_t *node;
3910   bin_tree_t *dup_root;
3911   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3912 
3913   for (node = root; ; )
3914     {
3915       /* Create a new tree and link it back to the current parent.  */
3916       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3917       if (*p_new == NULL)
3918 	return NULL;
3919       (*p_new)->parent = dup_node;
3920       (*p_new)->token.duplicated = 1;
3921       dup_node = *p_new;
3922 
3923       /* Go to the left node, or up and to the right.  */
3924       if (node->left)
3925 	{
3926 	  node = node->left;
3927 	  p_new = &dup_node->left;
3928 	}
3929       else
3930 	{
3931 	  const bin_tree_t *prev = NULL;
3932 	  while (node->right == prev || node->right == NULL)
3933 	    {
3934 	      prev = node;
3935 	      node = node->parent;
3936 	      dup_node = dup_node->parent;
3937 	      if (!node)
3938 		return dup_root;
3939 	    }
3940 	  node = node->right;
3941 	  p_new = &dup_node->right;
3942 	}
3943     }
3944 }
3945