xref: /dragonfly/contrib/grep/lib/regcomp.c (revision 92fc8b5c)
1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
5    Software Foundation, Inc.
6    This file is part of the GNU C Library.
7    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3, or (at your option)
12    any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License along
20    with this program; if not, write to the Free Software Foundation,
21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22 
23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
24 					  size_t length, reg_syntax_t syntax);
25 static void re_compile_fastmap_iter (regex_t *bufp,
26 				     const re_dfastate_t *init_state,
27 				     char *fastmap);
28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
29 #ifdef RE_ENABLE_I18N
30 static void free_charset (re_charset_t *cset);
31 #endif /* RE_ENABLE_I18N */
32 static void free_workarea_compile (regex_t *preg);
33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 #ifdef RE_ENABLE_I18N
35 static void optimize_utf8 (re_dfa_t *dfa);
36 #endif
37 static reg_errcode_t analyze (regex_t *preg);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 			       reg_errcode_t (fn (void *, bin_tree_t *)),
40 			       void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 				reg_errcode_t (fn (void *, bin_tree_t *)),
43 				void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 				 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
53 				   unsigned int constraint);
54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
56 					 Idx node, bool root);
57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
58 static Idx fetch_number (re_string_t *input, re_token_t *token,
59 			 reg_syntax_t syntax);
60 static int peek_token (re_token_t *token, re_string_t *input,
61 			reg_syntax_t syntax) internal_function;
62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
63 			  reg_syntax_t syntax, reg_errcode_t *err);
64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
65 				  re_token_t *token, reg_syntax_t syntax,
66 				  Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
68 				 re_token_t *token, reg_syntax_t syntax,
69 				 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
71 				     re_token_t *token, reg_syntax_t syntax,
72 				     Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
74 				  re_token_t *token, reg_syntax_t syntax,
75 				  Idx nest, reg_errcode_t *err);
76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
77 				 re_dfa_t *dfa, re_token_t *token,
78 				 reg_syntax_t syntax, reg_errcode_t *err);
79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
80 				      re_token_t *token, reg_syntax_t syntax,
81 				      reg_errcode_t *err);
82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
83 					    re_string_t *regexp,
84 					    re_token_t *token, int token_len,
85 					    re_dfa_t *dfa,
86 					    reg_syntax_t syntax,
87 					    bool accept_hyphen);
88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 					  re_string_t *regexp,
90 					  re_token_t *token);
91 #ifdef RE_ENABLE_I18N
92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
93 					re_charset_t *mbcset,
94 					Idx *equiv_class_alloc,
95 					const unsigned char *name);
96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
97 				      bitset_t sbcset,
98 				      re_charset_t *mbcset,
99 				      Idx *char_class_alloc,
100 				      const unsigned char *class_name,
101 				      reg_syntax_t syntax);
102 #else  /* not RE_ENABLE_I18N */
103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
104 					const unsigned char *name);
105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
106 				      bitset_t sbcset,
107 				      const unsigned char *class_name,
108 				      reg_syntax_t syntax);
109 #endif /* not RE_ENABLE_I18N */
110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
111 				       RE_TRANSLATE_TYPE trans,
112 				       const unsigned char *class_name,
113 				       const unsigned char *extra,
114 				       bool non_match, reg_errcode_t *err);
115 static bin_tree_t *create_tree (re_dfa_t *dfa,
116 				bin_tree_t *left, bin_tree_t *right,
117 				re_token_type_t type);
118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
119 				      bin_tree_t *left, bin_tree_t *right,
120 				      const re_token_t *token);
121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
122 static void free_token (re_token_t *node);
123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 
126 /* This table gives an error message for each of the error codes listed
127    in regex.h.  Obviously the order here has to be same as there.
128    POSIX doesn't require that we do anything for REG_NOERROR,
129    but why not be nice?  */
130 
131 static const char __re_error_msgid[] =
132   {
133 #define REG_NOERROR_IDX	0
134     gettext_noop ("Success")	/* REG_NOERROR */
135     "\0"
136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
137     gettext_noop ("No match")	/* REG_NOMATCH */
138     "\0"
139 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
140     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
141     "\0"
142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
143     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
144     "\0"
145 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
146     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
147     "\0"
148 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
149     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
150     "\0"
151 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
152     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
153     "\0"
154 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
155     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
156     "\0"
157 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
158     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
159     "\0"
160 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
161     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
162     "\0"
163 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
164     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
165     "\0"
166 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
167     gettext_noop ("Invalid range end")	/* REG_ERANGE */
168     "\0"
169 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
170     gettext_noop ("Memory exhausted") /* REG_ESPACE */
171     "\0"
172 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
173     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
174     "\0"
175 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
176     gettext_noop ("Premature end of regular expression") /* REG_EEND */
177     "\0"
178 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
179     gettext_noop ("Regular expression too big") /* REG_ESIZE */
180     "\0"
181 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
182     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183   };
184 
185 static const size_t __re_error_msgid_idx[] =
186   {
187     REG_NOERROR_IDX,
188     REG_NOMATCH_IDX,
189     REG_BADPAT_IDX,
190     REG_ECOLLATE_IDX,
191     REG_ECTYPE_IDX,
192     REG_EESCAPE_IDX,
193     REG_ESUBREG_IDX,
194     REG_EBRACK_IDX,
195     REG_EPAREN_IDX,
196     REG_EBRACE_IDX,
197     REG_BADBR_IDX,
198     REG_ERANGE_IDX,
199     REG_ESPACE_IDX,
200     REG_BADRPT_IDX,
201     REG_EEND_IDX,
202     REG_ESIZE_IDX,
203     REG_ERPAREN_IDX
204   };
205 
206 /* Entry points for GNU code.  */
207 
208 /* re_compile_pattern is the GNU regular expression compiler: it
209    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210    Returns 0 if the pattern was valid, otherwise an error string.
211 
212    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213    are set in BUFP on entry.  */
214 
215 #ifdef _LIBC
216 const char *
217 re_compile_pattern (pattern, length, bufp)
218     const char *pattern;
219     size_t length;
220     struct re_pattern_buffer *bufp;
221 #else /* size_t might promote */
222 const char *
223 re_compile_pattern (const char *pattern, size_t length,
224 		    struct re_pattern_buffer *bufp)
225 #endif
226 {
227   reg_errcode_t ret;
228 
229   /* And GNU code determines whether or not to get register information
230      by passing null for the REGS argument to re_match, etc., not by
231      setting no_sub, unless RE_NO_SUB is set.  */
232   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
233 
234   /* Match anchors at newline.  */
235   bufp->newline_anchor = 1;
236 
237   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
238 
239   if (!ret)
240     return NULL;
241   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
242 }
243 #ifdef _LIBC
244 weak_alias (__re_compile_pattern, re_compile_pattern)
245 #endif
246 
247 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
248    also be assigned to arbitrarily: each pattern buffer stores its own
249    syntax, so it can be changed between regex compilations.  */
250 /* This has no initializer because initialized variables in Emacs
251    become read-only after dumping.  */
252 reg_syntax_t re_syntax_options;
253 
254 
255 /* Specify the precise syntax of regexps for compilation.  This provides
256    for compatibility for various utilities which historically have
257    different, incompatible syntaxes.
258 
259    The argument SYNTAX is a bit mask comprised of the various bits
260    defined in regex.h.  We return the old syntax.  */
261 
262 reg_syntax_t
263 re_set_syntax (syntax)
264     reg_syntax_t syntax;
265 {
266   reg_syntax_t ret = re_syntax_options;
267 
268   re_syntax_options = syntax;
269   return ret;
270 }
271 #ifdef _LIBC
272 weak_alias (__re_set_syntax, re_set_syntax)
273 #endif
274 
275 int
276 re_compile_fastmap (bufp)
277     struct re_pattern_buffer *bufp;
278 {
279   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
280   char *fastmap = bufp->fastmap;
281 
282   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
283   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
284   if (dfa->init_state != dfa->init_state_word)
285     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
286   if (dfa->init_state != dfa->init_state_nl)
287     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
288   if (dfa->init_state != dfa->init_state_begbuf)
289     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
290   bufp->fastmap_accurate = 1;
291   return 0;
292 }
293 #ifdef _LIBC
294 weak_alias (__re_compile_fastmap, re_compile_fastmap)
295 #endif
296 
297 static inline void
298 __attribute ((always_inline))
299 re_set_fastmap (char *fastmap, bool icase, int ch)
300 {
301   fastmap[ch] = 1;
302   if (icase)
303     fastmap[tolower (ch)] = 1;
304 }
305 
306 /* Helper function for re_compile_fastmap.
307    Compile fastmap for the initial_state INIT_STATE.  */
308 
309 static void
310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
311 			 char *fastmap)
312 {
313   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
314   Idx node_cnt;
315   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
316   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
317     {
318       Idx node = init_state->nodes.elems[node_cnt];
319       re_token_type_t type = dfa->nodes[node].type;
320 
321       if (type == CHARACTER)
322 	{
323 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
324 #ifdef RE_ENABLE_I18N
325 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
326 	    {
327 	      unsigned char buf[MB_LEN_MAX];
328 	      unsigned char *p;
329 	      wchar_t wc;
330 	      mbstate_t state;
331 
332 	      p = buf;
333 	      *p++ = dfa->nodes[node].opr.c;
334 	      while (++node < dfa->nodes_len
335 		     &&	dfa->nodes[node].type == CHARACTER
336 		     && dfa->nodes[node].mb_partial)
337 		*p++ = dfa->nodes[node].opr.c;
338 	      memset (&state, '\0', sizeof (state));
339 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
340 			     &state) == p - buf
341 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
342 		      != (size_t) -1))
343 		re_set_fastmap (fastmap, false, buf[0]);
344 	    }
345 #endif
346 	}
347       else if (type == SIMPLE_BRACKET)
348 	{
349 	  int i, ch;
350 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
351 	    {
352 	      int j;
353 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
354 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
355 		if (w & ((bitset_word_t) 1 << j))
356 		  re_set_fastmap (fastmap, icase, ch);
357 	    }
358 	}
359 #ifdef RE_ENABLE_I18N
360       else if (type == COMPLEX_BRACKET)
361 	{
362 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 	  Idx i;
364 
365 # ifdef _LIBC
366 	  /* See if we have to try all bytes which start multiple collation
367 	     elements.
368 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
369 		  collation element, and don't catch 'b' since 'b' is
370 		  the only collation element which starts from 'b' (and
371 		  it is caught by SIMPLE_BRACKET).  */
372 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
373 		  && (cset->ncoll_syms || cset->nranges))
374 		{
375 		  const int32_t *table = (const int32_t *)
376 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 		  for (i = 0; i < SBC_MAX; ++i)
378 		    if (table[i] < 0)
379 		      re_set_fastmap (fastmap, icase, i);
380 		}
381 # endif /* _LIBC */
382 
383 	  /* See if we have to start the match at all multibyte characters,
384 	     i.e. where we would not find an invalid sequence.  This only
385 	     applies to multibyte character sets; for single byte character
386 	     sets, the SIMPLE_BRACKET again suffices.  */
387 	  if (dfa->mb_cur_max > 1
388 	      && (cset->nchar_classes || cset->non_match || cset->nranges
389 # ifdef _LIBC
390 		  || cset->nequiv_classes
391 # endif /* _LIBC */
392 		 ))
393 	    {
394 	      unsigned char c = 0;
395 	      do
396 		{
397 		  mbstate_t mbs;
398 		  memset (&mbs, 0, sizeof (mbs));
399 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
400 		    re_set_fastmap (fastmap, false, (int) c);
401 		}
402 	      while (++c != 0);
403 	    }
404 
405 	  else
406 	    {
407 	      /* ... Else catch all bytes which can start the mbchars.  */
408 	      for (i = 0; i < cset->nmbchars; ++i)
409 		{
410 		  char buf[256];
411 		  mbstate_t state;
412 		  memset (&state, '\0', sizeof (state));
413 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 		    {
417 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 			  != (size_t) -1)
419 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 		    }
421 		}
422 	    }
423 	}
424 #endif /* RE_ENABLE_I18N */
425       else if (type == OP_PERIOD
426 #ifdef RE_ENABLE_I18N
427 	       || type == OP_UTF8_PERIOD
428 #endif /* RE_ENABLE_I18N */
429 	       || type == END_OF_RE)
430 	{
431 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
432 	  if (type == END_OF_RE)
433 	    bufp->can_be_null = 1;
434 	  return;
435 	}
436     }
437 }
438 
439 /* Entry point for POSIX code.  */
440 /* regcomp takes a regular expression as a string and compiles it.
441 
442    PREG is a regex_t *.  We do not expect any fields to be initialized,
443    since POSIX says we shouldn't.  Thus, we set
444 
445      `buffer' to the compiled pattern;
446      `used' to the length of the compiled pattern;
447      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
448        REG_EXTENDED bit in CFLAGS is set; otherwise, to
449        RE_SYNTAX_POSIX_BASIC;
450      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
451      `fastmap' to an allocated space for the fastmap;
452      `fastmap_accurate' to zero;
453      `re_nsub' to the number of subexpressions in PATTERN.
454 
455    PATTERN is the address of the pattern string.
456 
457    CFLAGS is a series of bits which affect compilation.
458 
459      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
460      use POSIX basic syntax.
461 
462      If REG_NEWLINE is set, then . and [^...] don't match newline.
463      Also, regexec will try a match beginning after every newline.
464 
465      If REG_ICASE is set, then we considers upper- and lowercase
466      versions of letters to be equivalent when matching.
467 
468      If REG_NOSUB is set, then when PREG is passed to regexec, that
469      routine will report only success or failure, and nothing about the
470      registers.
471 
472    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
473    the return codes and their meanings.)  */
474 
475 int
476 regcomp (preg, pattern, cflags)
477     regex_t *_Restrict_ preg;
478     const char *_Restrict_ pattern;
479     int cflags;
480 {
481   reg_errcode_t ret;
482   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
483 			 : RE_SYNTAX_POSIX_BASIC);
484 
485   preg->buffer = NULL;
486   preg->allocated = 0;
487   preg->used = 0;
488 
489   /* Try to allocate space for the fastmap.  */
490   preg->fastmap = re_malloc (char, SBC_MAX);
491   if (BE (preg->fastmap == NULL, 0))
492     return REG_ESPACE;
493 
494   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495 
496   /* If REG_NEWLINE is set, newlines are treated differently.  */
497   if (cflags & REG_NEWLINE)
498     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
499       syntax &= ~RE_DOT_NEWLINE;
500       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
501       /* It also changes the matching behavior.  */
502       preg->newline_anchor = 1;
503     }
504   else
505     preg->newline_anchor = 0;
506   preg->no_sub = !!(cflags & REG_NOSUB);
507   preg->translate = NULL;
508 
509   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510 
511   /* POSIX doesn't distinguish between an unmatched open-group and an
512      unmatched close-group: both are REG_EPAREN.  */
513   if (ret == REG_ERPAREN)
514     ret = REG_EPAREN;
515 
516   /* We have already checked preg->fastmap != NULL.  */
517   if (BE (ret == REG_NOERROR, 1))
518     /* Compute the fastmap now, since regexec cannot modify the pattern
519        buffer.  This function never fails in this implementation.  */
520     (void) re_compile_fastmap (preg);
521   else
522     {
523       /* Some error occurred while compiling the expression.  */
524       re_free (preg->fastmap);
525       preg->fastmap = NULL;
526     }
527 
528   return (int) ret;
529 }
530 #ifdef _LIBC
531 weak_alias (__regcomp, regcomp)
532 #endif
533 
534 /* Returns a message corresponding to an error code, ERRCODE, returned
535    from either regcomp or regexec.   We don't use PREG here.  */
536 
537 #ifdef _LIBC
538 size_t
539 regerror (errcode, preg, errbuf, errbuf_size)
540     int errcode;
541     const regex_t *_Restrict_ preg;
542     char *_Restrict_ errbuf;
543     size_t errbuf_size;
544 #else /* size_t might promote */
545 size_t
546 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
547 	  char *_Restrict_ errbuf, size_t errbuf_size)
548 #endif
549 {
550   const char *msg;
551   size_t msg_size;
552 
553   if (BE (errcode < 0
554 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
555 			       / sizeof (__re_error_msgid_idx[0])), 0))
556     /* Only error codes returned by the rest of the code should be passed
557        to this routine.  If we are given anything else, or if other regex
558        code generates an invalid error code, then the program has a bug.
559        Dump core so we can fix it.  */
560     abort ();
561 
562   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
563 
564   msg_size = strlen (msg) + 1; /* Includes the null.  */
565 
566   if (BE (errbuf_size != 0, 1))
567     {
568       size_t cpy_size = msg_size;
569       if (BE (msg_size > errbuf_size, 0))
570 	{
571 	  cpy_size = errbuf_size - 1;
572 	  errbuf[cpy_size] = '\0';
573 	}
574       memcpy (errbuf, msg, cpy_size);
575     }
576 
577   return msg_size;
578 }
579 #ifdef _LIBC
580 weak_alias (__regerror, regerror)
581 #endif
582 
583 
584 #ifdef RE_ENABLE_I18N
585 /* This static array is used for the map to single-byte characters when
586    UTF-8 is used.  Otherwise we would allocate memory just to initialize
587    it the same all the time.  UTF-8 is the preferred encoding so this is
588    a worthwhile optimization.  */
589 static const bitset_t utf8_sb_map =
590 {
591   /* Set the first 128 bits.  */
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 #  error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597   BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599   BITSET_WORD_MAX,
600 # endif
601   (BITSET_WORD_MAX
602    >> (SBC_MAX % BITSET_WORD_BITS == 0
603        ? 0
604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 };
606 #endif
607 
608 
609 static void
610 free_dfa_content (re_dfa_t *dfa)
611 {
612   Idx i, j;
613 
614   if (dfa->nodes)
615     for (i = 0; i < dfa->nodes_len; ++i)
616       free_token (dfa->nodes + i);
617   re_free (dfa->nexts);
618   for (i = 0; i < dfa->nodes_len; ++i)
619     {
620       if (dfa->eclosures != NULL)
621 	re_node_set_free (dfa->eclosures + i);
622       if (dfa->inveclosures != NULL)
623 	re_node_set_free (dfa->inveclosures + i);
624       if (dfa->edests != NULL)
625 	re_node_set_free (dfa->edests + i);
626     }
627   re_free (dfa->edests);
628   re_free (dfa->eclosures);
629   re_free (dfa->inveclosures);
630   re_free (dfa->nodes);
631 
632   if (dfa->state_table)
633     for (i = 0; i <= dfa->state_hash_mask; ++i)
634       {
635 	struct re_state_table_entry *entry = dfa->state_table + i;
636 	for (j = 0; j < entry->num; ++j)
637 	  {
638 	    re_dfastate_t *state = entry->array[j];
639 	    free_state (state);
640 	  }
641 	re_free (entry->array);
642       }
643   re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645   if (dfa->sb_char != utf8_sb_map)
646     re_free (dfa->sb_char);
647 #endif
648   re_free (dfa->subexp_map);
649 #ifdef DEBUG
650   re_free (dfa->re_str);
651 #endif
652 
653   re_free (dfa);
654 }
655 
656 
657 /* Free dynamically allocated space used by PREG.  */
658 
659 void
660 regfree (preg)
661     regex_t *preg;
662 {
663   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
664   if (BE (dfa != NULL, 1))
665     free_dfa_content (dfa);
666   preg->buffer = NULL;
667   preg->allocated = 0;
668 
669   re_free (preg->fastmap);
670   preg->fastmap = NULL;
671 
672   re_free (preg->translate);
673   preg->translate = NULL;
674 }
675 #ifdef _LIBC
676 weak_alias (__regfree, regfree)
677 #endif
678 
679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
680    them unless specifically requested.  */
681 
682 #if defined _REGEX_RE_COMP || defined _LIBC
683 
684 /* BSD has one and only one pattern buffer.  */
685 static struct re_pattern_buffer re_comp_buf;
686 
687 char *
688 # ifdef _LIBC
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690    these names if they don't use our functions, and still use
691    regcomp/regexec above without link errors.  */
692 weak_function
693 # endif
694 re_comp (s)
695      const char *s;
696 {
697   reg_errcode_t ret;
698   char *fastmap;
699 
700   if (!s)
701     {
702       if (!re_comp_buf.buffer)
703 	return gettext ("No previous regular expression");
704       return 0;
705     }
706 
707   if (re_comp_buf.buffer)
708     {
709       fastmap = re_comp_buf.fastmap;
710       re_comp_buf.fastmap = NULL;
711       __regfree (&re_comp_buf);
712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713       re_comp_buf.fastmap = fastmap;
714     }
715 
716   if (re_comp_buf.fastmap == NULL)
717     {
718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719       if (re_comp_buf.fastmap == NULL)
720 	return (char *) gettext (__re_error_msgid
721 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
722     }
723 
724   /* Since `re_exec' always passes NULL for the `regs' argument, we
725      don't need to initialize the pattern buffer fields which affect it.  */
726 
727   /* Match anchors at newlines.  */
728   re_comp_buf.newline_anchor = 1;
729 
730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731 
732   if (!ret)
733     return NULL;
734 
735   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 }
738 
739 #ifdef _LIBC
740 libc_freeres_fn (free_mem)
741 {
742   __regfree (&re_comp_buf);
743 }
744 #endif
745 
746 #endif /* _REGEX_RE_COMP */
747 
748 /* Internal entry point.
749    Compile the regular expression PATTERN, whose length is LENGTH.
750    SYNTAX indicate regular expression's syntax.  */
751 
752 static reg_errcode_t
753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 		     reg_syntax_t syntax)
755 {
756   reg_errcode_t err = REG_NOERROR;
757   re_dfa_t *dfa;
758   re_string_t regexp;
759 
760   /* Initialize the pattern buffer.  */
761   preg->fastmap_accurate = 0;
762   preg->syntax = syntax;
763   preg->not_bol = preg->not_eol = 0;
764   preg->used = 0;
765   preg->re_nsub = 0;
766   preg->can_be_null = 0;
767   preg->regs_allocated = REGS_UNALLOCATED;
768 
769   /* Initialize the dfa.  */
770   dfa = (re_dfa_t *) preg->buffer;
771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772     {
773       /* If zero allocated, but buffer is non-null, try to realloc
774 	 enough space.  This loses if buffer's address is bogus, but
775 	 that is the user's responsibility.  If ->buffer is NULL this
776 	 is a simple allocation.  */
777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778       if (dfa == NULL)
779 	return REG_ESPACE;
780       preg->allocated = sizeof (re_dfa_t);
781       preg->buffer = (unsigned char *) dfa;
782     }
783   preg->used = sizeof (re_dfa_t);
784 
785   err = init_dfa (dfa, length);
786   if (BE (err != REG_NOERROR, 0))
787     {
788       free_dfa_content (dfa);
789       preg->buffer = NULL;
790       preg->allocated = 0;
791       return err;
792     }
793 #ifdef DEBUG
794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
795   dfa->re_str = re_malloc (char, length + 1);
796   strncpy (dfa->re_str, pattern, length + 1);
797 #endif
798 
799   __libc_lock_init (dfa->lock);
800 
801   err = re_string_construct (&regexp, pattern, length, preg->translate,
802 			     (syntax & RE_ICASE) != 0, dfa);
803   if (BE (err != REG_NOERROR, 0))
804     {
805     re_compile_internal_free_return:
806       free_workarea_compile (preg);
807       re_string_destruct (&regexp);
808       free_dfa_content (dfa);
809       preg->buffer = NULL;
810       preg->allocated = 0;
811       return err;
812     }
813 
814   /* Parse the regular expression, and build a structure tree.  */
815   preg->re_nsub = 0;
816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
817   if (BE (dfa->str_tree == NULL, 0))
818     goto re_compile_internal_free_return;
819 
820   /* Analyze the tree and create the nfa.  */
821   err = analyze (preg);
822   if (BE (err != REG_NOERROR, 0))
823     goto re_compile_internal_free_return;
824 
825 #ifdef RE_ENABLE_I18N
826   /* If possible, do searching in single byte encoding to speed things up.  */
827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828     optimize_utf8 (dfa);
829 #endif
830 
831   /* Then create the initial state of the dfa.  */
832   err = create_initial_state (dfa);
833 
834   /* Release work areas.  */
835   free_workarea_compile (preg);
836   re_string_destruct (&regexp);
837 
838   if (BE (err != REG_NOERROR, 0))
839     {
840       free_dfa_content (dfa);
841       preg->buffer = NULL;
842       preg->allocated = 0;
843     }
844 
845   return err;
846 }
847 
848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
849    as the initial length of some arrays.  */
850 
851 static reg_errcode_t
852 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 {
854   __re_size_t table_size;
855 #ifndef _LIBC
856   char *codeset_name;
857 #endif
858 #ifdef RE_ENABLE_I18N
859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
860 #else
861   size_t max_i18n_object_size = 0;
862 #endif
863   size_t max_object_size =
864     MAX (sizeof (struct re_state_table_entry),
865 	 MAX (sizeof (re_token_t),
866 	      MAX (sizeof (re_node_set),
867 		   MAX (sizeof (regmatch_t),
868 			max_i18n_object_size))));
869 
870   memset (dfa, '\0', sizeof (re_dfa_t));
871 
872   /* Force allocation of str_tree_storage the first time.  */
873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874 
875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
876      calculation below, and for similar doubling calculations
877      elsewhere.  And it's <= rather than <, because some of the
878      doubling calculations add 1 afterwards.  */
879   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
880     return REG_ESPACE;
881 
882   dfa->nodes_alloc = pat_len + 1;
883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
884 
885   /*  table_size = 2 ^ ceil(log pat_len) */
886   for (table_size = 1; ; table_size <<= 1)
887     if (table_size > pat_len)
888       break;
889 
890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891   dfa->state_hash_mask = table_size - 1;
892 
893   dfa->mb_cur_max = MB_CUR_MAX;
894 #ifdef _LIBC
895   if (dfa->mb_cur_max == 6
896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
897     dfa->is_utf8 = 1;
898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 		       != 0);
900 #else
901   codeset_name = nl_langinfo (CODESET);
902   if (strcasecmp (codeset_name, "UTF-8") == 0
903       || strcasecmp (codeset_name, "UTF8") == 0)
904     dfa->is_utf8 = 1;
905 
906   /* We check exhaustively in the loop below if this charset is a
907      superset of ASCII.  */
908   dfa->map_notascii = 0;
909 #endif
910 
911 #ifdef RE_ENABLE_I18N
912   if (dfa->mb_cur_max > 1)
913     {
914       if (dfa->is_utf8)
915 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
916       else
917 	{
918 	  int i, j, ch;
919 
920 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 	  if (BE (dfa->sb_char == NULL, 0))
922 	    return REG_ESPACE;
923 
924 	  /* Set the bits corresponding to single byte chars.  */
925 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
927 	      {
928 		wint_t wch = __btowc (ch);
929 		if (wch != WEOF)
930 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
931 # ifndef _LIBC
932 		if (isascii (ch) && wch != ch)
933 		  dfa->map_notascii = 1;
934 # endif
935 	      }
936 	}
937     }
938 #endif
939 
940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
941     return REG_ESPACE;
942   return REG_NOERROR;
943 }
944 
945 /* Initialize WORD_CHAR table, which indicate which character is
946    "word".  In this case "word" means that it is the word construction
947    character used by some operators like "\<", "\>", etc.  */
948 
949 static void
950 internal_function
951 init_word_char (re_dfa_t *dfa)
952 {
953   int i, j, ch;
954   dfa->word_ops_used = 1;
955   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
956     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
957       if (isalnum (ch) || ch == '_')
958 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
959 }
960 
961 /* Free the work area which are only used while compiling.  */
962 
963 static void
964 free_workarea_compile (regex_t *preg)
965 {
966   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
967   bin_tree_storage_t *storage, *next;
968   for (storage = dfa->str_tree_storage; storage; storage = next)
969     {
970       next = storage->next;
971       re_free (storage);
972     }
973   dfa->str_tree_storage = NULL;
974   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
975   dfa->str_tree = NULL;
976   re_free (dfa->org_indices);
977   dfa->org_indices = NULL;
978 }
979 
980 /* Create initial states for all contexts.  */
981 
982 static reg_errcode_t
983 create_initial_state (re_dfa_t *dfa)
984 {
985   Idx first, i;
986   reg_errcode_t err;
987   re_node_set init_nodes;
988 
989   /* Initial states have the epsilon closure of the node which is
990      the first node of the regular expression.  */
991   first = dfa->str_tree->first->node_idx;
992   dfa->init_node = first;
993   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
994   if (BE (err != REG_NOERROR, 0))
995     return err;
996 
997   /* The back-references which are in initial states can epsilon transit,
998      since in this case all of the subexpressions can be null.
999      Then we add epsilon closures of the nodes which are the next nodes of
1000      the back-references.  */
1001   if (dfa->nbackref > 0)
1002     for (i = 0; i < init_nodes.nelem; ++i)
1003       {
1004 	Idx node_idx = init_nodes.elems[i];
1005 	re_token_type_t type = dfa->nodes[node_idx].type;
1006 
1007 	Idx clexp_idx;
1008 	if (type != OP_BACK_REF)
1009 	  continue;
1010 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011 	  {
1012 	    re_token_t *clexp_node;
1013 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1015 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016 	      break;
1017 	  }
1018 	if (clexp_idx == init_nodes.nelem)
1019 	  continue;
1020 
1021 	if (type == OP_BACK_REF)
1022 	  {
1023 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1024 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1025 	      {
1026 		reg_errcode_t merge_err
1027                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1028 		if (merge_err != REG_NOERROR)
1029 		  return merge_err;
1030 		i = 0;
1031 	      }
1032 	  }
1033       }
1034 
1035   /* It must be the first time to invoke acquire_state.  */
1036   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1037   /* We don't check ERR here, since the initial state must not be NULL.  */
1038   if (BE (dfa->init_state == NULL, 0))
1039     return err;
1040   if (dfa->init_state->has_constraint)
1041     {
1042       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1043 						       CONTEXT_WORD);
1044       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1045 						     CONTEXT_NEWLINE);
1046       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1047 							 &init_nodes,
1048 							 CONTEXT_NEWLINE
1049 							 | CONTEXT_BEGBUF);
1050       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1051 	      || dfa->init_state_begbuf == NULL, 0))
1052 	return err;
1053     }
1054   else
1055     dfa->init_state_word = dfa->init_state_nl
1056       = dfa->init_state_begbuf = dfa->init_state;
1057 
1058   re_node_set_free (&init_nodes);
1059   return REG_NOERROR;
1060 }
1061 
1062 #ifdef RE_ENABLE_I18N
1063 /* If it is possible to do searching in single byte encoding instead of UTF-8
1064    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1065    DFA nodes where needed.  */
1066 
1067 static void
1068 optimize_utf8 (re_dfa_t *dfa)
1069 {
1070   Idx node;
1071   int i;
1072   bool mb_chars = false;
1073   bool has_period = false;
1074 
1075   for (node = 0; node < dfa->nodes_len; ++node)
1076     switch (dfa->nodes[node].type)
1077       {
1078       case CHARACTER:
1079 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1080 	  mb_chars = true;
1081 	break;
1082       case ANCHOR:
1083 	switch (dfa->nodes[node].opr.ctx_type)
1084 	  {
1085 	  case LINE_FIRST:
1086 	  case LINE_LAST:
1087 	  case BUF_FIRST:
1088 	  case BUF_LAST:
1089 	    break;
1090 	  default:
1091 	    /* Word anchors etc. cannot be handled.  It's okay to test
1092 	       opr.ctx_type since constraints (for all DFA nodes) are
1093 	       created by ORing one or more opr.ctx_type values.  */
1094 	    return;
1095 	  }
1096 	break;
1097       case OP_PERIOD:
1098 	has_period = true;
1099 	break;
1100       case OP_BACK_REF:
1101       case OP_ALT:
1102       case END_OF_RE:
1103       case OP_DUP_ASTERISK:
1104       case OP_OPEN_SUBEXP:
1105       case OP_CLOSE_SUBEXP:
1106 	break;
1107       case COMPLEX_BRACKET:
1108 	return;
1109       case SIMPLE_BRACKET:
1110 	/* Just double check.  */
1111 	{
1112 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1113 			? 0
1114 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1115 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1116 	    {
1117 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1118 		return;
1119 	      rshift = 0;
1120 	    }
1121 	}
1122 	break;
1123       default:
1124 	abort ();
1125       }
1126 
1127   if (mb_chars || has_period)
1128     for (node = 0; node < dfa->nodes_len; ++node)
1129       {
1130 	if (dfa->nodes[node].type == CHARACTER
1131 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1132 	  dfa->nodes[node].mb_partial = 0;
1133 	else if (dfa->nodes[node].type == OP_PERIOD)
1134 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1135       }
1136 
1137   /* The search can be in single byte locale.  */
1138   dfa->mb_cur_max = 1;
1139   dfa->is_utf8 = 0;
1140   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 }
1142 #endif
1143 
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145    "eclosure", and "inveclosure".  */
1146 
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1149 {
1150   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151   reg_errcode_t ret;
1152 
1153   /* Allocate arrays.  */
1154   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1156   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159 	  || dfa->eclosures == NULL, 0))
1160     return REG_ESPACE;
1161 
1162   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1163   if (dfa->subexp_map != NULL)
1164     {
1165       Idx i;
1166       for (i = 0; i < preg->re_nsub; i++)
1167 	dfa->subexp_map[i] = i;
1168       preorder (dfa->str_tree, optimize_subexps, dfa);
1169       for (i = 0; i < preg->re_nsub; i++)
1170 	if (dfa->subexp_map[i] != i)
1171 	  break;
1172       if (i == preg->re_nsub)
1173 	{
1174 	  free (dfa->subexp_map);
1175 	  dfa->subexp_map = NULL;
1176 	}
1177     }
1178 
1179   ret = postorder (dfa->str_tree, lower_subexps, preg);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   ret = postorder (dfa->str_tree, calc_first, dfa);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185   preorder (dfa->str_tree, calc_next, dfa);
1186   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189   ret = calc_eclosure (dfa);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192 
1193   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1195   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196       || dfa->nbackref)
1197     {
1198       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199       if (BE (dfa->inveclosures == NULL, 0))
1200 	return REG_ESPACE;
1201       ret = calc_inveclosure (dfa);
1202     }
1203 
1204   return ret;
1205 }
1206 
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208    implement parse tree visits.  Instead, we use parent pointers and
1209    some hairy code in these two functions.  */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212 	   void *extra)
1213 {
1214   bin_tree_t *node, *prev;
1215 
1216   for (node = root; ; )
1217     {
1218       /* Descend down the tree, preferably to the left (or to the right
1219 	 if that's the only child).  */
1220       while (node->left || node->right)
1221 	if (node->left)
1222 	  node = node->left;
1223 	else
1224 	  node = node->right;
1225 
1226       do
1227 	{
1228 	  reg_errcode_t err = fn (extra, node);
1229 	  if (BE (err != REG_NOERROR, 0))
1230 	    return err;
1231 	  if (node->parent == NULL)
1232 	    return REG_NOERROR;
1233 	  prev = node;
1234 	  node = node->parent;
1235 	}
1236       /* Go up while we have a node that is reached from the right.  */
1237       while (node->right == prev || node->right == NULL);
1238       node = node->right;
1239     }
1240 }
1241 
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244 	  void *extra)
1245 {
1246   bin_tree_t *node;
1247 
1248   for (node = root; ; )
1249     {
1250       reg_errcode_t err = fn (extra, node);
1251       if (BE (err != REG_NOERROR, 0))
1252 	return err;
1253 
1254       /* Go to the left node, or up and to the right.  */
1255       if (node->left)
1256 	node = node->left;
1257       else
1258 	{
1259 	  bin_tree_t *prev = NULL;
1260 	  while (node->right == prev || node->right == NULL)
1261 	    {
1262 	      prev = node;
1263 	      node = node->parent;
1264 	      if (!node)
1265 		return REG_NOERROR;
1266 	    }
1267 	  node = node->right;
1268 	}
1269     }
1270 }
1271 
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1274    backreferences as well.  Requires a preorder visit.  */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1277 {
1278   re_dfa_t *dfa = (re_dfa_t *) extra;
1279 
1280   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281     {
1282       int idx = node->token.opr.idx;
1283       node->token.opr.idx = dfa->subexp_map[idx];
1284       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285     }
1286 
1287   else if (node->token.type == SUBEXP
1288 	   && node->left && node->left->token.type == SUBEXP)
1289     {
1290       Idx other_idx = node->left->token.opr.idx;
1291 
1292       node->left = node->left->left;
1293       if (node->left)
1294 	node->left->parent = node;
1295 
1296       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297       if (other_idx < BITSET_WORD_BITS)
1298 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299     }
1300 
1301   return REG_NOERROR;
1302 }
1303 
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1308 {
1309   regex_t *preg = (regex_t *) extra;
1310   reg_errcode_t err = REG_NOERROR;
1311 
1312   if (node->left && node->left->token.type == SUBEXP)
1313     {
1314       node->left = lower_subexp (&err, preg, node->left);
1315       if (node->left)
1316 	node->left->parent = node;
1317     }
1318   if (node->right && node->right->token.type == SUBEXP)
1319     {
1320       node->right = lower_subexp (&err, preg, node->right);
1321       if (node->right)
1322 	node->right->parent = node;
1323     }
1324 
1325   return err;
1326 }
1327 
1328 static bin_tree_t *
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 {
1331   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332   bin_tree_t *body = node->left;
1333   bin_tree_t *op, *cls, *tree1, *tree;
1334 
1335   if (preg->no_sub
1336       /* We do not optimize empty subexpressions, because otherwise we may
1337 	 have bad CONCAT nodes with NULL children.  This is obviously not
1338 	 very common, so we do not lose much.  An example that triggers
1339 	 this case is the sed "script" /\(\)/x.  */
1340       && node->left != NULL
1341       && (node->token.opr.idx >= BITSET_WORD_BITS
1342 	  || !(dfa->used_bkref_map
1343 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1344     return node->left;
1345 
1346   /* Convert the SUBEXP node to the concatenation of an
1347      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1348   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351   tree = create_tree (dfa, op, tree1, CONCAT);
1352   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353     {
1354       *err = REG_ESPACE;
1355       return NULL;
1356     }
1357 
1358   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360   return tree;
1361 }
1362 
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364    nodes.  Requires a postorder visit.  */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = (re_dfa_t *) extra;
1369   if (node->token.type == CONCAT)
1370     {
1371       node->first = node->left->first;
1372       node->node_idx = node->left->node_idx;
1373     }
1374   else
1375     {
1376       node->first = node;
1377       node->node_idx = re_dfa_add_node (dfa, node->token);
1378       if (BE (node->node_idx == REG_MISSING, 0))
1379 	return REG_ESPACE;
1380       if (node->token.type == ANCHOR)
1381 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382     }
1383   return REG_NOERROR;
1384 }
1385 
1386 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1387 static reg_errcode_t
1388 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1389 {
1390   switch (node->token.type)
1391     {
1392     case OP_DUP_ASTERISK:
1393       node->left->next = node;
1394       break;
1395     case CONCAT:
1396       node->left->next = node->right->first;
1397       node->right->next = node->next;
1398       break;
1399     default:
1400       if (node->left)
1401 	node->left->next = node->next;
1402       if (node->right)
1403 	node->right->next = node->next;
1404       break;
1405     }
1406   return REG_NOERROR;
1407 }
1408 
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 {
1413   re_dfa_t *dfa = (re_dfa_t *) extra;
1414   Idx idx = node->node_idx;
1415   reg_errcode_t err = REG_NOERROR;
1416 
1417   switch (node->token.type)
1418     {
1419     case CONCAT:
1420       break;
1421 
1422     case END_OF_RE:
1423       assert (node->next == NULL);
1424       break;
1425 
1426     case OP_DUP_ASTERISK:
1427     case OP_ALT:
1428       {
1429 	Idx left, right;
1430 	dfa->has_plural_match = 1;
1431 	if (node->left != NULL)
1432 	  left = node->left->first->node_idx;
1433 	else
1434 	  left = node->next->node_idx;
1435 	if (node->right != NULL)
1436 	  right = node->right->first->node_idx;
1437 	else
1438 	  right = node->next->node_idx;
1439 	assert (REG_VALID_INDEX (left));
1440 	assert (REG_VALID_INDEX (right));
1441 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442       }
1443       break;
1444 
1445     case ANCHOR:
1446     case OP_OPEN_SUBEXP:
1447     case OP_CLOSE_SUBEXP:
1448       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449       break;
1450 
1451     case OP_BACK_REF:
1452       dfa->nexts[idx] = node->next->node_idx;
1453       if (node->token.type == OP_BACK_REF)
1454 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455       break;
1456 
1457     default:
1458       assert (!IS_EPSILON_NODE (node->token.type));
1459       dfa->nexts[idx] = node->next->node_idx;
1460       break;
1461     }
1462 
1463   return err;
1464 }
1465 
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468    to their own constraint.  */
1469 
1470 static reg_errcode_t
1471 internal_function
1472 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1473 			Idx root_node, unsigned int init_constraint)
1474 {
1475   Idx org_node, clone_node;
1476   bool ok;
1477   unsigned int constraint = init_constraint;
1478   for (org_node = top_org_node, clone_node = top_clone_node;;)
1479     {
1480       Idx org_dest, clone_dest;
1481       if (dfa->nodes[org_node].type == OP_BACK_REF)
1482 	{
1483 	  /* If the back reference epsilon-transit, its destination must
1484 	     also have the constraint.  Then duplicate the epsilon closure
1485 	     of the destination of the back reference, and store it in
1486 	     edests of the back reference.  */
1487 	  org_dest = dfa->nexts[org_node];
1488 	  re_node_set_empty (dfa->edests + clone_node);
1489 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1490 	  if (BE (clone_dest == REG_MISSING, 0))
1491 	    return REG_ESPACE;
1492 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1493 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494 	  if (BE (! ok, 0))
1495 	    return REG_ESPACE;
1496 	}
1497       else if (dfa->edests[org_node].nelem == 0)
1498 	{
1499 	  /* In case of the node can't epsilon-transit, don't duplicate the
1500 	     destination and store the original destination as the
1501 	     destination of the node.  */
1502 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1503 	  break;
1504 	}
1505       else if (dfa->edests[org_node].nelem == 1)
1506 	{
1507 	  /* In case of the node can epsilon-transit, and it has only one
1508 	     destination.  */
1509 	  org_dest = dfa->edests[org_node].elems[0];
1510 	  re_node_set_empty (dfa->edests + clone_node);
1511 	  /* If the node is root_node itself, it means the epsilon closure
1512 	     has a loop.  Then tie it to the destination of the root_node.  */
1513 	  if (org_node == root_node && clone_node != org_node)
1514 	    {
1515 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1516 	      if (BE (! ok, 0))
1517 	        return REG_ESPACE;
1518 	      break;
1519 	    }
1520 	  /* In case the node has another constraint, append it.  */
1521 	  constraint |= dfa->nodes[org_node].constraint;
1522 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1523 	  if (BE (clone_dest == REG_MISSING, 0))
1524 	    return REG_ESPACE;
1525 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526 	  if (BE (! ok, 0))
1527 	    return REG_ESPACE;
1528 	}
1529       else /* dfa->edests[org_node].nelem == 2 */
1530 	{
1531 	  /* In case of the node can epsilon-transit, and it has two
1532 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533 	  org_dest = dfa->edests[org_node].elems[0];
1534 	  re_node_set_empty (dfa->edests + clone_node);
1535 	  /* Search for a duplicated node which satisfies the constraint.  */
1536 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537 	  if (clone_dest == REG_MISSING)
1538 	    {
1539 	      /* There is no such duplicated node, create a new one.  */
1540 	      reg_errcode_t err;
1541 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1542 	      if (BE (clone_dest == REG_MISSING, 0))
1543 		return REG_ESPACE;
1544 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1545 	      if (BE (! ok, 0))
1546 		return REG_ESPACE;
1547 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1548 					    root_node, constraint);
1549 	      if (BE (err != REG_NOERROR, 0))
1550 		return err;
1551 	    }
1552 	  else
1553 	    {
1554 	      /* There is a duplicated node which satisfies the constraint,
1555 		 use it to avoid infinite loop.  */
1556 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557 	      if (BE (! ok, 0))
1558 		return REG_ESPACE;
1559 	    }
1560 
1561 	  org_dest = dfa->edests[org_node].elems[1];
1562 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1563 	  if (BE (clone_dest == REG_MISSING, 0))
1564 	    return REG_ESPACE;
1565 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 	  if (BE (! ok, 0))
1567 	    return REG_ESPACE;
1568 	}
1569       org_node = org_dest;
1570       clone_node = clone_dest;
1571     }
1572   return REG_NOERROR;
1573 }
1574 
1575 /* Search for a node which is duplicated from the node ORG_NODE, and
1576    satisfies the constraint CONSTRAINT.  */
1577 
1578 static Idx
1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1580 			unsigned int constraint)
1581 {
1582   Idx idx;
1583   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584     {
1585       if (org_node == dfa->org_indices[idx]
1586 	  && constraint == dfa->nodes[idx].constraint)
1587 	return idx; /* Found.  */
1588     }
1589   return REG_MISSING; /* Not found.  */
1590 }
1591 
1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1593    Return the index of the new node, or REG_MISSING if insufficient storage is
1594    available.  */
1595 
1596 static Idx
1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1598 {
1599   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1600   if (BE (dup_idx != REG_MISSING, 1))
1601     {
1602       dfa->nodes[dup_idx].constraint = constraint;
1603       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1604       dfa->nodes[dup_idx].duplicated = 1;
1605 
1606       /* Store the index of the original node.  */
1607       dfa->org_indices[dup_idx] = org_idx;
1608     }
1609   return dup_idx;
1610 }
1611 
1612 static reg_errcode_t
1613 calc_inveclosure (re_dfa_t *dfa)
1614 {
1615   Idx src, idx;
1616   bool ok;
1617   for (idx = 0; idx < dfa->nodes_len; ++idx)
1618     re_node_set_init_empty (dfa->inveclosures + idx);
1619 
1620   for (src = 0; src < dfa->nodes_len; ++src)
1621     {
1622       Idx *elems = dfa->eclosures[src].elems;
1623       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624 	{
1625 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1626 	  if (BE (! ok, 0))
1627 	    return REG_ESPACE;
1628 	}
1629     }
1630 
1631   return REG_NOERROR;
1632 }
1633 
1634 /* Calculate "eclosure" for all the node in DFA.  */
1635 
1636 static reg_errcode_t
1637 calc_eclosure (re_dfa_t *dfa)
1638 {
1639   Idx node_idx;
1640   bool incomplete;
1641 #ifdef DEBUG
1642   assert (dfa->nodes_len > 0);
1643 #endif
1644   incomplete = false;
1645   /* For each nodes, calculate epsilon closure.  */
1646   for (node_idx = 0; ; ++node_idx)
1647     {
1648       reg_errcode_t err;
1649       re_node_set eclosure_elem;
1650       if (node_idx == dfa->nodes_len)
1651 	{
1652 	  if (!incomplete)
1653 	    break;
1654 	  incomplete = false;
1655 	  node_idx = 0;
1656 	}
1657 
1658 #ifdef DEBUG
1659       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1660 #endif
1661 
1662       /* If we have already calculated, skip it.  */
1663       if (dfa->eclosures[node_idx].nelem != 0)
1664 	continue;
1665       /* Calculate epsilon closure of `node_idx'.  */
1666       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1667       if (BE (err != REG_NOERROR, 0))
1668 	return err;
1669 
1670       if (dfa->eclosures[node_idx].nelem == 0)
1671 	{
1672 	  incomplete = true;
1673 	  re_node_set_free (&eclosure_elem);
1674 	}
1675     }
1676   return REG_NOERROR;
1677 }
1678 
1679 /* Calculate epsilon closure of NODE.  */
1680 
1681 static reg_errcode_t
1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1683 {
1684   reg_errcode_t err;
1685   Idx i;
1686   re_node_set eclosure;
1687   bool ok;
1688   bool incomplete = false;
1689   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1690   if (BE (err != REG_NOERROR, 0))
1691     return err;
1692 
1693   /* This indicates that we are calculating this node now.
1694      We reference this value to avoid infinite loop.  */
1695   dfa->eclosures[node].nelem = REG_MISSING;
1696 
1697   /* If the current node has constraints, duplicate all nodes
1698      since they must inherit the constraints.  */
1699   if (dfa->nodes[node].constraint
1700       && dfa->edests[node].nelem
1701       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1702     {
1703       err = duplicate_node_closure (dfa, node, node, node,
1704 				    dfa->nodes[node].constraint);
1705       if (BE (err != REG_NOERROR, 0))
1706 	return err;
1707     }
1708 
1709   /* Expand each epsilon destination nodes.  */
1710   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1711     for (i = 0; i < dfa->edests[node].nelem; ++i)
1712       {
1713 	re_node_set eclosure_elem;
1714 	Idx edest = dfa->edests[node].elems[i];
1715 	/* If calculating the epsilon closure of `edest' is in progress,
1716 	   return intermediate result.  */
1717 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1718 	  {
1719 	    incomplete = true;
1720 	    continue;
1721 	  }
1722 	/* If we haven't calculated the epsilon closure of `edest' yet,
1723 	   calculate now. Otherwise use calculated epsilon closure.  */
1724 	if (dfa->eclosures[edest].nelem == 0)
1725 	  {
1726 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1727 	    if (BE (err != REG_NOERROR, 0))
1728 	      return err;
1729 	  }
1730 	else
1731 	  eclosure_elem = dfa->eclosures[edest];
1732 	/* Merge the epsilon closure of `edest'.  */
1733 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1734 	if (BE (err != REG_NOERROR, 0))
1735 	  return err;
1736 	/* If the epsilon closure of `edest' is incomplete,
1737 	   the epsilon closure of this node is also incomplete.  */
1738 	if (dfa->eclosures[edest].nelem == 0)
1739 	  {
1740 	    incomplete = true;
1741 	    re_node_set_free (&eclosure_elem);
1742 	  }
1743       }
1744 
1745   /* An epsilon closure includes itself.  */
1746   ok = re_node_set_insert (&eclosure, node);
1747   if (BE (! ok, 0))
1748     return REG_ESPACE;
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 
1757 /* Functions for token which are used in the parser.  */
1758 
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761 
1762 static void
1763 internal_function
1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1765 {
1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 }
1768 
1769 /* Peek a token from INPUT, and return the length of the token.
1770    We must not use this function inside bracket expressions.  */
1771 
1772 static int
1773 internal_function
1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1775 {
1776   unsigned char c;
1777 
1778   if (re_string_eoi (input))
1779     {
1780       token->type = END_OF_RE;
1781       return 0;
1782     }
1783 
1784   c = re_string_peek_byte (input, 0);
1785   token->opr.c = c;
1786 
1787   token->word_char = 0;
1788 #ifdef RE_ENABLE_I18N
1789   token->mb_partial = 0;
1790   if (input->mb_cur_max > 1 &&
1791       !re_string_first_byte (input, re_string_cur_idx (input)))
1792     {
1793       token->type = CHARACTER;
1794       token->mb_partial = 1;
1795       return 1;
1796     }
1797 #endif
1798   if (c == '\\')
1799     {
1800       unsigned char c2;
1801       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1802 	{
1803 	  token->type = BACK_SLASH;
1804 	  return 1;
1805 	}
1806 
1807       c2 = re_string_peek_byte_case (input, 1);
1808       token->opr.c = c2;
1809       token->type = CHARACTER;
1810 #ifdef RE_ENABLE_I18N
1811       if (input->mb_cur_max > 1)
1812 	{
1813 	  wint_t wc = re_string_wchar_at (input,
1814 					  re_string_cur_idx (input) + 1);
1815 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1816 	}
1817       else
1818 #endif
1819 	token->word_char = IS_WORD_CHAR (c2) != 0;
1820 
1821       switch (c2)
1822 	{
1823 	case '|':
1824 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1825 	    token->type = OP_ALT;
1826 	  break;
1827 	case '1': case '2': case '3': case '4': case '5':
1828 	case '6': case '7': case '8': case '9':
1829 	  if (!(syntax & RE_NO_BK_REFS))
1830 	    {
1831 	      token->type = OP_BACK_REF;
1832 	      token->opr.idx = c2 - '1';
1833 	    }
1834 	  break;
1835 	case '<':
1836 	  if (!(syntax & RE_NO_GNU_OPS))
1837 	    {
1838 	      token->type = ANCHOR;
1839 	      token->opr.ctx_type = WORD_FIRST;
1840 	    }
1841 	  break;
1842 	case '>':
1843 	  if (!(syntax & RE_NO_GNU_OPS))
1844 	    {
1845 	      token->type = ANCHOR;
1846 	      token->opr.ctx_type = WORD_LAST;
1847 	    }
1848 	  break;
1849 	case 'b':
1850 	  if (!(syntax & RE_NO_GNU_OPS))
1851 	    {
1852 	      token->type = ANCHOR;
1853 	      token->opr.ctx_type = WORD_DELIM;
1854 	    }
1855 	  break;
1856 	case 'B':
1857 	  if (!(syntax & RE_NO_GNU_OPS))
1858 	    {
1859 	      token->type = ANCHOR;
1860 	      token->opr.ctx_type = NOT_WORD_DELIM;
1861 	    }
1862 	  break;
1863 	case 'w':
1864 	  if (!(syntax & RE_NO_GNU_OPS))
1865 	    token->type = OP_WORD;
1866 	  break;
1867 	case 'W':
1868 	  if (!(syntax & RE_NO_GNU_OPS))
1869 	    token->type = OP_NOTWORD;
1870 	  break;
1871 	case 's':
1872 	  if (!(syntax & RE_NO_GNU_OPS))
1873 	    token->type = OP_SPACE;
1874 	  break;
1875 	case 'S':
1876 	  if (!(syntax & RE_NO_GNU_OPS))
1877 	    token->type = OP_NOTSPACE;
1878 	  break;
1879 	case '`':
1880 	  if (!(syntax & RE_NO_GNU_OPS))
1881 	    {
1882 	      token->type = ANCHOR;
1883 	      token->opr.ctx_type = BUF_FIRST;
1884 	    }
1885 	  break;
1886 	case '\'':
1887 	  if (!(syntax & RE_NO_GNU_OPS))
1888 	    {
1889 	      token->type = ANCHOR;
1890 	      token->opr.ctx_type = BUF_LAST;
1891 	    }
1892 	  break;
1893 	case '(':
1894 	  if (!(syntax & RE_NO_BK_PARENS))
1895 	    token->type = OP_OPEN_SUBEXP;
1896 	  break;
1897 	case ')':
1898 	  if (!(syntax & RE_NO_BK_PARENS))
1899 	    token->type = OP_CLOSE_SUBEXP;
1900 	  break;
1901 	case '+':
1902 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1903 	    token->type = OP_DUP_PLUS;
1904 	  break;
1905 	case '?':
1906 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 	    token->type = OP_DUP_QUESTION;
1908 	  break;
1909 	case '{':
1910 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1911 	    token->type = OP_OPEN_DUP_NUM;
1912 	  break;
1913 	case '}':
1914 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 	    token->type = OP_CLOSE_DUP_NUM;
1916 	  break;
1917 	default:
1918 	  break;
1919 	}
1920       return 2;
1921     }
1922 
1923   token->type = CHARACTER;
1924 #ifdef RE_ENABLE_I18N
1925   if (input->mb_cur_max > 1)
1926     {
1927       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1928       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1929     }
1930   else
1931 #endif
1932     token->word_char = IS_WORD_CHAR (token->opr.c);
1933 
1934   switch (c)
1935     {
1936     case '\n':
1937       if (syntax & RE_NEWLINE_ALT)
1938 	token->type = OP_ALT;
1939       break;
1940     case '|':
1941       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1942 	token->type = OP_ALT;
1943       break;
1944     case '*':
1945       token->type = OP_DUP_ASTERISK;
1946       break;
1947     case '+':
1948       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1949 	token->type = OP_DUP_PLUS;
1950       break;
1951     case '?':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 	token->type = OP_DUP_QUESTION;
1954       break;
1955     case '{':
1956       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1957 	token->type = OP_OPEN_DUP_NUM;
1958       break;
1959     case '}':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 	token->type = OP_CLOSE_DUP_NUM;
1962       break;
1963     case '(':
1964       if (syntax & RE_NO_BK_PARENS)
1965 	token->type = OP_OPEN_SUBEXP;
1966       break;
1967     case ')':
1968       if (syntax & RE_NO_BK_PARENS)
1969 	token->type = OP_CLOSE_SUBEXP;
1970       break;
1971     case '[':
1972       token->type = OP_OPEN_BRACKET;
1973       break;
1974     case '.':
1975       token->type = OP_PERIOD;
1976       break;
1977     case '^':
1978       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1979 	  re_string_cur_idx (input) != 0)
1980 	{
1981 	  char prev = re_string_peek_byte (input, -1);
1982 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1983 	    break;
1984 	}
1985       token->type = ANCHOR;
1986       token->opr.ctx_type = LINE_FIRST;
1987       break;
1988     case '$':
1989       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1990 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1991 	{
1992 	  re_token_t next;
1993 	  re_string_skip_bytes (input, 1);
1994 	  peek_token (&next, input, syntax);
1995 	  re_string_skip_bytes (input, -1);
1996 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1997 	    break;
1998 	}
1999       token->type = ANCHOR;
2000       token->opr.ctx_type = LINE_LAST;
2001       break;
2002     default:
2003       break;
2004     }
2005   return 1;
2006 }
2007 
2008 /* Peek a token from INPUT, and return the length of the token.
2009    We must not use this function out of bracket expressions.  */
2010 
2011 static int
2012 internal_function
2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2014 {
2015   unsigned char c;
2016   if (re_string_eoi (input))
2017     {
2018       token->type = END_OF_RE;
2019       return 0;
2020     }
2021   c = re_string_peek_byte (input, 0);
2022   token->opr.c = c;
2023 
2024 #ifdef RE_ENABLE_I18N
2025   if (input->mb_cur_max > 1 &&
2026       !re_string_first_byte (input, re_string_cur_idx (input)))
2027     {
2028       token->type = CHARACTER;
2029       return 1;
2030     }
2031 #endif /* RE_ENABLE_I18N */
2032 
2033   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2034       && re_string_cur_idx (input) + 1 < re_string_length (input))
2035     {
2036       /* In this case, '\' escape a character.  */
2037       unsigned char c2;
2038       re_string_skip_bytes (input, 1);
2039       c2 = re_string_peek_byte (input, 0);
2040       token->opr.c = c2;
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044   if (c == '[') /* '[' is a special char in a bracket exps.  */
2045     {
2046       unsigned char c2;
2047       int token_len;
2048       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2049 	c2 = re_string_peek_byte (input, 1);
2050       else
2051 	c2 = 0;
2052       token->opr.c = c2;
2053       token_len = 2;
2054       switch (c2)
2055 	{
2056 	case '.':
2057 	  token->type = OP_OPEN_COLL_ELEM;
2058 	  break;
2059 	case '=':
2060 	  token->type = OP_OPEN_EQUIV_CLASS;
2061 	  break;
2062 	case ':':
2063 	  if (syntax & RE_CHAR_CLASSES)
2064 	    {
2065 	      token->type = OP_OPEN_CHAR_CLASS;
2066 	      break;
2067 	    }
2068 	  /* else fall through.  */
2069 	default:
2070 	  token->type = CHARACTER;
2071 	  token->opr.c = c;
2072 	  token_len = 1;
2073 	  break;
2074 	}
2075       return token_len;
2076     }
2077   switch (c)
2078     {
2079     case '-':
2080       token->type = OP_CHARSET_RANGE;
2081       break;
2082     case ']':
2083       token->type = OP_CLOSE_BRACKET;
2084       break;
2085     case '^':
2086       token->type = OP_NON_MATCH_LIST;
2087       break;
2088     default:
2089       token->type = CHARACTER;
2090     }
2091   return 1;
2092 }
2093 
2094 /* Functions for parser.  */
2095 
2096 /* Entry point of the parser.
2097    Parse the regular expression REGEXP and return the structure tree.
2098    If an error is occured, ERR is set by error code, and return NULL.
2099    This function build the following tree, from regular expression <reg_exp>:
2100 	   CAT
2101 	   / \
2102 	  /   \
2103    <reg_exp>  EOR
2104 
2105    CAT means concatenation.
2106    EOR means end of regular expression.  */
2107 
2108 static bin_tree_t *
2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2110        reg_errcode_t *err)
2111 {
2112   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113   bin_tree_t *tree, *eor, *root;
2114   re_token_t current_token;
2115   dfa->syntax = syntax;
2116   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2117   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2118   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119     return NULL;
2120   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2121   if (tree != NULL)
2122     root = create_tree (dfa, tree, eor, CONCAT);
2123   else
2124     root = eor;
2125   if (BE (eor == NULL || root == NULL, 0))
2126     {
2127       *err = REG_ESPACE;
2128       return NULL;
2129     }
2130   return root;
2131 }
2132 
2133 /* This function build the following tree, from regular expression
2134    <branch1>|<branch2>:
2135 	   ALT
2136 	   / \
2137 	  /   \
2138    <branch1> <branch2>
2139 
2140    ALT means alternative, which represents the operator `|'.  */
2141 
2142 static bin_tree_t *
2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2144 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2145 {
2146   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2147   bin_tree_t *tree, *branch = NULL;
2148   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2149   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2150     return NULL;
2151 
2152   while (token->type == OP_ALT)
2153     {
2154       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155       if (token->type != OP_ALT && token->type != END_OF_RE
2156 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2157 	{
2158 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2159 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2160 	    return NULL;
2161 	}
2162       else
2163 	branch = NULL;
2164       tree = create_tree (dfa, tree, branch, OP_ALT);
2165       if (BE (tree == NULL, 0))
2166 	{
2167 	  *err = REG_ESPACE;
2168 	  return NULL;
2169 	}
2170     }
2171   return tree;
2172 }
2173 
2174 /* This function build the following tree, from regular expression
2175    <exp1><exp2>:
2176 	CAT
2177 	/ \
2178        /   \
2179    <exp1> <exp2>
2180 
2181    CAT means concatenation.  */
2182 
2183 static bin_tree_t *
2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2185 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2186 {
2187   bin_tree_t *tree, *expr;
2188   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2189   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2190   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2191     return NULL;
2192 
2193   while (token->type != OP_ALT && token->type != END_OF_RE
2194 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195     {
2196       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2197       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2198 	{
2199 	  return NULL;
2200 	}
2201       if (tree != NULL && expr != NULL)
2202 	{
2203 	  tree = create_tree (dfa, tree, expr, CONCAT);
2204 	  if (tree == NULL)
2205 	    {
2206 	      *err = REG_ESPACE;
2207 	      return NULL;
2208 	    }
2209 	}
2210       else if (tree == NULL)
2211 	tree = expr;
2212       /* Otherwise expr == NULL, we don't need to create new tree.  */
2213     }
2214   return tree;
2215 }
2216 
2217 /* This function build the following tree, from regular expression a*:
2218 	 *
2219 	 |
2220 	 a
2221 */
2222 
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2226 {
2227   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228   bin_tree_t *tree;
2229   switch (token->type)
2230     {
2231     case CHARACTER:
2232       tree = create_token_tree (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234 	{
2235 	  *err = REG_ESPACE;
2236 	  return NULL;
2237 	}
2238 #ifdef RE_ENABLE_I18N
2239       if (dfa->mb_cur_max > 1)
2240 	{
2241 	  while (!re_string_eoi (regexp)
2242 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243 	    {
2244 	      bin_tree_t *mbc_remain;
2245 	      fetch_token (token, regexp, syntax);
2246 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2249 		{
2250 		  *err = REG_ESPACE;
2251 		  return NULL;
2252 		}
2253 	    }
2254 	}
2255 #endif
2256       break;
2257     case OP_OPEN_SUBEXP:
2258       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 	return NULL;
2261       break;
2262     case OP_OPEN_BRACKET:
2263       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 	return NULL;
2266       break;
2267     case OP_BACK_REF:
2268       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269 	{
2270 	  *err = REG_ESUBREG;
2271 	  return NULL;
2272 	}
2273       dfa->used_bkref_map |= 1 << token->opr.idx;
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276 	{
2277 	  *err = REG_ESPACE;
2278 	  return NULL;
2279 	}
2280       ++dfa->nbackref;
2281       dfa->has_mb_node = 1;
2282       break;
2283     case OP_OPEN_DUP_NUM:
2284       if (syntax & RE_CONTEXT_INVALID_DUP)
2285 	{
2286 	  *err = REG_BADRPT;
2287 	  return NULL;
2288 	}
2289       /* FALLTHROUGH */
2290     case OP_DUP_ASTERISK:
2291     case OP_DUP_PLUS:
2292     case OP_DUP_QUESTION:
2293       if (syntax & RE_CONTEXT_INVALID_OPS)
2294 	{
2295 	  *err = REG_BADRPT;
2296 	  return NULL;
2297 	}
2298       else if (syntax & RE_CONTEXT_INDEP_OPS)
2299 	{
2300 	  fetch_token (token, regexp, syntax);
2301 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2302 	}
2303       /* else fall through  */
2304     case OP_CLOSE_SUBEXP:
2305       if ((token->type == OP_CLOSE_SUBEXP) &&
2306 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307 	{
2308 	  *err = REG_ERPAREN;
2309 	  return NULL;
2310 	}
2311       /* else fall through  */
2312     case OP_CLOSE_DUP_NUM:
2313       /* We treat it as a normal character.  */
2314 
2315       /* Then we can these characters as normal characters.  */
2316       token->type = CHARACTER;
2317       /* mb_partial and word_char bits should be initialized already
2318 	 by peek_token.  */
2319       tree = create_token_tree (dfa, NULL, NULL, token);
2320       if (BE (tree == NULL, 0))
2321 	{
2322 	  *err = REG_ESPACE;
2323 	  return NULL;
2324 	}
2325       break;
2326     case ANCHOR:
2327       if ((token->opr.ctx_type
2328 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 	  && dfa->word_ops_used == 0)
2330 	init_word_char (dfa);
2331       if (token->opr.ctx_type == WORD_DELIM
2332 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2333 	{
2334 	  bin_tree_t *tree_first, *tree_last;
2335 	  if (token->opr.ctx_type == WORD_DELIM)
2336 	    {
2337 	      token->opr.ctx_type = WORD_FIRST;
2338 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 	      token->opr.ctx_type = WORD_LAST;
2340 	    }
2341 	  else
2342 	    {
2343 	      token->opr.ctx_type = INSIDE_WORD;
2344 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 	      token->opr.ctx_type = INSIDE_NOTWORD;
2346 	    }
2347 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350 	    {
2351 	      *err = REG_ESPACE;
2352 	      return NULL;
2353 	    }
2354 	}
2355       else
2356 	{
2357 	  tree = create_token_tree (dfa, NULL, NULL, token);
2358 	  if (BE (tree == NULL, 0))
2359 	    {
2360 	      *err = REG_ESPACE;
2361 	      return NULL;
2362 	    }
2363 	}
2364       /* We must return here, since ANCHORs can't be followed
2365 	 by repetition operators.
2366 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2368       fetch_token (token, regexp, syntax);
2369       return tree;
2370     case OP_PERIOD:
2371       tree = create_token_tree (dfa, NULL, NULL, token);
2372       if (BE (tree == NULL, 0))
2373 	{
2374 	  *err = REG_ESPACE;
2375 	  return NULL;
2376 	}
2377       if (dfa->mb_cur_max > 1)
2378 	dfa->has_mb_node = 1;
2379       break;
2380     case OP_WORD:
2381     case OP_NOTWORD:
2382       tree = build_charclass_op (dfa, regexp->trans,
2383 				 (const unsigned char *) "alnum",
2384 				 (const unsigned char *) "_",
2385 				 token->type == OP_NOTWORD, err);
2386       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387 	return NULL;
2388       break;
2389     case OP_SPACE:
2390     case OP_NOTSPACE:
2391       tree = build_charclass_op (dfa, regexp->trans,
2392 				 (const unsigned char *) "space",
2393 				 (const unsigned char *) "",
2394 				 token->type == OP_NOTSPACE, err);
2395       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 	return NULL;
2397       break;
2398     case OP_ALT:
2399     case END_OF_RE:
2400       return NULL;
2401     case BACK_SLASH:
2402       *err = REG_EESCAPE;
2403       return NULL;
2404     default:
2405       /* Must not happen?  */
2406 #ifdef DEBUG
2407       assert (0);
2408 #endif
2409       return NULL;
2410     }
2411   fetch_token (token, regexp, syntax);
2412 
2413   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415     {
2416       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 	return NULL;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 	  && (token->type == OP_DUP_ASTERISK
2422 	      || token->type == OP_OPEN_DUP_NUM))
2423 	{
2424 	  *err = REG_BADRPT;
2425 	  return NULL;
2426 	}
2427     }
2428 
2429   return tree;
2430 }
2431 
2432 /* This function build the following tree, from regular expression
2433    (<reg_exp>):
2434 	 SUBEXP
2435 	    |
2436 	<reg_exp>
2437 */
2438 
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2442 {
2443   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444   bin_tree_t *tree;
2445   size_t cur_nsub;
2446   cur_nsub = preg->re_nsub++;
2447 
2448   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2449 
2450   /* The subexpression may be a null string.  */
2451   if (token->type == OP_CLOSE_SUBEXP)
2452     tree = NULL;
2453   else
2454     {
2455       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2457 	*err = REG_EPAREN;
2458       if (BE (*err != REG_NOERROR, 0))
2459 	return NULL;
2460     }
2461 
2462   if (cur_nsub <= '9' - '1')
2463     dfa->completed_bkref_map |= 1 << cur_nsub;
2464 
2465   tree = create_tree (dfa, tree, NULL, SUBEXP);
2466   if (BE (tree == NULL, 0))
2467     {
2468       *err = REG_ESPACE;
2469       return NULL;
2470     }
2471   tree->token.opr.idx = cur_nsub;
2472   return tree;
2473 }
2474 
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2476 
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2480 {
2481   bin_tree_t *tree = NULL, *old_tree = NULL;
2482   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2483   re_token_t start_token = *token;
2484 
2485   if (token->type == OP_OPEN_DUP_NUM)
2486     {
2487       end = 0;
2488       start = fetch_number (regexp, token, syntax);
2489       if (start == REG_MISSING)
2490 	{
2491 	  if (token->type == CHARACTER && token->opr.c == ',')
2492 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2493 	  else
2494 	    {
2495 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2496 	      return NULL;
2497 	    }
2498 	}
2499       if (BE (start != REG_ERROR, 1))
2500 	{
2501 	  /* We treat "{n}" as "{n,n}".  */
2502 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503 		 : ((token->type == CHARACTER && token->opr.c == ',')
2504 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2505 	}
2506       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2507 	{
2508 	  /* Invalid sequence.  */
2509 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2510 	    {
2511 	      if (token->type == END_OF_RE)
2512 		*err = REG_EBRACE;
2513 	      else
2514 		*err = REG_BADBR;
2515 
2516 	      return NULL;
2517 	    }
2518 
2519 	  /* If the syntax bit is set, rollback.  */
2520 	  re_string_set_index (regexp, start_idx);
2521 	  *token = start_token;
2522 	  token->type = CHARACTER;
2523 	  /* mb_partial and word_char bits should be already initialized by
2524 	     peek_token.  */
2525 	  return elem;
2526 	}
2527 
2528       if (BE ((end != REG_MISSING && start > end)
2529 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2530 	{
2531 	  /* First number greater than second.  */
2532 	  *err = REG_BADBR;
2533 	  return NULL;
2534 	}
2535     }
2536   else
2537     {
2538       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2539       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2540     }
2541 
2542   fetch_token (token, regexp, syntax);
2543 
2544   if (BE (elem == NULL, 0))
2545     return NULL;
2546   if (BE (start == 0 && end == 0, 0))
2547     {
2548       postorder (elem, free_tree, NULL);
2549       return NULL;
2550     }
2551 
2552   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2553   if (BE (start > 0, 0))
2554     {
2555       tree = elem;
2556       for (i = 2; i <= start; ++i)
2557 	{
2558 	  elem = duplicate_tree (elem, dfa);
2559 	  tree = create_tree (dfa, tree, elem, CONCAT);
2560 	  if (BE (elem == NULL || tree == NULL, 0))
2561 	    goto parse_dup_op_espace;
2562 	}
2563 
2564       if (start == end)
2565 	return tree;
2566 
2567       /* Duplicate ELEM before it is marked optional.  */
2568       elem = duplicate_tree (elem, dfa);
2569       old_tree = tree;
2570     }
2571   else
2572     old_tree = NULL;
2573 
2574   if (elem->token.type == SUBEXP)
2575     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576 
2577   tree = create_tree (dfa, elem, NULL,
2578 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2579   if (BE (tree == NULL, 0))
2580     goto parse_dup_op_espace;
2581 
2582 /* From gnulib's "intprops.h":
2583    True if the arithmetic type T is signed.  */
2584 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2585 
2586   /* This loop is actually executed only when end != REG_MISSING,
2587      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2588      already created the start+1-th copy.  */
2589   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2590     for (i = start + 2; i <= end; ++i)
2591       {
2592 	elem = duplicate_tree (elem, dfa);
2593 	tree = create_tree (dfa, tree, elem, CONCAT);
2594 	if (BE (elem == NULL || tree == NULL, 0))
2595 	  goto parse_dup_op_espace;
2596 
2597 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2598 	if (BE (tree == NULL, 0))
2599 	  goto parse_dup_op_espace;
2600       }
2601 
2602   if (old_tree)
2603     tree = create_tree (dfa, old_tree, tree, CONCAT);
2604 
2605   return tree;
2606 
2607  parse_dup_op_espace:
2608   *err = REG_ESPACE;
2609   return NULL;
2610 }
2611 
2612 /* Size of the names for collating symbol/equivalence_class/character_class.
2613    I'm not sure, but maybe enough.  */
2614 #define BRACKET_NAME_BUF_SIZE 32
2615 
2616 #ifndef _LIBC
2617   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2618      Build the range expression which starts from START_ELEM, and ends
2619      at END_ELEM.  The result are written to MBCSET and SBCSET.
2620      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2621      mbcset->range_ends, is a pointer argument sinse we may
2622      update it.  */
2623 
2624 static reg_errcode_t
2625 internal_function
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (const reg_syntax_t syntax,
2628                  bitset_t sbcset,
2629                  re_charset_t *mbcset,
2630                  Idx *range_alloc,
2631                  const bracket_elem_t *start_elem,
2632                  const bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (const reg_syntax_t syntax,
2635                  bitset_t sbcset,
2636                  const bracket_elem_t *start_elem,
2637                  const bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2639 {
2640   unsigned int start_ch, end_ch;
2641   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2642   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644 	  0))
2645     return REG_ERANGE;
2646 
2647   /* We can handle no multi character collating elements without libc
2648      support.  */
2649   if (BE ((start_elem->type == COLL_SYM
2650 	   && strlen ((char *) start_elem->opr.name) > 1)
2651 	  || (end_elem->type == COLL_SYM
2652 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2653     return REG_ECOLLATE;
2654 
2655 # ifdef RE_ENABLE_I18N
2656   {
2657     wchar_t wc;
2658     wint_t start_wc;
2659     wint_t end_wc;
2660     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661 
2662     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664 		   : 0));
2665     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667 		 : 0));
2668     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2669 		? __btowc (start_ch) : start_elem->opr.wch);
2670     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2671 	      ? __btowc (end_ch) : end_elem->opr.wch);
2672     if (start_wc == WEOF || end_wc == WEOF)
2673       return REG_ECOLLATE;
2674     cmp_buf[0] = start_wc;
2675     cmp_buf[4] = end_wc;
2676 
2677     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2678             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2679       return REG_ERANGE;
2680 
2681     /* Got valid collation sequence values, add them as a new entry.
2682        However, for !_LIBC we have no collation elements: if the
2683        character set is single byte, the single byte character set
2684        that we build below suffices.  parse_bracket_exp passes
2685        no MBCSET if dfa->mb_cur_max == 1.  */
2686     if (mbcset)
2687       {
2688 	/* Check the space of the arrays.  */
2689 	if (BE (*range_alloc == mbcset->nranges, 0))
2690 	  {
2691 	    /* There is not enough space, need realloc.  */
2692 	    wchar_t *new_array_start, *new_array_end;
2693 	    Idx new_nranges;
2694 
2695 	    /* +1 in case of mbcset->nranges is 0.  */
2696 	    new_nranges = 2 * mbcset->nranges + 1;
2697 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2698 	       are NULL if *range_alloc == 0.  */
2699 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2700 					  new_nranges);
2701 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2702 					new_nranges);
2703 
2704 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2705 	      return REG_ESPACE;
2706 
2707 	    mbcset->range_starts = new_array_start;
2708 	    mbcset->range_ends = new_array_end;
2709 	    *range_alloc = new_nranges;
2710 	  }
2711 
2712 	mbcset->range_starts[mbcset->nranges] = start_wc;
2713 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2714       }
2715 
2716     /* Build the table for single byte characters.  */
2717     for (wc = 0; wc < SBC_MAX; ++wc)
2718       {
2719 	cmp_buf[2] = wc;
2720 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2721 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2722 	  bitset_set (sbcset, wc);
2723       }
2724   }
2725 # else /* not RE_ENABLE_I18N */
2726   {
2727     unsigned int ch;
2728     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2729 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2730 		   : 0));
2731     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2732 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2733 		 : 0));
2734     if (start_ch > end_ch)
2735       return REG_ERANGE;
2736     /* Build the table for single byte characters.  */
2737     for (ch = 0; ch < SBC_MAX; ++ch)
2738       if (start_ch <= ch  && ch <= end_ch)
2739 	bitset_set (sbcset, ch);
2740   }
2741 # endif /* not RE_ENABLE_I18N */
2742   return REG_NOERROR;
2743 }
2744 #endif /* not _LIBC */
2745 
2746 #ifndef _LIBC
2747 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2748    Build the collating element which is represented by NAME.
2749    The result are written to MBCSET and SBCSET.
2750    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2751    pointer argument since we may update it.  */
2752 
2753 static reg_errcode_t
2754 internal_function
2755 build_collating_symbol (bitset_t sbcset,
2756 # ifdef RE_ENABLE_I18N
2757 			re_charset_t *mbcset _UNUSED_PARAMETER_,
2758 			Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2759 # endif
2760 			const unsigned char *name)
2761 {
2762   size_t name_len = strlen ((const char *) name);
2763   if (BE (name_len != 1, 0))
2764     return REG_ECOLLATE;
2765   else
2766     {
2767       bitset_set (sbcset, name[0]);
2768       return REG_NOERROR;
2769     }
2770 }
2771 #endif /* not _LIBC */
2772 
2773 /* This function parse bracket expression like "[abc]", "[a-c]",
2774    "[[.a-a.]]" etc.  */
2775 
2776 static bin_tree_t *
2777 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2778 		   reg_syntax_t syntax, reg_errcode_t *err)
2779 {
2780 #ifdef _LIBC
2781   const unsigned char *collseqmb;
2782   const char *collseqwc;
2783   uint32_t nrules;
2784   int32_t table_size;
2785   const int32_t *symb_table;
2786   const unsigned char *extra;
2787 
2788   /* Local function for parse_bracket_exp used in _LIBC environement.
2789      Seek the collating symbol entry correspondings to NAME.
2790      Return the index of the symbol in the SYMB_TABLE.  */
2791 
2792   auto inline int32_t
2793   __attribute ((always_inline))
2794   seek_collating_symbol_entry (name, name_len)
2795 	 const unsigned char *name;
2796 	 size_t name_len;
2797     {
2798       int32_t hash = elem_hash ((const char *) name, name_len);
2799       int32_t elem = hash % table_size;
2800       if (symb_table[2 * elem] != 0)
2801 	{
2802 	  int32_t second = hash % (table_size - 2) + 1;
2803 
2804 	  do
2805 	    {
2806 	      /* First compare the hashing value.  */
2807 	      if (symb_table[2 * elem] == hash
2808 		  /* Compare the length of the name.  */
2809 		  && name_len == extra[symb_table[2 * elem + 1]]
2810 		  /* Compare the name.  */
2811 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2812 			     name_len) == 0)
2813 		{
2814 		  /* Yep, this is the entry.  */
2815 		  break;
2816 		}
2817 
2818 	      /* Next entry.  */
2819 	      elem += second;
2820 	    }
2821 	  while (symb_table[2 * elem] != 0);
2822 	}
2823       return elem;
2824     }
2825 
2826   /* Local function for parse_bracket_exp used in _LIBC environment.
2827      Look up the collation sequence value of BR_ELEM.
2828      Return the value if succeeded, UINT_MAX otherwise.  */
2829 
2830   auto inline unsigned int
2831   __attribute ((always_inline))
2832   lookup_collation_sequence_value (br_elem)
2833 	 bracket_elem_t *br_elem;
2834     {
2835       if (br_elem->type == SB_CHAR)
2836 	{
2837 	  /*
2838 	  if (MB_CUR_MAX == 1)
2839 	  */
2840 	  if (nrules == 0)
2841 	    return collseqmb[br_elem->opr.ch];
2842 	  else
2843 	    {
2844 	      wint_t wc = __btowc (br_elem->opr.ch);
2845 	      return __collseq_table_lookup (collseqwc, wc);
2846 	    }
2847 	}
2848       else if (br_elem->type == MB_CHAR)
2849 	{
2850 	  if (nrules != 0)
2851 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2852 	}
2853       else if (br_elem->type == COLL_SYM)
2854 	{
2855 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2856 	  if (nrules != 0)
2857 	    {
2858 	      int32_t elem, idx;
2859 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2860 						  sym_name_len);
2861 	      if (symb_table[2 * elem] != 0)
2862 		{
2863 		  /* We found the entry.  */
2864 		  idx = symb_table[2 * elem + 1];
2865 		  /* Skip the name of collating element name.  */
2866 		  idx += 1 + extra[idx];
2867 		  /* Skip the byte sequence of the collating element.  */
2868 		  idx += 1 + extra[idx];
2869 		  /* Adjust for the alignment.  */
2870 		  idx = (idx + 3) & ~3;
2871 		  /* Skip the multibyte collation sequence value.  */
2872 		  idx += sizeof (unsigned int);
2873 		  /* Skip the wide char sequence of the collating element.  */
2874 		  idx += sizeof (unsigned int) *
2875 		    (1 + *(unsigned int *) (extra + idx));
2876 		  /* Return the collation sequence value.  */
2877 		  return *(unsigned int *) (extra + idx);
2878 		}
2879 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2880 		{
2881 		  /* No valid character.  Match it as a single byte
2882 		     character.  */
2883 		  return collseqmb[br_elem->opr.name[0]];
2884 		}
2885 	    }
2886 	  else if (sym_name_len == 1)
2887 	    return collseqmb[br_elem->opr.name[0]];
2888 	}
2889       return UINT_MAX;
2890     }
2891 
2892   /* Local function for parse_bracket_exp used in _LIBC environement.
2893      Build the range expression which starts from START_ELEM, and ends
2894      at END_ELEM.  The result are written to MBCSET and SBCSET.
2895      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2896      mbcset->range_ends, is a pointer argument sinse we may
2897      update it.  */
2898 
2899   auto inline reg_errcode_t
2900   __attribute ((always_inline))
2901   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2902 	 re_charset_t *mbcset;
2903 	 Idx *range_alloc;
2904 	 bitset_t sbcset;
2905 	 bracket_elem_t *start_elem, *end_elem;
2906     {
2907       unsigned int ch;
2908       uint32_t start_collseq;
2909       uint32_t end_collseq;
2910 
2911       /* Equivalence Classes and Character Classes can't be a range
2912 	 start/end.  */
2913       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2914 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2915 	      0))
2916 	return REG_ERANGE;
2917 
2918       start_collseq = lookup_collation_sequence_value (start_elem);
2919       end_collseq = lookup_collation_sequence_value (end_elem);
2920       /* Check start/end collation sequence values.  */
2921       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2922 	return REG_ECOLLATE;
2923       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2924 	return REG_ERANGE;
2925 
2926       /* Got valid collation sequence values, add them as a new entry.
2927 	 However, if we have no collation elements, and the character set
2928 	 is single byte, the single byte character set that we
2929 	 build below suffices. */
2930       if (nrules > 0 || dfa->mb_cur_max > 1)
2931 	{
2932 	  /* Check the space of the arrays.  */
2933 	  if (BE (*range_alloc == mbcset->nranges, 0))
2934 	    {
2935 	      /* There is not enough space, need realloc.  */
2936 	      uint32_t *new_array_start;
2937 	      uint32_t *new_array_end;
2938 	      Idx new_nranges;
2939 
2940 	      /* +1 in case of mbcset->nranges is 0.  */
2941 	      new_nranges = 2 * mbcset->nranges + 1;
2942 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2943 					    new_nranges);
2944 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2945 					  new_nranges);
2946 
2947 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2948 		return REG_ESPACE;
2949 
2950 	      mbcset->range_starts = new_array_start;
2951 	      mbcset->range_ends = new_array_end;
2952 	      *range_alloc = new_nranges;
2953 	    }
2954 
2955 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2956 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2957 	}
2958 
2959       /* Build the table for single byte characters.  */
2960       for (ch = 0; ch < SBC_MAX; ch++)
2961 	{
2962 	  uint32_t ch_collseq;
2963 	  /*
2964 	  if (MB_CUR_MAX == 1)
2965 	  */
2966 	  if (nrules == 0)
2967 	    ch_collseq = collseqmb[ch];
2968 	  else
2969 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2970 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2971 	    bitset_set (sbcset, ch);
2972 	}
2973       return REG_NOERROR;
2974     }
2975 
2976   /* Local function for parse_bracket_exp used in _LIBC environement.
2977      Build the collating element which is represented by NAME.
2978      The result are written to MBCSET and SBCSET.
2979      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2980      pointer argument sinse we may update it.  */
2981 
2982   auto inline reg_errcode_t
2983   __attribute ((always_inline))
2984   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2985 	 re_charset_t *mbcset;
2986 	 Idx *coll_sym_alloc;
2987 	 bitset_t sbcset;
2988 	 const unsigned char *name;
2989     {
2990       int32_t elem, idx;
2991       size_t name_len = strlen ((const char *) name);
2992       if (nrules != 0)
2993 	{
2994 	  elem = seek_collating_symbol_entry (name, name_len);
2995 	  if (symb_table[2 * elem] != 0)
2996 	    {
2997 	      /* We found the entry.  */
2998 	      idx = symb_table[2 * elem + 1];
2999 	      /* Skip the name of collating element name.  */
3000 	      idx += 1 + extra[idx];
3001 	    }
3002 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3003 	    {
3004 	      /* No valid character, treat it as a normal
3005 		 character.  */
3006 	      bitset_set (sbcset, name[0]);
3007 	      return REG_NOERROR;
3008 	    }
3009 	  else
3010 	    return REG_ECOLLATE;
3011 
3012 	  /* Got valid collation sequence, add it as a new entry.  */
3013 	  /* Check the space of the arrays.  */
3014 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3015 	    {
3016 	      /* Not enough, realloc it.  */
3017 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3018 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3019 	      /* Use realloc since mbcset->coll_syms is NULL
3020 		 if *alloc == 0.  */
3021 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3022 						   new_coll_sym_alloc);
3023 	      if (BE (new_coll_syms == NULL, 0))
3024 		return REG_ESPACE;
3025 	      mbcset->coll_syms = new_coll_syms;
3026 	      *coll_sym_alloc = new_coll_sym_alloc;
3027 	    }
3028 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3029 	  return REG_NOERROR;
3030 	}
3031       else
3032 	{
3033 	  if (BE (name_len != 1, 0))
3034 	    return REG_ECOLLATE;
3035 	  else
3036 	    {
3037 	      bitset_set (sbcset, name[0]);
3038 	      return REG_NOERROR;
3039 	    }
3040 	}
3041     }
3042 #endif
3043 
3044   re_token_t br_token;
3045   re_bitset_ptr_t sbcset;
3046 #ifdef RE_ENABLE_I18N
3047   re_charset_t *mbcset;
3048   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3049   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3050 #endif /* not RE_ENABLE_I18N */
3051   bool non_match = false;
3052   bin_tree_t *work_tree;
3053   int token_len;
3054   bool first_round = true;
3055 #ifdef _LIBC
3056   collseqmb = (const unsigned char *)
3057     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3058   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3059   if (nrules)
3060     {
3061       /*
3062       if (MB_CUR_MAX > 1)
3063       */
3064       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3065       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3066       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3067 						  _NL_COLLATE_SYMB_TABLEMB);
3068       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3069 						   _NL_COLLATE_SYMB_EXTRAMB);
3070     }
3071 #endif
3072   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3073 #ifdef RE_ENABLE_I18N
3074   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3075 #endif /* RE_ENABLE_I18N */
3076 #ifdef RE_ENABLE_I18N
3077   if (BE (sbcset == NULL || mbcset == NULL, 0))
3078 #else
3079   if (BE (sbcset == NULL, 0))
3080 #endif /* RE_ENABLE_I18N */
3081     {
3082       *err = REG_ESPACE;
3083       return NULL;
3084     }
3085 
3086   token_len = peek_token_bracket (token, regexp, syntax);
3087   if (BE (token->type == END_OF_RE, 0))
3088     {
3089       *err = REG_BADPAT;
3090       goto parse_bracket_exp_free_return;
3091     }
3092   if (token->type == OP_NON_MATCH_LIST)
3093     {
3094 #ifdef RE_ENABLE_I18N
3095       mbcset->non_match = 1;
3096 #endif /* not RE_ENABLE_I18N */
3097       non_match = true;
3098       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3099 	bitset_set (sbcset, '\n');
3100       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3101       token_len = peek_token_bracket (token, regexp, syntax);
3102       if (BE (token->type == END_OF_RE, 0))
3103 	{
3104 	  *err = REG_BADPAT;
3105 	  goto parse_bracket_exp_free_return;
3106 	}
3107     }
3108 
3109   /* We treat the first ']' as a normal character.  */
3110   if (token->type == OP_CLOSE_BRACKET)
3111     token->type = CHARACTER;
3112 
3113   while (1)
3114     {
3115       bracket_elem_t start_elem, end_elem;
3116       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3117       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3118       reg_errcode_t ret;
3119       int token_len2 = 0;
3120       bool is_range_exp = false;
3121       re_token_t token2;
3122 
3123       start_elem.opr.name = start_name_buf;
3124       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3125 				   syntax, first_round);
3126       if (BE (ret != REG_NOERROR, 0))
3127 	{
3128 	  *err = ret;
3129 	  goto parse_bracket_exp_free_return;
3130 	}
3131       first_round = false;
3132 
3133       /* Get information about the next token.  We need it in any case.  */
3134       token_len = peek_token_bracket (token, regexp, syntax);
3135 
3136       /* Do not check for ranges if we know they are not allowed.  */
3137       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3138 	{
3139 	  if (BE (token->type == END_OF_RE, 0))
3140 	    {
3141 	      *err = REG_EBRACK;
3142 	      goto parse_bracket_exp_free_return;
3143 	    }
3144 	  if (token->type == OP_CHARSET_RANGE)
3145 	    {
3146 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3147 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3148 	      if (BE (token2.type == END_OF_RE, 0))
3149 		{
3150 		  *err = REG_EBRACK;
3151 		  goto parse_bracket_exp_free_return;
3152 		}
3153 	      if (token2.type == OP_CLOSE_BRACKET)
3154 		{
3155 		  /* We treat the last '-' as a normal character.  */
3156 		  re_string_skip_bytes (regexp, -token_len);
3157 		  token->type = CHARACTER;
3158 		}
3159 	      else
3160 		is_range_exp = true;
3161 	    }
3162 	}
3163 
3164       if (is_range_exp == true)
3165 	{
3166 	  end_elem.opr.name = end_name_buf;
3167 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3168 				       dfa, syntax, true);
3169 	  if (BE (ret != REG_NOERROR, 0))
3170 	    {
3171 	      *err = ret;
3172 	      goto parse_bracket_exp_free_return;
3173 	    }
3174 
3175 	  token_len = peek_token_bracket (token, regexp, syntax);
3176 
3177 #ifdef _LIBC
3178 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3179 				  &start_elem, &end_elem);
3180 #else
3181 # ifdef RE_ENABLE_I18N
3182 	  *err = build_range_exp (syntax, sbcset,
3183 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3184 				  &range_alloc, &start_elem, &end_elem);
3185 # else
3186 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3187 # endif
3188 #endif /* RE_ENABLE_I18N */
3189 	  if (BE (*err != REG_NOERROR, 0))
3190 	    goto parse_bracket_exp_free_return;
3191 	}
3192       else
3193 	{
3194 	  switch (start_elem.type)
3195 	    {
3196 	    case SB_CHAR:
3197 	      bitset_set (sbcset, start_elem.opr.ch);
3198 	      break;
3199 #ifdef RE_ENABLE_I18N
3200 	    case MB_CHAR:
3201 	      /* Check whether the array has enough space.  */
3202 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3203 		{
3204 		  wchar_t *new_mbchars;
3205 		  /* Not enough, realloc it.  */
3206 		  /* +1 in case of mbcset->nmbchars is 0.  */
3207 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3208 		  /* Use realloc since array is NULL if *alloc == 0.  */
3209 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3210 					    mbchar_alloc);
3211 		  if (BE (new_mbchars == NULL, 0))
3212 		    goto parse_bracket_exp_espace;
3213 		  mbcset->mbchars = new_mbchars;
3214 		}
3215 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3216 	      break;
3217 #endif /* RE_ENABLE_I18N */
3218 	    case EQUIV_CLASS:
3219 	      *err = build_equiv_class (sbcset,
3220 #ifdef RE_ENABLE_I18N
3221 					mbcset, &equiv_class_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223 					start_elem.opr.name);
3224 	      if (BE (*err != REG_NOERROR, 0))
3225 		goto parse_bracket_exp_free_return;
3226 	      break;
3227 	    case COLL_SYM:
3228 	      *err = build_collating_symbol (sbcset,
3229 #ifdef RE_ENABLE_I18N
3230 					     mbcset, &coll_sym_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232 					     start_elem.opr.name);
3233 	      if (BE (*err != REG_NOERROR, 0))
3234 		goto parse_bracket_exp_free_return;
3235 	      break;
3236 	    case CHAR_CLASS:
3237 	      *err = build_charclass (regexp->trans, sbcset,
3238 #ifdef RE_ENABLE_I18N
3239 				      mbcset, &char_class_alloc,
3240 #endif /* RE_ENABLE_I18N */
3241 				      start_elem.opr.name, syntax);
3242 	      if (BE (*err != REG_NOERROR, 0))
3243 	       goto parse_bracket_exp_free_return;
3244 	      break;
3245 	    default:
3246 	      assert (0);
3247 	      break;
3248 	    }
3249 	}
3250       if (BE (token->type == END_OF_RE, 0))
3251 	{
3252 	  *err = REG_EBRACK;
3253 	  goto parse_bracket_exp_free_return;
3254 	}
3255       if (token->type == OP_CLOSE_BRACKET)
3256 	break;
3257     }
3258 
3259   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3260 
3261   /* If it is non-matching list.  */
3262   if (non_match)
3263     bitset_not (sbcset);
3264 
3265 #ifdef RE_ENABLE_I18N
3266   /* Ensure only single byte characters are set.  */
3267   if (dfa->mb_cur_max > 1)
3268     bitset_mask (sbcset, dfa->sb_char);
3269 
3270   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3271       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3272 						     || mbcset->non_match)))
3273     {
3274       bin_tree_t *mbc_tree;
3275       int sbc_idx;
3276       /* Build a tree for complex bracket.  */
3277       dfa->has_mb_node = 1;
3278       br_token.type = COMPLEX_BRACKET;
3279       br_token.opr.mbcset = mbcset;
3280       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3281       if (BE (mbc_tree == NULL, 0))
3282 	goto parse_bracket_exp_espace;
3283       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3284 	if (sbcset[sbc_idx])
3285 	  break;
3286       /* If there are no bits set in sbcset, there is no point
3287 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3288       if (sbc_idx < BITSET_WORDS)
3289 	{
3290 	  /* Build a tree for simple bracket.  */
3291 	  br_token.type = SIMPLE_BRACKET;
3292 	  br_token.opr.sbcset = sbcset;
3293 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3294 	  if (BE (work_tree == NULL, 0))
3295 	    goto parse_bracket_exp_espace;
3296 
3297 	  /* Then join them by ALT node.  */
3298 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3299 	  if (BE (work_tree == NULL, 0))
3300 	    goto parse_bracket_exp_espace;
3301 	}
3302       else
3303 	{
3304 	  re_free (sbcset);
3305 	  work_tree = mbc_tree;
3306 	}
3307     }
3308   else
3309 #endif /* not RE_ENABLE_I18N */
3310     {
3311 #ifdef RE_ENABLE_I18N
3312       free_charset (mbcset);
3313 #endif
3314       /* Build a tree for simple bracket.  */
3315       br_token.type = SIMPLE_BRACKET;
3316       br_token.opr.sbcset = sbcset;
3317       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3318       if (BE (work_tree == NULL, 0))
3319 	goto parse_bracket_exp_espace;
3320     }
3321   return work_tree;
3322 
3323  parse_bracket_exp_espace:
3324   *err = REG_ESPACE;
3325  parse_bracket_exp_free_return:
3326   re_free (sbcset);
3327 #ifdef RE_ENABLE_I18N
3328   free_charset (mbcset);
3329 #endif /* RE_ENABLE_I18N */
3330   return NULL;
3331 }
3332 
3333 /* Parse an element in the bracket expression.  */
3334 
3335 static reg_errcode_t
3336 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3337 		       re_token_t *token, int token_len,
3338 		       re_dfa_t *dfa _UNUSED_PARAMETER_,
3339 		       reg_syntax_t syntax, bool accept_hyphen)
3340 {
3341 #ifdef RE_ENABLE_I18N
3342   int cur_char_size;
3343   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3344   if (cur_char_size > 1)
3345     {
3346       elem->type = MB_CHAR;
3347       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3348       re_string_skip_bytes (regexp, cur_char_size);
3349       return REG_NOERROR;
3350     }
3351 #endif /* RE_ENABLE_I18N */
3352   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3353   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3354       || token->type == OP_OPEN_EQUIV_CLASS)
3355     return parse_bracket_symbol (elem, regexp, token);
3356   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3357     {
3358       /* A '-' must only appear as anything but a range indicator before
3359 	 the closing bracket.  Everything else is an error.  */
3360       re_token_t token2;
3361       (void) peek_token_bracket (&token2, regexp, syntax);
3362       if (token2.type != OP_CLOSE_BRACKET)
3363 	/* The actual error value is not standardized since this whole
3364 	   case is undefined.  But ERANGE makes good sense.  */
3365 	return REG_ERANGE;
3366     }
3367   elem->type = SB_CHAR;
3368   elem->opr.ch = token->opr.c;
3369   return REG_NOERROR;
3370 }
3371 
3372 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3373    such as [:<character_class>:], [.<collating_element>.], and
3374    [=<equivalent_class>=].  */
3375 
3376 static reg_errcode_t
3377 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3378 		      re_token_t *token)
3379 {
3380   unsigned char ch, delim = token->opr.c;
3381   int i = 0;
3382   if (re_string_eoi(regexp))
3383     return REG_EBRACK;
3384   for (;; ++i)
3385     {
3386       if (i >= BRACKET_NAME_BUF_SIZE)
3387 	return REG_EBRACK;
3388       if (token->type == OP_OPEN_CHAR_CLASS)
3389 	ch = re_string_fetch_byte_case (regexp);
3390       else
3391 	ch = re_string_fetch_byte (regexp);
3392       if (re_string_eoi(regexp))
3393 	return REG_EBRACK;
3394       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3395 	break;
3396       elem->opr.name[i] = ch;
3397     }
3398   re_string_skip_bytes (regexp, 1);
3399   elem->opr.name[i] = '\0';
3400   switch (token->type)
3401     {
3402     case OP_OPEN_COLL_ELEM:
3403       elem->type = COLL_SYM;
3404       break;
3405     case OP_OPEN_EQUIV_CLASS:
3406       elem->type = EQUIV_CLASS;
3407       break;
3408     case OP_OPEN_CHAR_CLASS:
3409       elem->type = CHAR_CLASS;
3410       break;
3411     default:
3412       break;
3413     }
3414   return REG_NOERROR;
3415 }
3416 
3417   /* Helper function for parse_bracket_exp.
3418      Build the equivalence class which is represented by NAME.
3419      The result are written to MBCSET and SBCSET.
3420      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3421      is a pointer argument sinse we may update it.  */
3422 
3423 static reg_errcode_t
3424 #ifdef RE_ENABLE_I18N
3425 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_,
3426 		   Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3427 		   const unsigned char *name)
3428 #else /* not RE_ENABLE_I18N */
3429 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3430 #endif /* not RE_ENABLE_I18N */
3431 {
3432 #ifdef _LIBC
3433   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3434   if (nrules != 0)
3435     {
3436       const int32_t *table, *indirect;
3437       const unsigned char *weights, *extra, *cp;
3438       unsigned char char_buf[2];
3439       int32_t idx1, idx2;
3440       unsigned int ch;
3441       size_t len;
3442       /* This #include defines a local function!  */
3443 # include <locale/weight.h>
3444       /* Calculate the index for equivalence class.  */
3445       cp = name;
3446       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3447       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3448 					       _NL_COLLATE_WEIGHTMB);
3449       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3450 						   _NL_COLLATE_EXTRAMB);
3451       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3452 						_NL_COLLATE_INDIRECTMB);
3453       idx1 = findidx (&cp);
3454       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3455 	/* This isn't a valid character.  */
3456 	return REG_ECOLLATE;
3457 
3458       /* Build single byte matcing table for this equivalence class.  */
3459       char_buf[1] = (unsigned char) '\0';
3460       len = weights[idx1 & 0xffffff];
3461       for (ch = 0; ch < SBC_MAX; ++ch)
3462 	{
3463 	  char_buf[0] = ch;
3464 	  cp = char_buf;
3465 	  idx2 = findidx (&cp);
3466 /*
3467 	  idx2 = table[ch];
3468 */
3469 	  if (idx2 == 0)
3470 	    /* This isn't a valid character.  */
3471 	    continue;
3472 	  /* Compare only if the length matches and the collation rule
3473 	     index is the same.  */
3474 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3475 	    {
3476 	      int cnt = 0;
3477 
3478 	      while (cnt <= len &&
3479 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3480 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3481 		++cnt;
3482 
3483 	      if (cnt > len)
3484 		bitset_set (sbcset, ch);
3485 	    }
3486 	}
3487       /* Check whether the array has enough space.  */
3488       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3489 	{
3490 	  /* Not enough, realloc it.  */
3491 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3492 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3493 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3494 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3495 						   int32_t,
3496 						   new_equiv_class_alloc);
3497 	  if (BE (new_equiv_classes == NULL, 0))
3498 	    return REG_ESPACE;
3499 	  mbcset->equiv_classes = new_equiv_classes;
3500 	  *equiv_class_alloc = new_equiv_class_alloc;
3501 	}
3502       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3503     }
3504   else
3505 #endif /* _LIBC */
3506     {
3507       if (BE (strlen ((const char *) name) != 1, 0))
3508 	return REG_ECOLLATE;
3509       bitset_set (sbcset, *name);
3510     }
3511   return REG_NOERROR;
3512 }
3513 
3514   /* Helper function for parse_bracket_exp.
3515      Build the character class which is represented by NAME.
3516      The result are written to MBCSET and SBCSET.
3517      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3518      is a pointer argument sinse we may update it.  */
3519 
3520 static reg_errcode_t
3521 #ifdef RE_ENABLE_I18N
3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3523 		 re_charset_t *mbcset, Idx *char_class_alloc,
3524 		 const unsigned char *class_name, reg_syntax_t syntax)
3525 #else /* not RE_ENABLE_I18N */
3526 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3527 		 const unsigned char *class_name, reg_syntax_t syntax)
3528 #endif /* not RE_ENABLE_I18N */
3529 {
3530   int i;
3531   const char *name = (const char *) class_name;
3532 
3533   /* In case of REG_ICASE "upper" and "lower" match the both of
3534      upper and lower cases.  */
3535   if ((syntax & RE_ICASE)
3536       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3537     name = "alpha";
3538 
3539 #ifdef RE_ENABLE_I18N
3540   /* Check the space of the arrays.  */
3541   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3542     {
3543       /* Not enough, realloc it.  */
3544       /* +1 in case of mbcset->nchar_classes is 0.  */
3545       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3546       /* Use realloc since array is NULL if *alloc == 0.  */
3547       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3548 					       new_char_class_alloc);
3549       if (BE (new_char_classes == NULL, 0))
3550 	return REG_ESPACE;
3551       mbcset->char_classes = new_char_classes;
3552       *char_class_alloc = new_char_class_alloc;
3553     }
3554   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3555 #endif /* RE_ENABLE_I18N */
3556 
3557 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3558   do {						\
3559     if (BE (trans != NULL, 0))			\
3560       {						\
3561 	for (i = 0; i < SBC_MAX; ++i)		\
3562 	  if (ctype_func (i))			\
3563 	    bitset_set (sbcset, trans[i]);	\
3564       }						\
3565     else					\
3566       {						\
3567 	for (i = 0; i < SBC_MAX; ++i)		\
3568 	  if (ctype_func (i))			\
3569 	    bitset_set (sbcset, i);		\
3570       }						\
3571   } while (0)
3572 
3573   if (strcmp (name, "alnum") == 0)
3574     BUILD_CHARCLASS_LOOP (isalnum);
3575   else if (strcmp (name, "cntrl") == 0)
3576     BUILD_CHARCLASS_LOOP (iscntrl);
3577   else if (strcmp (name, "lower") == 0)
3578     BUILD_CHARCLASS_LOOP (islower);
3579   else if (strcmp (name, "space") == 0)
3580     BUILD_CHARCLASS_LOOP (isspace);
3581   else if (strcmp (name, "alpha") == 0)
3582     BUILD_CHARCLASS_LOOP (isalpha);
3583   else if (strcmp (name, "digit") == 0)
3584     BUILD_CHARCLASS_LOOP (isdigit);
3585   else if (strcmp (name, "print") == 0)
3586     BUILD_CHARCLASS_LOOP (isprint);
3587   else if (strcmp (name, "upper") == 0)
3588     BUILD_CHARCLASS_LOOP (isupper);
3589   else if (strcmp (name, "blank") == 0)
3590     BUILD_CHARCLASS_LOOP (isblank);
3591   else if (strcmp (name, "graph") == 0)
3592     BUILD_CHARCLASS_LOOP (isgraph);
3593   else if (strcmp (name, "punct") == 0)
3594     BUILD_CHARCLASS_LOOP (ispunct);
3595   else if (strcmp (name, "xdigit") == 0)
3596     BUILD_CHARCLASS_LOOP (isxdigit);
3597   else
3598     return REG_ECTYPE;
3599 
3600   return REG_NOERROR;
3601 }
3602 
3603 static bin_tree_t *
3604 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3605 		    const unsigned char *class_name,
3606 		    const unsigned char *extra, bool non_match,
3607 		    reg_errcode_t *err)
3608 {
3609   re_bitset_ptr_t sbcset;
3610 #ifdef RE_ENABLE_I18N
3611   re_charset_t *mbcset;
3612   Idx alloc = 0;
3613 #endif /* not RE_ENABLE_I18N */
3614   reg_errcode_t ret;
3615   re_token_t br_token;
3616   bin_tree_t *tree;
3617 
3618   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3619 #ifdef RE_ENABLE_I18N
3620   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3621 #endif /* RE_ENABLE_I18N */
3622 
3623 #ifdef RE_ENABLE_I18N
3624   if (BE (sbcset == NULL || mbcset == NULL, 0))
3625 #else /* not RE_ENABLE_I18N */
3626   if (BE (sbcset == NULL, 0))
3627 #endif /* not RE_ENABLE_I18N */
3628     {
3629       *err = REG_ESPACE;
3630       return NULL;
3631     }
3632 
3633   if (non_match)
3634     {
3635 #ifdef RE_ENABLE_I18N
3636       mbcset->non_match = 1;
3637 #endif /* not RE_ENABLE_I18N */
3638     }
3639 
3640   /* We don't care the syntax in this case.  */
3641   ret = build_charclass (trans, sbcset,
3642 #ifdef RE_ENABLE_I18N
3643 			 mbcset, &alloc,
3644 #endif /* RE_ENABLE_I18N */
3645 			 class_name, 0);
3646 
3647   if (BE (ret != REG_NOERROR, 0))
3648     {
3649       re_free (sbcset);
3650 #ifdef RE_ENABLE_I18N
3651       free_charset (mbcset);
3652 #endif /* RE_ENABLE_I18N */
3653       *err = ret;
3654       return NULL;
3655     }
3656   /* \w match '_' also.  */
3657   for (; *extra; extra++)
3658     bitset_set (sbcset, *extra);
3659 
3660   /* If it is non-matching list.  */
3661   if (non_match)
3662     bitset_not (sbcset);
3663 
3664 #ifdef RE_ENABLE_I18N
3665   /* Ensure only single byte characters are set.  */
3666   if (dfa->mb_cur_max > 1)
3667     bitset_mask (sbcset, dfa->sb_char);
3668 #endif
3669 
3670   /* Build a tree for simple bracket.  */
3671   br_token.type = SIMPLE_BRACKET;
3672   br_token.opr.sbcset = sbcset;
3673   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3674   if (BE (tree == NULL, 0))
3675     goto build_word_op_espace;
3676 
3677 #ifdef RE_ENABLE_I18N
3678   if (dfa->mb_cur_max > 1)
3679     {
3680       bin_tree_t *mbc_tree;
3681       /* Build a tree for complex bracket.  */
3682       br_token.type = COMPLEX_BRACKET;
3683       br_token.opr.mbcset = mbcset;
3684       dfa->has_mb_node = 1;
3685       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3686       if (BE (mbc_tree == NULL, 0))
3687 	goto build_word_op_espace;
3688       /* Then join them by ALT node.  */
3689       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3690       if (BE (mbc_tree != NULL, 1))
3691 	return tree;
3692     }
3693   else
3694     {
3695       free_charset (mbcset);
3696       return tree;
3697     }
3698 #else /* not RE_ENABLE_I18N */
3699   return tree;
3700 #endif /* not RE_ENABLE_I18N */
3701 
3702  build_word_op_espace:
3703   re_free (sbcset);
3704 #ifdef RE_ENABLE_I18N
3705   free_charset (mbcset);
3706 #endif /* RE_ENABLE_I18N */
3707   *err = REG_ESPACE;
3708   return NULL;
3709 }
3710 
3711 /* This is intended for the expressions like "a{1,3}".
3712    Fetch a number from `input', and return the number.
3713    Return REG_MISSING if the number field is empty like "{,1}".
3714    Return REG_ERROR if an error occurred.  */
3715 
3716 static Idx
3717 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3718 {
3719   Idx num = REG_MISSING;
3720   unsigned char c;
3721   while (1)
3722     {
3723       fetch_token (token, input, syntax);
3724       c = token->opr.c;
3725       if (BE (token->type == END_OF_RE, 0))
3726 	return REG_ERROR;
3727       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3728 	break;
3729       num = ((token->type != CHARACTER || c < '0' || '9' < c
3730 	      || num == REG_ERROR)
3731 	     ? REG_ERROR
3732 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3733       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3734     }
3735   return num;
3736 }
3737 
3738 #ifdef RE_ENABLE_I18N
3739 static void
3740 free_charset (re_charset_t *cset)
3741 {
3742   re_free (cset->mbchars);
3743 # ifdef _LIBC
3744   re_free (cset->coll_syms);
3745   re_free (cset->equiv_classes);
3746   re_free (cset->range_starts);
3747   re_free (cset->range_ends);
3748 # endif
3749   re_free (cset->char_classes);
3750   re_free (cset);
3751 }
3752 #endif /* RE_ENABLE_I18N */
3753 
3754 /* Functions for binary tree operation.  */
3755 
3756 /* Create a tree node.  */
3757 
3758 static bin_tree_t *
3759 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3760 	     re_token_type_t type)
3761 {
3762   re_token_t t;
3763   t.type = type;
3764   return create_token_tree (dfa, left, right, &t);
3765 }
3766 
3767 static bin_tree_t *
3768 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769 		   const re_token_t *token)
3770 {
3771   bin_tree_t *tree;
3772   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3773     {
3774       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3775 
3776       if (storage == NULL)
3777 	return NULL;
3778       storage->next = dfa->str_tree_storage;
3779       dfa->str_tree_storage = storage;
3780       dfa->str_tree_storage_idx = 0;
3781     }
3782   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3783 
3784   tree->parent = NULL;
3785   tree->left = left;
3786   tree->right = right;
3787   tree->token = *token;
3788   tree->token.duplicated = 0;
3789   tree->token.opt_subexp = 0;
3790   tree->first = NULL;
3791   tree->next = NULL;
3792   tree->node_idx = REG_MISSING;
3793 
3794   if (left != NULL)
3795     left->parent = tree;
3796   if (right != NULL)
3797     right->parent = tree;
3798   return tree;
3799 }
3800 
3801 /* Mark the tree SRC as an optional subexpression.
3802    To be called from preorder or postorder.  */
3803 
3804 static reg_errcode_t
3805 mark_opt_subexp (void *extra, bin_tree_t *node)
3806 {
3807   Idx idx = (Idx) (long) extra;
3808   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3809     node->token.opt_subexp = 1;
3810 
3811   return REG_NOERROR;
3812 }
3813 
3814 /* Free the allocated memory inside NODE. */
3815 
3816 static void
3817 free_token (re_token_t *node)
3818 {
3819 #ifdef RE_ENABLE_I18N
3820   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3821     free_charset (node->opr.mbcset);
3822   else
3823 #endif /* RE_ENABLE_I18N */
3824     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3825       re_free (node->opr.sbcset);
3826 }
3827 
3828 /* Worker function for tree walking.  Free the allocated memory inside NODE
3829    and its children. */
3830 
3831 static reg_errcode_t
3832 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3833 {
3834   free_token (&node->token);
3835   return REG_NOERROR;
3836 }
3837 
3838 
3839 /* Duplicate the node SRC, and return new node.  This is a preorder
3840    visit similar to the one implemented by the generic visitor, but
3841    we need more infrastructure to maintain two parallel trees --- so,
3842    it's easier to duplicate.  */
3843 
3844 static bin_tree_t *
3845 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3846 {
3847   const bin_tree_t *node;
3848   bin_tree_t *dup_root;
3849   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3850 
3851   for (node = root; ; )
3852     {
3853       /* Create a new tree and link it back to the current parent.  */
3854       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3855       if (*p_new == NULL)
3856 	return NULL;
3857       (*p_new)->parent = dup_node;
3858       (*p_new)->token.duplicated = 1;
3859       dup_node = *p_new;
3860 
3861       /* Go to the left node, or up and to the right.  */
3862       if (node->left)
3863 	{
3864 	  node = node->left;
3865 	  p_new = &dup_node->left;
3866 	}
3867       else
3868 	{
3869 	  const bin_tree_t *prev = NULL;
3870 	  while (node->right == prev || node->right == NULL)
3871 	    {
3872 	      prev = node;
3873 	      node = node->parent;
3874 	      dup_node = dup_node->parent;
3875 	      if (!node)
3876 		return dup_root;
3877 	    }
3878 	  node = node->right;
3879 	  p_new = &dup_node->right;
3880 	}
3881     }
3882 }
3883