xref: /dragonfly/contrib/diffutils/lib/regcomp.c (revision 678e8cc6)
1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002-2011 Free Software Foundation, Inc.
5    This file is part of the GNU C Library.
6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3, or (at your option)
11    any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License along
19    with this program; if not, write to the Free Software Foundation,
20    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 					  size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 				     const re_dfastate_t *init_state,
26 				     char *fastmap);
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 			       reg_errcode_t (fn (void *, bin_tree_t *)),
39 			       void *extra);
40 static reg_errcode_t postorder (bin_tree_t *root,
41 				reg_errcode_t (fn (void *, bin_tree_t *)),
42 				void *extra);
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 				 bin_tree_t *node);
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
52 				   unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 					 Idx node, bool root);
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static Idx fetch_number (re_string_t *input, re_token_t *token,
58 			 reg_syntax_t syntax);
59 static int peek_token (re_token_t *token, re_string_t *input,
60 			reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 			  reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 				  re_token_t *token, reg_syntax_t syntax,
65 				  Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 				 re_token_t *token, reg_syntax_t syntax,
68 				 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 				     re_token_t *token, reg_syntax_t syntax,
71 				     Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 				  re_token_t *token, reg_syntax_t syntax,
74 				  Idx nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 				 re_dfa_t *dfa, re_token_t *token,
77 				 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 				      re_token_t *token, reg_syntax_t syntax,
80 				      reg_errcode_t *err);
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 					    re_string_t *regexp,
83 					    re_token_t *token, int token_len,
84 					    re_dfa_t *dfa,
85 					    reg_syntax_t syntax,
86 					    bool accept_hyphen);
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 					  re_string_t *regexp,
89 					  re_token_t *token);
90 #ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 					re_charset_t *mbcset,
93 					Idx *equiv_class_alloc,
94 					const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 				      bitset_t sbcset,
97 				      re_charset_t *mbcset,
98 				      Idx *char_class_alloc,
99 				      const unsigned char *class_name,
100 				      reg_syntax_t syntax);
101 #else  /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 					const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 				      bitset_t sbcset,
106 				      const unsigned char *class_name,
107 				      reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 				       RE_TRANSLATE_TYPE trans,
111 				       const unsigned char *class_name,
112 				       const unsigned char *extra,
113 				       bool non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 				bin_tree_t *left, bin_tree_t *right,
116 				re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 				      bin_tree_t *left, bin_tree_t *right,
119 				      const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 
125 /* This table gives an error message for each of the error codes listed
126    in regex.h.  Obviously the order here has to be same as there.
127    POSIX doesn't require that we do anything for REG_NOERROR,
128    but why not be nice?  */
129 
130 static const char __re_error_msgid[] =
131   {
132 #define REG_NOERROR_IDX	0
133     gettext_noop ("Success")	/* REG_NOERROR */
134     "\0"
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136     gettext_noop ("No match")	/* REG_NOMATCH */
137     "\0"
138 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140     "\0"
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143     "\0"
144 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146     "\0"
147 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149     "\0"
150 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152     "\0"
153 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
154     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
155     "\0"
156 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158     "\0"
159 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161     "\0"
162 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164     "\0"
165 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166     gettext_noop ("Invalid range end")	/* REG_ERANGE */
167     "\0"
168 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
170     "\0"
171 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173     "\0"
174 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
176     "\0"
177 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
179     "\0"
180 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182   };
183 
184 static const size_t __re_error_msgid_idx[] =
185   {
186     REG_NOERROR_IDX,
187     REG_NOMATCH_IDX,
188     REG_BADPAT_IDX,
189     REG_ECOLLATE_IDX,
190     REG_ECTYPE_IDX,
191     REG_EESCAPE_IDX,
192     REG_ESUBREG_IDX,
193     REG_EBRACK_IDX,
194     REG_EPAREN_IDX,
195     REG_EBRACE_IDX,
196     REG_BADBR_IDX,
197     REG_ERANGE_IDX,
198     REG_ESPACE_IDX,
199     REG_BADRPT_IDX,
200     REG_EEND_IDX,
201     REG_ESIZE_IDX,
202     REG_ERPAREN_IDX
203   };
204 
205 /* Entry points for GNU code.  */
206 
207 /* re_compile_pattern is the GNU regular expression compiler: it
208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209    Returns 0 if the pattern was valid, otherwise an error string.
210 
211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212    are set in BUFP on entry.  */
213 
214 #ifdef _LIBC
215 const char *
216 re_compile_pattern (pattern, length, bufp)
217     const char *pattern;
218     size_t length;
219     struct re_pattern_buffer *bufp;
220 #else /* size_t might promote */
221 const char *
222 re_compile_pattern (const char *pattern, size_t length,
223 		    struct re_pattern_buffer *bufp)
224 #endif
225 {
226   reg_errcode_t ret;
227 
228   /* And GNU code determines whether or not to get register information
229      by passing null for the REGS argument to re_match, etc., not by
230      setting no_sub, unless RE_NO_SUB is set.  */
231   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 
233   /* Match anchors at newline.  */
234   bufp->newline_anchor = 1;
235 
236   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 
238   if (!ret)
239     return NULL;
240   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 }
242 #ifdef _LIBC
243 weak_alias (__re_compile_pattern, re_compile_pattern)
244 #endif
245 
246 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
247    also be assigned to arbitrarily: each pattern buffer stores its own
248    syntax, so it can be changed between regex compilations.  */
249 /* This has no initializer because initialized variables in Emacs
250    become read-only after dumping.  */
251 reg_syntax_t re_syntax_options;
252 
253 
254 /* Specify the precise syntax of regexps for compilation.  This provides
255    for compatibility for various utilities which historically have
256    different, incompatible syntaxes.
257 
258    The argument SYNTAX is a bit mask comprised of the various bits
259    defined in regex.h.  We return the old syntax.  */
260 
261 reg_syntax_t
262 re_set_syntax (syntax)
263     reg_syntax_t syntax;
264 {
265   reg_syntax_t ret = re_syntax_options;
266 
267   re_syntax_options = syntax;
268   return ret;
269 }
270 #ifdef _LIBC
271 weak_alias (__re_set_syntax, re_set_syntax)
272 #endif
273 
274 int
275 re_compile_fastmap (bufp)
276     struct re_pattern_buffer *bufp;
277 {
278   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
279   char *fastmap = bufp->fastmap;
280 
281   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
282   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
283   if (dfa->init_state != dfa->init_state_word)
284     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
285   if (dfa->init_state != dfa->init_state_nl)
286     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
287   if (dfa->init_state != dfa->init_state_begbuf)
288     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
289   bufp->fastmap_accurate = 1;
290   return 0;
291 }
292 #ifdef _LIBC
293 weak_alias (__re_compile_fastmap, re_compile_fastmap)
294 #endif
295 
296 static inline void
297 __attribute ((always_inline))
298 re_set_fastmap (char *fastmap, bool icase, int ch)
299 {
300   fastmap[ch] = 1;
301   if (icase)
302     fastmap[tolower (ch)] = 1;
303 }
304 
305 /* Helper function for re_compile_fastmap.
306    Compile fastmap for the initial_state INIT_STATE.  */
307 
308 static void
309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 			 char *fastmap)
311 {
312   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313   Idx node_cnt;
314   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
315   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316     {
317       Idx node = init_state->nodes.elems[node_cnt];
318       re_token_type_t type = dfa->nodes[node].type;
319 
320       if (type == CHARACTER)
321 	{
322 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
323 #ifdef RE_ENABLE_I18N
324 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 	    {
326 	      unsigned char buf[MB_LEN_MAX];
327 	      unsigned char *p;
328 	      wchar_t wc;
329 	      mbstate_t state;
330 
331 	      p = buf;
332 	      *p++ = dfa->nodes[node].opr.c;
333 	      while (++node < dfa->nodes_len
334 		     &&	dfa->nodes[node].type == CHARACTER
335 		     && dfa->nodes[node].mb_partial)
336 		*p++ = dfa->nodes[node].opr.c;
337 	      memset (&state, '\0', sizeof (state));
338 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
339 			     &state) == p - buf
340 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
341 		      != (size_t) -1))
342 		re_set_fastmap (fastmap, false, buf[0]);
343 	    }
344 #endif
345 	}
346       else if (type == SIMPLE_BRACKET)
347 	{
348 	  int i, ch;
349 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 	    {
351 	      int j;
352 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
353 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
354 		if (w & ((bitset_word_t) 1 << j))
355 		  re_set_fastmap (fastmap, icase, ch);
356 	    }
357 	}
358 #ifdef RE_ENABLE_I18N
359       else if (type == COMPLEX_BRACKET)
360 	{
361 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
362 	  Idx i;
363 
364 # ifdef _LIBC
365 	  /* See if we have to try all bytes which start multiple collation
366 	     elements.
367 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
368 		  collation element, and don't catch 'b' since 'b' is
369 		  the only collation element which starts from 'b' (and
370 		  it is caught by SIMPLE_BRACKET).  */
371 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
372 		  && (cset->ncoll_syms || cset->nranges))
373 		{
374 		  const int32_t *table = (const int32_t *)
375 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
376 		  for (i = 0; i < SBC_MAX; ++i)
377 		    if (table[i] < 0)
378 		      re_set_fastmap (fastmap, icase, i);
379 		}
380 # endif /* _LIBC */
381 
382 	  /* See if we have to start the match at all multibyte characters,
383 	     i.e. where we would not find an invalid sequence.  This only
384 	     applies to multibyte character sets; for single byte character
385 	     sets, the SIMPLE_BRACKET again suffices.  */
386 	  if (dfa->mb_cur_max > 1
387 	      && (cset->nchar_classes || cset->non_match || cset->nranges
388 # ifdef _LIBC
389 		  || cset->nequiv_classes
390 # endif /* _LIBC */
391 		 ))
392 	    {
393 	      unsigned char c = 0;
394 	      do
395 		{
396 		  mbstate_t mbs;
397 		  memset (&mbs, 0, sizeof (mbs));
398 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
399 		    re_set_fastmap (fastmap, false, (int) c);
400 		}
401 	      while (++c != 0);
402 	    }
403 
404 	  else
405 	    {
406 	      /* ... Else catch all bytes which can start the mbchars.  */
407 	      for (i = 0; i < cset->nmbchars; ++i)
408 		{
409 		  char buf[256];
410 		  mbstate_t state;
411 		  memset (&state, '\0', sizeof (state));
412 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
413 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 		    {
416 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
417 			  != (size_t) -1)
418 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
419 		    }
420 		}
421 	    }
422 	}
423 #endif /* RE_ENABLE_I18N */
424       else if (type == OP_PERIOD
425 #ifdef RE_ENABLE_I18N
426 	       || type == OP_UTF8_PERIOD
427 #endif /* RE_ENABLE_I18N */
428 	       || type == END_OF_RE)
429 	{
430 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431 	  if (type == END_OF_RE)
432 	    bufp->can_be_null = 1;
433 	  return;
434 	}
435     }
436 }
437 
438 /* Entry point for POSIX code.  */
439 /* regcomp takes a regular expression as a string and compiles it.
440 
441    PREG is a regex_t *.  We do not expect any fields to be initialized,
442    since POSIX says we shouldn't.  Thus, we set
443 
444      `buffer' to the compiled pattern;
445      `used' to the length of the compiled pattern;
446      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447        REG_EXTENDED bit in CFLAGS is set; otherwise, to
448        RE_SYNTAX_POSIX_BASIC;
449      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450      `fastmap' to an allocated space for the fastmap;
451      `fastmap_accurate' to zero;
452      `re_nsub' to the number of subexpressions in PATTERN.
453 
454    PATTERN is the address of the pattern string.
455 
456    CFLAGS is a series of bits which affect compilation.
457 
458      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459      use POSIX basic syntax.
460 
461      If REG_NEWLINE is set, then . and [^...] don't match newline.
462      Also, regexec will try a match beginning after every newline.
463 
464      If REG_ICASE is set, then we considers upper- and lowercase
465      versions of letters to be equivalent when matching.
466 
467      If REG_NOSUB is set, then when PREG is passed to regexec, that
468      routine will report only success or failure, and nothing about the
469      registers.
470 
471    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
472    the return codes and their meanings.)  */
473 
474 int
475 regcomp (preg, pattern, cflags)
476     regex_t *_Restrict_ preg;
477     const char *_Restrict_ pattern;
478     int cflags;
479 {
480   reg_errcode_t ret;
481   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482 			 : RE_SYNTAX_POSIX_BASIC);
483 
484   preg->buffer = NULL;
485   preg->allocated = 0;
486   preg->used = 0;
487 
488   /* Try to allocate space for the fastmap.  */
489   preg->fastmap = re_malloc (char, SBC_MAX);
490   if (BE (preg->fastmap == NULL, 0))
491     return REG_ESPACE;
492 
493   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 
495   /* If REG_NEWLINE is set, newlines are treated differently.  */
496   if (cflags & REG_NEWLINE)
497     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
498       syntax &= ~RE_DOT_NEWLINE;
499       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500       /* It also changes the matching behavior.  */
501       preg->newline_anchor = 1;
502     }
503   else
504     preg->newline_anchor = 0;
505   preg->no_sub = !!(cflags & REG_NOSUB);
506   preg->translate = NULL;
507 
508   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 
510   /* POSIX doesn't distinguish between an unmatched open-group and an
511      unmatched close-group: both are REG_EPAREN.  */
512   if (ret == REG_ERPAREN)
513     ret = REG_EPAREN;
514 
515   /* We have already checked preg->fastmap != NULL.  */
516   if (BE (ret == REG_NOERROR, 1))
517     /* Compute the fastmap now, since regexec cannot modify the pattern
518        buffer.  This function never fails in this implementation.  */
519     (void) re_compile_fastmap (preg);
520   else
521     {
522       /* Some error occurred while compiling the expression.  */
523       re_free (preg->fastmap);
524       preg->fastmap = NULL;
525     }
526 
527   return (int) ret;
528 }
529 #ifdef _LIBC
530 weak_alias (__regcomp, regcomp)
531 #endif
532 
533 /* Returns a message corresponding to an error code, ERRCODE, returned
534    from either regcomp or regexec.   We don't use PREG here.  */
535 
536 #ifdef _LIBC
537 size_t
538 regerror (errcode, preg, errbuf, errbuf_size)
539     int errcode;
540     const regex_t *_Restrict_ preg;
541     char *_Restrict_ errbuf;
542     size_t errbuf_size;
543 #else /* size_t might promote */
544 size_t
545 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
546 	  char *_Restrict_ errbuf, size_t errbuf_size)
547 #endif
548 {
549   const char *msg;
550   size_t msg_size;
551 
552   if (BE (errcode < 0
553 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
554 			       / sizeof (__re_error_msgid_idx[0])), 0))
555     /* Only error codes returned by the rest of the code should be passed
556        to this routine.  If we are given anything else, or if other regex
557        code generates an invalid error code, then the program has a bug.
558        Dump core so we can fix it.  */
559     abort ();
560 
561   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 
563   msg_size = strlen (msg) + 1; /* Includes the null.  */
564 
565   if (BE (errbuf_size != 0, 1))
566     {
567       size_t cpy_size = msg_size;
568       if (BE (msg_size > errbuf_size, 0))
569 	{
570 	  cpy_size = errbuf_size - 1;
571 	  errbuf[cpy_size] = '\0';
572 	}
573       memcpy (errbuf, msg, cpy_size);
574     }
575 
576   return msg_size;
577 }
578 #ifdef _LIBC
579 weak_alias (__regerror, regerror)
580 #endif
581 
582 
583 #ifdef RE_ENABLE_I18N
584 /* This static array is used for the map to single-byte characters when
585    UTF-8 is used.  Otherwise we would allocate memory just to initialize
586    it the same all the time.  UTF-8 is the preferred encoding so this is
587    a worthwhile optimization.  */
588 static const bitset_t utf8_sb_map =
589 {
590   /* Set the first 128 bits.  */
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 #  error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596   BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
598   BITSET_WORD_MAX,
599 # endif
600   (BITSET_WORD_MAX
601    >> (SBC_MAX % BITSET_WORD_BITS == 0
602        ? 0
603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
604 };
605 #endif
606 
607 
608 static void
609 free_dfa_content (re_dfa_t *dfa)
610 {
611   Idx i, j;
612 
613   if (dfa->nodes)
614     for (i = 0; i < dfa->nodes_len; ++i)
615       free_token (dfa->nodes + i);
616   re_free (dfa->nexts);
617   for (i = 0; i < dfa->nodes_len; ++i)
618     {
619       if (dfa->eclosures != NULL)
620 	re_node_set_free (dfa->eclosures + i);
621       if (dfa->inveclosures != NULL)
622 	re_node_set_free (dfa->inveclosures + i);
623       if (dfa->edests != NULL)
624 	re_node_set_free (dfa->edests + i);
625     }
626   re_free (dfa->edests);
627   re_free (dfa->eclosures);
628   re_free (dfa->inveclosures);
629   re_free (dfa->nodes);
630 
631   if (dfa->state_table)
632     for (i = 0; i <= dfa->state_hash_mask; ++i)
633       {
634 	struct re_state_table_entry *entry = dfa->state_table + i;
635 	for (j = 0; j < entry->num; ++j)
636 	  {
637 	    re_dfastate_t *state = entry->array[j];
638 	    free_state (state);
639 	  }
640 	re_free (entry->array);
641       }
642   re_free (dfa->state_table);
643 #ifdef RE_ENABLE_I18N
644   if (dfa->sb_char != utf8_sb_map)
645     re_free (dfa->sb_char);
646 #endif
647   re_free (dfa->subexp_map);
648 #ifdef DEBUG
649   re_free (dfa->re_str);
650 #endif
651 
652   re_free (dfa);
653 }
654 
655 
656 /* Free dynamically allocated space used by PREG.  */
657 
658 void
659 regfree (preg)
660     regex_t *preg;
661 {
662   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
663   if (BE (dfa != NULL, 1))
664     free_dfa_content (dfa);
665   preg->buffer = NULL;
666   preg->allocated = 0;
667 
668   re_free (preg->fastmap);
669   preg->fastmap = NULL;
670 
671   re_free (preg->translate);
672   preg->translate = NULL;
673 }
674 #ifdef _LIBC
675 weak_alias (__regfree, regfree)
676 #endif
677 
678 /* Entry points compatible with 4.2 BSD regex library.  We don't define
679    them unless specifically requested.  */
680 
681 #if defined _REGEX_RE_COMP || defined _LIBC
682 
683 /* BSD has one and only one pattern buffer.  */
684 static struct re_pattern_buffer re_comp_buf;
685 
686 char *
687 # ifdef _LIBC
688 /* Make these definitions weak in libc, so POSIX programs can redefine
689    these names if they don't use our functions, and still use
690    regcomp/regexec above without link errors.  */
691 weak_function
692 # endif
693 re_comp (s)
694      const char *s;
695 {
696   reg_errcode_t ret;
697   char *fastmap;
698 
699   if (!s)
700     {
701       if (!re_comp_buf.buffer)
702 	return gettext ("No previous regular expression");
703       return 0;
704     }
705 
706   if (re_comp_buf.buffer)
707     {
708       fastmap = re_comp_buf.fastmap;
709       re_comp_buf.fastmap = NULL;
710       __regfree (&re_comp_buf);
711       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
712       re_comp_buf.fastmap = fastmap;
713     }
714 
715   if (re_comp_buf.fastmap == NULL)
716     {
717       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
718       if (re_comp_buf.fastmap == NULL)
719 	return (char *) gettext (__re_error_msgid
720 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
721     }
722 
723   /* Since `re_exec' always passes NULL for the `regs' argument, we
724      don't need to initialize the pattern buffer fields which affect it.  */
725 
726   /* Match anchors at newlines.  */
727   re_comp_buf.newline_anchor = 1;
728 
729   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
730 
731   if (!ret)
732     return NULL;
733 
734   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
735   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
736 }
737 
738 #ifdef _LIBC
739 libc_freeres_fn (free_mem)
740 {
741   __regfree (&re_comp_buf);
742 }
743 #endif
744 
745 #endif /* _REGEX_RE_COMP */
746 
747 /* Internal entry point.
748    Compile the regular expression PATTERN, whose length is LENGTH.
749    SYNTAX indicate regular expression's syntax.  */
750 
751 static reg_errcode_t
752 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
753 		     reg_syntax_t syntax)
754 {
755   reg_errcode_t err = REG_NOERROR;
756   re_dfa_t *dfa;
757   re_string_t regexp;
758 
759   /* Initialize the pattern buffer.  */
760   preg->fastmap_accurate = 0;
761   preg->syntax = syntax;
762   preg->not_bol = preg->not_eol = 0;
763   preg->used = 0;
764   preg->re_nsub = 0;
765   preg->can_be_null = 0;
766   preg->regs_allocated = REGS_UNALLOCATED;
767 
768   /* Initialize the dfa.  */
769   dfa = (re_dfa_t *) preg->buffer;
770   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771     {
772       /* If zero allocated, but buffer is non-null, try to realloc
773 	 enough space.  This loses if buffer's address is bogus, but
774 	 that is the user's responsibility.  If ->buffer is NULL this
775 	 is a simple allocation.  */
776       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
777       if (dfa == NULL)
778 	return REG_ESPACE;
779       preg->allocated = sizeof (re_dfa_t);
780       preg->buffer = (unsigned char *) dfa;
781     }
782   preg->used = sizeof (re_dfa_t);
783 
784   err = init_dfa (dfa, length);
785   if (BE (err != REG_NOERROR, 0))
786     {
787       free_dfa_content (dfa);
788       preg->buffer = NULL;
789       preg->allocated = 0;
790       return err;
791     }
792 #ifdef DEBUG
793   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
794   dfa->re_str = re_malloc (char, length + 1);
795   strncpy (dfa->re_str, pattern, length + 1);
796 #endif
797 
798   __libc_lock_init (dfa->lock);
799 
800   err = re_string_construct (&regexp, pattern, length, preg->translate,
801 			     (syntax & RE_ICASE) != 0, dfa);
802   if (BE (err != REG_NOERROR, 0))
803     {
804     re_compile_internal_free_return:
805       free_workarea_compile (preg);
806       re_string_destruct (&regexp);
807       free_dfa_content (dfa);
808       preg->buffer = NULL;
809       preg->allocated = 0;
810       return err;
811     }
812 
813   /* Parse the regular expression, and build a structure tree.  */
814   preg->re_nsub = 0;
815   dfa->str_tree = parse (&regexp, preg, syntax, &err);
816   if (BE (dfa->str_tree == NULL, 0))
817     goto re_compile_internal_free_return;
818 
819   /* Analyze the tree and create the nfa.  */
820   err = analyze (preg);
821   if (BE (err != REG_NOERROR, 0))
822     goto re_compile_internal_free_return;
823 
824 #ifdef RE_ENABLE_I18N
825   /* If possible, do searching in single byte encoding to speed things up.  */
826   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
827     optimize_utf8 (dfa);
828 #endif
829 
830   /* Then create the initial state of the dfa.  */
831   err = create_initial_state (dfa);
832 
833   /* Release work areas.  */
834   free_workarea_compile (preg);
835   re_string_destruct (&regexp);
836 
837   if (BE (err != REG_NOERROR, 0))
838     {
839       free_dfa_content (dfa);
840       preg->buffer = NULL;
841       preg->allocated = 0;
842     }
843 
844   return err;
845 }
846 
847 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
848    as the initial length of some arrays.  */
849 
850 static reg_errcode_t
851 init_dfa (re_dfa_t *dfa, size_t pat_len)
852 {
853   __re_size_t table_size;
854 #ifndef _LIBC
855   char *codeset_name;
856 #endif
857 #ifdef RE_ENABLE_I18N
858   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
859 #else
860   size_t max_i18n_object_size = 0;
861 #endif
862   size_t max_object_size =
863     MAX (sizeof (struct re_state_table_entry),
864 	 MAX (sizeof (re_token_t),
865 	      MAX (sizeof (re_node_set),
866 		   MAX (sizeof (regmatch_t),
867 			max_i18n_object_size))));
868 
869   memset (dfa, '\0', sizeof (re_dfa_t));
870 
871   /* Force allocation of str_tree_storage the first time.  */
872   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
873 
874   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
875      calculation below, and for similar doubling calculations
876      elsewhere.  And it's <= rather than <, because some of the
877      doubling calculations add 1 afterwards.  */
878   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
879     return REG_ESPACE;
880 
881   dfa->nodes_alloc = pat_len + 1;
882   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
883 
884   /*  table_size = 2 ^ ceil(log pat_len) */
885   for (table_size = 1; ; table_size <<= 1)
886     if (table_size > pat_len)
887       break;
888 
889   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
890   dfa->state_hash_mask = table_size - 1;
891 
892   dfa->mb_cur_max = MB_CUR_MAX;
893 #ifdef _LIBC
894   if (dfa->mb_cur_max == 6
895       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
896     dfa->is_utf8 = 1;
897   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
898 		       != 0);
899 #else
900   codeset_name = nl_langinfo (CODESET);
901   if (strcasecmp (codeset_name, "UTF-8") == 0
902       || strcasecmp (codeset_name, "UTF8") == 0)
903     dfa->is_utf8 = 1;
904 
905   /* We check exhaustively in the loop below if this charset is a
906      superset of ASCII.  */
907   dfa->map_notascii = 0;
908 #endif
909 
910 #ifdef RE_ENABLE_I18N
911   if (dfa->mb_cur_max > 1)
912     {
913       if (dfa->is_utf8)
914 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
915       else
916 	{
917 	  int i, j, ch;
918 
919 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
920 	  if (BE (dfa->sb_char == NULL, 0))
921 	    return REG_ESPACE;
922 
923 	  /* Set the bits corresponding to single byte chars.  */
924 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
925 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926 	      {
927 		wint_t wch = __btowc (ch);
928 		if (wch != WEOF)
929 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
930 # ifndef _LIBC
931 		if (isascii (ch) && wch != ch)
932 		  dfa->map_notascii = 1;
933 # endif
934 	      }
935 	}
936     }
937 #endif
938 
939   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
940     return REG_ESPACE;
941   return REG_NOERROR;
942 }
943 
944 /* Initialize WORD_CHAR table, which indicate which character is
945    "word".  In this case "word" means that it is the word construction
946    character used by some operators like "\<", "\>", etc.  */
947 
948 static void
949 internal_function
950 init_word_char (re_dfa_t *dfa)
951 {
952   int i, j, ch;
953   dfa->word_ops_used = 1;
954   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
955     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
956       if (isalnum (ch) || ch == '_')
957 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
958 }
959 
960 /* Free the work area which are only used while compiling.  */
961 
962 static void
963 free_workarea_compile (regex_t *preg)
964 {
965   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
966   bin_tree_storage_t *storage, *next;
967   for (storage = dfa->str_tree_storage; storage; storage = next)
968     {
969       next = storage->next;
970       re_free (storage);
971     }
972   dfa->str_tree_storage = NULL;
973   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
974   dfa->str_tree = NULL;
975   re_free (dfa->org_indices);
976   dfa->org_indices = NULL;
977 }
978 
979 /* Create initial states for all contexts.  */
980 
981 static reg_errcode_t
982 create_initial_state (re_dfa_t *dfa)
983 {
984   Idx first, i;
985   reg_errcode_t err;
986   re_node_set init_nodes;
987 
988   /* Initial states have the epsilon closure of the node which is
989      the first node of the regular expression.  */
990   first = dfa->str_tree->first->node_idx;
991   dfa->init_node = first;
992   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
993   if (BE (err != REG_NOERROR, 0))
994     return err;
995 
996   /* The back-references which are in initial states can epsilon transit,
997      since in this case all of the subexpressions can be null.
998      Then we add epsilon closures of the nodes which are the next nodes of
999      the back-references.  */
1000   if (dfa->nbackref > 0)
1001     for (i = 0; i < init_nodes.nelem; ++i)
1002       {
1003 	Idx node_idx = init_nodes.elems[i];
1004 	re_token_type_t type = dfa->nodes[node_idx].type;
1005 
1006 	Idx clexp_idx;
1007 	if (type != OP_BACK_REF)
1008 	  continue;
1009 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010 	  {
1011 	    re_token_t *clexp_node;
1012 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1013 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1014 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1015 	      break;
1016 	  }
1017 	if (clexp_idx == init_nodes.nelem)
1018 	  continue;
1019 
1020 	if (type == OP_BACK_REF)
1021 	  {
1022 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1023 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1024 	      {
1025 		reg_errcode_t merge_err
1026                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027 		if (merge_err != REG_NOERROR)
1028 		  return merge_err;
1029 		i = 0;
1030 	      }
1031 	  }
1032       }
1033 
1034   /* It must be the first time to invoke acquire_state.  */
1035   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1036   /* We don't check ERR here, since the initial state must not be NULL.  */
1037   if (BE (dfa->init_state == NULL, 0))
1038     return err;
1039   if (dfa->init_state->has_constraint)
1040     {
1041       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1042 						       CONTEXT_WORD);
1043       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1044 						     CONTEXT_NEWLINE);
1045       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1046 							 &init_nodes,
1047 							 CONTEXT_NEWLINE
1048 							 | CONTEXT_BEGBUF);
1049       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1050 	      || dfa->init_state_begbuf == NULL, 0))
1051 	return err;
1052     }
1053   else
1054     dfa->init_state_word = dfa->init_state_nl
1055       = dfa->init_state_begbuf = dfa->init_state;
1056 
1057   re_node_set_free (&init_nodes);
1058   return REG_NOERROR;
1059 }
1060 
1061 #ifdef RE_ENABLE_I18N
1062 /* If it is possible to do searching in single byte encoding instead of UTF-8
1063    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1064    DFA nodes where needed.  */
1065 
1066 static void
1067 optimize_utf8 (re_dfa_t *dfa)
1068 {
1069   Idx node;
1070   int i;
1071   bool mb_chars = false;
1072   bool has_period = false;
1073 
1074   for (node = 0; node < dfa->nodes_len; ++node)
1075     switch (dfa->nodes[node].type)
1076       {
1077       case CHARACTER:
1078 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1079 	  mb_chars = true;
1080 	break;
1081       case ANCHOR:
1082 	switch (dfa->nodes[node].opr.ctx_type)
1083 	  {
1084 	  case LINE_FIRST:
1085 	  case LINE_LAST:
1086 	  case BUF_FIRST:
1087 	  case BUF_LAST:
1088 	    break;
1089 	  default:
1090 	    /* Word anchors etc. cannot be handled.  It's okay to test
1091 	       opr.ctx_type since constraints (for all DFA nodes) are
1092 	       created by ORing one or more opr.ctx_type values.  */
1093 	    return;
1094 	  }
1095 	break;
1096       case OP_PERIOD:
1097 	has_period = true;
1098 	break;
1099       case OP_BACK_REF:
1100       case OP_ALT:
1101       case END_OF_RE:
1102       case OP_DUP_ASTERISK:
1103       case OP_OPEN_SUBEXP:
1104       case OP_CLOSE_SUBEXP:
1105 	break;
1106       case COMPLEX_BRACKET:
1107 	return;
1108       case SIMPLE_BRACKET:
1109 	/* Just double check.  */
1110 	{
1111 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1112 			? 0
1113 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1114 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1115 	    {
1116 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1117 		return;
1118 	      rshift = 0;
1119 	    }
1120 	}
1121 	break;
1122       default:
1123 	abort ();
1124       }
1125 
1126   if (mb_chars || has_period)
1127     for (node = 0; node < dfa->nodes_len; ++node)
1128       {
1129 	if (dfa->nodes[node].type == CHARACTER
1130 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1131 	  dfa->nodes[node].mb_partial = 0;
1132 	else if (dfa->nodes[node].type == OP_PERIOD)
1133 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1134       }
1135 
1136   /* The search can be in single byte locale.  */
1137   dfa->mb_cur_max = 1;
1138   dfa->is_utf8 = 0;
1139   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1140 }
1141 #endif
1142 
1143 /* Analyze the structure tree, and calculate "first", "next", "edest",
1144    "eclosure", and "inveclosure".  */
1145 
1146 static reg_errcode_t
1147 analyze (regex_t *preg)
1148 {
1149   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1150   reg_errcode_t ret;
1151 
1152   /* Allocate arrays.  */
1153   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1154   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1155   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1156   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1157   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1158 	  || dfa->eclosures == NULL, 0))
1159     return REG_ESPACE;
1160 
1161   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1162   if (dfa->subexp_map != NULL)
1163     {
1164       Idx i;
1165       for (i = 0; i < preg->re_nsub; i++)
1166 	dfa->subexp_map[i] = i;
1167       preorder (dfa->str_tree, optimize_subexps, dfa);
1168       for (i = 0; i < preg->re_nsub; i++)
1169 	if (dfa->subexp_map[i] != i)
1170 	  break;
1171       if (i == preg->re_nsub)
1172 	{
1173 	  free (dfa->subexp_map);
1174 	  dfa->subexp_map = NULL;
1175 	}
1176     }
1177 
1178   ret = postorder (dfa->str_tree, lower_subexps, preg);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181   ret = postorder (dfa->str_tree, calc_first, dfa);
1182   if (BE (ret != REG_NOERROR, 0))
1183     return ret;
1184   preorder (dfa->str_tree, calc_next, dfa);
1185   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1186   if (BE (ret != REG_NOERROR, 0))
1187     return ret;
1188   ret = calc_eclosure (dfa);
1189   if (BE (ret != REG_NOERROR, 0))
1190     return ret;
1191 
1192   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1193      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1194   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1195       || dfa->nbackref)
1196     {
1197       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1198       if (BE (dfa->inveclosures == NULL, 0))
1199 	return REG_ESPACE;
1200       ret = calc_inveclosure (dfa);
1201     }
1202 
1203   return ret;
1204 }
1205 
1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
1207    implement parse tree visits.  Instead, we use parent pointers and
1208    some hairy code in these two functions.  */
1209 static reg_errcode_t
1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1211 	   void *extra)
1212 {
1213   bin_tree_t *node, *prev;
1214 
1215   for (node = root; ; )
1216     {
1217       /* Descend down the tree, preferably to the left (or to the right
1218 	 if that's the only child).  */
1219       while (node->left || node->right)
1220 	if (node->left)
1221 	  node = node->left;
1222 	else
1223 	  node = node->right;
1224 
1225       do
1226 	{
1227 	  reg_errcode_t err = fn (extra, node);
1228 	  if (BE (err != REG_NOERROR, 0))
1229 	    return err;
1230 	  if (node->parent == NULL)
1231 	    return REG_NOERROR;
1232 	  prev = node;
1233 	  node = node->parent;
1234 	}
1235       /* Go up while we have a node that is reached from the right.  */
1236       while (node->right == prev || node->right == NULL);
1237       node = node->right;
1238     }
1239 }
1240 
1241 static reg_errcode_t
1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1243 	  void *extra)
1244 {
1245   bin_tree_t *node;
1246 
1247   for (node = root; ; )
1248     {
1249       reg_errcode_t err = fn (extra, node);
1250       if (BE (err != REG_NOERROR, 0))
1251 	return err;
1252 
1253       /* Go to the left node, or up and to the right.  */
1254       if (node->left)
1255 	node = node->left;
1256       else
1257 	{
1258 	  bin_tree_t *prev = NULL;
1259 	  while (node->right == prev || node->right == NULL)
1260 	    {
1261 	      prev = node;
1262 	      node = node->parent;
1263 	      if (!node)
1264 		return REG_NOERROR;
1265 	    }
1266 	  node = node->right;
1267 	}
1268     }
1269 }
1270 
1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1272    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1273    backreferences as well.  Requires a preorder visit.  */
1274 static reg_errcode_t
1275 optimize_subexps (void *extra, bin_tree_t *node)
1276 {
1277   re_dfa_t *dfa = (re_dfa_t *) extra;
1278 
1279   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280     {
1281       int idx = node->token.opr.idx;
1282       node->token.opr.idx = dfa->subexp_map[idx];
1283       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1284     }
1285 
1286   else if (node->token.type == SUBEXP
1287 	   && node->left && node->left->token.type == SUBEXP)
1288     {
1289       Idx other_idx = node->left->token.opr.idx;
1290 
1291       node->left = node->left->left;
1292       if (node->left)
1293 	node->left->parent = node;
1294 
1295       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1296       if (other_idx < BITSET_WORD_BITS)
1297 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1298     }
1299 
1300   return REG_NOERROR;
1301 }
1302 
1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1304    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1305 static reg_errcode_t
1306 lower_subexps (void *extra, bin_tree_t *node)
1307 {
1308   regex_t *preg = (regex_t *) extra;
1309   reg_errcode_t err = REG_NOERROR;
1310 
1311   if (node->left && node->left->token.type == SUBEXP)
1312     {
1313       node->left = lower_subexp (&err, preg, node->left);
1314       if (node->left)
1315 	node->left->parent = node;
1316     }
1317   if (node->right && node->right->token.type == SUBEXP)
1318     {
1319       node->right = lower_subexp (&err, preg, node->right);
1320       if (node->right)
1321 	node->right->parent = node;
1322     }
1323 
1324   return err;
1325 }
1326 
1327 static bin_tree_t *
1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 {
1330   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331   bin_tree_t *body = node->left;
1332   bin_tree_t *op, *cls, *tree1, *tree;
1333 
1334   if (preg->no_sub
1335       /* We do not optimize empty subexpressions, because otherwise we may
1336 	 have bad CONCAT nodes with NULL children.  This is obviously not
1337 	 very common, so we do not lose much.  An example that triggers
1338 	 this case is the sed "script" /\(\)/x.  */
1339       && node->left != NULL
1340       && (node->token.opr.idx >= BITSET_WORD_BITS
1341 	  || !(dfa->used_bkref_map
1342 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1343     return node->left;
1344 
1345   /* Convert the SUBEXP node to the concatenation of an
1346      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1347   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1348   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1349   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1350   tree = create_tree (dfa, op, tree1, CONCAT);
1351   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1352     {
1353       *err = REG_ESPACE;
1354       return NULL;
1355     }
1356 
1357   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1358   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1359   return tree;
1360 }
1361 
1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1363    nodes.  Requires a postorder visit.  */
1364 static reg_errcode_t
1365 calc_first (void *extra, bin_tree_t *node)
1366 {
1367   re_dfa_t *dfa = (re_dfa_t *) extra;
1368   if (node->token.type == CONCAT)
1369     {
1370       node->first = node->left->first;
1371       node->node_idx = node->left->node_idx;
1372     }
1373   else
1374     {
1375       node->first = node;
1376       node->node_idx = re_dfa_add_node (dfa, node->token);
1377       if (BE (node->node_idx == REG_MISSING, 0))
1378 	return REG_ESPACE;
1379       if (node->token.type == ANCHOR)
1380 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381     }
1382   return REG_NOERROR;
1383 }
1384 
1385 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1386 static reg_errcode_t
1387 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1388 {
1389   switch (node->token.type)
1390     {
1391     case OP_DUP_ASTERISK:
1392       node->left->next = node;
1393       break;
1394     case CONCAT:
1395       node->left->next = node->right->first;
1396       node->right->next = node->next;
1397       break;
1398     default:
1399       if (node->left)
1400 	node->left->next = node->next;
1401       if (node->right)
1402 	node->right->next = node->next;
1403       break;
1404     }
1405   return REG_NOERROR;
1406 }
1407 
1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1409 static reg_errcode_t
1410 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 {
1412   re_dfa_t *dfa = (re_dfa_t *) extra;
1413   Idx idx = node->node_idx;
1414   reg_errcode_t err = REG_NOERROR;
1415 
1416   switch (node->token.type)
1417     {
1418     case CONCAT:
1419       break;
1420 
1421     case END_OF_RE:
1422       assert (node->next == NULL);
1423       break;
1424 
1425     case OP_DUP_ASTERISK:
1426     case OP_ALT:
1427       {
1428 	Idx left, right;
1429 	dfa->has_plural_match = 1;
1430 	if (node->left != NULL)
1431 	  left = node->left->first->node_idx;
1432 	else
1433 	  left = node->next->node_idx;
1434 	if (node->right != NULL)
1435 	  right = node->right->first->node_idx;
1436 	else
1437 	  right = node->next->node_idx;
1438 	assert (REG_VALID_INDEX (left));
1439 	assert (REG_VALID_INDEX (right));
1440 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441       }
1442       break;
1443 
1444     case ANCHOR:
1445     case OP_OPEN_SUBEXP:
1446     case OP_CLOSE_SUBEXP:
1447       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1448       break;
1449 
1450     case OP_BACK_REF:
1451       dfa->nexts[idx] = node->next->node_idx;
1452       if (node->token.type == OP_BACK_REF)
1453 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1454       break;
1455 
1456     default:
1457       assert (!IS_EPSILON_NODE (node->token.type));
1458       dfa->nexts[idx] = node->next->node_idx;
1459       break;
1460     }
1461 
1462   return err;
1463 }
1464 
1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
1466    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1467    to their own constraint.  */
1468 
1469 static reg_errcode_t
1470 internal_function
1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1472 			Idx root_node, unsigned int init_constraint)
1473 {
1474   Idx org_node, clone_node;
1475   bool ok;
1476   unsigned int constraint = init_constraint;
1477   for (org_node = top_org_node, clone_node = top_clone_node;;)
1478     {
1479       Idx org_dest, clone_dest;
1480       if (dfa->nodes[org_node].type == OP_BACK_REF)
1481 	{
1482 	  /* If the back reference epsilon-transit, its destination must
1483 	     also have the constraint.  Then duplicate the epsilon closure
1484 	     of the destination of the back reference, and store it in
1485 	     edests of the back reference.  */
1486 	  org_dest = dfa->nexts[org_node];
1487 	  re_node_set_empty (dfa->edests + clone_node);
1488 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 	  if (BE (clone_dest == REG_MISSING, 0))
1490 	    return REG_ESPACE;
1491 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1492 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493 	  if (BE (! ok, 0))
1494 	    return REG_ESPACE;
1495 	}
1496       else if (dfa->edests[org_node].nelem == 0)
1497 	{
1498 	  /* In case of the node can't epsilon-transit, don't duplicate the
1499 	     destination and store the original destination as the
1500 	     destination of the node.  */
1501 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 	  break;
1503 	}
1504       else if (dfa->edests[org_node].nelem == 1)
1505 	{
1506 	  /* In case of the node can epsilon-transit, and it has only one
1507 	     destination.  */
1508 	  org_dest = dfa->edests[org_node].elems[0];
1509 	  re_node_set_empty (dfa->edests + clone_node);
1510 	  /* If the node is root_node itself, it means the epsilon closure
1511 	     has a loop.  Then tie it to the destination of the root_node.  */
1512 	  if (org_node == root_node && clone_node != org_node)
1513 	    {
1514 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515 	      if (BE (! ok, 0))
1516 	        return REG_ESPACE;
1517 	      break;
1518 	    }
1519 	  /* In case the node has another constraint, append it.  */
1520 	  constraint |= dfa->nodes[org_node].constraint;
1521 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1522 	  if (BE (clone_dest == REG_MISSING, 0))
1523 	    return REG_ESPACE;
1524 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525 	  if (BE (! ok, 0))
1526 	    return REG_ESPACE;
1527 	}
1528       else /* dfa->edests[org_node].nelem == 2 */
1529 	{
1530 	  /* In case of the node can epsilon-transit, and it has two
1531 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1532 	  org_dest = dfa->edests[org_node].elems[0];
1533 	  re_node_set_empty (dfa->edests + clone_node);
1534 	  /* Search for a duplicated node which satisfies the constraint.  */
1535 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536 	  if (clone_dest == REG_MISSING)
1537 	    {
1538 	      /* There is no such duplicated node, create a new one.  */
1539 	      reg_errcode_t err;
1540 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1541 	      if (BE (clone_dest == REG_MISSING, 0))
1542 		return REG_ESPACE;
1543 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544 	      if (BE (! ok, 0))
1545 		return REG_ESPACE;
1546 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547 					    root_node, constraint);
1548 	      if (BE (err != REG_NOERROR, 0))
1549 		return err;
1550 	    }
1551 	  else
1552 	    {
1553 	      /* There is a duplicated node which satisfies the constraint,
1554 		 use it to avoid infinite loop.  */
1555 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 	      if (BE (! ok, 0))
1557 		return REG_ESPACE;
1558 	    }
1559 
1560 	  org_dest = dfa->edests[org_node].elems[1];
1561 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1562 	  if (BE (clone_dest == REG_MISSING, 0))
1563 	    return REG_ESPACE;
1564 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565 	  if (BE (! ok, 0))
1566 	    return REG_ESPACE;
1567 	}
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573 
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576 
1577 static Idx
1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1579 			unsigned int constraint)
1580 {
1581   Idx idx;
1582   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583     {
1584       if (org_node == dfa->org_indices[idx]
1585 	  && constraint == dfa->nodes[idx].constraint)
1586 	return idx; /* Found.  */
1587     }
1588   return REG_MISSING; /* Not found.  */
1589 }
1590 
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592    Return the index of the new node, or REG_MISSING if insufficient storage is
1593    available.  */
1594 
1595 static Idx
1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1597 {
1598   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599   if (BE (dup_idx != REG_MISSING, 1))
1600     {
1601       dfa->nodes[dup_idx].constraint = constraint;
1602       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603       dfa->nodes[dup_idx].duplicated = 1;
1604 
1605       /* Store the index of the original node.  */
1606       dfa->org_indices[dup_idx] = org_idx;
1607     }
1608   return dup_idx;
1609 }
1610 
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1613 {
1614   Idx src, idx;
1615   bool ok;
1616   for (idx = 0; idx < dfa->nodes_len; ++idx)
1617     re_node_set_init_empty (dfa->inveclosures + idx);
1618 
1619   for (src = 0; src < dfa->nodes_len; ++src)
1620     {
1621       Idx *elems = dfa->eclosures[src].elems;
1622       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623 	{
1624 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625 	  if (BE (! ok, 0))
1626 	    return REG_ESPACE;
1627 	}
1628     }
1629 
1630   return REG_NOERROR;
1631 }
1632 
1633 /* Calculate "eclosure" for all the node in DFA.  */
1634 
1635 static reg_errcode_t
1636 calc_eclosure (re_dfa_t *dfa)
1637 {
1638   Idx node_idx;
1639   bool incomplete;
1640 #ifdef DEBUG
1641   assert (dfa->nodes_len > 0);
1642 #endif
1643   incomplete = false;
1644   /* For each nodes, calculate epsilon closure.  */
1645   for (node_idx = 0; ; ++node_idx)
1646     {
1647       reg_errcode_t err;
1648       re_node_set eclosure_elem;
1649       if (node_idx == dfa->nodes_len)
1650 	{
1651 	  if (!incomplete)
1652 	    break;
1653 	  incomplete = false;
1654 	  node_idx = 0;
1655 	}
1656 
1657 #ifdef DEBUG
1658       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659 #endif
1660 
1661       /* If we have already calculated, skip it.  */
1662       if (dfa->eclosures[node_idx].nelem != 0)
1663 	continue;
1664       /* Calculate epsilon closure of `node_idx'.  */
1665       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666       if (BE (err != REG_NOERROR, 0))
1667 	return err;
1668 
1669       if (dfa->eclosures[node_idx].nelem == 0)
1670 	{
1671 	  incomplete = true;
1672 	  re_node_set_free (&eclosure_elem);
1673 	}
1674     }
1675   return REG_NOERROR;
1676 }
1677 
1678 /* Calculate epsilon closure of NODE.  */
1679 
1680 static reg_errcode_t
1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1682 {
1683   reg_errcode_t err;
1684   Idx i;
1685   re_node_set eclosure;
1686   bool ok;
1687   bool incomplete = false;
1688   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689   if (BE (err != REG_NOERROR, 0))
1690     return err;
1691 
1692   /* This indicates that we are calculating this node now.
1693      We reference this value to avoid infinite loop.  */
1694   dfa->eclosures[node].nelem = REG_MISSING;
1695 
1696   /* If the current node has constraints, duplicate all nodes
1697      since they must inherit the constraints.  */
1698   if (dfa->nodes[node].constraint
1699       && dfa->edests[node].nelem
1700       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1701     {
1702       err = duplicate_node_closure (dfa, node, node, node,
1703 				    dfa->nodes[node].constraint);
1704       if (BE (err != REG_NOERROR, 0))
1705 	return err;
1706     }
1707 
1708   /* Expand each epsilon destination nodes.  */
1709   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1710     for (i = 0; i < dfa->edests[node].nelem; ++i)
1711       {
1712 	re_node_set eclosure_elem;
1713 	Idx edest = dfa->edests[node].elems[i];
1714 	/* If calculating the epsilon closure of `edest' is in progress,
1715 	   return intermediate result.  */
1716 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1717 	  {
1718 	    incomplete = true;
1719 	    continue;
1720 	  }
1721 	/* If we haven't calculated the epsilon closure of `edest' yet,
1722 	   calculate now. Otherwise use calculated epsilon closure.  */
1723 	if (dfa->eclosures[edest].nelem == 0)
1724 	  {
1725 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1726 	    if (BE (err != REG_NOERROR, 0))
1727 	      return err;
1728 	  }
1729 	else
1730 	  eclosure_elem = dfa->eclosures[edest];
1731 	/* Merge the epsilon closure of `edest'.  */
1732 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1733 	if (BE (err != REG_NOERROR, 0))
1734 	  return err;
1735 	/* If the epsilon closure of `edest' is incomplete,
1736 	   the epsilon closure of this node is also incomplete.  */
1737 	if (dfa->eclosures[edest].nelem == 0)
1738 	  {
1739 	    incomplete = true;
1740 	    re_node_set_free (&eclosure_elem);
1741 	  }
1742       }
1743 
1744   /* An epsilon closure includes itself.  */
1745   ok = re_node_set_insert (&eclosure, node);
1746   if (BE (! ok, 0))
1747     return REG_ESPACE;
1748   if (incomplete && !root)
1749     dfa->eclosures[node].nelem = 0;
1750   else
1751     dfa->eclosures[node] = eclosure;
1752   *new_set = eclosure;
1753   return REG_NOERROR;
1754 }
1755 
1756 /* Functions for token which are used in the parser.  */
1757 
1758 /* Fetch a token from INPUT.
1759    We must not use this function inside bracket expressions.  */
1760 
1761 static void
1762 internal_function
1763 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1764 {
1765   re_string_skip_bytes (input, peek_token (result, input, syntax));
1766 }
1767 
1768 /* Peek a token from INPUT, and return the length of the token.
1769    We must not use this function inside bracket expressions.  */
1770 
1771 static int
1772 internal_function
1773 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1774 {
1775   unsigned char c;
1776 
1777   if (re_string_eoi (input))
1778     {
1779       token->type = END_OF_RE;
1780       return 0;
1781     }
1782 
1783   c = re_string_peek_byte (input, 0);
1784   token->opr.c = c;
1785 
1786   token->word_char = 0;
1787 #ifdef RE_ENABLE_I18N
1788   token->mb_partial = 0;
1789   if (input->mb_cur_max > 1 &&
1790       !re_string_first_byte (input, re_string_cur_idx (input)))
1791     {
1792       token->type = CHARACTER;
1793       token->mb_partial = 1;
1794       return 1;
1795     }
1796 #endif
1797   if (c == '\\')
1798     {
1799       unsigned char c2;
1800       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1801 	{
1802 	  token->type = BACK_SLASH;
1803 	  return 1;
1804 	}
1805 
1806       c2 = re_string_peek_byte_case (input, 1);
1807       token->opr.c = c2;
1808       token->type = CHARACTER;
1809 #ifdef RE_ENABLE_I18N
1810       if (input->mb_cur_max > 1)
1811 	{
1812 	  wint_t wc = re_string_wchar_at (input,
1813 					  re_string_cur_idx (input) + 1);
1814 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1815 	}
1816       else
1817 #endif
1818 	token->word_char = IS_WORD_CHAR (c2) != 0;
1819 
1820       switch (c2)
1821 	{
1822 	case '|':
1823 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1824 	    token->type = OP_ALT;
1825 	  break;
1826 	case '1': case '2': case '3': case '4': case '5':
1827 	case '6': case '7': case '8': case '9':
1828 	  if (!(syntax & RE_NO_BK_REFS))
1829 	    {
1830 	      token->type = OP_BACK_REF;
1831 	      token->opr.idx = c2 - '1';
1832 	    }
1833 	  break;
1834 	case '<':
1835 	  if (!(syntax & RE_NO_GNU_OPS))
1836 	    {
1837 	      token->type = ANCHOR;
1838 	      token->opr.ctx_type = WORD_FIRST;
1839 	    }
1840 	  break;
1841 	case '>':
1842 	  if (!(syntax & RE_NO_GNU_OPS))
1843 	    {
1844 	      token->type = ANCHOR;
1845 	      token->opr.ctx_type = WORD_LAST;
1846 	    }
1847 	  break;
1848 	case 'b':
1849 	  if (!(syntax & RE_NO_GNU_OPS))
1850 	    {
1851 	      token->type = ANCHOR;
1852 	      token->opr.ctx_type = WORD_DELIM;
1853 	    }
1854 	  break;
1855 	case 'B':
1856 	  if (!(syntax & RE_NO_GNU_OPS))
1857 	    {
1858 	      token->type = ANCHOR;
1859 	      token->opr.ctx_type = NOT_WORD_DELIM;
1860 	    }
1861 	  break;
1862 	case 'w':
1863 	  if (!(syntax & RE_NO_GNU_OPS))
1864 	    token->type = OP_WORD;
1865 	  break;
1866 	case 'W':
1867 	  if (!(syntax & RE_NO_GNU_OPS))
1868 	    token->type = OP_NOTWORD;
1869 	  break;
1870 	case 's':
1871 	  if (!(syntax & RE_NO_GNU_OPS))
1872 	    token->type = OP_SPACE;
1873 	  break;
1874 	case 'S':
1875 	  if (!(syntax & RE_NO_GNU_OPS))
1876 	    token->type = OP_NOTSPACE;
1877 	  break;
1878 	case '`':
1879 	  if (!(syntax & RE_NO_GNU_OPS))
1880 	    {
1881 	      token->type = ANCHOR;
1882 	      token->opr.ctx_type = BUF_FIRST;
1883 	    }
1884 	  break;
1885 	case '\'':
1886 	  if (!(syntax & RE_NO_GNU_OPS))
1887 	    {
1888 	      token->type = ANCHOR;
1889 	      token->opr.ctx_type = BUF_LAST;
1890 	    }
1891 	  break;
1892 	case '(':
1893 	  if (!(syntax & RE_NO_BK_PARENS))
1894 	    token->type = OP_OPEN_SUBEXP;
1895 	  break;
1896 	case ')':
1897 	  if (!(syntax & RE_NO_BK_PARENS))
1898 	    token->type = OP_CLOSE_SUBEXP;
1899 	  break;
1900 	case '+':
1901 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902 	    token->type = OP_DUP_PLUS;
1903 	  break;
1904 	case '?':
1905 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1906 	    token->type = OP_DUP_QUESTION;
1907 	  break;
1908 	case '{':
1909 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910 	    token->type = OP_OPEN_DUP_NUM;
1911 	  break;
1912 	case '}':
1913 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1914 	    token->type = OP_CLOSE_DUP_NUM;
1915 	  break;
1916 	default:
1917 	  break;
1918 	}
1919       return 2;
1920     }
1921 
1922   token->type = CHARACTER;
1923 #ifdef RE_ENABLE_I18N
1924   if (input->mb_cur_max > 1)
1925     {
1926       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1927       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1928     }
1929   else
1930 #endif
1931     token->word_char = IS_WORD_CHAR (token->opr.c);
1932 
1933   switch (c)
1934     {
1935     case '\n':
1936       if (syntax & RE_NEWLINE_ALT)
1937 	token->type = OP_ALT;
1938       break;
1939     case '|':
1940       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1941 	token->type = OP_ALT;
1942       break;
1943     case '*':
1944       token->type = OP_DUP_ASTERISK;
1945       break;
1946     case '+':
1947       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948 	token->type = OP_DUP_PLUS;
1949       break;
1950     case '?':
1951       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1952 	token->type = OP_DUP_QUESTION;
1953       break;
1954     case '{':
1955       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956 	token->type = OP_OPEN_DUP_NUM;
1957       break;
1958     case '}':
1959       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1960 	token->type = OP_CLOSE_DUP_NUM;
1961       break;
1962     case '(':
1963       if (syntax & RE_NO_BK_PARENS)
1964 	token->type = OP_OPEN_SUBEXP;
1965       break;
1966     case ')':
1967       if (syntax & RE_NO_BK_PARENS)
1968 	token->type = OP_CLOSE_SUBEXP;
1969       break;
1970     case '[':
1971       token->type = OP_OPEN_BRACKET;
1972       break;
1973     case '.':
1974       token->type = OP_PERIOD;
1975       break;
1976     case '^':
1977       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1978 	  re_string_cur_idx (input) != 0)
1979 	{
1980 	  char prev = re_string_peek_byte (input, -1);
1981 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1982 	    break;
1983 	}
1984       token->type = ANCHOR;
1985       token->opr.ctx_type = LINE_FIRST;
1986       break;
1987     case '$':
1988       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1989 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1990 	{
1991 	  re_token_t next;
1992 	  re_string_skip_bytes (input, 1);
1993 	  peek_token (&next, input, syntax);
1994 	  re_string_skip_bytes (input, -1);
1995 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1996 	    break;
1997 	}
1998       token->type = ANCHOR;
1999       token->opr.ctx_type = LINE_LAST;
2000       break;
2001     default:
2002       break;
2003     }
2004   return 1;
2005 }
2006 
2007 /* Peek a token from INPUT, and return the length of the token.
2008    We must not use this function out of bracket expressions.  */
2009 
2010 static int
2011 internal_function
2012 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2013 {
2014   unsigned char c;
2015   if (re_string_eoi (input))
2016     {
2017       token->type = END_OF_RE;
2018       return 0;
2019     }
2020   c = re_string_peek_byte (input, 0);
2021   token->opr.c = c;
2022 
2023 #ifdef RE_ENABLE_I18N
2024   if (input->mb_cur_max > 1 &&
2025       !re_string_first_byte (input, re_string_cur_idx (input)))
2026     {
2027       token->type = CHARACTER;
2028       return 1;
2029     }
2030 #endif /* RE_ENABLE_I18N */
2031 
2032   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2033       && re_string_cur_idx (input) + 1 < re_string_length (input))
2034     {
2035       /* In this case, '\' escape a character.  */
2036       unsigned char c2;
2037       re_string_skip_bytes (input, 1);
2038       c2 = re_string_peek_byte (input, 0);
2039       token->opr.c = c2;
2040       token->type = CHARACTER;
2041       return 1;
2042     }
2043   if (c == '[') /* '[' is a special char in a bracket exps.  */
2044     {
2045       unsigned char c2;
2046       int token_len;
2047       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2048 	c2 = re_string_peek_byte (input, 1);
2049       else
2050 	c2 = 0;
2051       token->opr.c = c2;
2052       token_len = 2;
2053       switch (c2)
2054 	{
2055 	case '.':
2056 	  token->type = OP_OPEN_COLL_ELEM;
2057 	  break;
2058 	case '=':
2059 	  token->type = OP_OPEN_EQUIV_CLASS;
2060 	  break;
2061 	case ':':
2062 	  if (syntax & RE_CHAR_CLASSES)
2063 	    {
2064 	      token->type = OP_OPEN_CHAR_CLASS;
2065 	      break;
2066 	    }
2067 	  /* else fall through.  */
2068 	default:
2069 	  token->type = CHARACTER;
2070 	  token->opr.c = c;
2071 	  token_len = 1;
2072 	  break;
2073 	}
2074       return token_len;
2075     }
2076   switch (c)
2077     {
2078     case '-':
2079       token->type = OP_CHARSET_RANGE;
2080       break;
2081     case ']':
2082       token->type = OP_CLOSE_BRACKET;
2083       break;
2084     case '^':
2085       token->type = OP_NON_MATCH_LIST;
2086       break;
2087     default:
2088       token->type = CHARACTER;
2089     }
2090   return 1;
2091 }
2092 
2093 /* Functions for parser.  */
2094 
2095 /* Entry point of the parser.
2096    Parse the regular expression REGEXP and return the structure tree.
2097    If an error is occured, ERR is set by error code, and return NULL.
2098    This function build the following tree, from regular expression <reg_exp>:
2099 	   CAT
2100 	   / \
2101 	  /   \
2102    <reg_exp>  EOR
2103 
2104    CAT means concatenation.
2105    EOR means end of regular expression.  */
2106 
2107 static bin_tree_t *
2108 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2109        reg_errcode_t *err)
2110 {
2111   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2112   bin_tree_t *tree, *eor, *root;
2113   re_token_t current_token;
2114   dfa->syntax = syntax;
2115   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2116   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2117   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2118     return NULL;
2119   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2120   if (tree != NULL)
2121     root = create_tree (dfa, tree, eor, CONCAT);
2122   else
2123     root = eor;
2124   if (BE (eor == NULL || root == NULL, 0))
2125     {
2126       *err = REG_ESPACE;
2127       return NULL;
2128     }
2129   return root;
2130 }
2131 
2132 /* This function build the following tree, from regular expression
2133    <branch1>|<branch2>:
2134 	   ALT
2135 	   / \
2136 	  /   \
2137    <branch1> <branch2>
2138 
2139    ALT means alternative, which represents the operator `|'.  */
2140 
2141 static bin_tree_t *
2142 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2143 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2144 {
2145   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2146   bin_tree_t *tree, *branch = NULL;
2147   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150 
2151   while (token->type == OP_ALT)
2152     {
2153       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154       if (token->type != OP_ALT && token->type != END_OF_RE
2155 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2156 	{
2157 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2158 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2159 	    return NULL;
2160 	}
2161       else
2162 	branch = NULL;
2163       tree = create_tree (dfa, tree, branch, OP_ALT);
2164       if (BE (tree == NULL, 0))
2165 	{
2166 	  *err = REG_ESPACE;
2167 	  return NULL;
2168 	}
2169     }
2170   return tree;
2171 }
2172 
2173 /* This function build the following tree, from regular expression
2174    <exp1><exp2>:
2175 	CAT
2176 	/ \
2177        /   \
2178    <exp1> <exp2>
2179 
2180    CAT means concatenation.  */
2181 
2182 static bin_tree_t *
2183 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2184 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2185 {
2186   bin_tree_t *tree, *expr;
2187   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2188   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2189   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190     return NULL;
2191 
2192   while (token->type != OP_ALT && token->type != END_OF_RE
2193 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194     {
2195       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2196       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2197 	{
2198 	  return NULL;
2199 	}
2200       if (tree != NULL && expr != NULL)
2201 	{
2202 	  tree = create_tree (dfa, tree, expr, CONCAT);
2203 	  if (tree == NULL)
2204 	    {
2205 	      *err = REG_ESPACE;
2206 	      return NULL;
2207 	    }
2208 	}
2209       else if (tree == NULL)
2210 	tree = expr;
2211       /* Otherwise expr == NULL, we don't need to create new tree.  */
2212     }
2213   return tree;
2214 }
2215 
2216 /* This function build the following tree, from regular expression a*:
2217 	 *
2218 	 |
2219 	 a
2220 */
2221 
2222 static bin_tree_t *
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225 {
2226   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227   bin_tree_t *tree;
2228   switch (token->type)
2229     {
2230     case CHARACTER:
2231       tree = create_token_tree (dfa, NULL, NULL, token);
2232       if (BE (tree == NULL, 0))
2233 	{
2234 	  *err = REG_ESPACE;
2235 	  return NULL;
2236 	}
2237 #ifdef RE_ENABLE_I18N
2238       if (dfa->mb_cur_max > 1)
2239 	{
2240 	  while (!re_string_eoi (regexp)
2241 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242 	    {
2243 	      bin_tree_t *mbc_remain;
2244 	      fetch_token (token, regexp, syntax);
2245 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2248 		{
2249 		  *err = REG_ESPACE;
2250 		  return NULL;
2251 		}
2252 	    }
2253 	}
2254 #endif
2255       break;
2256     case OP_OPEN_SUBEXP:
2257       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259 	return NULL;
2260       break;
2261     case OP_OPEN_BRACKET:
2262       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264 	return NULL;
2265       break;
2266     case OP_BACK_REF:
2267       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268 	{
2269 	  *err = REG_ESUBREG;
2270 	  return NULL;
2271 	}
2272       dfa->used_bkref_map |= 1 << token->opr.idx;
2273       tree = create_token_tree (dfa, NULL, NULL, token);
2274       if (BE (tree == NULL, 0))
2275 	{
2276 	  *err = REG_ESPACE;
2277 	  return NULL;
2278 	}
2279       ++dfa->nbackref;
2280       dfa->has_mb_node = 1;
2281       break;
2282     case OP_OPEN_DUP_NUM:
2283       if (syntax & RE_CONTEXT_INVALID_DUP)
2284 	{
2285 	  *err = REG_BADRPT;
2286 	  return NULL;
2287 	}
2288       /* FALLTHROUGH */
2289     case OP_DUP_ASTERISK:
2290     case OP_DUP_PLUS:
2291     case OP_DUP_QUESTION:
2292       if (syntax & RE_CONTEXT_INVALID_OPS)
2293 	{
2294 	  *err = REG_BADRPT;
2295 	  return NULL;
2296 	}
2297       else if (syntax & RE_CONTEXT_INDEP_OPS)
2298 	{
2299 	  fetch_token (token, regexp, syntax);
2300 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2301 	}
2302       /* else fall through  */
2303     case OP_CLOSE_SUBEXP:
2304       if ((token->type == OP_CLOSE_SUBEXP) &&
2305 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306 	{
2307 	  *err = REG_ERPAREN;
2308 	  return NULL;
2309 	}
2310       /* else fall through  */
2311     case OP_CLOSE_DUP_NUM:
2312       /* We treat it as a normal character.  */
2313 
2314       /* Then we can these characters as normal characters.  */
2315       token->type = CHARACTER;
2316       /* mb_partial and word_char bits should be initialized already
2317 	 by peek_token.  */
2318       tree = create_token_tree (dfa, NULL, NULL, token);
2319       if (BE (tree == NULL, 0))
2320 	{
2321 	  *err = REG_ESPACE;
2322 	  return NULL;
2323 	}
2324       break;
2325     case ANCHOR:
2326       if ((token->opr.ctx_type
2327 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328 	  && dfa->word_ops_used == 0)
2329 	init_word_char (dfa);
2330       if (token->opr.ctx_type == WORD_DELIM
2331 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2332 	{
2333 	  bin_tree_t *tree_first, *tree_last;
2334 	  if (token->opr.ctx_type == WORD_DELIM)
2335 	    {
2336 	      token->opr.ctx_type = WORD_FIRST;
2337 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2338 	      token->opr.ctx_type = WORD_LAST;
2339 	    }
2340 	  else
2341 	    {
2342 	      token->opr.ctx_type = INSIDE_WORD;
2343 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 	      token->opr.ctx_type = INSIDE_NOTWORD;
2345 	    }
2346 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2347 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349 	    {
2350 	      *err = REG_ESPACE;
2351 	      return NULL;
2352 	    }
2353 	}
2354       else
2355 	{
2356 	  tree = create_token_tree (dfa, NULL, NULL, token);
2357 	  if (BE (tree == NULL, 0))
2358 	    {
2359 	      *err = REG_ESPACE;
2360 	      return NULL;
2361 	    }
2362 	}
2363       /* We must return here, since ANCHORs can't be followed
2364 	 by repetition operators.
2365 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2367       fetch_token (token, regexp, syntax);
2368       return tree;
2369     case OP_PERIOD:
2370       tree = create_token_tree (dfa, NULL, NULL, token);
2371       if (BE (tree == NULL, 0))
2372 	{
2373 	  *err = REG_ESPACE;
2374 	  return NULL;
2375 	}
2376       if (dfa->mb_cur_max > 1)
2377 	dfa->has_mb_node = 1;
2378       break;
2379     case OP_WORD:
2380     case OP_NOTWORD:
2381       tree = build_charclass_op (dfa, regexp->trans,
2382 				 (const unsigned char *) "alnum",
2383 				 (const unsigned char *) "_",
2384 				 token->type == OP_NOTWORD, err);
2385       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386 	return NULL;
2387       break;
2388     case OP_SPACE:
2389     case OP_NOTSPACE:
2390       tree = build_charclass_op (dfa, regexp->trans,
2391 				 (const unsigned char *) "space",
2392 				 (const unsigned char *) "",
2393 				 token->type == OP_NOTSPACE, err);
2394       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395 	return NULL;
2396       break;
2397     case OP_ALT:
2398     case END_OF_RE:
2399       return NULL;
2400     case BACK_SLASH:
2401       *err = REG_EESCAPE;
2402       return NULL;
2403     default:
2404       /* Must not happen?  */
2405 #ifdef DEBUG
2406       assert (0);
2407 #endif
2408       return NULL;
2409     }
2410   fetch_token (token, regexp, syntax);
2411 
2412   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414     {
2415       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2417 	return NULL;
2418       /* In BRE consecutive duplications are not allowed.  */
2419       if ((syntax & RE_CONTEXT_INVALID_DUP)
2420 	  && (token->type == OP_DUP_ASTERISK
2421 	      || token->type == OP_OPEN_DUP_NUM))
2422 	{
2423 	  *err = REG_BADRPT;
2424 	  return NULL;
2425 	}
2426     }
2427 
2428   return tree;
2429 }
2430 
2431 /* This function build the following tree, from regular expression
2432    (<reg_exp>):
2433 	 SUBEXP
2434 	    |
2435 	<reg_exp>
2436 */
2437 
2438 static bin_tree_t *
2439 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2440 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2441 {
2442   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2443   bin_tree_t *tree;
2444   size_t cur_nsub;
2445   cur_nsub = preg->re_nsub++;
2446 
2447   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2448 
2449   /* The subexpression may be a null string.  */
2450   if (token->type == OP_CLOSE_SUBEXP)
2451     tree = NULL;
2452   else
2453     {
2454       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2455       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2456 	*err = REG_EPAREN;
2457       if (BE (*err != REG_NOERROR, 0))
2458 	return NULL;
2459     }
2460 
2461   if (cur_nsub <= '9' - '1')
2462     dfa->completed_bkref_map |= 1 << cur_nsub;
2463 
2464   tree = create_tree (dfa, tree, NULL, SUBEXP);
2465   if (BE (tree == NULL, 0))
2466     {
2467       *err = REG_ESPACE;
2468       return NULL;
2469     }
2470   tree->token.opr.idx = cur_nsub;
2471   return tree;
2472 }
2473 
2474 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2475 
2476 static bin_tree_t *
2477 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2478 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2479 {
2480   bin_tree_t *tree = NULL, *old_tree = NULL;
2481   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2482   re_token_t start_token = *token;
2483 
2484   if (token->type == OP_OPEN_DUP_NUM)
2485     {
2486       end = 0;
2487       start = fetch_number (regexp, token, syntax);
2488       if (start == REG_MISSING)
2489 	{
2490 	  if (token->type == CHARACTER && token->opr.c == ',')
2491 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2492 	  else
2493 	    {
2494 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2495 	      return NULL;
2496 	    }
2497 	}
2498       if (BE (start != REG_ERROR, 1))
2499 	{
2500 	  /* We treat "{n}" as "{n,n}".  */
2501 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2502 		 : ((token->type == CHARACTER && token->opr.c == ',')
2503 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2504 	}
2505       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2506 	{
2507 	  /* Invalid sequence.  */
2508 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2509 	    {
2510 	      if (token->type == END_OF_RE)
2511 		*err = REG_EBRACE;
2512 	      else
2513 		*err = REG_BADBR;
2514 
2515 	      return NULL;
2516 	    }
2517 
2518 	  /* If the syntax bit is set, rollback.  */
2519 	  re_string_set_index (regexp, start_idx);
2520 	  *token = start_token;
2521 	  token->type = CHARACTER;
2522 	  /* mb_partial and word_char bits should be already initialized by
2523 	     peek_token.  */
2524 	  return elem;
2525 	}
2526 
2527       if (BE ((end != REG_MISSING && start > end)
2528 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2529 	{
2530 	  /* First number greater than second.  */
2531 	  *err = REG_BADBR;
2532 	  return NULL;
2533 	}
2534     }
2535   else
2536     {
2537       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2539     }
2540 
2541   fetch_token (token, regexp, syntax);
2542 
2543   if (BE (elem == NULL, 0))
2544     return NULL;
2545   if (BE (start == 0 && end == 0, 0))
2546     {
2547       postorder (elem, free_tree, NULL);
2548       return NULL;
2549     }
2550 
2551   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2552   if (BE (start > 0, 0))
2553     {
2554       tree = elem;
2555       for (i = 2; i <= start; ++i)
2556 	{
2557 	  elem = duplicate_tree (elem, dfa);
2558 	  tree = create_tree (dfa, tree, elem, CONCAT);
2559 	  if (BE (elem == NULL || tree == NULL, 0))
2560 	    goto parse_dup_op_espace;
2561 	}
2562 
2563       if (start == end)
2564 	return tree;
2565 
2566       /* Duplicate ELEM before it is marked optional.  */
2567       elem = duplicate_tree (elem, dfa);
2568       old_tree = tree;
2569     }
2570   else
2571     old_tree = NULL;
2572 
2573   if (elem->token.type == SUBEXP)
2574     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2575 
2576   tree = create_tree (dfa, elem, NULL,
2577 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2578   if (BE (tree == NULL, 0))
2579     goto parse_dup_op_espace;
2580 
2581 /* From gnulib's "intprops.h":
2582    True if the arithmetic type T is signed.  */
2583 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2584 
2585   /* This loop is actually executed only when end != REG_MISSING,
2586      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2587      already created the start+1-th copy.  */
2588   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2589     for (i = start + 2; i <= end; ++i)
2590       {
2591 	elem = duplicate_tree (elem, dfa);
2592 	tree = create_tree (dfa, tree, elem, CONCAT);
2593 	if (BE (elem == NULL || tree == NULL, 0))
2594 	  goto parse_dup_op_espace;
2595 
2596 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2597 	if (BE (tree == NULL, 0))
2598 	  goto parse_dup_op_espace;
2599       }
2600 
2601   if (old_tree)
2602     tree = create_tree (dfa, old_tree, tree, CONCAT);
2603 
2604   return tree;
2605 
2606  parse_dup_op_espace:
2607   *err = REG_ESPACE;
2608   return NULL;
2609 }
2610 
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612    I'm not sure, but maybe enough.  */
2613 #define BRACKET_NAME_BUF_SIZE 32
2614 
2615 #ifndef _LIBC
2616   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617      Build the range expression which starts from START_ELEM, and ends
2618      at END_ELEM.  The result are written to MBCSET and SBCSET.
2619      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620      mbcset->range_ends, is a pointer argument sinse we may
2621      update it.  */
2622 
2623 static reg_errcode_t
2624 internal_function
2625 # ifdef RE_ENABLE_I18N
2626 build_range_exp (const reg_syntax_t syntax,
2627                  bitset_t sbcset,
2628                  re_charset_t *mbcset,
2629                  Idx *range_alloc,
2630                  const bracket_elem_t *start_elem,
2631                  const bracket_elem_t *end_elem)
2632 # else /* not RE_ENABLE_I18N */
2633 build_range_exp (const reg_syntax_t syntax,
2634                  bitset_t sbcset,
2635                  const bracket_elem_t *start_elem,
2636                  const bracket_elem_t *end_elem)
2637 # endif /* not RE_ENABLE_I18N */
2638 {
2639   unsigned int start_ch, end_ch;
2640   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2641   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2642 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643 	  0))
2644     return REG_ERANGE;
2645 
2646   /* We can handle no multi character collating elements without libc
2647      support.  */
2648   if (BE ((start_elem->type == COLL_SYM
2649 	   && strlen ((char *) start_elem->opr.name) > 1)
2650 	  || (end_elem->type == COLL_SYM
2651 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2652     return REG_ECOLLATE;
2653 
2654 # ifdef RE_ENABLE_I18N
2655   {
2656     wchar_t wc;
2657     wint_t start_wc;
2658     wint_t end_wc;
2659     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660 
2661     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2662 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2663 		   : 0));
2664     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2665 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2666 		 : 0));
2667     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2668 		? __btowc (start_ch) : start_elem->opr.wch);
2669     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2670 	      ? __btowc (end_ch) : end_elem->opr.wch);
2671     if (start_wc == WEOF || end_wc == WEOF)
2672       return REG_ECOLLATE;
2673     cmp_buf[0] = start_wc;
2674     cmp_buf[4] = end_wc;
2675 
2676     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2677             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2678       return REG_ERANGE;
2679 
2680     /* Got valid collation sequence values, add them as a new entry.
2681        However, for !_LIBC we have no collation elements: if the
2682        character set is single byte, the single byte character set
2683        that we build below suffices.  parse_bracket_exp passes
2684        no MBCSET if dfa->mb_cur_max == 1.  */
2685     if (mbcset)
2686       {
2687 	/* Check the space of the arrays.  */
2688 	if (BE (*range_alloc == mbcset->nranges, 0))
2689 	  {
2690 	    /* There is not enough space, need realloc.  */
2691 	    wchar_t *new_array_start, *new_array_end;
2692 	    Idx new_nranges;
2693 
2694 	    /* +1 in case of mbcset->nranges is 0.  */
2695 	    new_nranges = 2 * mbcset->nranges + 1;
2696 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2697 	       are NULL if *range_alloc == 0.  */
2698 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2699 					  new_nranges);
2700 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2701 					new_nranges);
2702 
2703 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2704 	      return REG_ESPACE;
2705 
2706 	    mbcset->range_starts = new_array_start;
2707 	    mbcset->range_ends = new_array_end;
2708 	    *range_alloc = new_nranges;
2709 	  }
2710 
2711 	mbcset->range_starts[mbcset->nranges] = start_wc;
2712 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2713       }
2714 
2715     /* Build the table for single byte characters.  */
2716     for (wc = 0; wc < SBC_MAX; ++wc)
2717       {
2718 	cmp_buf[2] = wc;
2719 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2720 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2721 	  bitset_set (sbcset, wc);
2722       }
2723   }
2724 # else /* not RE_ENABLE_I18N */
2725   {
2726     unsigned int ch;
2727     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2728 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2729 		   : 0));
2730     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2731 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2732 		 : 0));
2733     if (start_ch > end_ch)
2734       return REG_ERANGE;
2735     /* Build the table for single byte characters.  */
2736     for (ch = 0; ch < SBC_MAX; ++ch)
2737       if (start_ch <= ch  && ch <= end_ch)
2738 	bitset_set (sbcset, ch);
2739   }
2740 # endif /* not RE_ENABLE_I18N */
2741   return REG_NOERROR;
2742 }
2743 #endif /* not _LIBC */
2744 
2745 #ifndef _LIBC
2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2747    Build the collating element which is represented by NAME.
2748    The result are written to MBCSET and SBCSET.
2749    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2750    pointer argument since we may update it.  */
2751 
2752 static reg_errcode_t
2753 internal_function
2754 build_collating_symbol (bitset_t sbcset,
2755 # ifdef RE_ENABLE_I18N
2756 			re_charset_t *mbcset _UNUSED_PARAMETER_,
2757 			Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2758 # endif
2759 			const unsigned char *name)
2760 {
2761   size_t name_len = strlen ((const char *) name);
2762   if (BE (name_len != 1, 0))
2763     return REG_ECOLLATE;
2764   else
2765     {
2766       bitset_set (sbcset, name[0]);
2767       return REG_NOERROR;
2768     }
2769 }
2770 #endif /* not _LIBC */
2771 
2772 /* This function parse bracket expression like "[abc]", "[a-c]",
2773    "[[.a-a.]]" etc.  */
2774 
2775 static bin_tree_t *
2776 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2777 		   reg_syntax_t syntax, reg_errcode_t *err)
2778 {
2779 #ifdef _LIBC
2780   const unsigned char *collseqmb;
2781   const char *collseqwc;
2782   uint32_t nrules;
2783   int32_t table_size;
2784   const int32_t *symb_table;
2785   const unsigned char *extra;
2786 
2787   /* Local function for parse_bracket_exp used in _LIBC environement.
2788      Seek the collating symbol entry correspondings to NAME.
2789      Return the index of the symbol in the SYMB_TABLE.  */
2790 
2791   auto inline int32_t
2792   __attribute ((always_inline))
2793   seek_collating_symbol_entry (name, name_len)
2794 	 const unsigned char *name;
2795 	 size_t name_len;
2796     {
2797       int32_t hash = elem_hash ((const char *) name, name_len);
2798       int32_t elem = hash % table_size;
2799       if (symb_table[2 * elem] != 0)
2800 	{
2801 	  int32_t second = hash % (table_size - 2) + 1;
2802 
2803 	  do
2804 	    {
2805 	      /* First compare the hashing value.  */
2806 	      if (symb_table[2 * elem] == hash
2807 		  /* Compare the length of the name.  */
2808 		  && name_len == extra[symb_table[2 * elem + 1]]
2809 		  /* Compare the name.  */
2810 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2811 			     name_len) == 0)
2812 		{
2813 		  /* Yep, this is the entry.  */
2814 		  break;
2815 		}
2816 
2817 	      /* Next entry.  */
2818 	      elem += second;
2819 	    }
2820 	  while (symb_table[2 * elem] != 0);
2821 	}
2822       return elem;
2823     }
2824 
2825   /* Local function for parse_bracket_exp used in _LIBC environment.
2826      Look up the collation sequence value of BR_ELEM.
2827      Return the value if succeeded, UINT_MAX otherwise.  */
2828 
2829   auto inline unsigned int
2830   __attribute ((always_inline))
2831   lookup_collation_sequence_value (br_elem)
2832 	 bracket_elem_t *br_elem;
2833     {
2834       if (br_elem->type == SB_CHAR)
2835 	{
2836 	  /*
2837 	  if (MB_CUR_MAX == 1)
2838 	  */
2839 	  if (nrules == 0)
2840 	    return collseqmb[br_elem->opr.ch];
2841 	  else
2842 	    {
2843 	      wint_t wc = __btowc (br_elem->opr.ch);
2844 	      return __collseq_table_lookup (collseqwc, wc);
2845 	    }
2846 	}
2847       else if (br_elem->type == MB_CHAR)
2848 	{
2849 	  if (nrules != 0)
2850 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2851 	}
2852       else if (br_elem->type == COLL_SYM)
2853 	{
2854 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2855 	  if (nrules != 0)
2856 	    {
2857 	      int32_t elem, idx;
2858 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2859 						  sym_name_len);
2860 	      if (symb_table[2 * elem] != 0)
2861 		{
2862 		  /* We found the entry.  */
2863 		  idx = symb_table[2 * elem + 1];
2864 		  /* Skip the name of collating element name.  */
2865 		  idx += 1 + extra[idx];
2866 		  /* Skip the byte sequence of the collating element.  */
2867 		  idx += 1 + extra[idx];
2868 		  /* Adjust for the alignment.  */
2869 		  idx = (idx + 3) & ~3;
2870 		  /* Skip the multibyte collation sequence value.  */
2871 		  idx += sizeof (unsigned int);
2872 		  /* Skip the wide char sequence of the collating element.  */
2873 		  idx += sizeof (unsigned int) *
2874 		    (1 + *(unsigned int *) (extra + idx));
2875 		  /* Return the collation sequence value.  */
2876 		  return *(unsigned int *) (extra + idx);
2877 		}
2878 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2879 		{
2880 		  /* No valid character.  Match it as a single byte
2881 		     character.  */
2882 		  return collseqmb[br_elem->opr.name[0]];
2883 		}
2884 	    }
2885 	  else if (sym_name_len == 1)
2886 	    return collseqmb[br_elem->opr.name[0]];
2887 	}
2888       return UINT_MAX;
2889     }
2890 
2891   /* Local function for parse_bracket_exp used in _LIBC environement.
2892      Build the range expression which starts from START_ELEM, and ends
2893      at END_ELEM.  The result are written to MBCSET and SBCSET.
2894      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2895      mbcset->range_ends, is a pointer argument sinse we may
2896      update it.  */
2897 
2898   auto inline reg_errcode_t
2899   __attribute ((always_inline))
2900   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2901 	 re_charset_t *mbcset;
2902 	 Idx *range_alloc;
2903 	 bitset_t sbcset;
2904 	 bracket_elem_t *start_elem, *end_elem;
2905     {
2906       unsigned int ch;
2907       uint32_t start_collseq;
2908       uint32_t end_collseq;
2909 
2910       /* Equivalence Classes and Character Classes can't be a range
2911 	 start/end.  */
2912       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2913 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2914 	      0))
2915 	return REG_ERANGE;
2916 
2917       start_collseq = lookup_collation_sequence_value (start_elem);
2918       end_collseq = lookup_collation_sequence_value (end_elem);
2919       /* Check start/end collation sequence values.  */
2920       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2921 	return REG_ECOLLATE;
2922       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2923 	return REG_ERANGE;
2924 
2925       /* Got valid collation sequence values, add them as a new entry.
2926 	 However, if we have no collation elements, and the character set
2927 	 is single byte, the single byte character set that we
2928 	 build below suffices. */
2929       if (nrules > 0 || dfa->mb_cur_max > 1)
2930 	{
2931 	  /* Check the space of the arrays.  */
2932 	  if (BE (*range_alloc == mbcset->nranges, 0))
2933 	    {
2934 	      /* There is not enough space, need realloc.  */
2935 	      uint32_t *new_array_start;
2936 	      uint32_t *new_array_end;
2937 	      Idx new_nranges;
2938 
2939 	      /* +1 in case of mbcset->nranges is 0.  */
2940 	      new_nranges = 2 * mbcset->nranges + 1;
2941 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2942 					    new_nranges);
2943 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2944 					  new_nranges);
2945 
2946 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2947 		return REG_ESPACE;
2948 
2949 	      mbcset->range_starts = new_array_start;
2950 	      mbcset->range_ends = new_array_end;
2951 	      *range_alloc = new_nranges;
2952 	    }
2953 
2954 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2955 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2956 	}
2957 
2958       /* Build the table for single byte characters.  */
2959       for (ch = 0; ch < SBC_MAX; ch++)
2960 	{
2961 	  uint32_t ch_collseq;
2962 	  /*
2963 	  if (MB_CUR_MAX == 1)
2964 	  */
2965 	  if (nrules == 0)
2966 	    ch_collseq = collseqmb[ch];
2967 	  else
2968 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2969 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2970 	    bitset_set (sbcset, ch);
2971 	}
2972       return REG_NOERROR;
2973     }
2974 
2975   /* Local function for parse_bracket_exp used in _LIBC environement.
2976      Build the collating element which is represented by NAME.
2977      The result are written to MBCSET and SBCSET.
2978      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2979      pointer argument sinse we may update it.  */
2980 
2981   auto inline reg_errcode_t
2982   __attribute ((always_inline))
2983   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2984 	 re_charset_t *mbcset;
2985 	 Idx *coll_sym_alloc;
2986 	 bitset_t sbcset;
2987 	 const unsigned char *name;
2988     {
2989       int32_t elem, idx;
2990       size_t name_len = strlen ((const char *) name);
2991       if (nrules != 0)
2992 	{
2993 	  elem = seek_collating_symbol_entry (name, name_len);
2994 	  if (symb_table[2 * elem] != 0)
2995 	    {
2996 	      /* We found the entry.  */
2997 	      idx = symb_table[2 * elem + 1];
2998 	      /* Skip the name of collating element name.  */
2999 	      idx += 1 + extra[idx];
3000 	    }
3001 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3002 	    {
3003 	      /* No valid character, treat it as a normal
3004 		 character.  */
3005 	      bitset_set (sbcset, name[0]);
3006 	      return REG_NOERROR;
3007 	    }
3008 	  else
3009 	    return REG_ECOLLATE;
3010 
3011 	  /* Got valid collation sequence, add it as a new entry.  */
3012 	  /* Check the space of the arrays.  */
3013 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3014 	    {
3015 	      /* Not enough, realloc it.  */
3016 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3017 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3018 	      /* Use realloc since mbcset->coll_syms is NULL
3019 		 if *alloc == 0.  */
3020 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3021 						   new_coll_sym_alloc);
3022 	      if (BE (new_coll_syms == NULL, 0))
3023 		return REG_ESPACE;
3024 	      mbcset->coll_syms = new_coll_syms;
3025 	      *coll_sym_alloc = new_coll_sym_alloc;
3026 	    }
3027 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3028 	  return REG_NOERROR;
3029 	}
3030       else
3031 	{
3032 	  if (BE (name_len != 1, 0))
3033 	    return REG_ECOLLATE;
3034 	  else
3035 	    {
3036 	      bitset_set (sbcset, name[0]);
3037 	      return REG_NOERROR;
3038 	    }
3039 	}
3040     }
3041 #endif
3042 
3043   re_token_t br_token;
3044   re_bitset_ptr_t sbcset;
3045 #ifdef RE_ENABLE_I18N
3046   re_charset_t *mbcset;
3047   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3048   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3049 #endif /* not RE_ENABLE_I18N */
3050   bool non_match = false;
3051   bin_tree_t *work_tree;
3052   int token_len;
3053   bool first_round = true;
3054 #ifdef _LIBC
3055   collseqmb = (const unsigned char *)
3056     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3057   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3058   if (nrules)
3059     {
3060       /*
3061       if (MB_CUR_MAX > 1)
3062       */
3063       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3064       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3065       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3066 						  _NL_COLLATE_SYMB_TABLEMB);
3067       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3068 						   _NL_COLLATE_SYMB_EXTRAMB);
3069     }
3070 #endif
3071   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3072 #ifdef RE_ENABLE_I18N
3073   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3074 #endif /* RE_ENABLE_I18N */
3075 #ifdef RE_ENABLE_I18N
3076   if (BE (sbcset == NULL || mbcset == NULL, 0))
3077 #else
3078   if (BE (sbcset == NULL, 0))
3079 #endif /* RE_ENABLE_I18N */
3080     {
3081       *err = REG_ESPACE;
3082       return NULL;
3083     }
3084 
3085   token_len = peek_token_bracket (token, regexp, syntax);
3086   if (BE (token->type == END_OF_RE, 0))
3087     {
3088       *err = REG_BADPAT;
3089       goto parse_bracket_exp_free_return;
3090     }
3091   if (token->type == OP_NON_MATCH_LIST)
3092     {
3093 #ifdef RE_ENABLE_I18N
3094       mbcset->non_match = 1;
3095 #endif /* not RE_ENABLE_I18N */
3096       non_match = true;
3097       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3098 	bitset_set (sbcset, '\n');
3099       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3100       token_len = peek_token_bracket (token, regexp, syntax);
3101       if (BE (token->type == END_OF_RE, 0))
3102 	{
3103 	  *err = REG_BADPAT;
3104 	  goto parse_bracket_exp_free_return;
3105 	}
3106     }
3107 
3108   /* We treat the first ']' as a normal character.  */
3109   if (token->type == OP_CLOSE_BRACKET)
3110     token->type = CHARACTER;
3111 
3112   while (1)
3113     {
3114       bracket_elem_t start_elem, end_elem;
3115       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3116       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3117       reg_errcode_t ret;
3118       int token_len2 = 0;
3119       bool is_range_exp = false;
3120       re_token_t token2;
3121 
3122       start_elem.opr.name = start_name_buf;
3123       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3124 				   syntax, first_round);
3125       if (BE (ret != REG_NOERROR, 0))
3126 	{
3127 	  *err = ret;
3128 	  goto parse_bracket_exp_free_return;
3129 	}
3130       first_round = false;
3131 
3132       /* Get information about the next token.  We need it in any case.  */
3133       token_len = peek_token_bracket (token, regexp, syntax);
3134 
3135       /* Do not check for ranges if we know they are not allowed.  */
3136       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3137 	{
3138 	  if (BE (token->type == END_OF_RE, 0))
3139 	    {
3140 	      *err = REG_EBRACK;
3141 	      goto parse_bracket_exp_free_return;
3142 	    }
3143 	  if (token->type == OP_CHARSET_RANGE)
3144 	    {
3145 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3146 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3147 	      if (BE (token2.type == END_OF_RE, 0))
3148 		{
3149 		  *err = REG_EBRACK;
3150 		  goto parse_bracket_exp_free_return;
3151 		}
3152 	      if (token2.type == OP_CLOSE_BRACKET)
3153 		{
3154 		  /* We treat the last '-' as a normal character.  */
3155 		  re_string_skip_bytes (regexp, -token_len);
3156 		  token->type = CHARACTER;
3157 		}
3158 	      else
3159 		is_range_exp = true;
3160 	    }
3161 	}
3162 
3163       if (is_range_exp == true)
3164 	{
3165 	  end_elem.opr.name = end_name_buf;
3166 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3167 				       dfa, syntax, true);
3168 	  if (BE (ret != REG_NOERROR, 0))
3169 	    {
3170 	      *err = ret;
3171 	      goto parse_bracket_exp_free_return;
3172 	    }
3173 
3174 	  token_len = peek_token_bracket (token, regexp, syntax);
3175 
3176 #ifdef _LIBC
3177 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3178 				  &start_elem, &end_elem);
3179 #else
3180 # ifdef RE_ENABLE_I18N
3181 	  *err = build_range_exp (syntax, sbcset,
3182 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3183 				  &range_alloc, &start_elem, &end_elem);
3184 # else
3185 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3186 # endif
3187 #endif /* RE_ENABLE_I18N */
3188 	  if (BE (*err != REG_NOERROR, 0))
3189 	    goto parse_bracket_exp_free_return;
3190 	}
3191       else
3192 	{
3193 	  switch (start_elem.type)
3194 	    {
3195 	    case SB_CHAR:
3196 	      bitset_set (sbcset, start_elem.opr.ch);
3197 	      break;
3198 #ifdef RE_ENABLE_I18N
3199 	    case MB_CHAR:
3200 	      /* Check whether the array has enough space.  */
3201 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3202 		{
3203 		  wchar_t *new_mbchars;
3204 		  /* Not enough, realloc it.  */
3205 		  /* +1 in case of mbcset->nmbchars is 0.  */
3206 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3207 		  /* Use realloc since array is NULL if *alloc == 0.  */
3208 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3209 					    mbchar_alloc);
3210 		  if (BE (new_mbchars == NULL, 0))
3211 		    goto parse_bracket_exp_espace;
3212 		  mbcset->mbchars = new_mbchars;
3213 		}
3214 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3215 	      break;
3216 #endif /* RE_ENABLE_I18N */
3217 	    case EQUIV_CLASS:
3218 	      *err = build_equiv_class (sbcset,
3219 #ifdef RE_ENABLE_I18N
3220 					mbcset, &equiv_class_alloc,
3221 #endif /* RE_ENABLE_I18N */
3222 					start_elem.opr.name);
3223 	      if (BE (*err != REG_NOERROR, 0))
3224 		goto parse_bracket_exp_free_return;
3225 	      break;
3226 	    case COLL_SYM:
3227 	      *err = build_collating_symbol (sbcset,
3228 #ifdef RE_ENABLE_I18N
3229 					     mbcset, &coll_sym_alloc,
3230 #endif /* RE_ENABLE_I18N */
3231 					     start_elem.opr.name);
3232 	      if (BE (*err != REG_NOERROR, 0))
3233 		goto parse_bracket_exp_free_return;
3234 	      break;
3235 	    case CHAR_CLASS:
3236 	      *err = build_charclass (regexp->trans, sbcset,
3237 #ifdef RE_ENABLE_I18N
3238 				      mbcset, &char_class_alloc,
3239 #endif /* RE_ENABLE_I18N */
3240 				      start_elem.opr.name, syntax);
3241 	      if (BE (*err != REG_NOERROR, 0))
3242 	       goto parse_bracket_exp_free_return;
3243 	      break;
3244 	    default:
3245 	      assert (0);
3246 	      break;
3247 	    }
3248 	}
3249       if (BE (token->type == END_OF_RE, 0))
3250 	{
3251 	  *err = REG_EBRACK;
3252 	  goto parse_bracket_exp_free_return;
3253 	}
3254       if (token->type == OP_CLOSE_BRACKET)
3255 	break;
3256     }
3257 
3258   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3259 
3260   /* If it is non-matching list.  */
3261   if (non_match)
3262     bitset_not (sbcset);
3263 
3264 #ifdef RE_ENABLE_I18N
3265   /* Ensure only single byte characters are set.  */
3266   if (dfa->mb_cur_max > 1)
3267     bitset_mask (sbcset, dfa->sb_char);
3268 
3269   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3270       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3271 						     || mbcset->non_match)))
3272     {
3273       bin_tree_t *mbc_tree;
3274       int sbc_idx;
3275       /* Build a tree for complex bracket.  */
3276       dfa->has_mb_node = 1;
3277       br_token.type = COMPLEX_BRACKET;
3278       br_token.opr.mbcset = mbcset;
3279       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3280       if (BE (mbc_tree == NULL, 0))
3281 	goto parse_bracket_exp_espace;
3282       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3283 	if (sbcset[sbc_idx])
3284 	  break;
3285       /* If there are no bits set in sbcset, there is no point
3286 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3287       if (sbc_idx < BITSET_WORDS)
3288 	{
3289 	  /* Build a tree for simple bracket.  */
3290 	  br_token.type = SIMPLE_BRACKET;
3291 	  br_token.opr.sbcset = sbcset;
3292 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293 	  if (BE (work_tree == NULL, 0))
3294 	    goto parse_bracket_exp_espace;
3295 
3296 	  /* Then join them by ALT node.  */
3297 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3298 	  if (BE (work_tree == NULL, 0))
3299 	    goto parse_bracket_exp_espace;
3300 	}
3301       else
3302 	{
3303 	  re_free (sbcset);
3304 	  work_tree = mbc_tree;
3305 	}
3306     }
3307   else
3308 #endif /* not RE_ENABLE_I18N */
3309     {
3310 #ifdef RE_ENABLE_I18N
3311       free_charset (mbcset);
3312 #endif
3313       /* Build a tree for simple bracket.  */
3314       br_token.type = SIMPLE_BRACKET;
3315       br_token.opr.sbcset = sbcset;
3316       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3317       if (BE (work_tree == NULL, 0))
3318 	goto parse_bracket_exp_espace;
3319     }
3320   return work_tree;
3321 
3322  parse_bracket_exp_espace:
3323   *err = REG_ESPACE;
3324  parse_bracket_exp_free_return:
3325   re_free (sbcset);
3326 #ifdef RE_ENABLE_I18N
3327   free_charset (mbcset);
3328 #endif /* RE_ENABLE_I18N */
3329   return NULL;
3330 }
3331 
3332 /* Parse an element in the bracket expression.  */
3333 
3334 static reg_errcode_t
3335 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3336 		       re_token_t *token, int token_len,
3337 		       re_dfa_t *dfa _UNUSED_PARAMETER_,
3338 		       reg_syntax_t syntax, bool accept_hyphen)
3339 {
3340 #ifdef RE_ENABLE_I18N
3341   int cur_char_size;
3342   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3343   if (cur_char_size > 1)
3344     {
3345       elem->type = MB_CHAR;
3346       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3347       re_string_skip_bytes (regexp, cur_char_size);
3348       return REG_NOERROR;
3349     }
3350 #endif /* RE_ENABLE_I18N */
3351   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3352   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3353       || token->type == OP_OPEN_EQUIV_CLASS)
3354     return parse_bracket_symbol (elem, regexp, token);
3355   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3356     {
3357       /* A '-' must only appear as anything but a range indicator before
3358 	 the closing bracket.  Everything else is an error.  */
3359       re_token_t token2;
3360       (void) peek_token_bracket (&token2, regexp, syntax);
3361       if (token2.type != OP_CLOSE_BRACKET)
3362 	/* The actual error value is not standardized since this whole
3363 	   case is undefined.  But ERANGE makes good sense.  */
3364 	return REG_ERANGE;
3365     }
3366   elem->type = SB_CHAR;
3367   elem->opr.ch = token->opr.c;
3368   return REG_NOERROR;
3369 }
3370 
3371 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3372    such as [:<character_class>:], [.<collating_element>.], and
3373    [=<equivalent_class>=].  */
3374 
3375 static reg_errcode_t
3376 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3377 		      re_token_t *token)
3378 {
3379   unsigned char ch, delim = token->opr.c;
3380   int i = 0;
3381   if (re_string_eoi(regexp))
3382     return REG_EBRACK;
3383   for (;; ++i)
3384     {
3385       if (i >= BRACKET_NAME_BUF_SIZE)
3386 	return REG_EBRACK;
3387       if (token->type == OP_OPEN_CHAR_CLASS)
3388 	ch = re_string_fetch_byte_case (regexp);
3389       else
3390 	ch = re_string_fetch_byte (regexp);
3391       if (re_string_eoi(regexp))
3392 	return REG_EBRACK;
3393       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3394 	break;
3395       elem->opr.name[i] = ch;
3396     }
3397   re_string_skip_bytes (regexp, 1);
3398   elem->opr.name[i] = '\0';
3399   switch (token->type)
3400     {
3401     case OP_OPEN_COLL_ELEM:
3402       elem->type = COLL_SYM;
3403       break;
3404     case OP_OPEN_EQUIV_CLASS:
3405       elem->type = EQUIV_CLASS;
3406       break;
3407     case OP_OPEN_CHAR_CLASS:
3408       elem->type = CHAR_CLASS;
3409       break;
3410     default:
3411       break;
3412     }
3413   return REG_NOERROR;
3414 }
3415 
3416   /* Helper function for parse_bracket_exp.
3417      Build the equivalence class which is represented by NAME.
3418      The result are written to MBCSET and SBCSET.
3419      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3420      is a pointer argument sinse we may update it.  */
3421 
3422 static reg_errcode_t
3423 #ifdef RE_ENABLE_I18N
3424 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_,
3425 		   Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3426 		   const unsigned char *name)
3427 #else /* not RE_ENABLE_I18N */
3428 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3429 #endif /* not RE_ENABLE_I18N */
3430 {
3431 #ifdef _LIBC
3432   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3433   if (nrules != 0)
3434     {
3435       const int32_t *table, *indirect;
3436       const unsigned char *weights, *extra, *cp;
3437       unsigned char char_buf[2];
3438       int32_t idx1, idx2;
3439       unsigned int ch;
3440       size_t len;
3441       /* This #include defines a local function!  */
3442 # include <locale/weight.h>
3443       /* Calculate the index for equivalence class.  */
3444       cp = name;
3445       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3446       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3447 					       _NL_COLLATE_WEIGHTMB);
3448       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3449 						   _NL_COLLATE_EXTRAMB);
3450       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3451 						_NL_COLLATE_INDIRECTMB);
3452       idx1 = findidx (&cp);
3453       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3454 	/* This isn't a valid character.  */
3455 	return REG_ECOLLATE;
3456 
3457       /* Build single byte matcing table for this equivalence class.  */
3458       char_buf[1] = (unsigned char) '\0';
3459       len = weights[idx1 & 0xffffff];
3460       for (ch = 0; ch < SBC_MAX; ++ch)
3461 	{
3462 	  char_buf[0] = ch;
3463 	  cp = char_buf;
3464 	  idx2 = findidx (&cp);
3465 /*
3466 	  idx2 = table[ch];
3467 */
3468 	  if (idx2 == 0)
3469 	    /* This isn't a valid character.  */
3470 	    continue;
3471 	  /* Compare only if the length matches and the collation rule
3472 	     index is the same.  */
3473 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3474 	    {
3475 	      int cnt = 0;
3476 
3477 	      while (cnt <= len &&
3478 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3479 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3480 		++cnt;
3481 
3482 	      if (cnt > len)
3483 		bitset_set (sbcset, ch);
3484 	    }
3485 	}
3486       /* Check whether the array has enough space.  */
3487       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3488 	{
3489 	  /* Not enough, realloc it.  */
3490 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3491 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3492 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3493 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3494 						   int32_t,
3495 						   new_equiv_class_alloc);
3496 	  if (BE (new_equiv_classes == NULL, 0))
3497 	    return REG_ESPACE;
3498 	  mbcset->equiv_classes = new_equiv_classes;
3499 	  *equiv_class_alloc = new_equiv_class_alloc;
3500 	}
3501       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3502     }
3503   else
3504 #endif /* _LIBC */
3505     {
3506       if (BE (strlen ((const char *) name) != 1, 0))
3507 	return REG_ECOLLATE;
3508       bitset_set (sbcset, *name);
3509     }
3510   return REG_NOERROR;
3511 }
3512 
3513   /* Helper function for parse_bracket_exp.
3514      Build the character class which is represented by NAME.
3515      The result are written to MBCSET and SBCSET.
3516      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3517      is a pointer argument sinse we may update it.  */
3518 
3519 static reg_errcode_t
3520 #ifdef RE_ENABLE_I18N
3521 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3522 		 re_charset_t *mbcset, Idx *char_class_alloc,
3523 		 const unsigned char *class_name, reg_syntax_t syntax)
3524 #else /* not RE_ENABLE_I18N */
3525 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3526 		 const unsigned char *class_name, reg_syntax_t syntax)
3527 #endif /* not RE_ENABLE_I18N */
3528 {
3529   int i;
3530   const char *name = (const char *) class_name;
3531 
3532   /* In case of REG_ICASE "upper" and "lower" match the both of
3533      upper and lower cases.  */
3534   if ((syntax & RE_ICASE)
3535       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3536     name = "alpha";
3537 
3538 #ifdef RE_ENABLE_I18N
3539   /* Check the space of the arrays.  */
3540   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3541     {
3542       /* Not enough, realloc it.  */
3543       /* +1 in case of mbcset->nchar_classes is 0.  */
3544       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3545       /* Use realloc since array is NULL if *alloc == 0.  */
3546       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3547 					       new_char_class_alloc);
3548       if (BE (new_char_classes == NULL, 0))
3549 	return REG_ESPACE;
3550       mbcset->char_classes = new_char_classes;
3551       *char_class_alloc = new_char_class_alloc;
3552     }
3553   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3554 #endif /* RE_ENABLE_I18N */
3555 
3556 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3557   do {						\
3558     if (BE (trans != NULL, 0))			\
3559       {						\
3560 	for (i = 0; i < SBC_MAX; ++i)		\
3561 	  if (ctype_func (i))			\
3562 	    bitset_set (sbcset, trans[i]);	\
3563       }						\
3564     else					\
3565       {						\
3566 	for (i = 0; i < SBC_MAX; ++i)		\
3567 	  if (ctype_func (i))			\
3568 	    bitset_set (sbcset, i);		\
3569       }						\
3570   } while (0)
3571 
3572   if (strcmp (name, "alnum") == 0)
3573     BUILD_CHARCLASS_LOOP (isalnum);
3574   else if (strcmp (name, "cntrl") == 0)
3575     BUILD_CHARCLASS_LOOP (iscntrl);
3576   else if (strcmp (name, "lower") == 0)
3577     BUILD_CHARCLASS_LOOP (islower);
3578   else if (strcmp (name, "space") == 0)
3579     BUILD_CHARCLASS_LOOP (isspace);
3580   else if (strcmp (name, "alpha") == 0)
3581     BUILD_CHARCLASS_LOOP (isalpha);
3582   else if (strcmp (name, "digit") == 0)
3583     BUILD_CHARCLASS_LOOP (isdigit);
3584   else if (strcmp (name, "print") == 0)
3585     BUILD_CHARCLASS_LOOP (isprint);
3586   else if (strcmp (name, "upper") == 0)
3587     BUILD_CHARCLASS_LOOP (isupper);
3588   else if (strcmp (name, "blank") == 0)
3589     BUILD_CHARCLASS_LOOP (isblank);
3590   else if (strcmp (name, "graph") == 0)
3591     BUILD_CHARCLASS_LOOP (isgraph);
3592   else if (strcmp (name, "punct") == 0)
3593     BUILD_CHARCLASS_LOOP (ispunct);
3594   else if (strcmp (name, "xdigit") == 0)
3595     BUILD_CHARCLASS_LOOP (isxdigit);
3596   else
3597     return REG_ECTYPE;
3598 
3599   return REG_NOERROR;
3600 }
3601 
3602 static bin_tree_t *
3603 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3604 		    const unsigned char *class_name,
3605 		    const unsigned char *extra, bool non_match,
3606 		    reg_errcode_t *err)
3607 {
3608   re_bitset_ptr_t sbcset;
3609 #ifdef RE_ENABLE_I18N
3610   re_charset_t *mbcset;
3611   Idx alloc = 0;
3612 #endif /* not RE_ENABLE_I18N */
3613   reg_errcode_t ret;
3614   re_token_t br_token;
3615   bin_tree_t *tree;
3616 
3617   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3618 #ifdef RE_ENABLE_I18N
3619   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3620 #endif /* RE_ENABLE_I18N */
3621 
3622 #ifdef RE_ENABLE_I18N
3623   if (BE (sbcset == NULL || mbcset == NULL, 0))
3624 #else /* not RE_ENABLE_I18N */
3625   if (BE (sbcset == NULL, 0))
3626 #endif /* not RE_ENABLE_I18N */
3627     {
3628       *err = REG_ESPACE;
3629       return NULL;
3630     }
3631 
3632   if (non_match)
3633     {
3634 #ifdef RE_ENABLE_I18N
3635       mbcset->non_match = 1;
3636 #endif /* not RE_ENABLE_I18N */
3637     }
3638 
3639   /* We don't care the syntax in this case.  */
3640   ret = build_charclass (trans, sbcset,
3641 #ifdef RE_ENABLE_I18N
3642 			 mbcset, &alloc,
3643 #endif /* RE_ENABLE_I18N */
3644 			 class_name, 0);
3645 
3646   if (BE (ret != REG_NOERROR, 0))
3647     {
3648       re_free (sbcset);
3649 #ifdef RE_ENABLE_I18N
3650       free_charset (mbcset);
3651 #endif /* RE_ENABLE_I18N */
3652       *err = ret;
3653       return NULL;
3654     }
3655   /* \w match '_' also.  */
3656   for (; *extra; extra++)
3657     bitset_set (sbcset, *extra);
3658 
3659   /* If it is non-matching list.  */
3660   if (non_match)
3661     bitset_not (sbcset);
3662 
3663 #ifdef RE_ENABLE_I18N
3664   /* Ensure only single byte characters are set.  */
3665   if (dfa->mb_cur_max > 1)
3666     bitset_mask (sbcset, dfa->sb_char);
3667 #endif
3668 
3669   /* Build a tree for simple bracket.  */
3670   br_token.type = SIMPLE_BRACKET;
3671   br_token.opr.sbcset = sbcset;
3672   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3673   if (BE (tree == NULL, 0))
3674     goto build_word_op_espace;
3675 
3676 #ifdef RE_ENABLE_I18N
3677   if (dfa->mb_cur_max > 1)
3678     {
3679       bin_tree_t *mbc_tree;
3680       /* Build a tree for complex bracket.  */
3681       br_token.type = COMPLEX_BRACKET;
3682       br_token.opr.mbcset = mbcset;
3683       dfa->has_mb_node = 1;
3684       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685       if (BE (mbc_tree == NULL, 0))
3686 	goto build_word_op_espace;
3687       /* Then join them by ALT node.  */
3688       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3689       if (BE (mbc_tree != NULL, 1))
3690 	return tree;
3691     }
3692   else
3693     {
3694       free_charset (mbcset);
3695       return tree;
3696     }
3697 #else /* not RE_ENABLE_I18N */
3698   return tree;
3699 #endif /* not RE_ENABLE_I18N */
3700 
3701  build_word_op_espace:
3702   re_free (sbcset);
3703 #ifdef RE_ENABLE_I18N
3704   free_charset (mbcset);
3705 #endif /* RE_ENABLE_I18N */
3706   *err = REG_ESPACE;
3707   return NULL;
3708 }
3709 
3710 /* This is intended for the expressions like "a{1,3}".
3711    Fetch a number from `input', and return the number.
3712    Return REG_MISSING if the number field is empty like "{,1}".
3713    Return REG_ERROR if an error occurred.  */
3714 
3715 static Idx
3716 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3717 {
3718   Idx num = REG_MISSING;
3719   unsigned char c;
3720   while (1)
3721     {
3722       fetch_token (token, input, syntax);
3723       c = token->opr.c;
3724       if (BE (token->type == END_OF_RE, 0))
3725 	return REG_ERROR;
3726       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3727 	break;
3728       num = ((token->type != CHARACTER || c < '0' || '9' < c
3729 	      || num == REG_ERROR)
3730 	     ? REG_ERROR
3731 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3732       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3733     }
3734   return num;
3735 }
3736 
3737 #ifdef RE_ENABLE_I18N
3738 static void
3739 free_charset (re_charset_t *cset)
3740 {
3741   re_free (cset->mbchars);
3742 # ifdef _LIBC
3743   re_free (cset->coll_syms);
3744   re_free (cset->equiv_classes);
3745   re_free (cset->range_starts);
3746   re_free (cset->range_ends);
3747 # endif
3748   re_free (cset->char_classes);
3749   re_free (cset);
3750 }
3751 #endif /* RE_ENABLE_I18N */
3752 
3753 /* Functions for binary tree operation.  */
3754 
3755 /* Create a tree node.  */
3756 
3757 static bin_tree_t *
3758 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3759 	     re_token_type_t type)
3760 {
3761   re_token_t t;
3762   t.type = type;
3763   return create_token_tree (dfa, left, right, &t);
3764 }
3765 
3766 static bin_tree_t *
3767 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3768 		   const re_token_t *token)
3769 {
3770   bin_tree_t *tree;
3771   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3772     {
3773       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3774 
3775       if (storage == NULL)
3776 	return NULL;
3777       storage->next = dfa->str_tree_storage;
3778       dfa->str_tree_storage = storage;
3779       dfa->str_tree_storage_idx = 0;
3780     }
3781   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3782 
3783   tree->parent = NULL;
3784   tree->left = left;
3785   tree->right = right;
3786   tree->token = *token;
3787   tree->token.duplicated = 0;
3788   tree->token.opt_subexp = 0;
3789   tree->first = NULL;
3790   tree->next = NULL;
3791   tree->node_idx = REG_MISSING;
3792 
3793   if (left != NULL)
3794     left->parent = tree;
3795   if (right != NULL)
3796     right->parent = tree;
3797   return tree;
3798 }
3799 
3800 /* Mark the tree SRC as an optional subexpression.
3801    To be called from preorder or postorder.  */
3802 
3803 static reg_errcode_t
3804 mark_opt_subexp (void *extra, bin_tree_t *node)
3805 {
3806   Idx idx = (Idx) (long) extra;
3807   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3808     node->token.opt_subexp = 1;
3809 
3810   return REG_NOERROR;
3811 }
3812 
3813 /* Free the allocated memory inside NODE. */
3814 
3815 static void
3816 free_token (re_token_t *node)
3817 {
3818 #ifdef RE_ENABLE_I18N
3819   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3820     free_charset (node->opr.mbcset);
3821   else
3822 #endif /* RE_ENABLE_I18N */
3823     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3824       re_free (node->opr.sbcset);
3825 }
3826 
3827 /* Worker function for tree walking.  Free the allocated memory inside NODE
3828    and its children. */
3829 
3830 static reg_errcode_t
3831 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3832 {
3833   free_token (&node->token);
3834   return REG_NOERROR;
3835 }
3836 
3837 
3838 /* Duplicate the node SRC, and return new node.  This is a preorder
3839    visit similar to the one implemented by the generic visitor, but
3840    we need more infrastructure to maintain two parallel trees --- so,
3841    it's easier to duplicate.  */
3842 
3843 static bin_tree_t *
3844 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3845 {
3846   const bin_tree_t *node;
3847   bin_tree_t *dup_root;
3848   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3849 
3850   for (node = root; ; )
3851     {
3852       /* Create a new tree and link it back to the current parent.  */
3853       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3854       if (*p_new == NULL)
3855 	return NULL;
3856       (*p_new)->parent = dup_node;
3857       (*p_new)->token.duplicated = 1;
3858       dup_node = *p_new;
3859 
3860       /* Go to the left node, or up and to the right.  */
3861       if (node->left)
3862 	{
3863 	  node = node->left;
3864 	  p_new = &dup_node->left;
3865 	}
3866       else
3867 	{
3868 	  const bin_tree_t *prev = NULL;
3869 	  while (node->right == prev || node->right == NULL)
3870 	    {
3871 	      prev = node;
3872 	      node = node->parent;
3873 	      dup_node = dup_node->parent;
3874 	      if (!node)
3875 		return dup_root;
3876 	    }
3877 	  node = node->right;
3878 	  p_new = &dup_node->right;
3879 	}
3880     }
3881 }
3882