xref: /dragonfly/contrib/grep/lib/regcomp.c (revision f7df6c8e)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 					  size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 				     const re_dfastate_t *init_state,
24 				     char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 			       reg_errcode_t (fn (void *, bin_tree_t *)),
37 			       void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 				reg_errcode_t (fn (void *, bin_tree_t *)),
40 				void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 				 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 				   unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 					 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 			 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 			reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 			  reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 				  re_token_t *token, reg_syntax_t syntax,
63 				  Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 				 re_token_t *token, reg_syntax_t syntax,
66 				 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 				     re_token_t *token, reg_syntax_t syntax,
69 				     Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 				  re_token_t *token, reg_syntax_t syntax,
72 				  Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 				 re_dfa_t *dfa, re_token_t *token,
75 				 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 				      re_token_t *token, reg_syntax_t syntax,
78 				      reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 					    re_string_t *regexp,
81 					    re_token_t *token, int token_len,
82 					    re_dfa_t *dfa,
83 					    reg_syntax_t syntax,
84 					    bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 					  re_string_t *regexp,
87 					  re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 					re_charset_t *mbcset,
91 					Idx *equiv_class_alloc,
92 					const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 				      bitset_t sbcset,
95 				      re_charset_t *mbcset,
96 				      Idx *char_class_alloc,
97 				      const char *class_name,
98 				      reg_syntax_t syntax);
99 #else  /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 					const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 				      bitset_t sbcset,
104 				      const char *class_name,
105 				      reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 				       RE_TRANSLATE_TYPE trans,
109 				       const char *class_name,
110 				       const char *extra,
111 				       bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 				bin_tree_t *left, bin_tree_t *right,
114 				re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 				      bin_tree_t *left, bin_tree_t *right,
117 				      const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 
123 /* This table gives an error message for each of the error codes listed
124    in regex.h.  Obviously the order here has to be same as there.
125    POSIX doesn't require that we do anything for REG_NOERROR,
126    but why not be nice?  */
127 
128 static const char __re_error_msgid[] =
129   {
130 #define REG_NOERROR_IDX	0
131     gettext_noop ("Success")	/* REG_NOERROR */
132     "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134     gettext_noop ("No match")	/* REG_NOMATCH */
135     "\0"
136 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138     "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141     "\0"
142 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144     "\0"
145 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147     "\0"
148 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150     "\0"
151 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153     "\0"
154 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156     "\0"
157 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159     "\0"
160 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162     "\0"
163 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164     gettext_noop ("Invalid range end")	/* REG_ERANGE */
165     "\0"
166 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
168     "\0"
169 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171     "\0"
172 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
174     "\0"
175 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
177     "\0"
178 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180   };
181 
182 static const size_t __re_error_msgid_idx[] =
183   {
184     REG_NOERROR_IDX,
185     REG_NOMATCH_IDX,
186     REG_BADPAT_IDX,
187     REG_ECOLLATE_IDX,
188     REG_ECTYPE_IDX,
189     REG_EESCAPE_IDX,
190     REG_ESUBREG_IDX,
191     REG_EBRACK_IDX,
192     REG_EPAREN_IDX,
193     REG_EBRACE_IDX,
194     REG_BADBR_IDX,
195     REG_ERANGE_IDX,
196     REG_ESPACE_IDX,
197     REG_BADRPT_IDX,
198     REG_EEND_IDX,
199     REG_ESIZE_IDX,
200     REG_ERPAREN_IDX
201   };
202 
203 /* Entry points for GNU code.  */
204 
205 /* re_compile_pattern is the GNU regular expression compiler: it
206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207    Returns 0 if the pattern was valid, otherwise an error string.
208 
209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210    are set in BUFP on entry.  */
211 
212 #ifdef _LIBC
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215     const char *pattern;
216     size_t length;
217     struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
219 const char *
220 re_compile_pattern (const char *pattern, size_t length,
221 		    struct re_pattern_buffer *bufp)
222 #endif
223 {
224   reg_errcode_t ret;
225 
226   /* And GNU code determines whether or not to get register information
227      by passing null for the REGS argument to re_match, etc., not by
228      setting no_sub, unless RE_NO_SUB is set.  */
229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 
231   /* Match anchors at newline.  */
232   bufp->newline_anchor = 1;
233 
234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235 
236   if (!ret)
237     return NULL;
238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239 }
240 #ifdef _LIBC
241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
243 
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
245    also be assigned to arbitrarily: each pattern buffer stores its own
246    syntax, so it can be changed between regex compilations.  */
247 /* This has no initializer because initialized variables in Emacs
248    become read-only after dumping.  */
249 reg_syntax_t re_syntax_options;
250 
251 
252 /* Specify the precise syntax of regexps for compilation.  This provides
253    for compatibility for various utilities which historically have
254    different, incompatible syntaxes.
255 
256    The argument SYNTAX is a bit mask comprised of the various bits
257    defined in regex.h.  We return the old syntax.  */
258 
259 reg_syntax_t
260 re_set_syntax (syntax)
261     reg_syntax_t syntax;
262 {
263   reg_syntax_t ret = re_syntax_options;
264 
265   re_syntax_options = syntax;
266   return ret;
267 }
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
271 
272 int
273 re_compile_fastmap (bufp)
274     struct re_pattern_buffer *bufp;
275 {
276   re_dfa_t *dfa = bufp->buffer;
277   char *fastmap = bufp->fastmap;
278 
279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281   if (dfa->init_state != dfa->init_state_word)
282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283   if (dfa->init_state != dfa->init_state_nl)
284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285   if (dfa->init_state != dfa->init_state_begbuf)
286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287   bufp->fastmap_accurate = 1;
288   return 0;
289 }
290 #ifdef _LIBC
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
293 
294 static inline void
295 __attribute__ ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
297 {
298   fastmap[ch] = 1;
299   if (icase)
300     fastmap[tolower (ch)] = 1;
301 }
302 
303 /* Helper function for re_compile_fastmap.
304    Compile fastmap for the initial_state INIT_STATE.  */
305 
306 static void
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 			 char *fastmap)
309 {
310   re_dfa_t *dfa = bufp->buffer;
311   Idx node_cnt;
312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314     {
315       Idx node = init_state->nodes.elems[node_cnt];
316       re_token_type_t type = dfa->nodes[node].type;
317 
318       if (type == CHARACTER)
319 	{
320 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 	    {
324 	      unsigned char buf[MB_LEN_MAX];
325 	      unsigned char *p;
326 	      wchar_t wc;
327 	      mbstate_t state;
328 
329 	      p = buf;
330 	      *p++ = dfa->nodes[node].opr.c;
331 	      while (++node < dfa->nodes_len
332 		     &&	dfa->nodes[node].type == CHARACTER
333 		     && dfa->nodes[node].mb_partial)
334 		*p++ = dfa->nodes[node].opr.c;
335 	      memset (&state, '\0', sizeof (state));
336 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 			     &state) == p - buf
338 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 		      != (size_t) -1))
340 		re_set_fastmap (fastmap, false, buf[0]);
341 	    }
342 #endif
343 	}
344       else if (type == SIMPLE_BRACKET)
345 	{
346 	  int i, ch;
347 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 	    {
349 	      int j;
350 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 		if (w & ((bitset_word_t) 1 << j))
353 		  re_set_fastmap (fastmap, icase, ch);
354 	    }
355 	}
356 #ifdef RE_ENABLE_I18N
357       else if (type == COMPLEX_BRACKET)
358 	{
359 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360 	  Idx i;
361 
362 # ifdef _LIBC
363 	  /* See if we have to try all bytes which start multiple collation
364 	     elements.
365 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 		  collation element, and don't catch 'b' since 'b' is
367 		  the only collation element which starts from 'b' (and
368 		  it is caught by SIMPLE_BRACKET).  */
369 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 		  && (cset->ncoll_syms || cset->nranges))
371 		{
372 		  const int32_t *table = (const int32_t *)
373 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 		  for (i = 0; i < SBC_MAX; ++i)
375 		    if (table[i] < 0)
376 		      re_set_fastmap (fastmap, icase, i);
377 		}
378 # endif /* _LIBC */
379 
380 	  /* See if we have to start the match at all multibyte characters,
381 	     i.e. where we would not find an invalid sequence.  This only
382 	     applies to multibyte character sets; for single byte character
383 	     sets, the SIMPLE_BRACKET again suffices.  */
384 	  if (dfa->mb_cur_max > 1
385 	      && (cset->nchar_classes || cset->non_match || cset->nranges
386 # ifdef _LIBC
387 		  || cset->nequiv_classes
388 # endif /* _LIBC */
389 		 ))
390 	    {
391 	      unsigned char c = 0;
392 	      do
393 		{
394 		  mbstate_t mbs;
395 		  memset (&mbs, 0, sizeof (mbs));
396 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 		    re_set_fastmap (fastmap, false, (int) c);
398 		}
399 	      while (++c != 0);
400 	    }
401 
402 	  else
403 	    {
404 	      /* ... Else catch all bytes which can start the mbchars.  */
405 	      for (i = 0; i < cset->nmbchars; ++i)
406 		{
407 		  char buf[256];
408 		  mbstate_t state;
409 		  memset (&state, '\0', sizeof (state));
410 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 		    {
414 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 			  != (size_t) -1)
416 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417 		    }
418 		}
419 	    }
420 	}
421 #endif /* RE_ENABLE_I18N */
422       else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 	       || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 	       || type == END_OF_RE)
427 	{
428 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 	  if (type == END_OF_RE)
430 	    bufp->can_be_null = 1;
431 	  return;
432 	}
433     }
434 }
435 
436 /* Entry point for POSIX code.  */
437 /* regcomp takes a regular expression as a string and compiles it.
438 
439    PREG is a regex_t *.  We do not expect any fields to be initialized,
440    since POSIX says we shouldn't.  Thus, we set
441 
442      'buffer' to the compiled pattern;
443      'used' to the length of the compiled pattern;
444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
446        RE_SYNTAX_POSIX_BASIC;
447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448      'fastmap' to an allocated space for the fastmap;
449      'fastmap_accurate' to zero;
450      're_nsub' to the number of subexpressions in PATTERN.
451 
452    PATTERN is the address of the pattern string.
453 
454    CFLAGS is a series of bits which affect compilation.
455 
456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457      use POSIX basic syntax.
458 
459      If REG_NEWLINE is set, then . and [^...] don't match newline.
460      Also, regexec will try a match beginning after every newline.
461 
462      If REG_ICASE is set, then we considers upper- and lowercase
463      versions of letters to be equivalent when matching.
464 
465      If REG_NOSUB is set, then when PREG is passed to regexec, that
466      routine will report only success or failure, and nothing about the
467      registers.
468 
469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
470    the return codes and their meanings.)  */
471 
472 int
473 regcomp (preg, pattern, cflags)
474     regex_t *_Restrict_ preg;
475     const char *_Restrict_ pattern;
476     int cflags;
477 {
478   reg_errcode_t ret;
479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 			 : RE_SYNTAX_POSIX_BASIC);
481 
482   preg->buffer = NULL;
483   preg->allocated = 0;
484   preg->used = 0;
485 
486   /* Try to allocate space for the fastmap.  */
487   preg->fastmap = re_malloc (char, SBC_MAX);
488   if (BE (preg->fastmap == NULL, 0))
489     return REG_ESPACE;
490 
491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 
493   /* If REG_NEWLINE is set, newlines are treated differently.  */
494   if (cflags & REG_NEWLINE)
495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
496       syntax &= ~RE_DOT_NEWLINE;
497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498       /* It also changes the matching behavior.  */
499       preg->newline_anchor = 1;
500     }
501   else
502     preg->newline_anchor = 0;
503   preg->no_sub = !!(cflags & REG_NOSUB);
504   preg->translate = NULL;
505 
506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 
508   /* POSIX doesn't distinguish between an unmatched open-group and an
509      unmatched close-group: both are REG_EPAREN.  */
510   if (ret == REG_ERPAREN)
511     ret = REG_EPAREN;
512 
513   /* We have already checked preg->fastmap != NULL.  */
514   if (BE (ret == REG_NOERROR, 1))
515     /* Compute the fastmap now, since regexec cannot modify the pattern
516        buffer.  This function never fails in this implementation.  */
517     (void) re_compile_fastmap (preg);
518   else
519     {
520       /* Some error occurred while compiling the expression.  */
521       re_free (preg->fastmap);
522       preg->fastmap = NULL;
523     }
524 
525   return (int) ret;
526 }
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
530 
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532    from either regcomp or regexec.   We don't use PREG here.  */
533 
534 #ifdef _LIBC
535 size_t
536 regerror (errcode, preg, errbuf, errbuf_size)
537     int errcode;
538     const regex_t *_Restrict_ preg;
539     char *_Restrict_ errbuf;
540     size_t errbuf_size;
541 #else /* size_t might promote */
542 size_t
543 regerror (int errcode, const regex_t *_Restrict_ preg,
544 	  char *_Restrict_ errbuf, size_t errbuf_size)
545 #endif
546 {
547   const char *msg;
548   size_t msg_size;
549 
550   if (BE (errcode < 0
551 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 			       / sizeof (__re_error_msgid_idx[0])), 0))
553     /* Only error codes returned by the rest of the code should be passed
554        to this routine.  If we are given anything else, or if other regex
555        code generates an invalid error code, then the program has a bug.
556        Dump core so we can fix it.  */
557     abort ();
558 
559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560 
561   msg_size = strlen (msg) + 1; /* Includes the null.  */
562 
563   if (BE (errbuf_size != 0, 1))
564     {
565       size_t cpy_size = msg_size;
566       if (BE (msg_size > errbuf_size, 0))
567 	{
568 	  cpy_size = errbuf_size - 1;
569 	  errbuf[cpy_size] = '\0';
570 	}
571       memcpy (errbuf, msg, cpy_size);
572     }
573 
574   return msg_size;
575 }
576 #ifdef _LIBC
577 weak_alias (__regerror, regerror)
578 #endif
579 
580 
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
584    it the same all the time.  UTF-8 is the preferred encoding so this is
585    a worthwhile optimization.  */
586 static const bitset_t utf8_sb_map =
587 {
588   /* Set the first 128 bits.  */
589 # if defined __GNUC__ && !defined __STRICT_ANSI__
590   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # else
592 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 #   error "bitset_word_t is narrower than 32 bits"
594 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597   BITSET_WORD_MAX, BITSET_WORD_MAX,
598 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599   BITSET_WORD_MAX,
600 #  endif
601   (BITSET_WORD_MAX
602    >> (SBC_MAX % BITSET_WORD_BITS == 0
603        ? 0
604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 # endif
606 };
607 #endif
608 
609 
610 static void
611 free_dfa_content (re_dfa_t *dfa)
612 {
613   Idx i, j;
614 
615   if (dfa->nodes)
616     for (i = 0; i < dfa->nodes_len; ++i)
617       free_token (dfa->nodes + i);
618   re_free (dfa->nexts);
619   for (i = 0; i < dfa->nodes_len; ++i)
620     {
621       if (dfa->eclosures != NULL)
622 	re_node_set_free (dfa->eclosures + i);
623       if (dfa->inveclosures != NULL)
624 	re_node_set_free (dfa->inveclosures + i);
625       if (dfa->edests != NULL)
626 	re_node_set_free (dfa->edests + i);
627     }
628   re_free (dfa->edests);
629   re_free (dfa->eclosures);
630   re_free (dfa->inveclosures);
631   re_free (dfa->nodes);
632 
633   if (dfa->state_table)
634     for (i = 0; i <= dfa->state_hash_mask; ++i)
635       {
636 	struct re_state_table_entry *entry = dfa->state_table + i;
637 	for (j = 0; j < entry->num; ++j)
638 	  {
639 	    re_dfastate_t *state = entry->array[j];
640 	    free_state (state);
641 	  }
642 	re_free (entry->array);
643       }
644   re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646   if (dfa->sb_char != utf8_sb_map)
647     re_free (dfa->sb_char);
648 #endif
649   re_free (dfa->subexp_map);
650 #ifdef DEBUG
651   re_free (dfa->re_str);
652 #endif
653 
654   re_free (dfa);
655 }
656 
657 
658 /* Free dynamically allocated space used by PREG.  */
659 
660 void
661 regfree (preg)
662     regex_t *preg;
663 {
664   re_dfa_t *dfa = preg->buffer;
665   if (BE (dfa != NULL, 1))
666     {
667       lock_fini (dfa->lock);
668       free_dfa_content (dfa);
669     }
670   preg->buffer = NULL;
671   preg->allocated = 0;
672 
673   re_free (preg->fastmap);
674   preg->fastmap = NULL;
675 
676   re_free (preg->translate);
677   preg->translate = NULL;
678 }
679 #ifdef _LIBC
680 weak_alias (__regfree, regfree)
681 #endif
682 
683 /* Entry points compatible with 4.2 BSD regex library.  We don't define
684    them unless specifically requested.  */
685 
686 #if defined _REGEX_RE_COMP || defined _LIBC
687 
688 /* BSD has one and only one pattern buffer.  */
689 static struct re_pattern_buffer re_comp_buf;
690 
691 char *
692 # ifdef _LIBC
693 /* Make these definitions weak in libc, so POSIX programs can redefine
694    these names if they don't use our functions, and still use
695    regcomp/regexec above without link errors.  */
696 weak_function
697 # endif
698 re_comp (s)
699      const char *s;
700 {
701   reg_errcode_t ret;
702   char *fastmap;
703 
704   if (!s)
705     {
706       if (!re_comp_buf.buffer)
707 	return gettext ("No previous regular expression");
708       return 0;
709     }
710 
711   if (re_comp_buf.buffer)
712     {
713       fastmap = re_comp_buf.fastmap;
714       re_comp_buf.fastmap = NULL;
715       __regfree (&re_comp_buf);
716       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
717       re_comp_buf.fastmap = fastmap;
718     }
719 
720   if (re_comp_buf.fastmap == NULL)
721     {
722       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
723       if (re_comp_buf.fastmap == NULL)
724 	return (char *) gettext (__re_error_msgid
725 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
726     }
727 
728   /* Since 're_exec' always passes NULL for the 'regs' argument, we
729      don't need to initialize the pattern buffer fields which affect it.  */
730 
731   /* Match anchors at newlines.  */
732   re_comp_buf.newline_anchor = 1;
733 
734   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735 
736   if (!ret)
737     return NULL;
738 
739   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
740   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741 }
742 
743 #ifdef _LIBC
744 libc_freeres_fn (free_mem)
745 {
746   __regfree (&re_comp_buf);
747 }
748 #endif
749 
750 #endif /* _REGEX_RE_COMP */
751 
752 /* Internal entry point.
753    Compile the regular expression PATTERN, whose length is LENGTH.
754    SYNTAX indicate regular expression's syntax.  */
755 
756 static reg_errcode_t
757 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
758 		     reg_syntax_t syntax)
759 {
760   reg_errcode_t err = REG_NOERROR;
761   re_dfa_t *dfa;
762   re_string_t regexp;
763 
764   /* Initialize the pattern buffer.  */
765   preg->fastmap_accurate = 0;
766   preg->syntax = syntax;
767   preg->not_bol = preg->not_eol = 0;
768   preg->used = 0;
769   preg->re_nsub = 0;
770   preg->can_be_null = 0;
771   preg->regs_allocated = REGS_UNALLOCATED;
772 
773   /* Initialize the dfa.  */
774   dfa = preg->buffer;
775   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
776     {
777       /* If zero allocated, but buffer is non-null, try to realloc
778 	 enough space.  This loses if buffer's address is bogus, but
779 	 that is the user's responsibility.  If ->buffer is NULL this
780 	 is a simple allocation.  */
781       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
782       if (dfa == NULL)
783 	return REG_ESPACE;
784       preg->allocated = sizeof (re_dfa_t);
785       preg->buffer = dfa;
786     }
787   preg->used = sizeof (re_dfa_t);
788 
789   err = init_dfa (dfa, length);
790   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
791     err = REG_ESPACE;
792   if (BE (err != REG_NOERROR, 0))
793     {
794       free_dfa_content (dfa);
795       preg->buffer = NULL;
796       preg->allocated = 0;
797       return err;
798     }
799 #ifdef DEBUG
800   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
801   dfa->re_str = re_malloc (char, length + 1);
802   strncpy (dfa->re_str, pattern, length + 1);
803 #endif
804 
805   err = re_string_construct (&regexp, pattern, length, preg->translate,
806 			     (syntax & RE_ICASE) != 0, dfa);
807   if (BE (err != REG_NOERROR, 0))
808     {
809     re_compile_internal_free_return:
810       free_workarea_compile (preg);
811       re_string_destruct (&regexp);
812       lock_fini (dfa->lock);
813       free_dfa_content (dfa);
814       preg->buffer = NULL;
815       preg->allocated = 0;
816       return err;
817     }
818 
819   /* Parse the regular expression, and build a structure tree.  */
820   preg->re_nsub = 0;
821   dfa->str_tree = parse (&regexp, preg, syntax, &err);
822   if (BE (dfa->str_tree == NULL, 0))
823     goto re_compile_internal_free_return;
824 
825   /* Analyze the tree and create the nfa.  */
826   err = analyze (preg);
827   if (BE (err != REG_NOERROR, 0))
828     goto re_compile_internal_free_return;
829 
830 #ifdef RE_ENABLE_I18N
831   /* If possible, do searching in single byte encoding to speed things up.  */
832   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
833     optimize_utf8 (dfa);
834 #endif
835 
836   /* Then create the initial state of the dfa.  */
837   err = create_initial_state (dfa);
838 
839   /* Release work areas.  */
840   free_workarea_compile (preg);
841   re_string_destruct (&regexp);
842 
843   if (BE (err != REG_NOERROR, 0))
844     {
845       lock_fini (dfa->lock);
846       free_dfa_content (dfa);
847       preg->buffer = NULL;
848       preg->allocated = 0;
849     }
850 
851   return err;
852 }
853 
854 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
855    as the initial length of some arrays.  */
856 
857 static reg_errcode_t
858 init_dfa (re_dfa_t *dfa, size_t pat_len)
859 {
860   __re_size_t table_size;
861 #ifndef _LIBC
862   const char *codeset_name;
863 #endif
864 #ifdef RE_ENABLE_I18N
865   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
866 #else
867   size_t max_i18n_object_size = 0;
868 #endif
869   size_t max_object_size =
870     MAX (sizeof (struct re_state_table_entry),
871 	 MAX (sizeof (re_token_t),
872 	      MAX (sizeof (re_node_set),
873 		   MAX (sizeof (regmatch_t),
874 			max_i18n_object_size))));
875 
876   memset (dfa, '\0', sizeof (re_dfa_t));
877 
878   /* Force allocation of str_tree_storage the first time.  */
879   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
880 
881   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
882      calculation below, and for similar doubling calculations
883      elsewhere.  And it's <= rather than <, because some of the
884      doubling calculations add 1 afterwards.  */
885   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
886     return REG_ESPACE;
887 
888   dfa->nodes_alloc = pat_len + 1;
889   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
890 
891   /*  table_size = 2 ^ ceil(log pat_len) */
892   for (table_size = 1; ; table_size <<= 1)
893     if (table_size > pat_len)
894       break;
895 
896   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
897   dfa->state_hash_mask = table_size - 1;
898 
899   dfa->mb_cur_max = MB_CUR_MAX;
900 #ifdef _LIBC
901   if (dfa->mb_cur_max == 6
902       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
903     dfa->is_utf8 = 1;
904   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
905 		       != 0);
906 #else
907   codeset_name = nl_langinfo (CODESET);
908   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
909       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
910       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
911       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
912     dfa->is_utf8 = 1;
913 
914   /* We check exhaustively in the loop below if this charset is a
915      superset of ASCII.  */
916   dfa->map_notascii = 0;
917 #endif
918 
919 #ifdef RE_ENABLE_I18N
920   if (dfa->mb_cur_max > 1)
921     {
922       if (dfa->is_utf8)
923 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
924       else
925 	{
926 	  int i, j, ch;
927 
928 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
929 	  if (BE (dfa->sb_char == NULL, 0))
930 	    return REG_ESPACE;
931 
932 	  /* Set the bits corresponding to single byte chars.  */
933 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
934 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935 	      {
936 		wint_t wch = __btowc (ch);
937 		if (wch != WEOF)
938 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
939 # ifndef _LIBC
940 		if (isascii (ch) && wch != ch)
941 		  dfa->map_notascii = 1;
942 # endif
943 	      }
944 	}
945     }
946 #endif
947 
948   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
949     return REG_ESPACE;
950   return REG_NOERROR;
951 }
952 
953 /* Initialize WORD_CHAR table, which indicate which character is
954    "word".  In this case "word" means that it is the word construction
955    character used by some operators like "\<", "\>", etc.  */
956 
957 static void
958 internal_function
959 init_word_char (re_dfa_t *dfa)
960 {
961   int i = 0;
962   int j;
963   int ch = 0;
964   dfa->word_ops_used = 1;
965   if (BE (dfa->map_notascii == 0, 1))
966     {
967       bitset_word_t bits0 = 0x00000000;
968       bitset_word_t bits1 = 0x03ff0000;
969       bitset_word_t bits2 = 0x87fffffe;
970       bitset_word_t bits3 = 0x07fffffe;
971       if (BITSET_WORD_BITS == 64)
972 	{
973 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
974 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
975 	  i = 2;
976 	}
977       else if (BITSET_WORD_BITS == 32)
978 	{
979 	  dfa->word_char[0] = bits0;
980 	  dfa->word_char[1] = bits1;
981 	  dfa->word_char[2] = bits2;
982 	  dfa->word_char[3] = bits3;
983 	  i = 4;
984 	}
985       else
986         goto general_case;
987       ch = 128;
988 
989       if (BE (dfa->is_utf8, 1))
990 	{
991 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
992 	  return;
993 	}
994     }
995 
996  general_case:
997   for (; i < BITSET_WORDS; ++i)
998     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
999       if (isalnum (ch) || ch == '_')
1000 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001 }
1002 
1003 /* Free the work area which are only used while compiling.  */
1004 
1005 static void
1006 free_workarea_compile (regex_t *preg)
1007 {
1008   re_dfa_t *dfa = preg->buffer;
1009   bin_tree_storage_t *storage, *next;
1010   for (storage = dfa->str_tree_storage; storage; storage = next)
1011     {
1012       next = storage->next;
1013       re_free (storage);
1014     }
1015   dfa->str_tree_storage = NULL;
1016   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017   dfa->str_tree = NULL;
1018   re_free (dfa->org_indices);
1019   dfa->org_indices = NULL;
1020 }
1021 
1022 /* Create initial states for all contexts.  */
1023 
1024 static reg_errcode_t
1025 create_initial_state (re_dfa_t *dfa)
1026 {
1027   Idx first, i;
1028   reg_errcode_t err;
1029   re_node_set init_nodes;
1030 
1031   /* Initial states have the epsilon closure of the node which is
1032      the first node of the regular expression.  */
1033   first = dfa->str_tree->first->node_idx;
1034   dfa->init_node = first;
1035   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036   if (BE (err != REG_NOERROR, 0))
1037     return err;
1038 
1039   /* The back-references which are in initial states can epsilon transit,
1040      since in this case all of the subexpressions can be null.
1041      Then we add epsilon closures of the nodes which are the next nodes of
1042      the back-references.  */
1043   if (dfa->nbackref > 0)
1044     for (i = 0; i < init_nodes.nelem; ++i)
1045       {
1046 	Idx node_idx = init_nodes.elems[i];
1047 	re_token_type_t type = dfa->nodes[node_idx].type;
1048 
1049 	Idx clexp_idx;
1050 	if (type != OP_BACK_REF)
1051 	  continue;
1052 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053 	  {
1054 	    re_token_t *clexp_node;
1055 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1057 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058 	      break;
1059 	  }
1060 	if (clexp_idx == init_nodes.nelem)
1061 	  continue;
1062 
1063 	if (type == OP_BACK_REF)
1064 	  {
1065 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1066 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1067 	      {
1068 		reg_errcode_t merge_err
1069                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070 		if (merge_err != REG_NOERROR)
1071 		  return merge_err;
1072 		i = 0;
1073 	      }
1074 	  }
1075       }
1076 
1077   /* It must be the first time to invoke acquire_state.  */
1078   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079   /* We don't check ERR here, since the initial state must not be NULL.  */
1080   if (BE (dfa->init_state == NULL, 0))
1081     return err;
1082   if (dfa->init_state->has_constraint)
1083     {
1084       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085 						       CONTEXT_WORD);
1086       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087 						     CONTEXT_NEWLINE);
1088       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089 							 &init_nodes,
1090 							 CONTEXT_NEWLINE
1091 							 | CONTEXT_BEGBUF);
1092       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093 	      || dfa->init_state_begbuf == NULL, 0))
1094 	return err;
1095     }
1096   else
1097     dfa->init_state_word = dfa->init_state_nl
1098       = dfa->init_state_begbuf = dfa->init_state;
1099 
1100   re_node_set_free (&init_nodes);
1101   return REG_NOERROR;
1102 }
1103 
1104 #ifdef RE_ENABLE_I18N
1105 /* If it is possible to do searching in single byte encoding instead of UTF-8
1106    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107    DFA nodes where needed.  */
1108 
1109 static void
1110 optimize_utf8 (re_dfa_t *dfa)
1111 {
1112   Idx node;
1113   int i;
1114   bool mb_chars = false;
1115   bool has_period = false;
1116 
1117   for (node = 0; node < dfa->nodes_len; ++node)
1118     switch (dfa->nodes[node].type)
1119       {
1120       case CHARACTER:
1121 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122 	  mb_chars = true;
1123 	break;
1124       case ANCHOR:
1125 	switch (dfa->nodes[node].opr.ctx_type)
1126 	  {
1127 	  case LINE_FIRST:
1128 	  case LINE_LAST:
1129 	  case BUF_FIRST:
1130 	  case BUF_LAST:
1131 	    break;
1132 	  default:
1133 	    /* Word anchors etc. cannot be handled.  It's okay to test
1134 	       opr.ctx_type since constraints (for all DFA nodes) are
1135 	       created by ORing one or more opr.ctx_type values.  */
1136 	    return;
1137 	  }
1138 	break;
1139       case OP_PERIOD:
1140 	has_period = true;
1141 	break;
1142       case OP_BACK_REF:
1143       case OP_ALT:
1144       case END_OF_RE:
1145       case OP_DUP_ASTERISK:
1146       case OP_OPEN_SUBEXP:
1147       case OP_CLOSE_SUBEXP:
1148 	break;
1149       case COMPLEX_BRACKET:
1150 	return;
1151       case SIMPLE_BRACKET:
1152 	/* Just double check.  */
1153 	{
1154 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155 			? 0
1156 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158 	    {
1159 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160 		return;
1161 	      rshift = 0;
1162 	    }
1163 	}
1164 	break;
1165       default:
1166 	abort ();
1167       }
1168 
1169   if (mb_chars || has_period)
1170     for (node = 0; node < dfa->nodes_len; ++node)
1171       {
1172 	if (dfa->nodes[node].type == CHARACTER
1173 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174 	  dfa->nodes[node].mb_partial = 0;
1175 	else if (dfa->nodes[node].type == OP_PERIOD)
1176 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1177       }
1178 
1179   /* The search can be in single byte locale.  */
1180   dfa->mb_cur_max = 1;
1181   dfa->is_utf8 = 0;
1182   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183 }
1184 #endif
1185 
1186 /* Analyze the structure tree, and calculate "first", "next", "edest",
1187    "eclosure", and "inveclosure".  */
1188 
1189 static reg_errcode_t
1190 analyze (regex_t *preg)
1191 {
1192   re_dfa_t *dfa = preg->buffer;
1193   reg_errcode_t ret;
1194 
1195   /* Allocate arrays.  */
1196   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201 	  || dfa->eclosures == NULL, 0))
1202     return REG_ESPACE;
1203 
1204   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205   if (dfa->subexp_map != NULL)
1206     {
1207       Idx i;
1208       for (i = 0; i < preg->re_nsub; i++)
1209 	dfa->subexp_map[i] = i;
1210       preorder (dfa->str_tree, optimize_subexps, dfa);
1211       for (i = 0; i < preg->re_nsub; i++)
1212 	if (dfa->subexp_map[i] != i)
1213 	  break;
1214       if (i == preg->re_nsub)
1215 	{
1216 	  free (dfa->subexp_map);
1217 	  dfa->subexp_map = NULL;
1218 	}
1219     }
1220 
1221   ret = postorder (dfa->str_tree, lower_subexps, preg);
1222   if (BE (ret != REG_NOERROR, 0))
1223     return ret;
1224   ret = postorder (dfa->str_tree, calc_first, dfa);
1225   if (BE (ret != REG_NOERROR, 0))
1226     return ret;
1227   preorder (dfa->str_tree, calc_next, dfa);
1228   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229   if (BE (ret != REG_NOERROR, 0))
1230     return ret;
1231   ret = calc_eclosure (dfa);
1232   if (BE (ret != REG_NOERROR, 0))
1233     return ret;
1234 
1235   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1237   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238       || dfa->nbackref)
1239     {
1240       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241       if (BE (dfa->inveclosures == NULL, 0))
1242 	return REG_ESPACE;
1243       ret = calc_inveclosure (dfa);
1244     }
1245 
1246   return ret;
1247 }
1248 
1249 /* Our parse trees are very unbalanced, so we cannot use a stack to
1250    implement parse tree visits.  Instead, we use parent pointers and
1251    some hairy code in these two functions.  */
1252 static reg_errcode_t
1253 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254 	   void *extra)
1255 {
1256   bin_tree_t *node, *prev;
1257 
1258   for (node = root; ; )
1259     {
1260       /* Descend down the tree, preferably to the left (or to the right
1261 	 if that's the only child).  */
1262       while (node->left || node->right)
1263 	if (node->left)
1264 	  node = node->left;
1265 	else
1266 	  node = node->right;
1267 
1268       do
1269 	{
1270 	  reg_errcode_t err = fn (extra, node);
1271 	  if (BE (err != REG_NOERROR, 0))
1272 	    return err;
1273 	  if (node->parent == NULL)
1274 	    return REG_NOERROR;
1275 	  prev = node;
1276 	  node = node->parent;
1277 	}
1278       /* Go up while we have a node that is reached from the right.  */
1279       while (node->right == prev || node->right == NULL);
1280       node = node->right;
1281     }
1282 }
1283 
1284 static reg_errcode_t
1285 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286 	  void *extra)
1287 {
1288   bin_tree_t *node;
1289 
1290   for (node = root; ; )
1291     {
1292       reg_errcode_t err = fn (extra, node);
1293       if (BE (err != REG_NOERROR, 0))
1294 	return err;
1295 
1296       /* Go to the left node, or up and to the right.  */
1297       if (node->left)
1298 	node = node->left;
1299       else
1300 	{
1301 	  bin_tree_t *prev = NULL;
1302 	  while (node->right == prev || node->right == NULL)
1303 	    {
1304 	      prev = node;
1305 	      node = node->parent;
1306 	      if (!node)
1307 		return REG_NOERROR;
1308 	    }
1309 	  node = node->right;
1310 	}
1311     }
1312 }
1313 
1314 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1316    backreferences as well.  Requires a preorder visit.  */
1317 static reg_errcode_t
1318 optimize_subexps (void *extra, bin_tree_t *node)
1319 {
1320   re_dfa_t *dfa = (re_dfa_t *) extra;
1321 
1322   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323     {
1324       int idx = node->token.opr.idx;
1325       node->token.opr.idx = dfa->subexp_map[idx];
1326       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327     }
1328 
1329   else if (node->token.type == SUBEXP
1330 	   && node->left && node->left->token.type == SUBEXP)
1331     {
1332       Idx other_idx = node->left->token.opr.idx;
1333 
1334       node->left = node->left->left;
1335       if (node->left)
1336 	node->left->parent = node;
1337 
1338       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339       if (other_idx < BITSET_WORD_BITS)
1340 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341     }
1342 
1343   return REG_NOERROR;
1344 }
1345 
1346 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1348 static reg_errcode_t
1349 lower_subexps (void *extra, bin_tree_t *node)
1350 {
1351   regex_t *preg = (regex_t *) extra;
1352   reg_errcode_t err = REG_NOERROR;
1353 
1354   if (node->left && node->left->token.type == SUBEXP)
1355     {
1356       node->left = lower_subexp (&err, preg, node->left);
1357       if (node->left)
1358 	node->left->parent = node;
1359     }
1360   if (node->right && node->right->token.type == SUBEXP)
1361     {
1362       node->right = lower_subexp (&err, preg, node->right);
1363       if (node->right)
1364 	node->right->parent = node;
1365     }
1366 
1367   return err;
1368 }
1369 
1370 static bin_tree_t *
1371 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372 {
1373   re_dfa_t *dfa = preg->buffer;
1374   bin_tree_t *body = node->left;
1375   bin_tree_t *op, *cls, *tree1, *tree;
1376 
1377   if (preg->no_sub
1378       /* We do not optimize empty subexpressions, because otherwise we may
1379 	 have bad CONCAT nodes with NULL children.  This is obviously not
1380 	 very common, so we do not lose much.  An example that triggers
1381 	 this case is the sed "script" /\(\)/x.  */
1382       && node->left != NULL
1383       && (node->token.opr.idx >= BITSET_WORD_BITS
1384 	  || !(dfa->used_bkref_map
1385 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1386     return node->left;
1387 
1388   /* Convert the SUBEXP node to the concatenation of an
1389      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1390   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393   tree = create_tree (dfa, op, tree1, CONCAT);
1394   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395     {
1396       *err = REG_ESPACE;
1397       return NULL;
1398     }
1399 
1400   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402   return tree;
1403 }
1404 
1405 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406    nodes.  Requires a postorder visit.  */
1407 static reg_errcode_t
1408 calc_first (void *extra, bin_tree_t *node)
1409 {
1410   re_dfa_t *dfa = (re_dfa_t *) extra;
1411   if (node->token.type == CONCAT)
1412     {
1413       node->first = node->left->first;
1414       node->node_idx = node->left->node_idx;
1415     }
1416   else
1417     {
1418       node->first = node;
1419       node->node_idx = re_dfa_add_node (dfa, node->token);
1420       if (BE (node->node_idx == REG_MISSING, 0))
1421 	return REG_ESPACE;
1422       if (node->token.type == ANCHOR)
1423 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1424     }
1425   return REG_NOERROR;
1426 }
1427 
1428 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1429 static reg_errcode_t
1430 calc_next (void *extra, bin_tree_t *node)
1431 {
1432   switch (node->token.type)
1433     {
1434     case OP_DUP_ASTERISK:
1435       node->left->next = node;
1436       break;
1437     case CONCAT:
1438       node->left->next = node->right->first;
1439       node->right->next = node->next;
1440       break;
1441     default:
1442       if (node->left)
1443 	node->left->next = node->next;
1444       if (node->right)
1445 	node->right->next = node->next;
1446       break;
1447     }
1448   return REG_NOERROR;
1449 }
1450 
1451 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1452 static reg_errcode_t
1453 link_nfa_nodes (void *extra, bin_tree_t *node)
1454 {
1455   re_dfa_t *dfa = (re_dfa_t *) extra;
1456   Idx idx = node->node_idx;
1457   reg_errcode_t err = REG_NOERROR;
1458 
1459   switch (node->token.type)
1460     {
1461     case CONCAT:
1462       break;
1463 
1464     case END_OF_RE:
1465       assert (node->next == NULL);
1466       break;
1467 
1468     case OP_DUP_ASTERISK:
1469     case OP_ALT:
1470       {
1471 	Idx left, right;
1472 	dfa->has_plural_match = 1;
1473 	if (node->left != NULL)
1474 	  left = node->left->first->node_idx;
1475 	else
1476 	  left = node->next->node_idx;
1477 	if (node->right != NULL)
1478 	  right = node->right->first->node_idx;
1479 	else
1480 	  right = node->next->node_idx;
1481 	assert (REG_VALID_INDEX (left));
1482 	assert (REG_VALID_INDEX (right));
1483 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484       }
1485       break;
1486 
1487     case ANCHOR:
1488     case OP_OPEN_SUBEXP:
1489     case OP_CLOSE_SUBEXP:
1490       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491       break;
1492 
1493     case OP_BACK_REF:
1494       dfa->nexts[idx] = node->next->node_idx;
1495       if (node->token.type == OP_BACK_REF)
1496 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1497       break;
1498 
1499     default:
1500       assert (!IS_EPSILON_NODE (node->token.type));
1501       dfa->nexts[idx] = node->next->node_idx;
1502       break;
1503     }
1504 
1505   return err;
1506 }
1507 
1508 /* Duplicate the epsilon closure of the node ROOT_NODE.
1509    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510    to their own constraint.  */
1511 
1512 static reg_errcode_t
1513 internal_function
1514 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515 			Idx root_node, unsigned int init_constraint)
1516 {
1517   Idx org_node, clone_node;
1518   bool ok;
1519   unsigned int constraint = init_constraint;
1520   for (org_node = top_org_node, clone_node = top_clone_node;;)
1521     {
1522       Idx org_dest, clone_dest;
1523       if (dfa->nodes[org_node].type == OP_BACK_REF)
1524 	{
1525 	  /* If the back reference epsilon-transit, its destination must
1526 	     also have the constraint.  Then duplicate the epsilon closure
1527 	     of the destination of the back reference, and store it in
1528 	     edests of the back reference.  */
1529 	  org_dest = dfa->nexts[org_node];
1530 	  re_node_set_empty (dfa->edests + clone_node);
1531 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 	  if (BE (clone_dest == REG_MISSING, 0))
1533 	    return REG_ESPACE;
1534 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1535 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 	  if (BE (! ok, 0))
1537 	    return REG_ESPACE;
1538 	}
1539       else if (dfa->edests[org_node].nelem == 0)
1540 	{
1541 	  /* In case of the node can't epsilon-transit, don't duplicate the
1542 	     destination and store the original destination as the
1543 	     destination of the node.  */
1544 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1545 	  break;
1546 	}
1547       else if (dfa->edests[org_node].nelem == 1)
1548 	{
1549 	  /* In case of the node can epsilon-transit, and it has only one
1550 	     destination.  */
1551 	  org_dest = dfa->edests[org_node].elems[0];
1552 	  re_node_set_empty (dfa->edests + clone_node);
1553 	  /* If the node is root_node itself, it means the epsilon closure
1554 	     has a loop.  Then tie it to the destination of the root_node.  */
1555 	  if (org_node == root_node && clone_node != org_node)
1556 	    {
1557 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558 	      if (BE (! ok, 0))
1559 	        return REG_ESPACE;
1560 	      break;
1561 	    }
1562 	  /* In case the node has another constraint, append it.  */
1563 	  constraint |= dfa->nodes[org_node].constraint;
1564 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 	  if (BE (clone_dest == REG_MISSING, 0))
1566 	    return REG_ESPACE;
1567 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 	  if (BE (! ok, 0))
1569 	    return REG_ESPACE;
1570 	}
1571       else /* dfa->edests[org_node].nelem == 2 */
1572 	{
1573 	  /* In case of the node can epsilon-transit, and it has two
1574 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1575 	  org_dest = dfa->edests[org_node].elems[0];
1576 	  re_node_set_empty (dfa->edests + clone_node);
1577 	  /* Search for a duplicated node which satisfies the constraint.  */
1578 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579 	  if (clone_dest == REG_MISSING)
1580 	    {
1581 	      /* There is no such duplicated node, create a new one.  */
1582 	      reg_errcode_t err;
1583 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1584 	      if (BE (clone_dest == REG_MISSING, 0))
1585 		return REG_ESPACE;
1586 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587 	      if (BE (! ok, 0))
1588 		return REG_ESPACE;
1589 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590 					    root_node, constraint);
1591 	      if (BE (err != REG_NOERROR, 0))
1592 		return err;
1593 	    }
1594 	  else
1595 	    {
1596 	      /* There is a duplicated node which satisfies the constraint,
1597 		 use it to avoid infinite loop.  */
1598 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599 	      if (BE (! ok, 0))
1600 		return REG_ESPACE;
1601 	    }
1602 
1603 	  org_dest = dfa->edests[org_node].elems[1];
1604 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1605 	  if (BE (clone_dest == REG_MISSING, 0))
1606 	    return REG_ESPACE;
1607 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608 	  if (BE (! ok, 0))
1609 	    return REG_ESPACE;
1610 	}
1611       org_node = org_dest;
1612       clone_node = clone_dest;
1613     }
1614   return REG_NOERROR;
1615 }
1616 
1617 /* Search for a node which is duplicated from the node ORG_NODE, and
1618    satisfies the constraint CONSTRAINT.  */
1619 
1620 static Idx
1621 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622 			unsigned int constraint)
1623 {
1624   Idx idx;
1625   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626     {
1627       if (org_node == dfa->org_indices[idx]
1628 	  && constraint == dfa->nodes[idx].constraint)
1629 	return idx; /* Found.  */
1630     }
1631   return REG_MISSING; /* Not found.  */
1632 }
1633 
1634 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635    Return the index of the new node, or REG_MISSING if insufficient storage is
1636    available.  */
1637 
1638 static Idx
1639 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640 {
1641   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642   if (BE (dup_idx != REG_MISSING, 1))
1643     {
1644       dfa->nodes[dup_idx].constraint = constraint;
1645       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646       dfa->nodes[dup_idx].duplicated = 1;
1647 
1648       /* Store the index of the original node.  */
1649       dfa->org_indices[dup_idx] = org_idx;
1650     }
1651   return dup_idx;
1652 }
1653 
1654 static reg_errcode_t
1655 calc_inveclosure (re_dfa_t *dfa)
1656 {
1657   Idx src, idx;
1658   bool ok;
1659   for (idx = 0; idx < dfa->nodes_len; ++idx)
1660     re_node_set_init_empty (dfa->inveclosures + idx);
1661 
1662   for (src = 0; src < dfa->nodes_len; ++src)
1663     {
1664       Idx *elems = dfa->eclosures[src].elems;
1665       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666 	{
1667 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668 	  if (BE (! ok, 0))
1669 	    return REG_ESPACE;
1670 	}
1671     }
1672 
1673   return REG_NOERROR;
1674 }
1675 
1676 /* Calculate "eclosure" for all the node in DFA.  */
1677 
1678 static reg_errcode_t
1679 calc_eclosure (re_dfa_t *dfa)
1680 {
1681   Idx node_idx;
1682   bool incomplete;
1683 #ifdef DEBUG
1684   assert (dfa->nodes_len > 0);
1685 #endif
1686   incomplete = false;
1687   /* For each nodes, calculate epsilon closure.  */
1688   for (node_idx = 0; ; ++node_idx)
1689     {
1690       reg_errcode_t err;
1691       re_node_set eclosure_elem;
1692       if (node_idx == dfa->nodes_len)
1693 	{
1694 	  if (!incomplete)
1695 	    break;
1696 	  incomplete = false;
1697 	  node_idx = 0;
1698 	}
1699 
1700 #ifdef DEBUG
1701       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702 #endif
1703 
1704       /* If we have already calculated, skip it.  */
1705       if (dfa->eclosures[node_idx].nelem != 0)
1706 	continue;
1707       /* Calculate epsilon closure of 'node_idx'.  */
1708       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709       if (BE (err != REG_NOERROR, 0))
1710 	return err;
1711 
1712       if (dfa->eclosures[node_idx].nelem == 0)
1713 	{
1714 	  incomplete = true;
1715 	  re_node_set_free (&eclosure_elem);
1716 	}
1717     }
1718   return REG_NOERROR;
1719 }
1720 
1721 /* Calculate epsilon closure of NODE.  */
1722 
1723 static reg_errcode_t
1724 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725 {
1726   reg_errcode_t err;
1727   Idx i;
1728   re_node_set eclosure;
1729   bool ok;
1730   bool incomplete = false;
1731   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732   if (BE (err != REG_NOERROR, 0))
1733     return err;
1734 
1735   /* This indicates that we are calculating this node now.
1736      We reference this value to avoid infinite loop.  */
1737   dfa->eclosures[node].nelem = REG_MISSING;
1738 
1739   /* If the current node has constraints, duplicate all nodes
1740      since they must inherit the constraints.  */
1741   if (dfa->nodes[node].constraint
1742       && dfa->edests[node].nelem
1743       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744     {
1745       err = duplicate_node_closure (dfa, node, node, node,
1746 				    dfa->nodes[node].constraint);
1747       if (BE (err != REG_NOERROR, 0))
1748 	return err;
1749     }
1750 
1751   /* Expand each epsilon destination nodes.  */
1752   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753     for (i = 0; i < dfa->edests[node].nelem; ++i)
1754       {
1755 	re_node_set eclosure_elem;
1756 	Idx edest = dfa->edests[node].elems[i];
1757 	/* If calculating the epsilon closure of 'edest' is in progress,
1758 	   return intermediate result.  */
1759 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1760 	  {
1761 	    incomplete = true;
1762 	    continue;
1763 	  }
1764 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1765 	   calculate now. Otherwise use calculated epsilon closure.  */
1766 	if (dfa->eclosures[edest].nelem == 0)
1767 	  {
1768 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769 	    if (BE (err != REG_NOERROR, 0))
1770 	      return err;
1771 	  }
1772 	else
1773 	  eclosure_elem = dfa->eclosures[edest];
1774 	/* Merge the epsilon closure of 'edest'.  */
1775 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1776 	if (BE (err != REG_NOERROR, 0))
1777 	  return err;
1778 	/* If the epsilon closure of 'edest' is incomplete,
1779 	   the epsilon closure of this node is also incomplete.  */
1780 	if (dfa->eclosures[edest].nelem == 0)
1781 	  {
1782 	    incomplete = true;
1783 	    re_node_set_free (&eclosure_elem);
1784 	  }
1785       }
1786 
1787   /* An epsilon closure includes itself.  */
1788   ok = re_node_set_insert (&eclosure, node);
1789   if (BE (! ok, 0))
1790     return REG_ESPACE;
1791   if (incomplete && !root)
1792     dfa->eclosures[node].nelem = 0;
1793   else
1794     dfa->eclosures[node] = eclosure;
1795   *new_set = eclosure;
1796   return REG_NOERROR;
1797 }
1798 
1799 /* Functions for token which are used in the parser.  */
1800 
1801 /* Fetch a token from INPUT.
1802    We must not use this function inside bracket expressions.  */
1803 
1804 static void
1805 internal_function
1806 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807 {
1808   re_string_skip_bytes (input, peek_token (result, input, syntax));
1809 }
1810 
1811 /* Peek a token from INPUT, and return the length of the token.
1812    We must not use this function inside bracket expressions.  */
1813 
1814 static int
1815 internal_function
1816 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817 {
1818   unsigned char c;
1819 
1820   if (re_string_eoi (input))
1821     {
1822       token->type = END_OF_RE;
1823       return 0;
1824     }
1825 
1826   c = re_string_peek_byte (input, 0);
1827   token->opr.c = c;
1828 
1829   token->word_char = 0;
1830 #ifdef RE_ENABLE_I18N
1831   token->mb_partial = 0;
1832   if (input->mb_cur_max > 1 &&
1833       !re_string_first_byte (input, re_string_cur_idx (input)))
1834     {
1835       token->type = CHARACTER;
1836       token->mb_partial = 1;
1837       return 1;
1838     }
1839 #endif
1840   if (c == '\\')
1841     {
1842       unsigned char c2;
1843       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844 	{
1845 	  token->type = BACK_SLASH;
1846 	  return 1;
1847 	}
1848 
1849       c2 = re_string_peek_byte_case (input, 1);
1850       token->opr.c = c2;
1851       token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853       if (input->mb_cur_max > 1)
1854 	{
1855 	  wint_t wc = re_string_wchar_at (input,
1856 					  re_string_cur_idx (input) + 1);
1857 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858 	}
1859       else
1860 #endif
1861 	token->word_char = IS_WORD_CHAR (c2) != 0;
1862 
1863       switch (c2)
1864 	{
1865 	case '|':
1866 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867 	    token->type = OP_ALT;
1868 	  break;
1869 	case '1': case '2': case '3': case '4': case '5':
1870 	case '6': case '7': case '8': case '9':
1871 	  if (!(syntax & RE_NO_BK_REFS))
1872 	    {
1873 	      token->type = OP_BACK_REF;
1874 	      token->opr.idx = c2 - '1';
1875 	    }
1876 	  break;
1877 	case '<':
1878 	  if (!(syntax & RE_NO_GNU_OPS))
1879 	    {
1880 	      token->type = ANCHOR;
1881 	      token->opr.ctx_type = WORD_FIRST;
1882 	    }
1883 	  break;
1884 	case '>':
1885 	  if (!(syntax & RE_NO_GNU_OPS))
1886 	    {
1887 	      token->type = ANCHOR;
1888 	      token->opr.ctx_type = WORD_LAST;
1889 	    }
1890 	  break;
1891 	case 'b':
1892 	  if (!(syntax & RE_NO_GNU_OPS))
1893 	    {
1894 	      token->type = ANCHOR;
1895 	      token->opr.ctx_type = WORD_DELIM;
1896 	    }
1897 	  break;
1898 	case 'B':
1899 	  if (!(syntax & RE_NO_GNU_OPS))
1900 	    {
1901 	      token->type = ANCHOR;
1902 	      token->opr.ctx_type = NOT_WORD_DELIM;
1903 	    }
1904 	  break;
1905 	case 'w':
1906 	  if (!(syntax & RE_NO_GNU_OPS))
1907 	    token->type = OP_WORD;
1908 	  break;
1909 	case 'W':
1910 	  if (!(syntax & RE_NO_GNU_OPS))
1911 	    token->type = OP_NOTWORD;
1912 	  break;
1913 	case 's':
1914 	  if (!(syntax & RE_NO_GNU_OPS))
1915 	    token->type = OP_SPACE;
1916 	  break;
1917 	case 'S':
1918 	  if (!(syntax & RE_NO_GNU_OPS))
1919 	    token->type = OP_NOTSPACE;
1920 	  break;
1921 	case '`':
1922 	  if (!(syntax & RE_NO_GNU_OPS))
1923 	    {
1924 	      token->type = ANCHOR;
1925 	      token->opr.ctx_type = BUF_FIRST;
1926 	    }
1927 	  break;
1928 	case '\'':
1929 	  if (!(syntax & RE_NO_GNU_OPS))
1930 	    {
1931 	      token->type = ANCHOR;
1932 	      token->opr.ctx_type = BUF_LAST;
1933 	    }
1934 	  break;
1935 	case '(':
1936 	  if (!(syntax & RE_NO_BK_PARENS))
1937 	    token->type = OP_OPEN_SUBEXP;
1938 	  break;
1939 	case ')':
1940 	  if (!(syntax & RE_NO_BK_PARENS))
1941 	    token->type = OP_CLOSE_SUBEXP;
1942 	  break;
1943 	case '+':
1944 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945 	    token->type = OP_DUP_PLUS;
1946 	  break;
1947 	case '?':
1948 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949 	    token->type = OP_DUP_QUESTION;
1950 	  break;
1951 	case '{':
1952 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953 	    token->type = OP_OPEN_DUP_NUM;
1954 	  break;
1955 	case '}':
1956 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957 	    token->type = OP_CLOSE_DUP_NUM;
1958 	  break;
1959 	default:
1960 	  break;
1961 	}
1962       return 2;
1963     }
1964 
1965   token->type = CHARACTER;
1966 #ifdef RE_ENABLE_I18N
1967   if (input->mb_cur_max > 1)
1968     {
1969       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971     }
1972   else
1973 #endif
1974     token->word_char = IS_WORD_CHAR (token->opr.c);
1975 
1976   switch (c)
1977     {
1978     case '\n':
1979       if (syntax & RE_NEWLINE_ALT)
1980 	token->type = OP_ALT;
1981       break;
1982     case '|':
1983       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984 	token->type = OP_ALT;
1985       break;
1986     case '*':
1987       token->type = OP_DUP_ASTERISK;
1988       break;
1989     case '+':
1990       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991 	token->type = OP_DUP_PLUS;
1992       break;
1993     case '?':
1994       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995 	token->type = OP_DUP_QUESTION;
1996       break;
1997     case '{':
1998       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999 	token->type = OP_OPEN_DUP_NUM;
2000       break;
2001     case '}':
2002       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003 	token->type = OP_CLOSE_DUP_NUM;
2004       break;
2005     case '(':
2006       if (syntax & RE_NO_BK_PARENS)
2007 	token->type = OP_OPEN_SUBEXP;
2008       break;
2009     case ')':
2010       if (syntax & RE_NO_BK_PARENS)
2011 	token->type = OP_CLOSE_SUBEXP;
2012       break;
2013     case '[':
2014       token->type = OP_OPEN_BRACKET;
2015       break;
2016     case '.':
2017       token->type = OP_PERIOD;
2018       break;
2019     case '^':
2020       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021 	  re_string_cur_idx (input) != 0)
2022 	{
2023 	  char prev = re_string_peek_byte (input, -1);
2024 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025 	    break;
2026 	}
2027       token->type = ANCHOR;
2028       token->opr.ctx_type = LINE_FIRST;
2029       break;
2030     case '$':
2031       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2033 	{
2034 	  re_token_t next;
2035 	  re_string_skip_bytes (input, 1);
2036 	  peek_token (&next, input, syntax);
2037 	  re_string_skip_bytes (input, -1);
2038 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039 	    break;
2040 	}
2041       token->type = ANCHOR;
2042       token->opr.ctx_type = LINE_LAST;
2043       break;
2044     default:
2045       break;
2046     }
2047   return 1;
2048 }
2049 
2050 /* Peek a token from INPUT, and return the length of the token.
2051    We must not use this function out of bracket expressions.  */
2052 
2053 static int
2054 internal_function
2055 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056 {
2057   unsigned char c;
2058   if (re_string_eoi (input))
2059     {
2060       token->type = END_OF_RE;
2061       return 0;
2062     }
2063   c = re_string_peek_byte (input, 0);
2064   token->opr.c = c;
2065 
2066 #ifdef RE_ENABLE_I18N
2067   if (input->mb_cur_max > 1 &&
2068       !re_string_first_byte (input, re_string_cur_idx (input)))
2069     {
2070       token->type = CHARACTER;
2071       return 1;
2072     }
2073 #endif /* RE_ENABLE_I18N */
2074 
2075   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076       && re_string_cur_idx (input) + 1 < re_string_length (input))
2077     {
2078       /* In this case, '\' escape a character.  */
2079       unsigned char c2;
2080       re_string_skip_bytes (input, 1);
2081       c2 = re_string_peek_byte (input, 0);
2082       token->opr.c = c2;
2083       token->type = CHARACTER;
2084       return 1;
2085     }
2086   if (c == '[') /* '[' is a special char in a bracket exps.  */
2087     {
2088       unsigned char c2;
2089       int token_len;
2090       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091 	c2 = re_string_peek_byte (input, 1);
2092       else
2093 	c2 = 0;
2094       token->opr.c = c2;
2095       token_len = 2;
2096       switch (c2)
2097 	{
2098 	case '.':
2099 	  token->type = OP_OPEN_COLL_ELEM;
2100 	  break;
2101 	case '=':
2102 	  token->type = OP_OPEN_EQUIV_CLASS;
2103 	  break;
2104 	case ':':
2105 	  if (syntax & RE_CHAR_CLASSES)
2106 	    {
2107 	      token->type = OP_OPEN_CHAR_CLASS;
2108 	      break;
2109 	    }
2110 	  /* else fall through.  */
2111 	default:
2112 	  token->type = CHARACTER;
2113 	  token->opr.c = c;
2114 	  token_len = 1;
2115 	  break;
2116 	}
2117       return token_len;
2118     }
2119   switch (c)
2120     {
2121     case '-':
2122       token->type = OP_CHARSET_RANGE;
2123       break;
2124     case ']':
2125       token->type = OP_CLOSE_BRACKET;
2126       break;
2127     case '^':
2128       token->type = OP_NON_MATCH_LIST;
2129       break;
2130     default:
2131       token->type = CHARACTER;
2132     }
2133   return 1;
2134 }
2135 
2136 /* Functions for parser.  */
2137 
2138 /* Entry point of the parser.
2139    Parse the regular expression REGEXP and return the structure tree.
2140    If an error occurs, ERR is set by error code, and return NULL.
2141    This function build the following tree, from regular expression <reg_exp>:
2142 	   CAT
2143 	   / \
2144 	  /   \
2145    <reg_exp>  EOR
2146 
2147    CAT means concatenation.
2148    EOR means end of regular expression.  */
2149 
2150 static bin_tree_t *
2151 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152        reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = preg->buffer;
2155   bin_tree_t *tree, *eor, *root;
2156   re_token_t current_token;
2157   dfa->syntax = syntax;
2158   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161     return NULL;
2162   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163   if (tree != NULL)
2164     root = create_tree (dfa, tree, eor, CONCAT);
2165   else
2166     root = eor;
2167   if (BE (eor == NULL || root == NULL, 0))
2168     {
2169       *err = REG_ESPACE;
2170       return NULL;
2171     }
2172   return root;
2173 }
2174 
2175 /* This function build the following tree, from regular expression
2176    <branch1>|<branch2>:
2177 	   ALT
2178 	   / \
2179 	  /   \
2180    <branch1> <branch2>
2181 
2182    ALT means alternative, which represents the operator '|'.  */
2183 
2184 static bin_tree_t *
2185 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 {
2188   re_dfa_t *dfa = preg->buffer;
2189   bin_tree_t *tree, *branch = NULL;
2190   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2191   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192     return NULL;
2193 
2194   while (token->type == OP_ALT)
2195     {
2196       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2197       if (token->type != OP_ALT && token->type != END_OF_RE
2198 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199 	{
2200 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2201 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202 	    return NULL;
2203 	}
2204       else
2205 	branch = NULL;
2206       tree = create_tree (dfa, tree, branch, OP_ALT);
2207       if (BE (tree == NULL, 0))
2208 	{
2209 	  *err = REG_ESPACE;
2210 	  return NULL;
2211 	}
2212     }
2213   return tree;
2214 }
2215 
2216 /* This function build the following tree, from regular expression
2217    <exp1><exp2>:
2218 	CAT
2219 	/ \
2220        /   \
2221    <exp1> <exp2>
2222 
2223    CAT means concatenation.  */
2224 
2225 static bin_tree_t *
2226 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228 {
2229   bin_tree_t *tree, *expr;
2230   re_dfa_t *dfa = preg->buffer;
2231   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2232   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2233     return NULL;
2234 
2235   while (token->type != OP_ALT && token->type != END_OF_RE
2236 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2237     {
2238       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2239       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2240 	{
2241 	  if (tree != NULL)
2242 	    postorder (tree, free_tree, NULL);
2243 	  return NULL;
2244 	}
2245       if (tree != NULL && expr != NULL)
2246 	{
2247 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2248 	  if (newtree == NULL)
2249 	    {
2250 	      postorder (expr, free_tree, NULL);
2251 	      postorder (tree, free_tree, NULL);
2252 	      *err = REG_ESPACE;
2253 	      return NULL;
2254 	    }
2255 	  tree = newtree;
2256 	}
2257       else if (tree == NULL)
2258 	tree = expr;
2259       /* Otherwise expr == NULL, we don't need to create new tree.  */
2260     }
2261   return tree;
2262 }
2263 
2264 /* This function build the following tree, from regular expression a*:
2265 	 *
2266 	 |
2267 	 a
2268 */
2269 
2270 static bin_tree_t *
2271 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2272 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2273 {
2274   re_dfa_t *dfa = preg->buffer;
2275   bin_tree_t *tree;
2276   switch (token->type)
2277     {
2278     case CHARACTER:
2279       tree = create_token_tree (dfa, NULL, NULL, token);
2280       if (BE (tree == NULL, 0))
2281 	{
2282 	  *err = REG_ESPACE;
2283 	  return NULL;
2284 	}
2285 #ifdef RE_ENABLE_I18N
2286       if (dfa->mb_cur_max > 1)
2287 	{
2288 	  while (!re_string_eoi (regexp)
2289 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2290 	    {
2291 	      bin_tree_t *mbc_remain;
2292 	      fetch_token (token, regexp, syntax);
2293 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2294 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2295 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2296 		{
2297 		  *err = REG_ESPACE;
2298 		  return NULL;
2299 		}
2300 	    }
2301 	}
2302 #endif
2303       break;
2304     case OP_OPEN_SUBEXP:
2305       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2306       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307 	return NULL;
2308       break;
2309     case OP_OPEN_BRACKET:
2310       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2311       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312 	return NULL;
2313       break;
2314     case OP_BACK_REF:
2315       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2316 	{
2317 	  *err = REG_ESUBREG;
2318 	  return NULL;
2319 	}
2320       dfa->used_bkref_map |= 1 << token->opr.idx;
2321       tree = create_token_tree (dfa, NULL, NULL, token);
2322       if (BE (tree == NULL, 0))
2323 	{
2324 	  *err = REG_ESPACE;
2325 	  return NULL;
2326 	}
2327       ++dfa->nbackref;
2328       dfa->has_mb_node = 1;
2329       break;
2330     case OP_OPEN_DUP_NUM:
2331       if (syntax & RE_CONTEXT_INVALID_DUP)
2332 	{
2333 	  *err = REG_BADRPT;
2334 	  return NULL;
2335 	}
2336       /* FALLTHROUGH */
2337     case OP_DUP_ASTERISK:
2338     case OP_DUP_PLUS:
2339     case OP_DUP_QUESTION:
2340       if (syntax & RE_CONTEXT_INVALID_OPS)
2341 	{
2342 	  *err = REG_BADRPT;
2343 	  return NULL;
2344 	}
2345       else if (syntax & RE_CONTEXT_INDEP_OPS)
2346 	{
2347 	  fetch_token (token, regexp, syntax);
2348 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2349 	}
2350       /* else fall through  */
2351     case OP_CLOSE_SUBEXP:
2352       if ((token->type == OP_CLOSE_SUBEXP) &&
2353 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2354 	{
2355 	  *err = REG_ERPAREN;
2356 	  return NULL;
2357 	}
2358       /* else fall through  */
2359     case OP_CLOSE_DUP_NUM:
2360       /* We treat it as a normal character.  */
2361 
2362       /* Then we can these characters as normal characters.  */
2363       token->type = CHARACTER;
2364       /* mb_partial and word_char bits should be initialized already
2365 	 by peek_token.  */
2366       tree = create_token_tree (dfa, NULL, NULL, token);
2367       if (BE (tree == NULL, 0))
2368 	{
2369 	  *err = REG_ESPACE;
2370 	  return NULL;
2371 	}
2372       break;
2373     case ANCHOR:
2374       if ((token->opr.ctx_type
2375 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2376 	  && dfa->word_ops_used == 0)
2377 	init_word_char (dfa);
2378       if (token->opr.ctx_type == WORD_DELIM
2379 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2380 	{
2381 	  bin_tree_t *tree_first, *tree_last;
2382 	  if (token->opr.ctx_type == WORD_DELIM)
2383 	    {
2384 	      token->opr.ctx_type = WORD_FIRST;
2385 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2386 	      token->opr.ctx_type = WORD_LAST;
2387 	    }
2388 	  else
2389 	    {
2390 	      token->opr.ctx_type = INSIDE_WORD;
2391 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2392 	      token->opr.ctx_type = INSIDE_NOTWORD;
2393 	    }
2394 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2395 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2396 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2397 	    {
2398 	      *err = REG_ESPACE;
2399 	      return NULL;
2400 	    }
2401 	}
2402       else
2403 	{
2404 	  tree = create_token_tree (dfa, NULL, NULL, token);
2405 	  if (BE (tree == NULL, 0))
2406 	    {
2407 	      *err = REG_ESPACE;
2408 	      return NULL;
2409 	    }
2410 	}
2411       /* We must return here, since ANCHORs can't be followed
2412 	 by repetition operators.
2413 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2414 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2415       fetch_token (token, regexp, syntax);
2416       return tree;
2417     case OP_PERIOD:
2418       tree = create_token_tree (dfa, NULL, NULL, token);
2419       if (BE (tree == NULL, 0))
2420 	{
2421 	  *err = REG_ESPACE;
2422 	  return NULL;
2423 	}
2424       if (dfa->mb_cur_max > 1)
2425 	dfa->has_mb_node = 1;
2426       break;
2427     case OP_WORD:
2428     case OP_NOTWORD:
2429       tree = build_charclass_op (dfa, regexp->trans,
2430 				 "alnum",
2431 				 "_",
2432 				 token->type == OP_NOTWORD, err);
2433       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2434 	return NULL;
2435       break;
2436     case OP_SPACE:
2437     case OP_NOTSPACE:
2438       tree = build_charclass_op (dfa, regexp->trans,
2439 				 "space",
2440 				 "",
2441 				 token->type == OP_NOTSPACE, err);
2442       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443 	return NULL;
2444       break;
2445     case OP_ALT:
2446     case END_OF_RE:
2447       return NULL;
2448     case BACK_SLASH:
2449       *err = REG_EESCAPE;
2450       return NULL;
2451     default:
2452       /* Must not happen?  */
2453 #ifdef DEBUG
2454       assert (0);
2455 #endif
2456       return NULL;
2457     }
2458   fetch_token (token, regexp, syntax);
2459 
2460   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2461 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2462     {
2463       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2464       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2465 	return NULL;
2466       /* In BRE consecutive duplications are not allowed.  */
2467       if ((syntax & RE_CONTEXT_INVALID_DUP)
2468 	  && (token->type == OP_DUP_ASTERISK
2469 	      || token->type == OP_OPEN_DUP_NUM))
2470 	{
2471 	  *err = REG_BADRPT;
2472 	  return NULL;
2473 	}
2474     }
2475 
2476   return tree;
2477 }
2478 
2479 /* This function build the following tree, from regular expression
2480    (<reg_exp>):
2481 	 SUBEXP
2482 	    |
2483 	<reg_exp>
2484 */
2485 
2486 static bin_tree_t *
2487 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2488 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489 {
2490   re_dfa_t *dfa = preg->buffer;
2491   bin_tree_t *tree;
2492   size_t cur_nsub;
2493   cur_nsub = preg->re_nsub++;
2494 
2495   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496 
2497   /* The subexpression may be a null string.  */
2498   if (token->type == OP_CLOSE_SUBEXP)
2499     tree = NULL;
2500   else
2501     {
2502       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2504 	{
2505 	  if (tree != NULL)
2506 	    postorder (tree, free_tree, NULL);
2507 	  *err = REG_EPAREN;
2508 	}
2509       if (BE (*err != REG_NOERROR, 0))
2510 	return NULL;
2511     }
2512 
2513   if (cur_nsub <= '9' - '1')
2514     dfa->completed_bkref_map |= 1 << cur_nsub;
2515 
2516   tree = create_tree (dfa, tree, NULL, SUBEXP);
2517   if (BE (tree == NULL, 0))
2518     {
2519       *err = REG_ESPACE;
2520       return NULL;
2521     }
2522   tree->token.opr.idx = cur_nsub;
2523   return tree;
2524 }
2525 
2526 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2527 
2528 static bin_tree_t *
2529 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2530 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2531 {
2532   bin_tree_t *tree = NULL, *old_tree = NULL;
2533   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2534   re_token_t start_token = *token;
2535 
2536   if (token->type == OP_OPEN_DUP_NUM)
2537     {
2538       end = 0;
2539       start = fetch_number (regexp, token, syntax);
2540       if (start == REG_MISSING)
2541 	{
2542 	  if (token->type == CHARACTER && token->opr.c == ',')
2543 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2544 	  else
2545 	    {
2546 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2547 	      return NULL;
2548 	    }
2549 	}
2550       if (BE (start != REG_ERROR, 1))
2551 	{
2552 	  /* We treat "{n}" as "{n,n}".  */
2553 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2554 		 : ((token->type == CHARACTER && token->opr.c == ',')
2555 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2556 	}
2557       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2558 	{
2559 	  /* Invalid sequence.  */
2560 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2561 	    {
2562 	      if (token->type == END_OF_RE)
2563 		*err = REG_EBRACE;
2564 	      else
2565 		*err = REG_BADBR;
2566 
2567 	      return NULL;
2568 	    }
2569 
2570 	  /* If the syntax bit is set, rollback.  */
2571 	  re_string_set_index (regexp, start_idx);
2572 	  *token = start_token;
2573 	  token->type = CHARACTER;
2574 	  /* mb_partial and word_char bits should be already initialized by
2575 	     peek_token.  */
2576 	  return elem;
2577 	}
2578 
2579       if (BE ((end != REG_MISSING && start > end)
2580 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2581 	{
2582 	  /* First number greater than second.  */
2583 	  *err = REG_BADBR;
2584 	  return NULL;
2585 	}
2586 
2587       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2588 	{
2589 	  *err = REG_ESIZE;
2590 	  return NULL;
2591 	}
2592     }
2593   else
2594     {
2595       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2596       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2597     }
2598 
2599   fetch_token (token, regexp, syntax);
2600 
2601   if (BE (elem == NULL, 0))
2602     return NULL;
2603   if (BE (start == 0 && end == 0, 0))
2604     {
2605       postorder (elem, free_tree, NULL);
2606       return NULL;
2607     }
2608 
2609   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2610   if (BE (start > 0, 0))
2611     {
2612       tree = elem;
2613       for (i = 2; i <= start; ++i)
2614 	{
2615 	  elem = duplicate_tree (elem, dfa);
2616 	  tree = create_tree (dfa, tree, elem, CONCAT);
2617 	  if (BE (elem == NULL || tree == NULL, 0))
2618 	    goto parse_dup_op_espace;
2619 	}
2620 
2621       if (start == end)
2622 	return tree;
2623 
2624       /* Duplicate ELEM before it is marked optional.  */
2625       elem = duplicate_tree (elem, dfa);
2626       old_tree = tree;
2627     }
2628   else
2629     old_tree = NULL;
2630 
2631   if (elem->token.type == SUBEXP)
2632     {
2633       uintptr_t subidx = elem->token.opr.idx;
2634       postorder (elem, mark_opt_subexp, (void *) subidx);
2635     }
2636 
2637   tree = create_tree (dfa, elem, NULL,
2638 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2639   if (BE (tree == NULL, 0))
2640     goto parse_dup_op_espace;
2641 
2642 /* From gnulib's "intprops.h":
2643    True if the arithmetic type T is signed.  */
2644 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2645 
2646   /* This loop is actually executed only when end != REG_MISSING,
2647      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2648      already created the start+1-th copy.  */
2649   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2650     for (i = start + 2; i <= end; ++i)
2651       {
2652 	elem = duplicate_tree (elem, dfa);
2653 	tree = create_tree (dfa, tree, elem, CONCAT);
2654 	if (BE (elem == NULL || tree == NULL, 0))
2655 	  goto parse_dup_op_espace;
2656 
2657 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2658 	if (BE (tree == NULL, 0))
2659 	  goto parse_dup_op_espace;
2660       }
2661 
2662   if (old_tree)
2663     tree = create_tree (dfa, old_tree, tree, CONCAT);
2664 
2665   return tree;
2666 
2667  parse_dup_op_espace:
2668   *err = REG_ESPACE;
2669   return NULL;
2670 }
2671 
2672 /* Size of the names for collating symbol/equivalence_class/character_class.
2673    I'm not sure, but maybe enough.  */
2674 #define BRACKET_NAME_BUF_SIZE 32
2675 
2676 #ifndef _LIBC
2677   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678      Build the range expression which starts from START_ELEM, and ends
2679      at END_ELEM.  The result are written to MBCSET and SBCSET.
2680      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681      mbcset->range_ends, is a pointer argument since we may
2682      update it.  */
2683 
2684 static reg_errcode_t
2685 internal_function
2686 # ifdef RE_ENABLE_I18N
2687 build_range_exp (const reg_syntax_t syntax,
2688                  bitset_t sbcset,
2689                  re_charset_t *mbcset,
2690                  Idx *range_alloc,
2691                  const bracket_elem_t *start_elem,
2692                  const bracket_elem_t *end_elem)
2693 # else /* not RE_ENABLE_I18N */
2694 build_range_exp (const reg_syntax_t syntax,
2695                  bitset_t sbcset,
2696                  const bracket_elem_t *start_elem,
2697                  const bracket_elem_t *end_elem)
2698 # endif /* not RE_ENABLE_I18N */
2699 {
2700   unsigned int start_ch, end_ch;
2701   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2702   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2703 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2704 	  0))
2705     return REG_ERANGE;
2706 
2707   /* We can handle no multi character collating elements without libc
2708      support.  */
2709   if (BE ((start_elem->type == COLL_SYM
2710 	   && strlen ((char *) start_elem->opr.name) > 1)
2711 	  || (end_elem->type == COLL_SYM
2712 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2713     return REG_ECOLLATE;
2714 
2715 # ifdef RE_ENABLE_I18N
2716   {
2717     wchar_t wc;
2718     wint_t start_wc;
2719     wint_t end_wc;
2720 
2721     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2722 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2723 		   : 0));
2724     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2725 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2726 		 : 0));
2727     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2728 		? __btowc (start_ch) : start_elem->opr.wch);
2729     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2730 	      ? __btowc (end_ch) : end_elem->opr.wch);
2731     if (start_wc == WEOF || end_wc == WEOF)
2732       return REG_ECOLLATE;
2733     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2734       return REG_ERANGE;
2735 
2736     /* Got valid collation sequence values, add them as a new entry.
2737        However, for !_LIBC we have no collation elements: if the
2738        character set is single byte, the single byte character set
2739        that we build below suffices.  parse_bracket_exp passes
2740        no MBCSET if dfa->mb_cur_max == 1.  */
2741     if (mbcset)
2742       {
2743 	/* Check the space of the arrays.  */
2744 	if (BE (*range_alloc == mbcset->nranges, 0))
2745 	  {
2746 	    /* There is not enough space, need realloc.  */
2747 	    wchar_t *new_array_start, *new_array_end;
2748 	    Idx new_nranges;
2749 
2750 	    /* +1 in case of mbcset->nranges is 0.  */
2751 	    new_nranges = 2 * mbcset->nranges + 1;
2752 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753 	       are NULL if *range_alloc == 0.  */
2754 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2755 					  new_nranges);
2756 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2757 					new_nranges);
2758 
2759 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2760 	      return REG_ESPACE;
2761 
2762 	    mbcset->range_starts = new_array_start;
2763 	    mbcset->range_ends = new_array_end;
2764 	    *range_alloc = new_nranges;
2765 	  }
2766 
2767 	mbcset->range_starts[mbcset->nranges] = start_wc;
2768 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2769       }
2770 
2771     /* Build the table for single byte characters.  */
2772     for (wc = 0; wc < SBC_MAX; ++wc)
2773       {
2774 	if (start_wc <= wc && wc <= end_wc)
2775 	  bitset_set (sbcset, wc);
2776       }
2777   }
2778 # else /* not RE_ENABLE_I18N */
2779   {
2780     unsigned int ch;
2781     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2782 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2783 		   : 0));
2784     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2785 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2786 		 : 0));
2787     if (start_ch > end_ch)
2788       return REG_ERANGE;
2789     /* Build the table for single byte characters.  */
2790     for (ch = 0; ch < SBC_MAX; ++ch)
2791       if (start_ch <= ch  && ch <= end_ch)
2792 	bitset_set (sbcset, ch);
2793   }
2794 # endif /* not RE_ENABLE_I18N */
2795   return REG_NOERROR;
2796 }
2797 #endif /* not _LIBC */
2798 
2799 #ifndef _LIBC
2800 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2801    Build the collating element which is represented by NAME.
2802    The result are written to MBCSET and SBCSET.
2803    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2804    pointer argument since we may update it.  */
2805 
2806 static reg_errcode_t
2807 internal_function
2808 # ifdef RE_ENABLE_I18N
2809 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2810 			Idx *coll_sym_alloc, const unsigned char *name)
2811 # else /* not RE_ENABLE_I18N */
2812 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2813 # endif /* not RE_ENABLE_I18N */
2814 {
2815   size_t name_len = strlen ((const char *) name);
2816   if (BE (name_len != 1, 0))
2817     return REG_ECOLLATE;
2818   else
2819     {
2820       bitset_set (sbcset, name[0]);
2821       return REG_NOERROR;
2822     }
2823 }
2824 #endif /* not _LIBC */
2825 
2826 /* This function parse bracket expression like "[abc]", "[a-c]",
2827    "[[.a-a.]]" etc.  */
2828 
2829 static bin_tree_t *
2830 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2831 		   reg_syntax_t syntax, reg_errcode_t *err)
2832 {
2833 #ifdef _LIBC
2834   const unsigned char *collseqmb;
2835   const char *collseqwc;
2836   uint32_t nrules;
2837   int32_t table_size;
2838   const int32_t *symb_table;
2839   const unsigned char *extra;
2840 
2841   /* Local function for parse_bracket_exp used in _LIBC environment.
2842      Seek the collating symbol entry corresponding to NAME.
2843      Return the index of the symbol in the SYMB_TABLE,
2844      or -1 if not found.  */
2845 
2846   auto inline int32_t
2847   __attribute__ ((always_inline))
2848   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2849     {
2850       int32_t elem;
2851 
2852       for (elem = 0; elem < table_size; elem++)
2853 	if (symb_table[2 * elem] != 0)
2854 	  {
2855 	    int32_t idx = symb_table[2 * elem + 1];
2856 	    /* Skip the name of collating element name.  */
2857 	    idx += 1 + extra[idx];
2858 	    if (/* Compare the length of the name.  */
2859 		name_len == extra[idx]
2860 		/* Compare the name.  */
2861 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2862 	      /* Yep, this is the entry.  */
2863 	      return elem;
2864 	  }
2865       return -1;
2866     }
2867 
2868   /* Local function for parse_bracket_exp used in _LIBC environment.
2869      Look up the collation sequence value of BR_ELEM.
2870      Return the value if succeeded, UINT_MAX otherwise.  */
2871 
2872   auto inline unsigned int
2873   __attribute__ ((always_inline))
2874   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2875     {
2876       if (br_elem->type == SB_CHAR)
2877 	{
2878 	  /*
2879 	  if (MB_CUR_MAX == 1)
2880 	  */
2881 	  if (nrules == 0)
2882 	    return collseqmb[br_elem->opr.ch];
2883 	  else
2884 	    {
2885 	      wint_t wc = __btowc (br_elem->opr.ch);
2886 	      return __collseq_table_lookup (collseqwc, wc);
2887 	    }
2888 	}
2889       else if (br_elem->type == MB_CHAR)
2890 	{
2891 	  if (nrules != 0)
2892 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893 	}
2894       else if (br_elem->type == COLL_SYM)
2895 	{
2896 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897 	  if (nrules != 0)
2898 	    {
2899 	      int32_t elem, idx;
2900 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2901 						  sym_name_len);
2902 	      if (elem != -1)
2903 		{
2904 		  /* We found the entry.  */
2905 		  idx = symb_table[2 * elem + 1];
2906 		  /* Skip the name of collating element name.  */
2907 		  idx += 1 + extra[idx];
2908 		  /* Skip the byte sequence of the collating element.  */
2909 		  idx += 1 + extra[idx];
2910 		  /* Adjust for the alignment.  */
2911 		  idx = (idx + 3) & ~3;
2912 		  /* Skip the multibyte collation sequence value.  */
2913 		  idx += sizeof (unsigned int);
2914 		  /* Skip the wide char sequence of the collating element.  */
2915 		  idx += sizeof (unsigned int) *
2916 		    (1 + *(unsigned int *) (extra + idx));
2917 		  /* Return the collation sequence value.  */
2918 		  return *(unsigned int *) (extra + idx);
2919 		}
2920 	      else if (sym_name_len == 1)
2921 		{
2922 		  /* No valid character.  Match it as a single byte
2923 		     character.  */
2924 		  return collseqmb[br_elem->opr.name[0]];
2925 		}
2926 	    }
2927 	  else if (sym_name_len == 1)
2928 	    return collseqmb[br_elem->opr.name[0]];
2929 	}
2930       return UINT_MAX;
2931     }
2932 
2933   /* Local function for parse_bracket_exp used in _LIBC environment.
2934      Build the range expression which starts from START_ELEM, and ends
2935      at END_ELEM.  The result are written to MBCSET and SBCSET.
2936      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937      mbcset->range_ends, is a pointer argument since we may
2938      update it.  */
2939 
2940   auto inline reg_errcode_t
2941   __attribute__ ((always_inline))
2942   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2944     {
2945       unsigned int ch;
2946       uint32_t start_collseq;
2947       uint32_t end_collseq;
2948 
2949       /* Equivalence Classes and Character Classes can't be a range
2950 	 start/end.  */
2951       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2952 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2953 	      0))
2954 	return REG_ERANGE;
2955 
2956       /* FIXME: Implement rational ranges here, too.  */
2957       start_collseq = lookup_collation_sequence_value (start_elem);
2958       end_collseq = lookup_collation_sequence_value (end_elem);
2959       /* Check start/end collation sequence values.  */
2960       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961 	return REG_ECOLLATE;
2962       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2963 	return REG_ERANGE;
2964 
2965       /* Got valid collation sequence values, add them as a new entry.
2966 	 However, if we have no collation elements, and the character set
2967 	 is single byte, the single byte character set that we
2968 	 build below suffices. */
2969       if (nrules > 0 || dfa->mb_cur_max > 1)
2970 	{
2971 	  /* Check the space of the arrays.  */
2972 	  if (BE (*range_alloc == mbcset->nranges, 0))
2973 	    {
2974 	      /* There is not enough space, need realloc.  */
2975 	      uint32_t *new_array_start;
2976 	      uint32_t *new_array_end;
2977 	      Idx new_nranges;
2978 
2979 	      /* +1 in case of mbcset->nranges is 0.  */
2980 	      new_nranges = 2 * mbcset->nranges + 1;
2981 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2982 					    new_nranges);
2983 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2984 					  new_nranges);
2985 
2986 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2987 		return REG_ESPACE;
2988 
2989 	      mbcset->range_starts = new_array_start;
2990 	      mbcset->range_ends = new_array_end;
2991 	      *range_alloc = new_nranges;
2992 	    }
2993 
2994 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2995 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2996 	}
2997 
2998       /* Build the table for single byte characters.  */
2999       for (ch = 0; ch < SBC_MAX; ch++)
3000 	{
3001 	  uint32_t ch_collseq;
3002 	  /*
3003 	  if (MB_CUR_MAX == 1)
3004 	  */
3005 	  if (nrules == 0)
3006 	    ch_collseq = collseqmb[ch];
3007 	  else
3008 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010 	    bitset_set (sbcset, ch);
3011 	}
3012       return REG_NOERROR;
3013     }
3014 
3015   /* Local function for parse_bracket_exp used in _LIBC environment.
3016      Build the collating element which is represented by NAME.
3017      The result are written to MBCSET and SBCSET.
3018      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019      pointer argument since we may update it.  */
3020 
3021   auto inline reg_errcode_t
3022   __attribute__ ((always_inline))
3023   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3024 			  Idx *coll_sym_alloc, const unsigned char *name)
3025     {
3026       int32_t elem, idx;
3027       size_t name_len = strlen ((const char *) name);
3028       if (nrules != 0)
3029 	{
3030 	  elem = seek_collating_symbol_entry (name, name_len);
3031 	  if (elem != -1)
3032 	    {
3033 	      /* We found the entry.  */
3034 	      idx = symb_table[2 * elem + 1];
3035 	      /* Skip the name of collating element name.  */
3036 	      idx += 1 + extra[idx];
3037 	    }
3038 	  else if (name_len == 1)
3039 	    {
3040 	      /* No valid character, treat it as a normal
3041 		 character.  */
3042 	      bitset_set (sbcset, name[0]);
3043 	      return REG_NOERROR;
3044 	    }
3045 	  else
3046 	    return REG_ECOLLATE;
3047 
3048 	  /* Got valid collation sequence, add it as a new entry.  */
3049 	  /* Check the space of the arrays.  */
3050 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3051 	    {
3052 	      /* Not enough, realloc it.  */
3053 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3054 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3055 	      /* Use realloc since mbcset->coll_syms is NULL
3056 		 if *alloc == 0.  */
3057 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3058 						   new_coll_sym_alloc);
3059 	      if (BE (new_coll_syms == NULL, 0))
3060 		return REG_ESPACE;
3061 	      mbcset->coll_syms = new_coll_syms;
3062 	      *coll_sym_alloc = new_coll_sym_alloc;
3063 	    }
3064 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3065 	  return REG_NOERROR;
3066 	}
3067       else
3068 	{
3069 	  if (BE (name_len != 1, 0))
3070 	    return REG_ECOLLATE;
3071 	  else
3072 	    {
3073 	      bitset_set (sbcset, name[0]);
3074 	      return REG_NOERROR;
3075 	    }
3076 	}
3077     }
3078 #endif
3079 
3080   re_token_t br_token;
3081   re_bitset_ptr_t sbcset;
3082 #ifdef RE_ENABLE_I18N
3083   re_charset_t *mbcset;
3084   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3085   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3086 #endif /* not RE_ENABLE_I18N */
3087   bool non_match = false;
3088   bin_tree_t *work_tree;
3089   int token_len;
3090   bool first_round = true;
3091 #ifdef _LIBC
3092   collseqmb = (const unsigned char *)
3093     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3094   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3095   if (nrules)
3096     {
3097       /*
3098       if (MB_CUR_MAX > 1)
3099       */
3100       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3101       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3102       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3103 						  _NL_COLLATE_SYMB_TABLEMB);
3104       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3105 						   _NL_COLLATE_SYMB_EXTRAMB);
3106     }
3107 #endif
3108   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3109 #ifdef RE_ENABLE_I18N
3110   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3111 #endif /* RE_ENABLE_I18N */
3112 #ifdef RE_ENABLE_I18N
3113   if (BE (sbcset == NULL || mbcset == NULL, 0))
3114 #else
3115   if (BE (sbcset == NULL, 0))
3116 #endif /* RE_ENABLE_I18N */
3117     {
3118       re_free (sbcset);
3119 #ifdef RE_ENABLE_I18N
3120       re_free (mbcset);
3121 #endif
3122       *err = REG_ESPACE;
3123       return NULL;
3124     }
3125 
3126   token_len = peek_token_bracket (token, regexp, syntax);
3127   if (BE (token->type == END_OF_RE, 0))
3128     {
3129       *err = REG_BADPAT;
3130       goto parse_bracket_exp_free_return;
3131     }
3132   if (token->type == OP_NON_MATCH_LIST)
3133     {
3134 #ifdef RE_ENABLE_I18N
3135       mbcset->non_match = 1;
3136 #endif /* not RE_ENABLE_I18N */
3137       non_match = true;
3138       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3139 	bitset_set (sbcset, '\n');
3140       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3141       token_len = peek_token_bracket (token, regexp, syntax);
3142       if (BE (token->type == END_OF_RE, 0))
3143 	{
3144 	  *err = REG_BADPAT;
3145 	  goto parse_bracket_exp_free_return;
3146 	}
3147     }
3148 
3149   /* We treat the first ']' as a normal character.  */
3150   if (token->type == OP_CLOSE_BRACKET)
3151     token->type = CHARACTER;
3152 
3153   while (1)
3154     {
3155       bracket_elem_t start_elem, end_elem;
3156       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3157       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3158       reg_errcode_t ret;
3159       int token_len2 = 0;
3160       bool is_range_exp = false;
3161       re_token_t token2;
3162 
3163       start_elem.opr.name = start_name_buf;
3164       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165 				   syntax, first_round);
3166       if (BE (ret != REG_NOERROR, 0))
3167 	{
3168 	  *err = ret;
3169 	  goto parse_bracket_exp_free_return;
3170 	}
3171       first_round = false;
3172 
3173       /* Get information about the next token.  We need it in any case.  */
3174       token_len = peek_token_bracket (token, regexp, syntax);
3175 
3176       /* Do not check for ranges if we know they are not allowed.  */
3177       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178 	{
3179 	  if (BE (token->type == END_OF_RE, 0))
3180 	    {
3181 	      *err = REG_EBRACK;
3182 	      goto parse_bracket_exp_free_return;
3183 	    }
3184 	  if (token->type == OP_CHARSET_RANGE)
3185 	    {
3186 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3187 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188 	      if (BE (token2.type == END_OF_RE, 0))
3189 		{
3190 		  *err = REG_EBRACK;
3191 		  goto parse_bracket_exp_free_return;
3192 		}
3193 	      if (token2.type == OP_CLOSE_BRACKET)
3194 		{
3195 		  /* We treat the last '-' as a normal character.  */
3196 		  re_string_skip_bytes (regexp, -token_len);
3197 		  token->type = CHARACTER;
3198 		}
3199 	      else
3200 		is_range_exp = true;
3201 	    }
3202 	}
3203 
3204       if (is_range_exp == true)
3205 	{
3206 	  end_elem.opr.name = end_name_buf;
3207 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208 				       dfa, syntax, true);
3209 	  if (BE (ret != REG_NOERROR, 0))
3210 	    {
3211 	      *err = ret;
3212 	      goto parse_bracket_exp_free_return;
3213 	    }
3214 
3215 	  token_len = peek_token_bracket (token, regexp, syntax);
3216 
3217 #ifdef _LIBC
3218 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3219 				  &start_elem, &end_elem);
3220 #else
3221 # ifdef RE_ENABLE_I18N
3222 	  *err = build_range_exp (syntax, sbcset,
3223 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3224 				  &range_alloc, &start_elem, &end_elem);
3225 # else
3226 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3227 # endif
3228 #endif /* RE_ENABLE_I18N */
3229 	  if (BE (*err != REG_NOERROR, 0))
3230 	    goto parse_bracket_exp_free_return;
3231 	}
3232       else
3233 	{
3234 	  switch (start_elem.type)
3235 	    {
3236 	    case SB_CHAR:
3237 	      bitset_set (sbcset, start_elem.opr.ch);
3238 	      break;
3239 #ifdef RE_ENABLE_I18N
3240 	    case MB_CHAR:
3241 	      /* Check whether the array has enough space.  */
3242 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243 		{
3244 		  wchar_t *new_mbchars;
3245 		  /* Not enough, realloc it.  */
3246 		  /* +1 in case of mbcset->nmbchars is 0.  */
3247 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248 		  /* Use realloc since array is NULL if *alloc == 0.  */
3249 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250 					    mbchar_alloc);
3251 		  if (BE (new_mbchars == NULL, 0))
3252 		    goto parse_bracket_exp_espace;
3253 		  mbcset->mbchars = new_mbchars;
3254 		}
3255 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256 	      break;
3257 #endif /* RE_ENABLE_I18N */
3258 	    case EQUIV_CLASS:
3259 	      *err = build_equiv_class (sbcset,
3260 #ifdef RE_ENABLE_I18N
3261 					mbcset, &equiv_class_alloc,
3262 #endif /* RE_ENABLE_I18N */
3263 					start_elem.opr.name);
3264 	      if (BE (*err != REG_NOERROR, 0))
3265 		goto parse_bracket_exp_free_return;
3266 	      break;
3267 	    case COLL_SYM:
3268 	      *err = build_collating_symbol (sbcset,
3269 #ifdef RE_ENABLE_I18N
3270 					     mbcset, &coll_sym_alloc,
3271 #endif /* RE_ENABLE_I18N */
3272 					     start_elem.opr.name);
3273 	      if (BE (*err != REG_NOERROR, 0))
3274 		goto parse_bracket_exp_free_return;
3275 	      break;
3276 	    case CHAR_CLASS:
3277 	      *err = build_charclass (regexp->trans, sbcset,
3278 #ifdef RE_ENABLE_I18N
3279 				      mbcset, &char_class_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281 				      (const char *) start_elem.opr.name,
3282 				      syntax);
3283 	      if (BE (*err != REG_NOERROR, 0))
3284 	       goto parse_bracket_exp_free_return;
3285 	      break;
3286 	    default:
3287 	      assert (0);
3288 	      break;
3289 	    }
3290 	}
3291       if (BE (token->type == END_OF_RE, 0))
3292 	{
3293 	  *err = REG_EBRACK;
3294 	  goto parse_bracket_exp_free_return;
3295 	}
3296       if (token->type == OP_CLOSE_BRACKET)
3297 	break;
3298     }
3299 
3300   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3301 
3302   /* If it is non-matching list.  */
3303   if (non_match)
3304     bitset_not (sbcset);
3305 
3306 #ifdef RE_ENABLE_I18N
3307   /* Ensure only single byte characters are set.  */
3308   if (dfa->mb_cur_max > 1)
3309     bitset_mask (sbcset, dfa->sb_char);
3310 
3311   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3312       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3313 						     || mbcset->non_match)))
3314     {
3315       bin_tree_t *mbc_tree;
3316       int sbc_idx;
3317       /* Build a tree for complex bracket.  */
3318       dfa->has_mb_node = 1;
3319       br_token.type = COMPLEX_BRACKET;
3320       br_token.opr.mbcset = mbcset;
3321       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322       if (BE (mbc_tree == NULL, 0))
3323 	goto parse_bracket_exp_espace;
3324       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3325 	if (sbcset[sbc_idx])
3326 	  break;
3327       /* If there are no bits set in sbcset, there is no point
3328 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3329       if (sbc_idx < BITSET_WORDS)
3330 	{
3331 	  /* Build a tree for simple bracket.  */
3332 	  br_token.type = SIMPLE_BRACKET;
3333 	  br_token.opr.sbcset = sbcset;
3334 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3335 	  if (BE (work_tree == NULL, 0))
3336 	    goto parse_bracket_exp_espace;
3337 
3338 	  /* Then join them by ALT node.  */
3339 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3340 	  if (BE (work_tree == NULL, 0))
3341 	    goto parse_bracket_exp_espace;
3342 	}
3343       else
3344 	{
3345 	  re_free (sbcset);
3346 	  work_tree = mbc_tree;
3347 	}
3348     }
3349   else
3350 #endif /* not RE_ENABLE_I18N */
3351     {
3352 #ifdef RE_ENABLE_I18N
3353       free_charset (mbcset);
3354 #endif
3355       /* Build a tree for simple bracket.  */
3356       br_token.type = SIMPLE_BRACKET;
3357       br_token.opr.sbcset = sbcset;
3358       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359       if (BE (work_tree == NULL, 0))
3360 	goto parse_bracket_exp_espace;
3361     }
3362   return work_tree;
3363 
3364  parse_bracket_exp_espace:
3365   *err = REG_ESPACE;
3366  parse_bracket_exp_free_return:
3367   re_free (sbcset);
3368 #ifdef RE_ENABLE_I18N
3369   free_charset (mbcset);
3370 #endif /* RE_ENABLE_I18N */
3371   return NULL;
3372 }
3373 
3374 /* Parse an element in the bracket expression.  */
3375 
3376 static reg_errcode_t
3377 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3378 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3379 		       reg_syntax_t syntax, bool accept_hyphen)
3380 {
3381 #ifdef RE_ENABLE_I18N
3382   int cur_char_size;
3383   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3384   if (cur_char_size > 1)
3385     {
3386       elem->type = MB_CHAR;
3387       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3388       re_string_skip_bytes (regexp, cur_char_size);
3389       return REG_NOERROR;
3390     }
3391 #endif /* RE_ENABLE_I18N */
3392   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3393   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3394       || token->type == OP_OPEN_EQUIV_CLASS)
3395     return parse_bracket_symbol (elem, regexp, token);
3396   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3397     {
3398       /* A '-' must only appear as anything but a range indicator before
3399 	 the closing bracket.  Everything else is an error.  */
3400       re_token_t token2;
3401       (void) peek_token_bracket (&token2, regexp, syntax);
3402       if (token2.type != OP_CLOSE_BRACKET)
3403 	/* The actual error value is not standardized since this whole
3404 	   case is undefined.  But ERANGE makes good sense.  */
3405 	return REG_ERANGE;
3406     }
3407   elem->type = SB_CHAR;
3408   elem->opr.ch = token->opr.c;
3409   return REG_NOERROR;
3410 }
3411 
3412 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3413    such as [:<character_class>:], [.<collating_element>.], and
3414    [=<equivalent_class>=].  */
3415 
3416 static reg_errcode_t
3417 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3418 		      re_token_t *token)
3419 {
3420   unsigned char ch, delim = token->opr.c;
3421   int i = 0;
3422   if (re_string_eoi(regexp))
3423     return REG_EBRACK;
3424   for (;; ++i)
3425     {
3426       if (i >= BRACKET_NAME_BUF_SIZE)
3427 	return REG_EBRACK;
3428       if (token->type == OP_OPEN_CHAR_CLASS)
3429 	ch = re_string_fetch_byte_case (regexp);
3430       else
3431 	ch = re_string_fetch_byte (regexp);
3432       if (re_string_eoi(regexp))
3433 	return REG_EBRACK;
3434       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435 	break;
3436       elem->opr.name[i] = ch;
3437     }
3438   re_string_skip_bytes (regexp, 1);
3439   elem->opr.name[i] = '\0';
3440   switch (token->type)
3441     {
3442     case OP_OPEN_COLL_ELEM:
3443       elem->type = COLL_SYM;
3444       break;
3445     case OP_OPEN_EQUIV_CLASS:
3446       elem->type = EQUIV_CLASS;
3447       break;
3448     case OP_OPEN_CHAR_CLASS:
3449       elem->type = CHAR_CLASS;
3450       break;
3451     default:
3452       break;
3453     }
3454   return REG_NOERROR;
3455 }
3456 
3457   /* Helper function for parse_bracket_exp.
3458      Build the equivalence class which is represented by NAME.
3459      The result are written to MBCSET and SBCSET.
3460      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461      is a pointer argument since we may update it.  */
3462 
3463 static reg_errcode_t
3464 #ifdef RE_ENABLE_I18N
3465 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3466 		   Idx *equiv_class_alloc, const unsigned char *name)
3467 #else /* not RE_ENABLE_I18N */
3468 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3469 #endif /* not RE_ENABLE_I18N */
3470 {
3471 #ifdef _LIBC
3472   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3473   if (nrules != 0)
3474     {
3475       const int32_t *table, *indirect;
3476       const unsigned char *weights, *extra, *cp;
3477       unsigned char char_buf[2];
3478       int32_t idx1, idx2;
3479       unsigned int ch;
3480       size_t len;
3481       /* This #include defines a local function!  */
3482 # include <locale/weight.h>
3483       /* Calculate the index for equivalence class.  */
3484       cp = name;
3485       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3486       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3487 					       _NL_COLLATE_WEIGHTMB);
3488       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489 						   _NL_COLLATE_EXTRAMB);
3490       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3491 						_NL_COLLATE_INDIRECTMB);
3492       idx1 = findidx (&cp, -1);
3493       if (BE (idx1 == 0 || *cp != '\0', 0))
3494 	/* This isn't a valid character.  */
3495 	return REG_ECOLLATE;
3496 
3497       /* Build single byte matching table for this equivalence class.  */
3498       len = weights[idx1 & 0xffffff];
3499       for (ch = 0; ch < SBC_MAX; ++ch)
3500 	{
3501 	  char_buf[0] = ch;
3502 	  cp = char_buf;
3503 	  idx2 = findidx (&cp, 1);
3504 /*
3505 	  idx2 = table[ch];
3506 */
3507 	  if (idx2 == 0)
3508 	    /* This isn't a valid character.  */
3509 	    continue;
3510 	  /* Compare only if the length matches and the collation rule
3511 	     index is the same.  */
3512 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3513 	    {
3514 	      int cnt = 0;
3515 
3516 	      while (cnt <= len &&
3517 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3518 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3519 		++cnt;
3520 
3521 	      if (cnt > len)
3522 		bitset_set (sbcset, ch);
3523 	    }
3524 	}
3525       /* Check whether the array has enough space.  */
3526       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527 	{
3528 	  /* Not enough, realloc it.  */
3529 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3530 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3532 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533 						   int32_t,
3534 						   new_equiv_class_alloc);
3535 	  if (BE (new_equiv_classes == NULL, 0))
3536 	    return REG_ESPACE;
3537 	  mbcset->equiv_classes = new_equiv_classes;
3538 	  *equiv_class_alloc = new_equiv_class_alloc;
3539 	}
3540       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541     }
3542   else
3543 #endif /* _LIBC */
3544     {
3545       if (BE (strlen ((const char *) name) != 1, 0))
3546 	return REG_ECOLLATE;
3547       bitset_set (sbcset, *name);
3548     }
3549   return REG_NOERROR;
3550 }
3551 
3552   /* Helper function for parse_bracket_exp.
3553      Build the character class which is represented by NAME.
3554      The result are written to MBCSET and SBCSET.
3555      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556      is a pointer argument since we may update it.  */
3557 
3558 static reg_errcode_t
3559 #ifdef RE_ENABLE_I18N
3560 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3561 		 re_charset_t *mbcset, Idx *char_class_alloc,
3562 		 const char *class_name, reg_syntax_t syntax)
3563 #else /* not RE_ENABLE_I18N */
3564 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565 		 const char *class_name, reg_syntax_t syntax)
3566 #endif /* not RE_ENABLE_I18N */
3567 {
3568   int i;
3569   const char *name = class_name;
3570 
3571   /* In case of REG_ICASE "upper" and "lower" match the both of
3572      upper and lower cases.  */
3573   if ((syntax & RE_ICASE)
3574       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3575     name = "alpha";
3576 
3577 #ifdef RE_ENABLE_I18N
3578   /* Check the space of the arrays.  */
3579   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3580     {
3581       /* Not enough, realloc it.  */
3582       /* +1 in case of mbcset->nchar_classes is 0.  */
3583       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3584       /* Use realloc since array is NULL if *alloc == 0.  */
3585       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3586 					       new_char_class_alloc);
3587       if (BE (new_char_classes == NULL, 0))
3588 	return REG_ESPACE;
3589       mbcset->char_classes = new_char_classes;
3590       *char_class_alloc = new_char_class_alloc;
3591     }
3592   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3593 #endif /* RE_ENABLE_I18N */
3594 
3595 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3596   do {						\
3597     if (BE (trans != NULL, 0))			\
3598       {						\
3599 	for (i = 0; i < SBC_MAX; ++i)		\
3600 	  if (ctype_func (i))			\
3601 	    bitset_set (sbcset, trans[i]);	\
3602       }						\
3603     else					\
3604       {						\
3605 	for (i = 0; i < SBC_MAX; ++i)		\
3606 	  if (ctype_func (i))			\
3607 	    bitset_set (sbcset, i);		\
3608       }						\
3609   } while (0)
3610 
3611   if (strcmp (name, "alnum") == 0)
3612     BUILD_CHARCLASS_LOOP (isalnum);
3613   else if (strcmp (name, "cntrl") == 0)
3614     BUILD_CHARCLASS_LOOP (iscntrl);
3615   else if (strcmp (name, "lower") == 0)
3616     BUILD_CHARCLASS_LOOP (islower);
3617   else if (strcmp (name, "space") == 0)
3618     BUILD_CHARCLASS_LOOP (isspace);
3619   else if (strcmp (name, "alpha") == 0)
3620     BUILD_CHARCLASS_LOOP (isalpha);
3621   else if (strcmp (name, "digit") == 0)
3622     BUILD_CHARCLASS_LOOP (isdigit);
3623   else if (strcmp (name, "print") == 0)
3624     BUILD_CHARCLASS_LOOP (isprint);
3625   else if (strcmp (name, "upper") == 0)
3626     BUILD_CHARCLASS_LOOP (isupper);
3627   else if (strcmp (name, "blank") == 0)
3628     BUILD_CHARCLASS_LOOP (isblank);
3629   else if (strcmp (name, "graph") == 0)
3630     BUILD_CHARCLASS_LOOP (isgraph);
3631   else if (strcmp (name, "punct") == 0)
3632     BUILD_CHARCLASS_LOOP (ispunct);
3633   else if (strcmp (name, "xdigit") == 0)
3634     BUILD_CHARCLASS_LOOP (isxdigit);
3635   else
3636     return REG_ECTYPE;
3637 
3638   return REG_NOERROR;
3639 }
3640 
3641 static bin_tree_t *
3642 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3643 		    const char *class_name,
3644 		    const char *extra, bool non_match,
3645 		    reg_errcode_t *err)
3646 {
3647   re_bitset_ptr_t sbcset;
3648 #ifdef RE_ENABLE_I18N
3649   re_charset_t *mbcset;
3650   Idx alloc = 0;
3651 #endif /* not RE_ENABLE_I18N */
3652   reg_errcode_t ret;
3653   re_token_t br_token;
3654   bin_tree_t *tree;
3655 
3656   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3657 #ifdef RE_ENABLE_I18N
3658   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659 #endif /* RE_ENABLE_I18N */
3660 
3661 #ifdef RE_ENABLE_I18N
3662   if (BE (sbcset == NULL || mbcset == NULL, 0))
3663 #else /* not RE_ENABLE_I18N */
3664   if (BE (sbcset == NULL, 0))
3665 #endif /* not RE_ENABLE_I18N */
3666     {
3667       *err = REG_ESPACE;
3668       return NULL;
3669     }
3670 
3671   if (non_match)
3672     {
3673 #ifdef RE_ENABLE_I18N
3674       mbcset->non_match = 1;
3675 #endif /* not RE_ENABLE_I18N */
3676     }
3677 
3678   /* We don't care the syntax in this case.  */
3679   ret = build_charclass (trans, sbcset,
3680 #ifdef RE_ENABLE_I18N
3681 			 mbcset, &alloc,
3682 #endif /* RE_ENABLE_I18N */
3683 			 class_name, 0);
3684 
3685   if (BE (ret != REG_NOERROR, 0))
3686     {
3687       re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689       free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691       *err = ret;
3692       return NULL;
3693     }
3694   /* \w match '_' also.  */
3695   for (; *extra; extra++)
3696     bitset_set (sbcset, *extra);
3697 
3698   /* If it is non-matching list.  */
3699   if (non_match)
3700     bitset_not (sbcset);
3701 
3702 #ifdef RE_ENABLE_I18N
3703   /* Ensure only single byte characters are set.  */
3704   if (dfa->mb_cur_max > 1)
3705     bitset_mask (sbcset, dfa->sb_char);
3706 #endif
3707 
3708   /* Build a tree for simple bracket.  */
3709   br_token.type = SIMPLE_BRACKET;
3710   br_token.opr.sbcset = sbcset;
3711   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712   if (BE (tree == NULL, 0))
3713     goto build_word_op_espace;
3714 
3715 #ifdef RE_ENABLE_I18N
3716   if (dfa->mb_cur_max > 1)
3717     {
3718       bin_tree_t *mbc_tree;
3719       /* Build a tree for complex bracket.  */
3720       br_token.type = COMPLEX_BRACKET;
3721       br_token.opr.mbcset = mbcset;
3722       dfa->has_mb_node = 1;
3723       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724       if (BE (mbc_tree == NULL, 0))
3725 	goto build_word_op_espace;
3726       /* Then join them by ALT node.  */
3727       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728       if (BE (mbc_tree != NULL, 1))
3729 	return tree;
3730     }
3731   else
3732     {
3733       free_charset (mbcset);
3734       return tree;
3735     }
3736 #else /* not RE_ENABLE_I18N */
3737   return tree;
3738 #endif /* not RE_ENABLE_I18N */
3739 
3740  build_word_op_espace:
3741   re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743   free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745   *err = REG_ESPACE;
3746   return NULL;
3747 }
3748 
3749 /* This is intended for the expressions like "a{1,3}".
3750    Fetch a number from 'input', and return the number.
3751    Return REG_MISSING if the number field is empty like "{,1}".
3752    Return RE_DUP_MAX + 1 if the number field is too large.
3753    Return REG_ERROR if an error occurred.  */
3754 
3755 static Idx
3756 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3757 {
3758   Idx num = REG_MISSING;
3759   unsigned char c;
3760   while (1)
3761     {
3762       fetch_token (token, input, syntax);
3763       c = token->opr.c;
3764       if (BE (token->type == END_OF_RE, 0))
3765 	return REG_ERROR;
3766       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3767 	break;
3768       num = ((token->type != CHARACTER || c < '0' || '9' < c
3769 	      || num == REG_ERROR)
3770 	     ? REG_ERROR
3771 	     : num == REG_MISSING
3772 	     ? c - '0'
3773 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3774     }
3775   return num;
3776 }
3777 
3778 #ifdef RE_ENABLE_I18N
3779 static void
3780 free_charset (re_charset_t *cset)
3781 {
3782   re_free (cset->mbchars);
3783 # ifdef _LIBC
3784   re_free (cset->coll_syms);
3785   re_free (cset->equiv_classes);
3786   re_free (cset->range_starts);
3787   re_free (cset->range_ends);
3788 # endif
3789   re_free (cset->char_classes);
3790   re_free (cset);
3791 }
3792 #endif /* RE_ENABLE_I18N */
3793 
3794 /* Functions for binary tree operation.  */
3795 
3796 /* Create a tree node.  */
3797 
3798 static bin_tree_t *
3799 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800 	     re_token_type_t type)
3801 {
3802   re_token_t t;
3803   t.type = type;
3804   return create_token_tree (dfa, left, right, &t);
3805 }
3806 
3807 static bin_tree_t *
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809 		   const re_token_t *token)
3810 {
3811   bin_tree_t *tree;
3812   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813     {
3814       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815 
3816       if (storage == NULL)
3817 	return NULL;
3818       storage->next = dfa->str_tree_storage;
3819       dfa->str_tree_storage = storage;
3820       dfa->str_tree_storage_idx = 0;
3821     }
3822   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823 
3824   tree->parent = NULL;
3825   tree->left = left;
3826   tree->right = right;
3827   tree->token = *token;
3828   tree->token.duplicated = 0;
3829   tree->token.opt_subexp = 0;
3830   tree->first = NULL;
3831   tree->next = NULL;
3832   tree->node_idx = REG_MISSING;
3833 
3834   if (left != NULL)
3835     left->parent = tree;
3836   if (right != NULL)
3837     right->parent = tree;
3838   return tree;
3839 }
3840 
3841 /* Mark the tree SRC as an optional subexpression.
3842    To be called from preorder or postorder.  */
3843 
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3846 {
3847   Idx idx = (uintptr_t) extra;
3848   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849     node->token.opt_subexp = 1;
3850 
3851   return REG_NOERROR;
3852 }
3853 
3854 /* Free the allocated memory inside NODE. */
3855 
3856 static void
3857 free_token (re_token_t *node)
3858 {
3859 #ifdef RE_ENABLE_I18N
3860   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861     free_charset (node->opr.mbcset);
3862   else
3863 #endif /* RE_ENABLE_I18N */
3864     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865       re_free (node->opr.sbcset);
3866 }
3867 
3868 /* Worker function for tree walking.  Free the allocated memory inside NODE
3869    and its children. */
3870 
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3873 {
3874   free_token (&node->token);
3875   return REG_NOERROR;
3876 }
3877 
3878 
3879 /* Duplicate the node SRC, and return new node.  This is a preorder
3880    visit similar to the one implemented by the generic visitor, but
3881    we need more infrastructure to maintain two parallel trees --- so,
3882    it's easier to duplicate.  */
3883 
3884 static bin_tree_t *
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886 {
3887   const bin_tree_t *node;
3888   bin_tree_t *dup_root;
3889   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890 
3891   for (node = root; ; )
3892     {
3893       /* Create a new tree and link it back to the current parent.  */
3894       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895       if (*p_new == NULL)
3896 	return NULL;
3897       (*p_new)->parent = dup_node;
3898       (*p_new)->token.duplicated = 1;
3899       dup_node = *p_new;
3900 
3901       /* Go to the left node, or up and to the right.  */
3902       if (node->left)
3903 	{
3904 	  node = node->left;
3905 	  p_new = &dup_node->left;
3906 	}
3907       else
3908 	{
3909 	  const bin_tree_t *prev = NULL;
3910 	  while (node->right == prev || node->right == NULL)
3911 	    {
3912 	      prev = node;
3913 	      node = node->parent;
3914 	      dup_node = dup_node->parent;
3915 	      if (!node)
3916 		return dup_root;
3917 	    }
3918 	  node = node->right;
3919 	  p_new = &dup_node->right;
3920 	}
3921     }
3922 }
3923