xref: /dragonfly/contrib/diffutils/lib/regcomp.c (revision 9348a738)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 					  size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 				     const re_dfastate_t *init_state,
24 				     char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 			       reg_errcode_t (fn (void *, bin_tree_t *)),
37 			       void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 				reg_errcode_t (fn (void *, bin_tree_t *)),
40 				void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 				 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 				   unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 					 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 			 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 			reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 			  reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 				  re_token_t *token, reg_syntax_t syntax,
63 				  Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 				 re_token_t *token, reg_syntax_t syntax,
66 				 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 				     re_token_t *token, reg_syntax_t syntax,
69 				     Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 				  re_token_t *token, reg_syntax_t syntax,
72 				  Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 				 re_dfa_t *dfa, re_token_t *token,
75 				 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 				      re_token_t *token, reg_syntax_t syntax,
78 				      reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 					    re_string_t *regexp,
81 					    re_token_t *token, int token_len,
82 					    re_dfa_t *dfa,
83 					    reg_syntax_t syntax,
84 					    bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 					  re_string_t *regexp,
87 					  re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 					re_charset_t *mbcset,
91 					Idx *equiv_class_alloc,
92 					const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 				      bitset_t sbcset,
95 				      re_charset_t *mbcset,
96 				      Idx *char_class_alloc,
97 				      const char *class_name,
98 				      reg_syntax_t syntax);
99 #else  /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 					const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 				      bitset_t sbcset,
104 				      const char *class_name,
105 				      reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 				       RE_TRANSLATE_TYPE trans,
109 				       const char *class_name,
110 				       const char *extra,
111 				       bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 				bin_tree_t *left, bin_tree_t *right,
114 				re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 				      bin_tree_t *left, bin_tree_t *right,
117 				      const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 
123 /* This table gives an error message for each of the error codes listed
124    in regex.h.  Obviously the order here has to be same as there.
125    POSIX doesn't require that we do anything for REG_NOERROR,
126    but why not be nice?  */
127 
128 static const char __re_error_msgid[] =
129   {
130 #define REG_NOERROR_IDX	0
131     gettext_noop ("Success")	/* REG_NOERROR */
132     "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134     gettext_noop ("No match")	/* REG_NOMATCH */
135     "\0"
136 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138     "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141     "\0"
142 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144     "\0"
145 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147     "\0"
148 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150     "\0"
151 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153     "\0"
154 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156     "\0"
157 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159     "\0"
160 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162     "\0"
163 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164     gettext_noop ("Invalid range end")	/* REG_ERANGE */
165     "\0"
166 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
168     "\0"
169 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171     "\0"
172 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
174     "\0"
175 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
177     "\0"
178 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180   };
181 
182 static const size_t __re_error_msgid_idx[] =
183   {
184     REG_NOERROR_IDX,
185     REG_NOMATCH_IDX,
186     REG_BADPAT_IDX,
187     REG_ECOLLATE_IDX,
188     REG_ECTYPE_IDX,
189     REG_EESCAPE_IDX,
190     REG_ESUBREG_IDX,
191     REG_EBRACK_IDX,
192     REG_EPAREN_IDX,
193     REG_EBRACE_IDX,
194     REG_BADBR_IDX,
195     REG_ERANGE_IDX,
196     REG_ESPACE_IDX,
197     REG_BADRPT_IDX,
198     REG_EEND_IDX,
199     REG_ESIZE_IDX,
200     REG_ERPAREN_IDX
201   };
202 
203 /* Entry points for GNU code.  */
204 
205 /* re_compile_pattern is the GNU regular expression compiler: it
206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207    Returns 0 if the pattern was valid, otherwise an error string.
208 
209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210    are set in BUFP on entry.  */
211 
212 #ifdef _LIBC
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215     const char *pattern;
216     size_t length;
217     struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
219 const char *
220 re_compile_pattern (const char *pattern, size_t length,
221 		    struct re_pattern_buffer *bufp)
222 #endif
223 {
224   reg_errcode_t ret;
225 
226   /* And GNU code determines whether or not to get register information
227      by passing null for the REGS argument to re_match, etc., not by
228      setting no_sub, unless RE_NO_SUB is set.  */
229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230 
231   /* Match anchors at newline.  */
232   bufp->newline_anchor = 1;
233 
234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235 
236   if (!ret)
237     return NULL;
238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239 }
240 #ifdef _LIBC
241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
243 
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
245    also be assigned to arbitrarily: each pattern buffer stores its own
246    syntax, so it can be changed between regex compilations.  */
247 /* This has no initializer because initialized variables in Emacs
248    become read-only after dumping.  */
249 reg_syntax_t re_syntax_options;
250 
251 
252 /* Specify the precise syntax of regexps for compilation.  This provides
253    for compatibility for various utilities which historically have
254    different, incompatible syntaxes.
255 
256    The argument SYNTAX is a bit mask comprised of the various bits
257    defined in regex.h.  We return the old syntax.  */
258 
259 reg_syntax_t
260 re_set_syntax (syntax)
261     reg_syntax_t syntax;
262 {
263   reg_syntax_t ret = re_syntax_options;
264 
265   re_syntax_options = syntax;
266   return ret;
267 }
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
271 
272 int
273 re_compile_fastmap (bufp)
274     struct re_pattern_buffer *bufp;
275 {
276   re_dfa_t *dfa = bufp->buffer;
277   char *fastmap = bufp->fastmap;
278 
279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281   if (dfa->init_state != dfa->init_state_word)
282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283   if (dfa->init_state != dfa->init_state_nl)
284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285   if (dfa->init_state != dfa->init_state_begbuf)
286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287   bufp->fastmap_accurate = 1;
288   return 0;
289 }
290 #ifdef _LIBC
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
293 
294 static inline void
295 __attribute__ ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
297 {
298   fastmap[ch] = 1;
299   if (icase)
300     fastmap[tolower (ch)] = 1;
301 }
302 
303 /* Helper function for re_compile_fastmap.
304    Compile fastmap for the initial_state INIT_STATE.  */
305 
306 static void
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 			 char *fastmap)
309 {
310   re_dfa_t *dfa = bufp->buffer;
311   Idx node_cnt;
312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314     {
315       Idx node = init_state->nodes.elems[node_cnt];
316       re_token_type_t type = dfa->nodes[node].type;
317 
318       if (type == CHARACTER)
319 	{
320 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 	    {
324 	      unsigned char buf[MB_LEN_MAX];
325 	      unsigned char *p;
326 	      wchar_t wc;
327 	      mbstate_t state;
328 
329 	      p = buf;
330 	      *p++ = dfa->nodes[node].opr.c;
331 	      while (++node < dfa->nodes_len
332 		     &&	dfa->nodes[node].type == CHARACTER
333 		     && dfa->nodes[node].mb_partial)
334 		*p++ = dfa->nodes[node].opr.c;
335 	      memset (&state, '\0', sizeof (state));
336 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 			     &state) == p - buf
338 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 		      != (size_t) -1))
340 		re_set_fastmap (fastmap, false, buf[0]);
341 	    }
342 #endif
343 	}
344       else if (type == SIMPLE_BRACKET)
345 	{
346 	  int i, ch;
347 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 	    {
349 	      int j;
350 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 		if (w & ((bitset_word_t) 1 << j))
353 		  re_set_fastmap (fastmap, icase, ch);
354 	    }
355 	}
356 #ifdef RE_ENABLE_I18N
357       else if (type == COMPLEX_BRACKET)
358 	{
359 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360 	  Idx i;
361 
362 # ifdef _LIBC
363 	  /* See if we have to try all bytes which start multiple collation
364 	     elements.
365 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 		  collation element, and don't catch 'b' since 'b' is
367 		  the only collation element which starts from 'b' (and
368 		  it is caught by SIMPLE_BRACKET).  */
369 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 		  && (cset->ncoll_syms || cset->nranges))
371 		{
372 		  const int32_t *table = (const int32_t *)
373 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 		  for (i = 0; i < SBC_MAX; ++i)
375 		    if (table[i] < 0)
376 		      re_set_fastmap (fastmap, icase, i);
377 		}
378 # endif /* _LIBC */
379 
380 	  /* See if we have to start the match at all multibyte characters,
381 	     i.e. where we would not find an invalid sequence.  This only
382 	     applies to multibyte character sets; for single byte character
383 	     sets, the SIMPLE_BRACKET again suffices.  */
384 	  if (dfa->mb_cur_max > 1
385 	      && (cset->nchar_classes || cset->non_match || cset->nranges
386 # ifdef _LIBC
387 		  || cset->nequiv_classes
388 # endif /* _LIBC */
389 		 ))
390 	    {
391 	      unsigned char c = 0;
392 	      do
393 		{
394 		  mbstate_t mbs;
395 		  memset (&mbs, 0, sizeof (mbs));
396 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 		    re_set_fastmap (fastmap, false, (int) c);
398 		}
399 	      while (++c != 0);
400 	    }
401 
402 	  else
403 	    {
404 	      /* ... Else catch all bytes which can start the mbchars.  */
405 	      for (i = 0; i < cset->nmbchars; ++i)
406 		{
407 		  char buf[256];
408 		  mbstate_t state;
409 		  memset (&state, '\0', sizeof (state));
410 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 		    {
414 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 			  != (size_t) -1)
416 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417 		    }
418 		}
419 	    }
420 	}
421 #endif /* RE_ENABLE_I18N */
422       else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 	       || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 	       || type == END_OF_RE)
427 	{
428 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 	  if (type == END_OF_RE)
430 	    bufp->can_be_null = 1;
431 	  return;
432 	}
433     }
434 }
435 
436 /* Entry point for POSIX code.  */
437 /* regcomp takes a regular expression as a string and compiles it.
438 
439    PREG is a regex_t *.  We do not expect any fields to be initialized,
440    since POSIX says we shouldn't.  Thus, we set
441 
442      'buffer' to the compiled pattern;
443      'used' to the length of the compiled pattern;
444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
446        RE_SYNTAX_POSIX_BASIC;
447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448      'fastmap' to an allocated space for the fastmap;
449      'fastmap_accurate' to zero;
450      're_nsub' to the number of subexpressions in PATTERN.
451 
452    PATTERN is the address of the pattern string.
453 
454    CFLAGS is a series of bits which affect compilation.
455 
456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457      use POSIX basic syntax.
458 
459      If REG_NEWLINE is set, then . and [^...] don't match newline.
460      Also, regexec will try a match beginning after every newline.
461 
462      If REG_ICASE is set, then we considers upper- and lowercase
463      versions of letters to be equivalent when matching.
464 
465      If REG_NOSUB is set, then when PREG is passed to regexec, that
466      routine will report only success or failure, and nothing about the
467      registers.
468 
469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
470    the return codes and their meanings.)  */
471 
472 int
473 regcomp (preg, pattern, cflags)
474     regex_t *_Restrict_ preg;
475     const char *_Restrict_ pattern;
476     int cflags;
477 {
478   reg_errcode_t ret;
479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 			 : RE_SYNTAX_POSIX_BASIC);
481 
482   preg->buffer = NULL;
483   preg->allocated = 0;
484   preg->used = 0;
485 
486   /* Try to allocate space for the fastmap.  */
487   preg->fastmap = re_malloc (char, SBC_MAX);
488   if (BE (preg->fastmap == NULL, 0))
489     return REG_ESPACE;
490 
491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 
493   /* If REG_NEWLINE is set, newlines are treated differently.  */
494   if (cflags & REG_NEWLINE)
495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
496       syntax &= ~RE_DOT_NEWLINE;
497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498       /* It also changes the matching behavior.  */
499       preg->newline_anchor = 1;
500     }
501   else
502     preg->newline_anchor = 0;
503   preg->no_sub = !!(cflags & REG_NOSUB);
504   preg->translate = NULL;
505 
506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 
508   /* POSIX doesn't distinguish between an unmatched open-group and an
509      unmatched close-group: both are REG_EPAREN.  */
510   if (ret == REG_ERPAREN)
511     ret = REG_EPAREN;
512 
513   /* We have already checked preg->fastmap != NULL.  */
514   if (BE (ret == REG_NOERROR, 1))
515     /* Compute the fastmap now, since regexec cannot modify the pattern
516        buffer.  This function never fails in this implementation.  */
517     (void) re_compile_fastmap (preg);
518   else
519     {
520       /* Some error occurred while compiling the expression.  */
521       re_free (preg->fastmap);
522       preg->fastmap = NULL;
523     }
524 
525   return (int) ret;
526 }
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
530 
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532    from either regcomp or regexec.   We don't use PREG here.  */
533 
534 #ifdef _LIBC
535 size_t
536 regerror (errcode, preg, errbuf, errbuf_size)
537     int errcode;
538     const regex_t *_Restrict_ preg;
539     char *_Restrict_ errbuf;
540     size_t errbuf_size;
541 #else /* size_t might promote */
542 size_t
543 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_,
544 	  char *_Restrict_ errbuf, size_t errbuf_size)
545 #endif
546 {
547   const char *msg;
548   size_t msg_size;
549 
550   if (BE (errcode < 0
551 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 			       / sizeof (__re_error_msgid_idx[0])), 0))
553     /* Only error codes returned by the rest of the code should be passed
554        to this routine.  If we are given anything else, or if other regex
555        code generates an invalid error code, then the program has a bug.
556        Dump core so we can fix it.  */
557     abort ();
558 
559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560 
561   msg_size = strlen (msg) + 1; /* Includes the null.  */
562 
563   if (BE (errbuf_size != 0, 1))
564     {
565       size_t cpy_size = msg_size;
566       if (BE (msg_size > errbuf_size, 0))
567 	{
568 	  cpy_size = errbuf_size - 1;
569 	  errbuf[cpy_size] = '\0';
570 	}
571       memcpy (errbuf, msg, cpy_size);
572     }
573 
574   return msg_size;
575 }
576 #ifdef _LIBC
577 weak_alias (__regerror, regerror)
578 #endif
579 
580 
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
584    it the same all the time.  UTF-8 is the preferred encoding so this is
585    a worthwhile optimization.  */
586 static const bitset_t utf8_sb_map =
587 {
588   /* Set the first 128 bits.  */
589 # ifdef __GNUC__
590   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # else
592 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 #   error "bitset_word_t is narrower than 32 bits"
594 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597   BITSET_WORD_MAX, BITSET_WORD_MAX,
598 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599   BITSET_WORD_MAX,
600 #  endif
601   (BITSET_WORD_MAX
602    >> (SBC_MAX % BITSET_WORD_BITS == 0
603        ? 0
604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 # endif
606 };
607 #endif
608 
609 
610 static void
611 free_dfa_content (re_dfa_t *dfa)
612 {
613   Idx i, j;
614 
615   if (dfa->nodes)
616     for (i = 0; i < dfa->nodes_len; ++i)
617       free_token (dfa->nodes + i);
618   re_free (dfa->nexts);
619   for (i = 0; i < dfa->nodes_len; ++i)
620     {
621       if (dfa->eclosures != NULL)
622 	re_node_set_free (dfa->eclosures + i);
623       if (dfa->inveclosures != NULL)
624 	re_node_set_free (dfa->inveclosures + i);
625       if (dfa->edests != NULL)
626 	re_node_set_free (dfa->edests + i);
627     }
628   re_free (dfa->edests);
629   re_free (dfa->eclosures);
630   re_free (dfa->inveclosures);
631   re_free (dfa->nodes);
632 
633   if (dfa->state_table)
634     for (i = 0; i <= dfa->state_hash_mask; ++i)
635       {
636 	struct re_state_table_entry *entry = dfa->state_table + i;
637 	for (j = 0; j < entry->num; ++j)
638 	  {
639 	    re_dfastate_t *state = entry->array[j];
640 	    free_state (state);
641 	  }
642 	re_free (entry->array);
643       }
644   re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646   if (dfa->sb_char != utf8_sb_map)
647     re_free (dfa->sb_char);
648 #endif
649   re_free (dfa->subexp_map);
650 #ifdef DEBUG
651   re_free (dfa->re_str);
652 #endif
653 
654   re_free (dfa);
655 }
656 
657 
658 /* Free dynamically allocated space used by PREG.  */
659 
660 void
661 regfree (preg)
662     regex_t *preg;
663 {
664   re_dfa_t *dfa = preg->buffer;
665   if (BE (dfa != NULL, 1))
666     free_dfa_content (dfa);
667   preg->buffer = NULL;
668   preg->allocated = 0;
669 
670   re_free (preg->fastmap);
671   preg->fastmap = NULL;
672 
673   re_free (preg->translate);
674   preg->translate = NULL;
675 }
676 #ifdef _LIBC
677 weak_alias (__regfree, regfree)
678 #endif
679 
680 /* Entry points compatible with 4.2 BSD regex library.  We don't define
681    them unless specifically requested.  */
682 
683 #if defined _REGEX_RE_COMP || defined _LIBC
684 
685 /* BSD has one and only one pattern buffer.  */
686 static struct re_pattern_buffer re_comp_buf;
687 
688 char *
689 # ifdef _LIBC
690 /* Make these definitions weak in libc, so POSIX programs can redefine
691    these names if they don't use our functions, and still use
692    regcomp/regexec above without link errors.  */
693 weak_function
694 # endif
695 re_comp (s)
696      const char *s;
697 {
698   reg_errcode_t ret;
699   char *fastmap;
700 
701   if (!s)
702     {
703       if (!re_comp_buf.buffer)
704 	return gettext ("No previous regular expression");
705       return 0;
706     }
707 
708   if (re_comp_buf.buffer)
709     {
710       fastmap = re_comp_buf.fastmap;
711       re_comp_buf.fastmap = NULL;
712       __regfree (&re_comp_buf);
713       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
714       re_comp_buf.fastmap = fastmap;
715     }
716 
717   if (re_comp_buf.fastmap == NULL)
718     {
719       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
720       if (re_comp_buf.fastmap == NULL)
721 	return (char *) gettext (__re_error_msgid
722 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
723     }
724 
725   /* Since 're_exec' always passes NULL for the 'regs' argument, we
726      don't need to initialize the pattern buffer fields which affect it.  */
727 
728   /* Match anchors at newlines.  */
729   re_comp_buf.newline_anchor = 1;
730 
731   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
732 
733   if (!ret)
734     return NULL;
735 
736   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
737   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738 }
739 
740 #ifdef _LIBC
741 libc_freeres_fn (free_mem)
742 {
743   __regfree (&re_comp_buf);
744 }
745 #endif
746 
747 #endif /* _REGEX_RE_COMP */
748 
749 /* Internal entry point.
750    Compile the regular expression PATTERN, whose length is LENGTH.
751    SYNTAX indicate regular expression's syntax.  */
752 
753 static reg_errcode_t
754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
755 		     reg_syntax_t syntax)
756 {
757   reg_errcode_t err = REG_NOERROR;
758   re_dfa_t *dfa;
759   re_string_t regexp;
760 
761   /* Initialize the pattern buffer.  */
762   preg->fastmap_accurate = 0;
763   preg->syntax = syntax;
764   preg->not_bol = preg->not_eol = 0;
765   preg->used = 0;
766   preg->re_nsub = 0;
767   preg->can_be_null = 0;
768   preg->regs_allocated = REGS_UNALLOCATED;
769 
770   /* Initialize the dfa.  */
771   dfa = preg->buffer;
772   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773     {
774       /* If zero allocated, but buffer is non-null, try to realloc
775 	 enough space.  This loses if buffer's address is bogus, but
776 	 that is the user's responsibility.  If ->buffer is NULL this
777 	 is a simple allocation.  */
778       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
779       if (dfa == NULL)
780 	return REG_ESPACE;
781       preg->allocated = sizeof (re_dfa_t);
782       preg->buffer = dfa;
783     }
784   preg->used = sizeof (re_dfa_t);
785 
786   err = init_dfa (dfa, length);
787   if (BE (err != REG_NOERROR, 0))
788     {
789       free_dfa_content (dfa);
790       preg->buffer = NULL;
791       preg->allocated = 0;
792       return err;
793     }
794 #ifdef DEBUG
795   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
796   dfa->re_str = re_malloc (char, length + 1);
797   strncpy (dfa->re_str, pattern, length + 1);
798 #endif
799 
800   __libc_lock_init (dfa->lock);
801 
802   err = re_string_construct (&regexp, pattern, length, preg->translate,
803 			     (syntax & RE_ICASE) != 0, dfa);
804   if (BE (err != REG_NOERROR, 0))
805     {
806     re_compile_internal_free_return:
807       free_workarea_compile (preg);
808       re_string_destruct (&regexp);
809       free_dfa_content (dfa);
810       preg->buffer = NULL;
811       preg->allocated = 0;
812       return err;
813     }
814 
815   /* Parse the regular expression, and build a structure tree.  */
816   preg->re_nsub = 0;
817   dfa->str_tree = parse (&regexp, preg, syntax, &err);
818   if (BE (dfa->str_tree == NULL, 0))
819     goto re_compile_internal_free_return;
820 
821   /* Analyze the tree and create the nfa.  */
822   err = analyze (preg);
823   if (BE (err != REG_NOERROR, 0))
824     goto re_compile_internal_free_return;
825 
826 #ifdef RE_ENABLE_I18N
827   /* If possible, do searching in single byte encoding to speed things up.  */
828   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829     optimize_utf8 (dfa);
830 #endif
831 
832   /* Then create the initial state of the dfa.  */
833   err = create_initial_state (dfa);
834 
835   /* Release work areas.  */
836   free_workarea_compile (preg);
837   re_string_destruct (&regexp);
838 
839   if (BE (err != REG_NOERROR, 0))
840     {
841       free_dfa_content (dfa);
842       preg->buffer = NULL;
843       preg->allocated = 0;
844     }
845 
846   return err;
847 }
848 
849 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
850    as the initial length of some arrays.  */
851 
852 static reg_errcode_t
853 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 {
855   __re_size_t table_size;
856 #ifndef _LIBC
857   const char *codeset_name;
858 #endif
859 #ifdef RE_ENABLE_I18N
860   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 #else
862   size_t max_i18n_object_size = 0;
863 #endif
864   size_t max_object_size =
865     MAX (sizeof (struct re_state_table_entry),
866 	 MAX (sizeof (re_token_t),
867 	      MAX (sizeof (re_node_set),
868 		   MAX (sizeof (regmatch_t),
869 			max_i18n_object_size))));
870 
871   memset (dfa, '\0', sizeof (re_dfa_t));
872 
873   /* Force allocation of str_tree_storage the first time.  */
874   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875 
876   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
877      calculation below, and for similar doubling calculations
878      elsewhere.  And it's <= rather than <, because some of the
879      doubling calculations add 1 afterwards.  */
880   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
881     return REG_ESPACE;
882 
883   dfa->nodes_alloc = pat_len + 1;
884   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885 
886   /*  table_size = 2 ^ ceil(log pat_len) */
887   for (table_size = 1; ; table_size <<= 1)
888     if (table_size > pat_len)
889       break;
890 
891   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
892   dfa->state_hash_mask = table_size - 1;
893 
894   dfa->mb_cur_max = MB_CUR_MAX;
895 #ifdef _LIBC
896   if (dfa->mb_cur_max == 6
897       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898     dfa->is_utf8 = 1;
899   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
900 		       != 0);
901 #else
902   codeset_name = nl_langinfo (CODESET);
903   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
904       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
905       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
906       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
907     dfa->is_utf8 = 1;
908 
909   /* We check exhaustively in the loop below if this charset is a
910      superset of ASCII.  */
911   dfa->map_notascii = 0;
912 #endif
913 
914 #ifdef RE_ENABLE_I18N
915   if (dfa->mb_cur_max > 1)
916     {
917       if (dfa->is_utf8)
918 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
919       else
920 	{
921 	  int i, j, ch;
922 
923 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
924 	  if (BE (dfa->sb_char == NULL, 0))
925 	    return REG_ESPACE;
926 
927 	  /* Set the bits corresponding to single byte chars.  */
928 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
929 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
930 	      {
931 		wint_t wch = __btowc (ch);
932 		if (wch != WEOF)
933 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
934 # ifndef _LIBC
935 		if (isascii (ch) && wch != ch)
936 		  dfa->map_notascii = 1;
937 # endif
938 	      }
939 	}
940     }
941 #endif
942 
943   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
944     return REG_ESPACE;
945   return REG_NOERROR;
946 }
947 
948 /* Initialize WORD_CHAR table, which indicate which character is
949    "word".  In this case "word" means that it is the word construction
950    character used by some operators like "\<", "\>", etc.  */
951 
952 static void
953 internal_function
954 init_word_char (re_dfa_t *dfa)
955 {
956   int i = 0;
957   int j;
958   int ch = 0;
959   dfa->word_ops_used = 1;
960   if (BE (dfa->map_notascii == 0, 1))
961     {
962       bitset_word_t bits0 = 0x00000000;
963       bitset_word_t bits1 = 0x03ff0000;
964       bitset_word_t bits2 = 0x87fffffe;
965       bitset_word_t bits3 = 0x07fffffe;
966       if (BITSET_WORD_BITS == 64)
967 	{
968 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
969 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
970 	  i = 2;
971 	}
972       else if (BITSET_WORD_BITS == 32)
973 	{
974 	  dfa->word_char[0] = bits0;
975 	  dfa->word_char[1] = bits1;
976 	  dfa->word_char[2] = bits2;
977 	  dfa->word_char[3] = bits3;
978 	  i = 4;
979 	}
980       else
981         goto general_case;
982       ch = 128;
983 
984       if (BE (dfa->is_utf8, 1))
985 	{
986 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
987 	  return;
988 	}
989     }
990 
991  general_case:
992   for (; i < BITSET_WORDS; ++i)
993     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
994       if (isalnum (ch) || ch == '_')
995 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
996 }
997 
998 /* Free the work area which are only used while compiling.  */
999 
1000 static void
1001 free_workarea_compile (regex_t *preg)
1002 {
1003   re_dfa_t *dfa = preg->buffer;
1004   bin_tree_storage_t *storage, *next;
1005   for (storage = dfa->str_tree_storage; storage; storage = next)
1006     {
1007       next = storage->next;
1008       re_free (storage);
1009     }
1010   dfa->str_tree_storage = NULL;
1011   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1012   dfa->str_tree = NULL;
1013   re_free (dfa->org_indices);
1014   dfa->org_indices = NULL;
1015 }
1016 
1017 /* Create initial states for all contexts.  */
1018 
1019 static reg_errcode_t
1020 create_initial_state (re_dfa_t *dfa)
1021 {
1022   Idx first, i;
1023   reg_errcode_t err;
1024   re_node_set init_nodes;
1025 
1026   /* Initial states have the epsilon closure of the node which is
1027      the first node of the regular expression.  */
1028   first = dfa->str_tree->first->node_idx;
1029   dfa->init_node = first;
1030   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1031   if (BE (err != REG_NOERROR, 0))
1032     return err;
1033 
1034   /* The back-references which are in initial states can epsilon transit,
1035      since in this case all of the subexpressions can be null.
1036      Then we add epsilon closures of the nodes which are the next nodes of
1037      the back-references.  */
1038   if (dfa->nbackref > 0)
1039     for (i = 0; i < init_nodes.nelem; ++i)
1040       {
1041 	Idx node_idx = init_nodes.elems[i];
1042 	re_token_type_t type = dfa->nodes[node_idx].type;
1043 
1044 	Idx clexp_idx;
1045 	if (type != OP_BACK_REF)
1046 	  continue;
1047 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1048 	  {
1049 	    re_token_t *clexp_node;
1050 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1051 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1052 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1053 	      break;
1054 	  }
1055 	if (clexp_idx == init_nodes.nelem)
1056 	  continue;
1057 
1058 	if (type == OP_BACK_REF)
1059 	  {
1060 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1061 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1062 	      {
1063 		reg_errcode_t merge_err
1064                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1065 		if (merge_err != REG_NOERROR)
1066 		  return merge_err;
1067 		i = 0;
1068 	      }
1069 	  }
1070       }
1071 
1072   /* It must be the first time to invoke acquire_state.  */
1073   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1074   /* We don't check ERR here, since the initial state must not be NULL.  */
1075   if (BE (dfa->init_state == NULL, 0))
1076     return err;
1077   if (dfa->init_state->has_constraint)
1078     {
1079       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1080 						       CONTEXT_WORD);
1081       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1082 						     CONTEXT_NEWLINE);
1083       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1084 							 &init_nodes,
1085 							 CONTEXT_NEWLINE
1086 							 | CONTEXT_BEGBUF);
1087       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1088 	      || dfa->init_state_begbuf == NULL, 0))
1089 	return err;
1090     }
1091   else
1092     dfa->init_state_word = dfa->init_state_nl
1093       = dfa->init_state_begbuf = dfa->init_state;
1094 
1095   re_node_set_free (&init_nodes);
1096   return REG_NOERROR;
1097 }
1098 
1099 #ifdef RE_ENABLE_I18N
1100 /* If it is possible to do searching in single byte encoding instead of UTF-8
1101    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1102    DFA nodes where needed.  */
1103 
1104 static void
1105 optimize_utf8 (re_dfa_t *dfa)
1106 {
1107   Idx node;
1108   int i;
1109   bool mb_chars = false;
1110   bool has_period = false;
1111 
1112   for (node = 0; node < dfa->nodes_len; ++node)
1113     switch (dfa->nodes[node].type)
1114       {
1115       case CHARACTER:
1116 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1117 	  mb_chars = true;
1118 	break;
1119       case ANCHOR:
1120 	switch (dfa->nodes[node].opr.ctx_type)
1121 	  {
1122 	  case LINE_FIRST:
1123 	  case LINE_LAST:
1124 	  case BUF_FIRST:
1125 	  case BUF_LAST:
1126 	    break;
1127 	  default:
1128 	    /* Word anchors etc. cannot be handled.  It's okay to test
1129 	       opr.ctx_type since constraints (for all DFA nodes) are
1130 	       created by ORing one or more opr.ctx_type values.  */
1131 	    return;
1132 	  }
1133 	break;
1134       case OP_PERIOD:
1135 	has_period = true;
1136 	break;
1137       case OP_BACK_REF:
1138       case OP_ALT:
1139       case END_OF_RE:
1140       case OP_DUP_ASTERISK:
1141       case OP_OPEN_SUBEXP:
1142       case OP_CLOSE_SUBEXP:
1143 	break;
1144       case COMPLEX_BRACKET:
1145 	return;
1146       case SIMPLE_BRACKET:
1147 	/* Just double check.  */
1148 	{
1149 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1150 			? 0
1151 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1152 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1153 	    {
1154 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1155 		return;
1156 	      rshift = 0;
1157 	    }
1158 	}
1159 	break;
1160       default:
1161 	abort ();
1162       }
1163 
1164   if (mb_chars || has_period)
1165     for (node = 0; node < dfa->nodes_len; ++node)
1166       {
1167 	if (dfa->nodes[node].type == CHARACTER
1168 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1169 	  dfa->nodes[node].mb_partial = 0;
1170 	else if (dfa->nodes[node].type == OP_PERIOD)
1171 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1172       }
1173 
1174   /* The search can be in single byte locale.  */
1175   dfa->mb_cur_max = 1;
1176   dfa->is_utf8 = 0;
1177   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1178 }
1179 #endif
1180 
1181 /* Analyze the structure tree, and calculate "first", "next", "edest",
1182    "eclosure", and "inveclosure".  */
1183 
1184 static reg_errcode_t
1185 analyze (regex_t *preg)
1186 {
1187   re_dfa_t *dfa = preg->buffer;
1188   reg_errcode_t ret;
1189 
1190   /* Allocate arrays.  */
1191   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1192   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1193   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1194   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1195   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1196 	  || dfa->eclosures == NULL, 0))
1197     return REG_ESPACE;
1198 
1199   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1200   if (dfa->subexp_map != NULL)
1201     {
1202       Idx i;
1203       for (i = 0; i < preg->re_nsub; i++)
1204 	dfa->subexp_map[i] = i;
1205       preorder (dfa->str_tree, optimize_subexps, dfa);
1206       for (i = 0; i < preg->re_nsub; i++)
1207 	if (dfa->subexp_map[i] != i)
1208 	  break;
1209       if (i == preg->re_nsub)
1210 	{
1211 	  free (dfa->subexp_map);
1212 	  dfa->subexp_map = NULL;
1213 	}
1214     }
1215 
1216   ret = postorder (dfa->str_tree, lower_subexps, preg);
1217   if (BE (ret != REG_NOERROR, 0))
1218     return ret;
1219   ret = postorder (dfa->str_tree, calc_first, dfa);
1220   if (BE (ret != REG_NOERROR, 0))
1221     return ret;
1222   preorder (dfa->str_tree, calc_next, dfa);
1223   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1224   if (BE (ret != REG_NOERROR, 0))
1225     return ret;
1226   ret = calc_eclosure (dfa);
1227   if (BE (ret != REG_NOERROR, 0))
1228     return ret;
1229 
1230   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1231      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1232   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1233       || dfa->nbackref)
1234     {
1235       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1236       if (BE (dfa->inveclosures == NULL, 0))
1237 	return REG_ESPACE;
1238       ret = calc_inveclosure (dfa);
1239     }
1240 
1241   return ret;
1242 }
1243 
1244 /* Our parse trees are very unbalanced, so we cannot use a stack to
1245    implement parse tree visits.  Instead, we use parent pointers and
1246    some hairy code in these two functions.  */
1247 static reg_errcode_t
1248 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249 	   void *extra)
1250 {
1251   bin_tree_t *node, *prev;
1252 
1253   for (node = root; ; )
1254     {
1255       /* Descend down the tree, preferably to the left (or to the right
1256 	 if that's the only child).  */
1257       while (node->left || node->right)
1258 	if (node->left)
1259 	  node = node->left;
1260 	else
1261 	  node = node->right;
1262 
1263       do
1264 	{
1265 	  reg_errcode_t err = fn (extra, node);
1266 	  if (BE (err != REG_NOERROR, 0))
1267 	    return err;
1268 	  if (node->parent == NULL)
1269 	    return REG_NOERROR;
1270 	  prev = node;
1271 	  node = node->parent;
1272 	}
1273       /* Go up while we have a node that is reached from the right.  */
1274       while (node->right == prev || node->right == NULL);
1275       node = node->right;
1276     }
1277 }
1278 
1279 static reg_errcode_t
1280 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1281 	  void *extra)
1282 {
1283   bin_tree_t *node;
1284 
1285   for (node = root; ; )
1286     {
1287       reg_errcode_t err = fn (extra, node);
1288       if (BE (err != REG_NOERROR, 0))
1289 	return err;
1290 
1291       /* Go to the left node, or up and to the right.  */
1292       if (node->left)
1293 	node = node->left;
1294       else
1295 	{
1296 	  bin_tree_t *prev = NULL;
1297 	  while (node->right == prev || node->right == NULL)
1298 	    {
1299 	      prev = node;
1300 	      node = node->parent;
1301 	      if (!node)
1302 		return REG_NOERROR;
1303 	    }
1304 	  node = node->right;
1305 	}
1306     }
1307 }
1308 
1309 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1310    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1311    backreferences as well.  Requires a preorder visit.  */
1312 static reg_errcode_t
1313 optimize_subexps (void *extra, bin_tree_t *node)
1314 {
1315   re_dfa_t *dfa = (re_dfa_t *) extra;
1316 
1317   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1318     {
1319       int idx = node->token.opr.idx;
1320       node->token.opr.idx = dfa->subexp_map[idx];
1321       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1322     }
1323 
1324   else if (node->token.type == SUBEXP
1325 	   && node->left && node->left->token.type == SUBEXP)
1326     {
1327       Idx other_idx = node->left->token.opr.idx;
1328 
1329       node->left = node->left->left;
1330       if (node->left)
1331 	node->left->parent = node;
1332 
1333       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1334       if (other_idx < BITSET_WORD_BITS)
1335 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1336     }
1337 
1338   return REG_NOERROR;
1339 }
1340 
1341 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1342    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1343 static reg_errcode_t
1344 lower_subexps (void *extra, bin_tree_t *node)
1345 {
1346   regex_t *preg = (regex_t *) extra;
1347   reg_errcode_t err = REG_NOERROR;
1348 
1349   if (node->left && node->left->token.type == SUBEXP)
1350     {
1351       node->left = lower_subexp (&err, preg, node->left);
1352       if (node->left)
1353 	node->left->parent = node;
1354     }
1355   if (node->right && node->right->token.type == SUBEXP)
1356     {
1357       node->right = lower_subexp (&err, preg, node->right);
1358       if (node->right)
1359 	node->right->parent = node;
1360     }
1361 
1362   return err;
1363 }
1364 
1365 static bin_tree_t *
1366 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = preg->buffer;
1369   bin_tree_t *body = node->left;
1370   bin_tree_t *op, *cls, *tree1, *tree;
1371 
1372   if (preg->no_sub
1373       /* We do not optimize empty subexpressions, because otherwise we may
1374 	 have bad CONCAT nodes with NULL children.  This is obviously not
1375 	 very common, so we do not lose much.  An example that triggers
1376 	 this case is the sed "script" /\(\)/x.  */
1377       && node->left != NULL
1378       && (node->token.opr.idx >= BITSET_WORD_BITS
1379 	  || !(dfa->used_bkref_map
1380 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1381     return node->left;
1382 
1383   /* Convert the SUBEXP node to the concatenation of an
1384      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1385   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1386   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1387   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1388   tree = create_tree (dfa, op, tree1, CONCAT);
1389   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1390     {
1391       *err = REG_ESPACE;
1392       return NULL;
1393     }
1394 
1395   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1396   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1397   return tree;
1398 }
1399 
1400 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1401    nodes.  Requires a postorder visit.  */
1402 static reg_errcode_t
1403 calc_first (void *extra, bin_tree_t *node)
1404 {
1405   re_dfa_t *dfa = (re_dfa_t *) extra;
1406   if (node->token.type == CONCAT)
1407     {
1408       node->first = node->left->first;
1409       node->node_idx = node->left->node_idx;
1410     }
1411   else
1412     {
1413       node->first = node;
1414       node->node_idx = re_dfa_add_node (dfa, node->token);
1415       if (BE (node->node_idx == REG_MISSING, 0))
1416 	return REG_ESPACE;
1417       if (node->token.type == ANCHOR)
1418 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1419     }
1420   return REG_NOERROR;
1421 }
1422 
1423 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1424 static reg_errcode_t
1425 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
1426 {
1427   switch (node->token.type)
1428     {
1429     case OP_DUP_ASTERISK:
1430       node->left->next = node;
1431       break;
1432     case CONCAT:
1433       node->left->next = node->right->first;
1434       node->right->next = node->next;
1435       break;
1436     default:
1437       if (node->left)
1438 	node->left->next = node->next;
1439       if (node->right)
1440 	node->right->next = node->next;
1441       break;
1442     }
1443   return REG_NOERROR;
1444 }
1445 
1446 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1447 static reg_errcode_t
1448 link_nfa_nodes (void *extra, bin_tree_t *node)
1449 {
1450   re_dfa_t *dfa = (re_dfa_t *) extra;
1451   Idx idx = node->node_idx;
1452   reg_errcode_t err = REG_NOERROR;
1453 
1454   switch (node->token.type)
1455     {
1456     case CONCAT:
1457       break;
1458 
1459     case END_OF_RE:
1460       assert (node->next == NULL);
1461       break;
1462 
1463     case OP_DUP_ASTERISK:
1464     case OP_ALT:
1465       {
1466 	Idx left, right;
1467 	dfa->has_plural_match = 1;
1468 	if (node->left != NULL)
1469 	  left = node->left->first->node_idx;
1470 	else
1471 	  left = node->next->node_idx;
1472 	if (node->right != NULL)
1473 	  right = node->right->first->node_idx;
1474 	else
1475 	  right = node->next->node_idx;
1476 	assert (REG_VALID_INDEX (left));
1477 	assert (REG_VALID_INDEX (right));
1478 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1479       }
1480       break;
1481 
1482     case ANCHOR:
1483     case OP_OPEN_SUBEXP:
1484     case OP_CLOSE_SUBEXP:
1485       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1486       break;
1487 
1488     case OP_BACK_REF:
1489       dfa->nexts[idx] = node->next->node_idx;
1490       if (node->token.type == OP_BACK_REF)
1491 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1492       break;
1493 
1494     default:
1495       assert (!IS_EPSILON_NODE (node->token.type));
1496       dfa->nexts[idx] = node->next->node_idx;
1497       break;
1498     }
1499 
1500   return err;
1501 }
1502 
1503 /* Duplicate the epsilon closure of the node ROOT_NODE.
1504    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1505    to their own constraint.  */
1506 
1507 static reg_errcode_t
1508 internal_function
1509 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1510 			Idx root_node, unsigned int init_constraint)
1511 {
1512   Idx org_node, clone_node;
1513   bool ok;
1514   unsigned int constraint = init_constraint;
1515   for (org_node = top_org_node, clone_node = top_clone_node;;)
1516     {
1517       Idx org_dest, clone_dest;
1518       if (dfa->nodes[org_node].type == OP_BACK_REF)
1519 	{
1520 	  /* If the back reference epsilon-transit, its destination must
1521 	     also have the constraint.  Then duplicate the epsilon closure
1522 	     of the destination of the back reference, and store it in
1523 	     edests of the back reference.  */
1524 	  org_dest = dfa->nexts[org_node];
1525 	  re_node_set_empty (dfa->edests + clone_node);
1526 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1527 	  if (BE (clone_dest == REG_MISSING, 0))
1528 	    return REG_ESPACE;
1529 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531 	  if (BE (! ok, 0))
1532 	    return REG_ESPACE;
1533 	}
1534       else if (dfa->edests[org_node].nelem == 0)
1535 	{
1536 	  /* In case of the node can't epsilon-transit, don't duplicate the
1537 	     destination and store the original destination as the
1538 	     destination of the node.  */
1539 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1540 	  break;
1541 	}
1542       else if (dfa->edests[org_node].nelem == 1)
1543 	{
1544 	  /* In case of the node can epsilon-transit, and it has only one
1545 	     destination.  */
1546 	  org_dest = dfa->edests[org_node].elems[0];
1547 	  re_node_set_empty (dfa->edests + clone_node);
1548 	  /* If the node is root_node itself, it means the epsilon closure
1549 	     has a loop.  Then tie it to the destination of the root_node.  */
1550 	  if (org_node == root_node && clone_node != org_node)
1551 	    {
1552 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1553 	      if (BE (! ok, 0))
1554 	        return REG_ESPACE;
1555 	      break;
1556 	    }
1557 	  /* In case the node has another constraint, append it.  */
1558 	  constraint |= dfa->nodes[org_node].constraint;
1559 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 	  if (BE (clone_dest == REG_MISSING, 0))
1561 	    return REG_ESPACE;
1562 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 	  if (BE (! ok, 0))
1564 	    return REG_ESPACE;
1565 	}
1566       else /* dfa->edests[org_node].nelem == 2 */
1567 	{
1568 	  /* In case of the node can epsilon-transit, and it has two
1569 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1570 	  org_dest = dfa->edests[org_node].elems[0];
1571 	  re_node_set_empty (dfa->edests + clone_node);
1572 	  /* Search for a duplicated node which satisfies the constraint.  */
1573 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1574 	  if (clone_dest == REG_MISSING)
1575 	    {
1576 	      /* There is no such duplicated node, create a new one.  */
1577 	      reg_errcode_t err;
1578 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1579 	      if (BE (clone_dest == REG_MISSING, 0))
1580 		return REG_ESPACE;
1581 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 	      if (BE (! ok, 0))
1583 		return REG_ESPACE;
1584 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1585 					    root_node, constraint);
1586 	      if (BE (err != REG_NOERROR, 0))
1587 		return err;
1588 	    }
1589 	  else
1590 	    {
1591 	      /* There is a duplicated node which satisfies the constraint,
1592 		 use it to avoid infinite loop.  */
1593 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1594 	      if (BE (! ok, 0))
1595 		return REG_ESPACE;
1596 	    }
1597 
1598 	  org_dest = dfa->edests[org_node].elems[1];
1599 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1600 	  if (BE (clone_dest == REG_MISSING, 0))
1601 	    return REG_ESPACE;
1602 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603 	  if (BE (! ok, 0))
1604 	    return REG_ESPACE;
1605 	}
1606       org_node = org_dest;
1607       clone_node = clone_dest;
1608     }
1609   return REG_NOERROR;
1610 }
1611 
1612 /* Search for a node which is duplicated from the node ORG_NODE, and
1613    satisfies the constraint CONSTRAINT.  */
1614 
1615 static Idx
1616 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1617 			unsigned int constraint)
1618 {
1619   Idx idx;
1620   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1621     {
1622       if (org_node == dfa->org_indices[idx]
1623 	  && constraint == dfa->nodes[idx].constraint)
1624 	return idx; /* Found.  */
1625     }
1626   return REG_MISSING; /* Not found.  */
1627 }
1628 
1629 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1630    Return the index of the new node, or REG_MISSING if insufficient storage is
1631    available.  */
1632 
1633 static Idx
1634 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1635 {
1636   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1637   if (BE (dup_idx != REG_MISSING, 1))
1638     {
1639       dfa->nodes[dup_idx].constraint = constraint;
1640       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1641       dfa->nodes[dup_idx].duplicated = 1;
1642 
1643       /* Store the index of the original node.  */
1644       dfa->org_indices[dup_idx] = org_idx;
1645     }
1646   return dup_idx;
1647 }
1648 
1649 static reg_errcode_t
1650 calc_inveclosure (re_dfa_t *dfa)
1651 {
1652   Idx src, idx;
1653   bool ok;
1654   for (idx = 0; idx < dfa->nodes_len; ++idx)
1655     re_node_set_init_empty (dfa->inveclosures + idx);
1656 
1657   for (src = 0; src < dfa->nodes_len; ++src)
1658     {
1659       Idx *elems = dfa->eclosures[src].elems;
1660       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1661 	{
1662 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1663 	  if (BE (! ok, 0))
1664 	    return REG_ESPACE;
1665 	}
1666     }
1667 
1668   return REG_NOERROR;
1669 }
1670 
1671 /* Calculate "eclosure" for all the node in DFA.  */
1672 
1673 static reg_errcode_t
1674 calc_eclosure (re_dfa_t *dfa)
1675 {
1676   Idx node_idx;
1677   bool incomplete;
1678 #ifdef DEBUG
1679   assert (dfa->nodes_len > 0);
1680 #endif
1681   incomplete = false;
1682   /* For each nodes, calculate epsilon closure.  */
1683   for (node_idx = 0; ; ++node_idx)
1684     {
1685       reg_errcode_t err;
1686       re_node_set eclosure_elem;
1687       if (node_idx == dfa->nodes_len)
1688 	{
1689 	  if (!incomplete)
1690 	    break;
1691 	  incomplete = false;
1692 	  node_idx = 0;
1693 	}
1694 
1695 #ifdef DEBUG
1696       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1697 #endif
1698 
1699       /* If we have already calculated, skip it.  */
1700       if (dfa->eclosures[node_idx].nelem != 0)
1701 	continue;
1702       /* Calculate epsilon closure of 'node_idx'.  */
1703       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1704       if (BE (err != REG_NOERROR, 0))
1705 	return err;
1706 
1707       if (dfa->eclosures[node_idx].nelem == 0)
1708 	{
1709 	  incomplete = true;
1710 	  re_node_set_free (&eclosure_elem);
1711 	}
1712     }
1713   return REG_NOERROR;
1714 }
1715 
1716 /* Calculate epsilon closure of NODE.  */
1717 
1718 static reg_errcode_t
1719 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1720 {
1721   reg_errcode_t err;
1722   Idx i;
1723   re_node_set eclosure;
1724   bool ok;
1725   bool incomplete = false;
1726   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1727   if (BE (err != REG_NOERROR, 0))
1728     return err;
1729 
1730   /* This indicates that we are calculating this node now.
1731      We reference this value to avoid infinite loop.  */
1732   dfa->eclosures[node].nelem = REG_MISSING;
1733 
1734   /* If the current node has constraints, duplicate all nodes
1735      since they must inherit the constraints.  */
1736   if (dfa->nodes[node].constraint
1737       && dfa->edests[node].nelem
1738       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1739     {
1740       err = duplicate_node_closure (dfa, node, node, node,
1741 				    dfa->nodes[node].constraint);
1742       if (BE (err != REG_NOERROR, 0))
1743 	return err;
1744     }
1745 
1746   /* Expand each epsilon destination nodes.  */
1747   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1748     for (i = 0; i < dfa->edests[node].nelem; ++i)
1749       {
1750 	re_node_set eclosure_elem;
1751 	Idx edest = dfa->edests[node].elems[i];
1752 	/* If calculating the epsilon closure of 'edest' is in progress,
1753 	   return intermediate result.  */
1754 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1755 	  {
1756 	    incomplete = true;
1757 	    continue;
1758 	  }
1759 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1760 	   calculate now. Otherwise use calculated epsilon closure.  */
1761 	if (dfa->eclosures[edest].nelem == 0)
1762 	  {
1763 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1764 	    if (BE (err != REG_NOERROR, 0))
1765 	      return err;
1766 	  }
1767 	else
1768 	  eclosure_elem = dfa->eclosures[edest];
1769 	/* Merge the epsilon closure of 'edest'.  */
1770 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1771 	if (BE (err != REG_NOERROR, 0))
1772 	  return err;
1773 	/* If the epsilon closure of 'edest' is incomplete,
1774 	   the epsilon closure of this node is also incomplete.  */
1775 	if (dfa->eclosures[edest].nelem == 0)
1776 	  {
1777 	    incomplete = true;
1778 	    re_node_set_free (&eclosure_elem);
1779 	  }
1780       }
1781 
1782   /* An epsilon closure includes itself.  */
1783   ok = re_node_set_insert (&eclosure, node);
1784   if (BE (! ok, 0))
1785     return REG_ESPACE;
1786   if (incomplete && !root)
1787     dfa->eclosures[node].nelem = 0;
1788   else
1789     dfa->eclosures[node] = eclosure;
1790   *new_set = eclosure;
1791   return REG_NOERROR;
1792 }
1793 
1794 /* Functions for token which are used in the parser.  */
1795 
1796 /* Fetch a token from INPUT.
1797    We must not use this function inside bracket expressions.  */
1798 
1799 static void
1800 internal_function
1801 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1802 {
1803   re_string_skip_bytes (input, peek_token (result, input, syntax));
1804 }
1805 
1806 /* Peek a token from INPUT, and return the length of the token.
1807    We must not use this function inside bracket expressions.  */
1808 
1809 static int
1810 internal_function
1811 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1812 {
1813   unsigned char c;
1814 
1815   if (re_string_eoi (input))
1816     {
1817       token->type = END_OF_RE;
1818       return 0;
1819     }
1820 
1821   c = re_string_peek_byte (input, 0);
1822   token->opr.c = c;
1823 
1824   token->word_char = 0;
1825 #ifdef RE_ENABLE_I18N
1826   token->mb_partial = 0;
1827   if (input->mb_cur_max > 1 &&
1828       !re_string_first_byte (input, re_string_cur_idx (input)))
1829     {
1830       token->type = CHARACTER;
1831       token->mb_partial = 1;
1832       return 1;
1833     }
1834 #endif
1835   if (c == '\\')
1836     {
1837       unsigned char c2;
1838       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1839 	{
1840 	  token->type = BACK_SLASH;
1841 	  return 1;
1842 	}
1843 
1844       c2 = re_string_peek_byte_case (input, 1);
1845       token->opr.c = c2;
1846       token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848       if (input->mb_cur_max > 1)
1849 	{
1850 	  wint_t wc = re_string_wchar_at (input,
1851 					  re_string_cur_idx (input) + 1);
1852 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1853 	}
1854       else
1855 #endif
1856 	token->word_char = IS_WORD_CHAR (c2) != 0;
1857 
1858       switch (c2)
1859 	{
1860 	case '|':
1861 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1862 	    token->type = OP_ALT;
1863 	  break;
1864 	case '1': case '2': case '3': case '4': case '5':
1865 	case '6': case '7': case '8': case '9':
1866 	  if (!(syntax & RE_NO_BK_REFS))
1867 	    {
1868 	      token->type = OP_BACK_REF;
1869 	      token->opr.idx = c2 - '1';
1870 	    }
1871 	  break;
1872 	case '<':
1873 	  if (!(syntax & RE_NO_GNU_OPS))
1874 	    {
1875 	      token->type = ANCHOR;
1876 	      token->opr.ctx_type = WORD_FIRST;
1877 	    }
1878 	  break;
1879 	case '>':
1880 	  if (!(syntax & RE_NO_GNU_OPS))
1881 	    {
1882 	      token->type = ANCHOR;
1883 	      token->opr.ctx_type = WORD_LAST;
1884 	    }
1885 	  break;
1886 	case 'b':
1887 	  if (!(syntax & RE_NO_GNU_OPS))
1888 	    {
1889 	      token->type = ANCHOR;
1890 	      token->opr.ctx_type = WORD_DELIM;
1891 	    }
1892 	  break;
1893 	case 'B':
1894 	  if (!(syntax & RE_NO_GNU_OPS))
1895 	    {
1896 	      token->type = ANCHOR;
1897 	      token->opr.ctx_type = NOT_WORD_DELIM;
1898 	    }
1899 	  break;
1900 	case 'w':
1901 	  if (!(syntax & RE_NO_GNU_OPS))
1902 	    token->type = OP_WORD;
1903 	  break;
1904 	case 'W':
1905 	  if (!(syntax & RE_NO_GNU_OPS))
1906 	    token->type = OP_NOTWORD;
1907 	  break;
1908 	case 's':
1909 	  if (!(syntax & RE_NO_GNU_OPS))
1910 	    token->type = OP_SPACE;
1911 	  break;
1912 	case 'S':
1913 	  if (!(syntax & RE_NO_GNU_OPS))
1914 	    token->type = OP_NOTSPACE;
1915 	  break;
1916 	case '`':
1917 	  if (!(syntax & RE_NO_GNU_OPS))
1918 	    {
1919 	      token->type = ANCHOR;
1920 	      token->opr.ctx_type = BUF_FIRST;
1921 	    }
1922 	  break;
1923 	case '\'':
1924 	  if (!(syntax & RE_NO_GNU_OPS))
1925 	    {
1926 	      token->type = ANCHOR;
1927 	      token->opr.ctx_type = BUF_LAST;
1928 	    }
1929 	  break;
1930 	case '(':
1931 	  if (!(syntax & RE_NO_BK_PARENS))
1932 	    token->type = OP_OPEN_SUBEXP;
1933 	  break;
1934 	case ')':
1935 	  if (!(syntax & RE_NO_BK_PARENS))
1936 	    token->type = OP_CLOSE_SUBEXP;
1937 	  break;
1938 	case '+':
1939 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1940 	    token->type = OP_DUP_PLUS;
1941 	  break;
1942 	case '?':
1943 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1944 	    token->type = OP_DUP_QUESTION;
1945 	  break;
1946 	case '{':
1947 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1948 	    token->type = OP_OPEN_DUP_NUM;
1949 	  break;
1950 	case '}':
1951 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1952 	    token->type = OP_CLOSE_DUP_NUM;
1953 	  break;
1954 	default:
1955 	  break;
1956 	}
1957       return 2;
1958     }
1959 
1960   token->type = CHARACTER;
1961 #ifdef RE_ENABLE_I18N
1962   if (input->mb_cur_max > 1)
1963     {
1964       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1965       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1966     }
1967   else
1968 #endif
1969     token->word_char = IS_WORD_CHAR (token->opr.c);
1970 
1971   switch (c)
1972     {
1973     case '\n':
1974       if (syntax & RE_NEWLINE_ALT)
1975 	token->type = OP_ALT;
1976       break;
1977     case '|':
1978       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1979 	token->type = OP_ALT;
1980       break;
1981     case '*':
1982       token->type = OP_DUP_ASTERISK;
1983       break;
1984     case '+':
1985       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1986 	token->type = OP_DUP_PLUS;
1987       break;
1988     case '?':
1989       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1990 	token->type = OP_DUP_QUESTION;
1991       break;
1992     case '{':
1993       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1994 	token->type = OP_OPEN_DUP_NUM;
1995       break;
1996     case '}':
1997       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1998 	token->type = OP_CLOSE_DUP_NUM;
1999       break;
2000     case '(':
2001       if (syntax & RE_NO_BK_PARENS)
2002 	token->type = OP_OPEN_SUBEXP;
2003       break;
2004     case ')':
2005       if (syntax & RE_NO_BK_PARENS)
2006 	token->type = OP_CLOSE_SUBEXP;
2007       break;
2008     case '[':
2009       token->type = OP_OPEN_BRACKET;
2010       break;
2011     case '.':
2012       token->type = OP_PERIOD;
2013       break;
2014     case '^':
2015       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2016 	  re_string_cur_idx (input) != 0)
2017 	{
2018 	  char prev = re_string_peek_byte (input, -1);
2019 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2020 	    break;
2021 	}
2022       token->type = ANCHOR;
2023       token->opr.ctx_type = LINE_FIRST;
2024       break;
2025     case '$':
2026       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2027 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2028 	{
2029 	  re_token_t next;
2030 	  re_string_skip_bytes (input, 1);
2031 	  peek_token (&next, input, syntax);
2032 	  re_string_skip_bytes (input, -1);
2033 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2034 	    break;
2035 	}
2036       token->type = ANCHOR;
2037       token->opr.ctx_type = LINE_LAST;
2038       break;
2039     default:
2040       break;
2041     }
2042   return 1;
2043 }
2044 
2045 /* Peek a token from INPUT, and return the length of the token.
2046    We must not use this function out of bracket expressions.  */
2047 
2048 static int
2049 internal_function
2050 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2051 {
2052   unsigned char c;
2053   if (re_string_eoi (input))
2054     {
2055       token->type = END_OF_RE;
2056       return 0;
2057     }
2058   c = re_string_peek_byte (input, 0);
2059   token->opr.c = c;
2060 
2061 #ifdef RE_ENABLE_I18N
2062   if (input->mb_cur_max > 1 &&
2063       !re_string_first_byte (input, re_string_cur_idx (input)))
2064     {
2065       token->type = CHARACTER;
2066       return 1;
2067     }
2068 #endif /* RE_ENABLE_I18N */
2069 
2070   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2071       && re_string_cur_idx (input) + 1 < re_string_length (input))
2072     {
2073       /* In this case, '\' escape a character.  */
2074       unsigned char c2;
2075       re_string_skip_bytes (input, 1);
2076       c2 = re_string_peek_byte (input, 0);
2077       token->opr.c = c2;
2078       token->type = CHARACTER;
2079       return 1;
2080     }
2081   if (c == '[') /* '[' is a special char in a bracket exps.  */
2082     {
2083       unsigned char c2;
2084       int token_len;
2085       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2086 	c2 = re_string_peek_byte (input, 1);
2087       else
2088 	c2 = 0;
2089       token->opr.c = c2;
2090       token_len = 2;
2091       switch (c2)
2092 	{
2093 	case '.':
2094 	  token->type = OP_OPEN_COLL_ELEM;
2095 	  break;
2096 	case '=':
2097 	  token->type = OP_OPEN_EQUIV_CLASS;
2098 	  break;
2099 	case ':':
2100 	  if (syntax & RE_CHAR_CLASSES)
2101 	    {
2102 	      token->type = OP_OPEN_CHAR_CLASS;
2103 	      break;
2104 	    }
2105 	  /* else fall through.  */
2106 	default:
2107 	  token->type = CHARACTER;
2108 	  token->opr.c = c;
2109 	  token_len = 1;
2110 	  break;
2111 	}
2112       return token_len;
2113     }
2114   switch (c)
2115     {
2116     case '-':
2117       token->type = OP_CHARSET_RANGE;
2118       break;
2119     case ']':
2120       token->type = OP_CLOSE_BRACKET;
2121       break;
2122     case '^':
2123       token->type = OP_NON_MATCH_LIST;
2124       break;
2125     default:
2126       token->type = CHARACTER;
2127     }
2128   return 1;
2129 }
2130 
2131 /* Functions for parser.  */
2132 
2133 /* Entry point of the parser.
2134    Parse the regular expression REGEXP and return the structure tree.
2135    If an error occurs, ERR is set by error code, and return NULL.
2136    This function build the following tree, from regular expression <reg_exp>:
2137 	   CAT
2138 	   / \
2139 	  /   \
2140    <reg_exp>  EOR
2141 
2142    CAT means concatenation.
2143    EOR means end of regular expression.  */
2144 
2145 static bin_tree_t *
2146 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2147        reg_errcode_t *err)
2148 {
2149   re_dfa_t *dfa = preg->buffer;
2150   bin_tree_t *tree, *eor, *root;
2151   re_token_t current_token;
2152   dfa->syntax = syntax;
2153   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2155   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2156     return NULL;
2157   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2158   if (tree != NULL)
2159     root = create_tree (dfa, tree, eor, CONCAT);
2160   else
2161     root = eor;
2162   if (BE (eor == NULL || root == NULL, 0))
2163     {
2164       *err = REG_ESPACE;
2165       return NULL;
2166     }
2167   return root;
2168 }
2169 
2170 /* This function build the following tree, from regular expression
2171    <branch1>|<branch2>:
2172 	   ALT
2173 	   / \
2174 	  /   \
2175    <branch1> <branch2>
2176 
2177    ALT means alternative, which represents the operator '|'.  */
2178 
2179 static bin_tree_t *
2180 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2182 {
2183   re_dfa_t *dfa = preg->buffer;
2184   bin_tree_t *tree, *branch = NULL;
2185   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2186   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2187     return NULL;
2188 
2189   while (token->type == OP_ALT)
2190     {
2191       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2192       if (token->type != OP_ALT && token->type != END_OF_RE
2193 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194 	{
2195 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2196 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2197 	    return NULL;
2198 	}
2199       else
2200 	branch = NULL;
2201       tree = create_tree (dfa, tree, branch, OP_ALT);
2202       if (BE (tree == NULL, 0))
2203 	{
2204 	  *err = REG_ESPACE;
2205 	  return NULL;
2206 	}
2207     }
2208   return tree;
2209 }
2210 
2211 /* This function build the following tree, from regular expression
2212    <exp1><exp2>:
2213 	CAT
2214 	/ \
2215        /   \
2216    <exp1> <exp2>
2217 
2218    CAT means concatenation.  */
2219 
2220 static bin_tree_t *
2221 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2223 {
2224   bin_tree_t *tree, *expr;
2225   re_dfa_t *dfa = preg->buffer;
2226   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2227   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2228     return NULL;
2229 
2230   while (token->type != OP_ALT && token->type != END_OF_RE
2231 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2232     {
2233       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2234       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2235 	{
2236 	  if (tree != NULL)
2237 	    postorder (tree, free_tree, NULL);
2238 	  return NULL;
2239 	}
2240       if (tree != NULL && expr != NULL)
2241 	{
2242 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2243 	  if (newtree == NULL)
2244 	    {
2245 	      postorder (expr, free_tree, NULL);
2246 	      postorder (tree, free_tree, NULL);
2247 	      *err = REG_ESPACE;
2248 	      return NULL;
2249 	    }
2250 	  tree = newtree;
2251 	}
2252       else if (tree == NULL)
2253 	tree = expr;
2254       /* Otherwise expr == NULL, we don't need to create new tree.  */
2255     }
2256   return tree;
2257 }
2258 
2259 /* This function build the following tree, from regular expression a*:
2260 	 *
2261 	 |
2262 	 a
2263 */
2264 
2265 static bin_tree_t *
2266 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2267 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2268 {
2269   re_dfa_t *dfa = preg->buffer;
2270   bin_tree_t *tree;
2271   switch (token->type)
2272     {
2273     case CHARACTER:
2274       tree = create_token_tree (dfa, NULL, NULL, token);
2275       if (BE (tree == NULL, 0))
2276 	{
2277 	  *err = REG_ESPACE;
2278 	  return NULL;
2279 	}
2280 #ifdef RE_ENABLE_I18N
2281       if (dfa->mb_cur_max > 1)
2282 	{
2283 	  while (!re_string_eoi (regexp)
2284 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2285 	    {
2286 	      bin_tree_t *mbc_remain;
2287 	      fetch_token (token, regexp, syntax);
2288 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2289 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2290 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2291 		{
2292 		  *err = REG_ESPACE;
2293 		  return NULL;
2294 		}
2295 	    }
2296 	}
2297 #endif
2298       break;
2299     case OP_OPEN_SUBEXP:
2300       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2301       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2302 	return NULL;
2303       break;
2304     case OP_OPEN_BRACKET:
2305       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2306       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307 	return NULL;
2308       break;
2309     case OP_BACK_REF:
2310       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311 	{
2312 	  *err = REG_ESUBREG;
2313 	  return NULL;
2314 	}
2315       dfa->used_bkref_map |= 1 << token->opr.idx;
2316       tree = create_token_tree (dfa, NULL, NULL, token);
2317       if (BE (tree == NULL, 0))
2318 	{
2319 	  *err = REG_ESPACE;
2320 	  return NULL;
2321 	}
2322       ++dfa->nbackref;
2323       dfa->has_mb_node = 1;
2324       break;
2325     case OP_OPEN_DUP_NUM:
2326       if (syntax & RE_CONTEXT_INVALID_DUP)
2327 	{
2328 	  *err = REG_BADRPT;
2329 	  return NULL;
2330 	}
2331       /* FALLTHROUGH */
2332     case OP_DUP_ASTERISK:
2333     case OP_DUP_PLUS:
2334     case OP_DUP_QUESTION:
2335       if (syntax & RE_CONTEXT_INVALID_OPS)
2336 	{
2337 	  *err = REG_BADRPT;
2338 	  return NULL;
2339 	}
2340       else if (syntax & RE_CONTEXT_INDEP_OPS)
2341 	{
2342 	  fetch_token (token, regexp, syntax);
2343 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2344 	}
2345       /* else fall through  */
2346     case OP_CLOSE_SUBEXP:
2347       if ((token->type == OP_CLOSE_SUBEXP) &&
2348 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2349 	{
2350 	  *err = REG_ERPAREN;
2351 	  return NULL;
2352 	}
2353       /* else fall through  */
2354     case OP_CLOSE_DUP_NUM:
2355       /* We treat it as a normal character.  */
2356 
2357       /* Then we can these characters as normal characters.  */
2358       token->type = CHARACTER;
2359       /* mb_partial and word_char bits should be initialized already
2360 	 by peek_token.  */
2361       tree = create_token_tree (dfa, NULL, NULL, token);
2362       if (BE (tree == NULL, 0))
2363 	{
2364 	  *err = REG_ESPACE;
2365 	  return NULL;
2366 	}
2367       break;
2368     case ANCHOR:
2369       if ((token->opr.ctx_type
2370 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 	  && dfa->word_ops_used == 0)
2372 	init_word_char (dfa);
2373       if (token->opr.ctx_type == WORD_DELIM
2374 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2375 	{
2376 	  bin_tree_t *tree_first, *tree_last;
2377 	  if (token->opr.ctx_type == WORD_DELIM)
2378 	    {
2379 	      token->opr.ctx_type = WORD_FIRST;
2380 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 	      token->opr.ctx_type = WORD_LAST;
2382 	    }
2383 	  else
2384 	    {
2385 	      token->opr.ctx_type = INSIDE_WORD;
2386 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 	      token->opr.ctx_type = INSIDE_NOTWORD;
2388 	    }
2389 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 	    {
2393 	      *err = REG_ESPACE;
2394 	      return NULL;
2395 	    }
2396 	}
2397       else
2398 	{
2399 	  tree = create_token_tree (dfa, NULL, NULL, token);
2400 	  if (BE (tree == NULL, 0))
2401 	    {
2402 	      *err = REG_ESPACE;
2403 	      return NULL;
2404 	    }
2405 	}
2406       /* We must return here, since ANCHORs can't be followed
2407 	 by repetition operators.
2408 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2410       fetch_token (token, regexp, syntax);
2411       return tree;
2412     case OP_PERIOD:
2413       tree = create_token_tree (dfa, NULL, NULL, token);
2414       if (BE (tree == NULL, 0))
2415 	{
2416 	  *err = REG_ESPACE;
2417 	  return NULL;
2418 	}
2419       if (dfa->mb_cur_max > 1)
2420 	dfa->has_mb_node = 1;
2421       break;
2422     case OP_WORD:
2423     case OP_NOTWORD:
2424       tree = build_charclass_op (dfa, regexp->trans,
2425 				 "alnum",
2426 				 "_",
2427 				 token->type == OP_NOTWORD, err);
2428       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2429 	return NULL;
2430       break;
2431     case OP_SPACE:
2432     case OP_NOTSPACE:
2433       tree = build_charclass_op (dfa, regexp->trans,
2434 				 "space",
2435 				 "",
2436 				 token->type == OP_NOTSPACE, err);
2437       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438 	return NULL;
2439       break;
2440     case OP_ALT:
2441     case END_OF_RE:
2442       return NULL;
2443     case BACK_SLASH:
2444       *err = REG_EESCAPE;
2445       return NULL;
2446     default:
2447       /* Must not happen?  */
2448 #ifdef DEBUG
2449       assert (0);
2450 #endif
2451       return NULL;
2452     }
2453   fetch_token (token, regexp, syntax);
2454 
2455   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2456 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2457     {
2458       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2459       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2460 	return NULL;
2461       /* In BRE consecutive duplications are not allowed.  */
2462       if ((syntax & RE_CONTEXT_INVALID_DUP)
2463 	  && (token->type == OP_DUP_ASTERISK
2464 	      || token->type == OP_OPEN_DUP_NUM))
2465 	{
2466 	  *err = REG_BADRPT;
2467 	  return NULL;
2468 	}
2469     }
2470 
2471   return tree;
2472 }
2473 
2474 /* This function build the following tree, from regular expression
2475    (<reg_exp>):
2476 	 SUBEXP
2477 	    |
2478 	<reg_exp>
2479 */
2480 
2481 static bin_tree_t *
2482 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2483 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2484 {
2485   re_dfa_t *dfa = preg->buffer;
2486   bin_tree_t *tree;
2487   size_t cur_nsub;
2488   cur_nsub = preg->re_nsub++;
2489 
2490   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2491 
2492   /* The subexpression may be a null string.  */
2493   if (token->type == OP_CLOSE_SUBEXP)
2494     tree = NULL;
2495   else
2496     {
2497       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2498       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2499 	{
2500 	  if (tree != NULL)
2501 	    postorder (tree, free_tree, NULL);
2502 	  *err = REG_EPAREN;
2503 	}
2504       if (BE (*err != REG_NOERROR, 0))
2505 	return NULL;
2506     }
2507 
2508   if (cur_nsub <= '9' - '1')
2509     dfa->completed_bkref_map |= 1 << cur_nsub;
2510 
2511   tree = create_tree (dfa, tree, NULL, SUBEXP);
2512   if (BE (tree == NULL, 0))
2513     {
2514       *err = REG_ESPACE;
2515       return NULL;
2516     }
2517   tree->token.opr.idx = cur_nsub;
2518   return tree;
2519 }
2520 
2521 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2522 
2523 static bin_tree_t *
2524 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2525 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2526 {
2527   bin_tree_t *tree = NULL, *old_tree = NULL;
2528   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2529   re_token_t start_token = *token;
2530 
2531   if (token->type == OP_OPEN_DUP_NUM)
2532     {
2533       end = 0;
2534       start = fetch_number (regexp, token, syntax);
2535       if (start == REG_MISSING)
2536 	{
2537 	  if (token->type == CHARACTER && token->opr.c == ',')
2538 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2539 	  else
2540 	    {
2541 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2542 	      return NULL;
2543 	    }
2544 	}
2545       if (BE (start != REG_ERROR, 1))
2546 	{
2547 	  /* We treat "{n}" as "{n,n}".  */
2548 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549 		 : ((token->type == CHARACTER && token->opr.c == ',')
2550 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2551 	}
2552       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2553 	{
2554 	  /* Invalid sequence.  */
2555 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2556 	    {
2557 	      if (token->type == END_OF_RE)
2558 		*err = REG_EBRACE;
2559 	      else
2560 		*err = REG_BADBR;
2561 
2562 	      return NULL;
2563 	    }
2564 
2565 	  /* If the syntax bit is set, rollback.  */
2566 	  re_string_set_index (regexp, start_idx);
2567 	  *token = start_token;
2568 	  token->type = CHARACTER;
2569 	  /* mb_partial and word_char bits should be already initialized by
2570 	     peek_token.  */
2571 	  return elem;
2572 	}
2573 
2574       if (BE ((end != REG_MISSING && start > end)
2575 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2576 	{
2577 	  /* First number greater than second.  */
2578 	  *err = REG_BADBR;
2579 	  return NULL;
2580 	}
2581 
2582       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2583 	{
2584 	  *err = REG_ESIZE;
2585 	  return NULL;
2586 	}
2587     }
2588   else
2589     {
2590       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2592     }
2593 
2594   fetch_token (token, regexp, syntax);
2595 
2596   if (BE (elem == NULL, 0))
2597     return NULL;
2598   if (BE (start == 0 && end == 0, 0))
2599     {
2600       postorder (elem, free_tree, NULL);
2601       return NULL;
2602     }
2603 
2604   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2605   if (BE (start > 0, 0))
2606     {
2607       tree = elem;
2608       for (i = 2; i <= start; ++i)
2609 	{
2610 	  elem = duplicate_tree (elem, dfa);
2611 	  tree = create_tree (dfa, tree, elem, CONCAT);
2612 	  if (BE (elem == NULL || tree == NULL, 0))
2613 	    goto parse_dup_op_espace;
2614 	}
2615 
2616       if (start == end)
2617 	return tree;
2618 
2619       /* Duplicate ELEM before it is marked optional.  */
2620       elem = duplicate_tree (elem, dfa);
2621       old_tree = tree;
2622     }
2623   else
2624     old_tree = NULL;
2625 
2626   if (elem->token.type == SUBEXP)
2627     {
2628       uintptr_t subidx = elem->token.opr.idx;
2629       postorder (elem, mark_opt_subexp, (void *) subidx);
2630     }
2631 
2632   tree = create_tree (dfa, elem, NULL,
2633 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2634   if (BE (tree == NULL, 0))
2635     goto parse_dup_op_espace;
2636 
2637 /* From gnulib's "intprops.h":
2638    True if the arithmetic type T is signed.  */
2639 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2640 
2641   /* This loop is actually executed only when end != REG_MISSING,
2642      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2643      already created the start+1-th copy.  */
2644   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2645     for (i = start + 2; i <= end; ++i)
2646       {
2647 	elem = duplicate_tree (elem, dfa);
2648 	tree = create_tree (dfa, tree, elem, CONCAT);
2649 	if (BE (elem == NULL || tree == NULL, 0))
2650 	  goto parse_dup_op_espace;
2651 
2652 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2653 	if (BE (tree == NULL, 0))
2654 	  goto parse_dup_op_espace;
2655       }
2656 
2657   if (old_tree)
2658     tree = create_tree (dfa, old_tree, tree, CONCAT);
2659 
2660   return tree;
2661 
2662  parse_dup_op_espace:
2663   *err = REG_ESPACE;
2664   return NULL;
2665 }
2666 
2667 /* Size of the names for collating symbol/equivalence_class/character_class.
2668    I'm not sure, but maybe enough.  */
2669 #define BRACKET_NAME_BUF_SIZE 32
2670 
2671 #ifndef _LIBC
2672   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2673      Build the range expression which starts from START_ELEM, and ends
2674      at END_ELEM.  The result are written to MBCSET and SBCSET.
2675      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2676      mbcset->range_ends, is a pointer argument since we may
2677      update it.  */
2678 
2679 static reg_errcode_t
2680 internal_function
2681 # ifdef RE_ENABLE_I18N
2682 build_range_exp (const reg_syntax_t syntax,
2683                  bitset_t sbcset,
2684                  re_charset_t *mbcset,
2685                  Idx *range_alloc,
2686                  const bracket_elem_t *start_elem,
2687                  const bracket_elem_t *end_elem)
2688 # else /* not RE_ENABLE_I18N */
2689 build_range_exp (const reg_syntax_t syntax,
2690                  bitset_t sbcset,
2691                  const bracket_elem_t *start_elem,
2692                  const bracket_elem_t *end_elem)
2693 # endif /* not RE_ENABLE_I18N */
2694 {
2695   unsigned int start_ch, end_ch;
2696   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2697   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2698 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2699 	  0))
2700     return REG_ERANGE;
2701 
2702   /* We can handle no multi character collating elements without libc
2703      support.  */
2704   if (BE ((start_elem->type == COLL_SYM
2705 	   && strlen ((char *) start_elem->opr.name) > 1)
2706 	  || (end_elem->type == COLL_SYM
2707 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2708     return REG_ECOLLATE;
2709 
2710 # ifdef RE_ENABLE_I18N
2711   {
2712     wchar_t wc;
2713     wint_t start_wc;
2714     wint_t end_wc;
2715 
2716     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718 		   : 0));
2719     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721 		 : 0));
2722     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723 		? __btowc (start_ch) : start_elem->opr.wch);
2724     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725 	      ? __btowc (end_ch) : end_elem->opr.wch);
2726     if (start_wc == WEOF || end_wc == WEOF)
2727       return REG_ECOLLATE;
2728     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2729       return REG_ERANGE;
2730 
2731     /* Got valid collation sequence values, add them as a new entry.
2732        However, for !_LIBC we have no collation elements: if the
2733        character set is single byte, the single byte character set
2734        that we build below suffices.  parse_bracket_exp passes
2735        no MBCSET if dfa->mb_cur_max == 1.  */
2736     if (mbcset)
2737       {
2738 	/* Check the space of the arrays.  */
2739 	if (BE (*range_alloc == mbcset->nranges, 0))
2740 	  {
2741 	    /* There is not enough space, need realloc.  */
2742 	    wchar_t *new_array_start, *new_array_end;
2743 	    Idx new_nranges;
2744 
2745 	    /* +1 in case of mbcset->nranges is 0.  */
2746 	    new_nranges = 2 * mbcset->nranges + 1;
2747 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2748 	       are NULL if *range_alloc == 0.  */
2749 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2750 					  new_nranges);
2751 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2752 					new_nranges);
2753 
2754 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2755 	      return REG_ESPACE;
2756 
2757 	    mbcset->range_starts = new_array_start;
2758 	    mbcset->range_ends = new_array_end;
2759 	    *range_alloc = new_nranges;
2760 	  }
2761 
2762 	mbcset->range_starts[mbcset->nranges] = start_wc;
2763 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2764       }
2765 
2766     /* Build the table for single byte characters.  */
2767     for (wc = 0; wc < SBC_MAX; ++wc)
2768       {
2769 	if (start_wc <= wc && wc <= end_wc)
2770 	  bitset_set (sbcset, wc);
2771       }
2772   }
2773 # else /* not RE_ENABLE_I18N */
2774   {
2775     unsigned int ch;
2776     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2777 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2778 		   : 0));
2779     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2780 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2781 		 : 0));
2782     if (start_ch > end_ch)
2783       return REG_ERANGE;
2784     /* Build the table for single byte characters.  */
2785     for (ch = 0; ch < SBC_MAX; ++ch)
2786       if (start_ch <= ch  && ch <= end_ch)
2787 	bitset_set (sbcset, ch);
2788   }
2789 # endif /* not RE_ENABLE_I18N */
2790   return REG_NOERROR;
2791 }
2792 #endif /* not _LIBC */
2793 
2794 #ifndef _LIBC
2795 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2796    Build the collating element which is represented by NAME.
2797    The result are written to MBCSET and SBCSET.
2798    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2799    pointer argument since we may update it.  */
2800 
2801 static reg_errcode_t
2802 internal_function
2803 # ifdef RE_ENABLE_I18N
2804 build_collating_symbol (bitset_t sbcset,
2805 			re_charset_t *mbcset _UNUSED_PARAMETER_,
2806 			Idx *coll_sym_alloc _UNUSED_PARAMETER_,
2807 			const unsigned char *name)
2808 # else /* not RE_ENABLE_I18N */
2809 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2810 # endif /* not RE_ENABLE_I18N */
2811 {
2812   size_t name_len = strlen ((const char *) name);
2813   if (BE (name_len != 1, 0))
2814     return REG_ECOLLATE;
2815   else
2816     {
2817       bitset_set (sbcset, name[0]);
2818       return REG_NOERROR;
2819     }
2820 }
2821 #endif /* not _LIBC */
2822 
2823 /* This function parse bracket expression like "[abc]", "[a-c]",
2824    "[[.a-a.]]" etc.  */
2825 
2826 static bin_tree_t *
2827 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2828 		   reg_syntax_t syntax, reg_errcode_t *err)
2829 {
2830 #ifdef _LIBC
2831   const unsigned char *collseqmb;
2832   const char *collseqwc;
2833   uint32_t nrules;
2834   int32_t table_size;
2835   const int32_t *symb_table;
2836   const unsigned char *extra;
2837 
2838   /* Local function for parse_bracket_exp used in _LIBC environment.
2839      Seek the collating symbol entry corresponding to NAME.
2840      Return the index of the symbol in the SYMB_TABLE,
2841      or -1 if not found.  */
2842 
2843   auto inline int32_t
2844   __attribute__ ((always_inline))
2845   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2846     {
2847       int32_t elem;
2848 
2849       for (elem = 0; elem < table_size; elem++)
2850 	if (symb_table[2 * elem] != 0)
2851 	  {
2852 	    int32_t idx = symb_table[2 * elem + 1];
2853 	    /* Skip the name of collating element name.  */
2854 	    idx += 1 + extra[idx];
2855 	    if (/* Compare the length of the name.  */
2856 		name_len == extra[idx]
2857 		/* Compare the name.  */
2858 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2859 	      /* Yep, this is the entry.  */
2860 	      return elem;
2861 	  }
2862       return -1;
2863     }
2864 
2865   /* Local function for parse_bracket_exp used in _LIBC environment.
2866      Look up the collation sequence value of BR_ELEM.
2867      Return the value if succeeded, UINT_MAX otherwise.  */
2868 
2869   auto inline unsigned int
2870   __attribute__ ((always_inline))
2871   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2872     {
2873       if (br_elem->type == SB_CHAR)
2874 	{
2875 	  /*
2876 	  if (MB_CUR_MAX == 1)
2877 	  */
2878 	  if (nrules == 0)
2879 	    return collseqmb[br_elem->opr.ch];
2880 	  else
2881 	    {
2882 	      wint_t wc = __btowc (br_elem->opr.ch);
2883 	      return __collseq_table_lookup (collseqwc, wc);
2884 	    }
2885 	}
2886       else if (br_elem->type == MB_CHAR)
2887 	{
2888 	  if (nrules != 0)
2889 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2890 	}
2891       else if (br_elem->type == COLL_SYM)
2892 	{
2893 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2894 	  if (nrules != 0)
2895 	    {
2896 	      int32_t elem, idx;
2897 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2898 						  sym_name_len);
2899 	      if (elem != -1)
2900 		{
2901 		  /* We found the entry.  */
2902 		  idx = symb_table[2 * elem + 1];
2903 		  /* Skip the name of collating element name.  */
2904 		  idx += 1 + extra[idx];
2905 		  /* Skip the byte sequence of the collating element.  */
2906 		  idx += 1 + extra[idx];
2907 		  /* Adjust for the alignment.  */
2908 		  idx = (idx + 3) & ~3;
2909 		  /* Skip the multibyte collation sequence value.  */
2910 		  idx += sizeof (unsigned int);
2911 		  /* Skip the wide char sequence of the collating element.  */
2912 		  idx += sizeof (unsigned int) *
2913 		    (1 + *(unsigned int *) (extra + idx));
2914 		  /* Return the collation sequence value.  */
2915 		  return *(unsigned int *) (extra + idx);
2916 		}
2917 	      else if (sym_name_len == 1)
2918 		{
2919 		  /* No valid character.  Match it as a single byte
2920 		     character.  */
2921 		  return collseqmb[br_elem->opr.name[0]];
2922 		}
2923 	    }
2924 	  else if (sym_name_len == 1)
2925 	    return collseqmb[br_elem->opr.name[0]];
2926 	}
2927       return UINT_MAX;
2928     }
2929 
2930   /* Local function for parse_bracket_exp used in _LIBC environment.
2931      Build the range expression which starts from START_ELEM, and ends
2932      at END_ELEM.  The result are written to MBCSET and SBCSET.
2933      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2934      mbcset->range_ends, is a pointer argument since we may
2935      update it.  */
2936 
2937   auto inline reg_errcode_t
2938   __attribute__ ((always_inline))
2939   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2940 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2941     {
2942       unsigned int ch;
2943       uint32_t start_collseq;
2944       uint32_t end_collseq;
2945 
2946       /* Equivalence Classes and Character Classes can't be a range
2947 	 start/end.  */
2948       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2949 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2950 	      0))
2951 	return REG_ERANGE;
2952 
2953       /* FIXME: Implement rational ranges here, too.  */
2954       start_collseq = lookup_collation_sequence_value (start_elem);
2955       end_collseq = lookup_collation_sequence_value (end_elem);
2956       /* Check start/end collation sequence values.  */
2957       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2958 	return REG_ECOLLATE;
2959       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2960 	return REG_ERANGE;
2961 
2962       /* Got valid collation sequence values, add them as a new entry.
2963 	 However, if we have no collation elements, and the character set
2964 	 is single byte, the single byte character set that we
2965 	 build below suffices. */
2966       if (nrules > 0 || dfa->mb_cur_max > 1)
2967 	{
2968 	  /* Check the space of the arrays.  */
2969 	  if (BE (*range_alloc == mbcset->nranges, 0))
2970 	    {
2971 	      /* There is not enough space, need realloc.  */
2972 	      uint32_t *new_array_start;
2973 	      uint32_t *new_array_end;
2974 	      Idx new_nranges;
2975 
2976 	      /* +1 in case of mbcset->nranges is 0.  */
2977 	      new_nranges = 2 * mbcset->nranges + 1;
2978 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2979 					    new_nranges);
2980 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2981 					  new_nranges);
2982 
2983 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2984 		return REG_ESPACE;
2985 
2986 	      mbcset->range_starts = new_array_start;
2987 	      mbcset->range_ends = new_array_end;
2988 	      *range_alloc = new_nranges;
2989 	    }
2990 
2991 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2992 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2993 	}
2994 
2995       /* Build the table for single byte characters.  */
2996       for (ch = 0; ch < SBC_MAX; ch++)
2997 	{
2998 	  uint32_t ch_collseq;
2999 	  /*
3000 	  if (MB_CUR_MAX == 1)
3001 	  */
3002 	  if (nrules == 0)
3003 	    ch_collseq = collseqmb[ch];
3004 	  else
3005 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3006 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3007 	    bitset_set (sbcset, ch);
3008 	}
3009       return REG_NOERROR;
3010     }
3011 
3012   /* Local function for parse_bracket_exp used in _LIBC environment.
3013      Build the collating element which is represented by NAME.
3014      The result are written to MBCSET and SBCSET.
3015      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3016      pointer argument since we may update it.  */
3017 
3018   auto inline reg_errcode_t
3019   __attribute__ ((always_inline))
3020   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3021 			  Idx *coll_sym_alloc, const unsigned char *name)
3022     {
3023       int32_t elem, idx;
3024       size_t name_len = strlen ((const char *) name);
3025       if (nrules != 0)
3026 	{
3027 	  elem = seek_collating_symbol_entry (name, name_len);
3028 	  if (elem != -1)
3029 	    {
3030 	      /* We found the entry.  */
3031 	      idx = symb_table[2 * elem + 1];
3032 	      /* Skip the name of collating element name.  */
3033 	      idx += 1 + extra[idx];
3034 	    }
3035 	  else if (name_len == 1)
3036 	    {
3037 	      /* No valid character, treat it as a normal
3038 		 character.  */
3039 	      bitset_set (sbcset, name[0]);
3040 	      return REG_NOERROR;
3041 	    }
3042 	  else
3043 	    return REG_ECOLLATE;
3044 
3045 	  /* Got valid collation sequence, add it as a new entry.  */
3046 	  /* Check the space of the arrays.  */
3047 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3048 	    {
3049 	      /* Not enough, realloc it.  */
3050 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3051 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3052 	      /* Use realloc since mbcset->coll_syms is NULL
3053 		 if *alloc == 0.  */
3054 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3055 						   new_coll_sym_alloc);
3056 	      if (BE (new_coll_syms == NULL, 0))
3057 		return REG_ESPACE;
3058 	      mbcset->coll_syms = new_coll_syms;
3059 	      *coll_sym_alloc = new_coll_sym_alloc;
3060 	    }
3061 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3062 	  return REG_NOERROR;
3063 	}
3064       else
3065 	{
3066 	  if (BE (name_len != 1, 0))
3067 	    return REG_ECOLLATE;
3068 	  else
3069 	    {
3070 	      bitset_set (sbcset, name[0]);
3071 	      return REG_NOERROR;
3072 	    }
3073 	}
3074     }
3075 #endif
3076 
3077   re_token_t br_token;
3078   re_bitset_ptr_t sbcset;
3079 #ifdef RE_ENABLE_I18N
3080   re_charset_t *mbcset;
3081   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3082   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3083 #endif /* not RE_ENABLE_I18N */
3084   bool non_match = false;
3085   bin_tree_t *work_tree;
3086   int token_len;
3087   bool first_round = true;
3088 #ifdef _LIBC
3089   collseqmb = (const unsigned char *)
3090     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3091   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3092   if (nrules)
3093     {
3094       /*
3095       if (MB_CUR_MAX > 1)
3096       */
3097       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3098       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3099       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3100 						  _NL_COLLATE_SYMB_TABLEMB);
3101       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3102 						   _NL_COLLATE_SYMB_EXTRAMB);
3103     }
3104 #endif
3105   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3106 #ifdef RE_ENABLE_I18N
3107   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3108 #endif /* RE_ENABLE_I18N */
3109 #ifdef RE_ENABLE_I18N
3110   if (BE (sbcset == NULL || mbcset == NULL, 0))
3111 #else
3112   if (BE (sbcset == NULL, 0))
3113 #endif /* RE_ENABLE_I18N */
3114     {
3115       re_free (sbcset);
3116 #ifdef RE_ENABLE_I18N
3117       re_free (mbcset);
3118 #endif
3119       *err = REG_ESPACE;
3120       return NULL;
3121     }
3122 
3123   token_len = peek_token_bracket (token, regexp, syntax);
3124   if (BE (token->type == END_OF_RE, 0))
3125     {
3126       *err = REG_BADPAT;
3127       goto parse_bracket_exp_free_return;
3128     }
3129   if (token->type == OP_NON_MATCH_LIST)
3130     {
3131 #ifdef RE_ENABLE_I18N
3132       mbcset->non_match = 1;
3133 #endif /* not RE_ENABLE_I18N */
3134       non_match = true;
3135       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3136 	bitset_set (sbcset, '\n');
3137       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3138       token_len = peek_token_bracket (token, regexp, syntax);
3139       if (BE (token->type == END_OF_RE, 0))
3140 	{
3141 	  *err = REG_BADPAT;
3142 	  goto parse_bracket_exp_free_return;
3143 	}
3144     }
3145 
3146   /* We treat the first ']' as a normal character.  */
3147   if (token->type == OP_CLOSE_BRACKET)
3148     token->type = CHARACTER;
3149 
3150   while (1)
3151     {
3152       bracket_elem_t start_elem, end_elem;
3153       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3154       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3155       reg_errcode_t ret;
3156       int token_len2 = 0;
3157       bool is_range_exp = false;
3158       re_token_t token2;
3159 
3160       start_elem.opr.name = start_name_buf;
3161       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3162 				   syntax, first_round);
3163       if (BE (ret != REG_NOERROR, 0))
3164 	{
3165 	  *err = ret;
3166 	  goto parse_bracket_exp_free_return;
3167 	}
3168       first_round = false;
3169 
3170       /* Get information about the next token.  We need it in any case.  */
3171       token_len = peek_token_bracket (token, regexp, syntax);
3172 
3173       /* Do not check for ranges if we know they are not allowed.  */
3174       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3175 	{
3176 	  if (BE (token->type == END_OF_RE, 0))
3177 	    {
3178 	      *err = REG_EBRACK;
3179 	      goto parse_bracket_exp_free_return;
3180 	    }
3181 	  if (token->type == OP_CHARSET_RANGE)
3182 	    {
3183 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3184 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3185 	      if (BE (token2.type == END_OF_RE, 0))
3186 		{
3187 		  *err = REG_EBRACK;
3188 		  goto parse_bracket_exp_free_return;
3189 		}
3190 	      if (token2.type == OP_CLOSE_BRACKET)
3191 		{
3192 		  /* We treat the last '-' as a normal character.  */
3193 		  re_string_skip_bytes (regexp, -token_len);
3194 		  token->type = CHARACTER;
3195 		}
3196 	      else
3197 		is_range_exp = true;
3198 	    }
3199 	}
3200 
3201       if (is_range_exp == true)
3202 	{
3203 	  end_elem.opr.name = end_name_buf;
3204 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3205 				       dfa, syntax, true);
3206 	  if (BE (ret != REG_NOERROR, 0))
3207 	    {
3208 	      *err = ret;
3209 	      goto parse_bracket_exp_free_return;
3210 	    }
3211 
3212 	  token_len = peek_token_bracket (token, regexp, syntax);
3213 
3214 #ifdef _LIBC
3215 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3216 				  &start_elem, &end_elem);
3217 #else
3218 # ifdef RE_ENABLE_I18N
3219 	  *err = build_range_exp (syntax, sbcset,
3220 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3221 				  &range_alloc, &start_elem, &end_elem);
3222 # else
3223 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3224 # endif
3225 #endif /* RE_ENABLE_I18N */
3226 	  if (BE (*err != REG_NOERROR, 0))
3227 	    goto parse_bracket_exp_free_return;
3228 	}
3229       else
3230 	{
3231 	  switch (start_elem.type)
3232 	    {
3233 	    case SB_CHAR:
3234 	      bitset_set (sbcset, start_elem.opr.ch);
3235 	      break;
3236 #ifdef RE_ENABLE_I18N
3237 	    case MB_CHAR:
3238 	      /* Check whether the array has enough space.  */
3239 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3240 		{
3241 		  wchar_t *new_mbchars;
3242 		  /* Not enough, realloc it.  */
3243 		  /* +1 in case of mbcset->nmbchars is 0.  */
3244 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3245 		  /* Use realloc since array is NULL if *alloc == 0.  */
3246 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3247 					    mbchar_alloc);
3248 		  if (BE (new_mbchars == NULL, 0))
3249 		    goto parse_bracket_exp_espace;
3250 		  mbcset->mbchars = new_mbchars;
3251 		}
3252 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3253 	      break;
3254 #endif /* RE_ENABLE_I18N */
3255 	    case EQUIV_CLASS:
3256 	      *err = build_equiv_class (sbcset,
3257 #ifdef RE_ENABLE_I18N
3258 					mbcset, &equiv_class_alloc,
3259 #endif /* RE_ENABLE_I18N */
3260 					start_elem.opr.name);
3261 	      if (BE (*err != REG_NOERROR, 0))
3262 		goto parse_bracket_exp_free_return;
3263 	      break;
3264 	    case COLL_SYM:
3265 	      *err = build_collating_symbol (sbcset,
3266 #ifdef RE_ENABLE_I18N
3267 					     mbcset, &coll_sym_alloc,
3268 #endif /* RE_ENABLE_I18N */
3269 					     start_elem.opr.name);
3270 	      if (BE (*err != REG_NOERROR, 0))
3271 		goto parse_bracket_exp_free_return;
3272 	      break;
3273 	    case CHAR_CLASS:
3274 	      *err = build_charclass (regexp->trans, sbcset,
3275 #ifdef RE_ENABLE_I18N
3276 				      mbcset, &char_class_alloc,
3277 #endif /* RE_ENABLE_I18N */
3278 				      (const char *) start_elem.opr.name,
3279 				      syntax);
3280 	      if (BE (*err != REG_NOERROR, 0))
3281 	       goto parse_bracket_exp_free_return;
3282 	      break;
3283 	    default:
3284 	      assert (0);
3285 	      break;
3286 	    }
3287 	}
3288       if (BE (token->type == END_OF_RE, 0))
3289 	{
3290 	  *err = REG_EBRACK;
3291 	  goto parse_bracket_exp_free_return;
3292 	}
3293       if (token->type == OP_CLOSE_BRACKET)
3294 	break;
3295     }
3296 
3297   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3298 
3299   /* If it is non-matching list.  */
3300   if (non_match)
3301     bitset_not (sbcset);
3302 
3303 #ifdef RE_ENABLE_I18N
3304   /* Ensure only single byte characters are set.  */
3305   if (dfa->mb_cur_max > 1)
3306     bitset_mask (sbcset, dfa->sb_char);
3307 
3308   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3309       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3310 						     || mbcset->non_match)))
3311     {
3312       bin_tree_t *mbc_tree;
3313       int sbc_idx;
3314       /* Build a tree for complex bracket.  */
3315       dfa->has_mb_node = 1;
3316       br_token.type = COMPLEX_BRACKET;
3317       br_token.opr.mbcset = mbcset;
3318       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3319       if (BE (mbc_tree == NULL, 0))
3320 	goto parse_bracket_exp_espace;
3321       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3322 	if (sbcset[sbc_idx])
3323 	  break;
3324       /* If there are no bits set in sbcset, there is no point
3325 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3326       if (sbc_idx < BITSET_WORDS)
3327 	{
3328 	  /* Build a tree for simple bracket.  */
3329 	  br_token.type = SIMPLE_BRACKET;
3330 	  br_token.opr.sbcset = sbcset;
3331 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3332 	  if (BE (work_tree == NULL, 0))
3333 	    goto parse_bracket_exp_espace;
3334 
3335 	  /* Then join them by ALT node.  */
3336 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3337 	  if (BE (work_tree == NULL, 0))
3338 	    goto parse_bracket_exp_espace;
3339 	}
3340       else
3341 	{
3342 	  re_free (sbcset);
3343 	  work_tree = mbc_tree;
3344 	}
3345     }
3346   else
3347 #endif /* not RE_ENABLE_I18N */
3348     {
3349 #ifdef RE_ENABLE_I18N
3350       free_charset (mbcset);
3351 #endif
3352       /* Build a tree for simple bracket.  */
3353       br_token.type = SIMPLE_BRACKET;
3354       br_token.opr.sbcset = sbcset;
3355       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3356       if (BE (work_tree == NULL, 0))
3357 	goto parse_bracket_exp_espace;
3358     }
3359   return work_tree;
3360 
3361  parse_bracket_exp_espace:
3362   *err = REG_ESPACE;
3363  parse_bracket_exp_free_return:
3364   re_free (sbcset);
3365 #ifdef RE_ENABLE_I18N
3366   free_charset (mbcset);
3367 #endif /* RE_ENABLE_I18N */
3368   return NULL;
3369 }
3370 
3371 /* Parse an element in the bracket expression.  */
3372 
3373 static reg_errcode_t
3374 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3375 		       re_token_t *token, int token_len,
3376 		       re_dfa_t *dfa _UNUSED_PARAMETER_,
3377 		       reg_syntax_t syntax, bool accept_hyphen)
3378 {
3379 #ifdef RE_ENABLE_I18N
3380   int cur_char_size;
3381   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3382   if (cur_char_size > 1)
3383     {
3384       elem->type = MB_CHAR;
3385       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3386       re_string_skip_bytes (regexp, cur_char_size);
3387       return REG_NOERROR;
3388     }
3389 #endif /* RE_ENABLE_I18N */
3390   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3391   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3392       || token->type == OP_OPEN_EQUIV_CLASS)
3393     return parse_bracket_symbol (elem, regexp, token);
3394   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3395     {
3396       /* A '-' must only appear as anything but a range indicator before
3397 	 the closing bracket.  Everything else is an error.  */
3398       re_token_t token2;
3399       (void) peek_token_bracket (&token2, regexp, syntax);
3400       if (token2.type != OP_CLOSE_BRACKET)
3401 	/* The actual error value is not standardized since this whole
3402 	   case is undefined.  But ERANGE makes good sense.  */
3403 	return REG_ERANGE;
3404     }
3405   elem->type = SB_CHAR;
3406   elem->opr.ch = token->opr.c;
3407   return REG_NOERROR;
3408 }
3409 
3410 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3411    such as [:<character_class>:], [.<collating_element>.], and
3412    [=<equivalent_class>=].  */
3413 
3414 static reg_errcode_t
3415 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3416 		      re_token_t *token)
3417 {
3418   unsigned char ch, delim = token->opr.c;
3419   int i = 0;
3420   if (re_string_eoi(regexp))
3421     return REG_EBRACK;
3422   for (;; ++i)
3423     {
3424       if (i >= BRACKET_NAME_BUF_SIZE)
3425 	return REG_EBRACK;
3426       if (token->type == OP_OPEN_CHAR_CLASS)
3427 	ch = re_string_fetch_byte_case (regexp);
3428       else
3429 	ch = re_string_fetch_byte (regexp);
3430       if (re_string_eoi(regexp))
3431 	return REG_EBRACK;
3432       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3433 	break;
3434       elem->opr.name[i] = ch;
3435     }
3436   re_string_skip_bytes (regexp, 1);
3437   elem->opr.name[i] = '\0';
3438   switch (token->type)
3439     {
3440     case OP_OPEN_COLL_ELEM:
3441       elem->type = COLL_SYM;
3442       break;
3443     case OP_OPEN_EQUIV_CLASS:
3444       elem->type = EQUIV_CLASS;
3445       break;
3446     case OP_OPEN_CHAR_CLASS:
3447       elem->type = CHAR_CLASS;
3448       break;
3449     default:
3450       break;
3451     }
3452   return REG_NOERROR;
3453 }
3454 
3455   /* Helper function for parse_bracket_exp.
3456      Build the equivalence class which is represented by NAME.
3457      The result are written to MBCSET and SBCSET.
3458      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3459      is a pointer argument since we may update it.  */
3460 
3461 static reg_errcode_t
3462 #ifdef RE_ENABLE_I18N
3463 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_,
3464 		   Idx *equiv_class_alloc _UNUSED_PARAMETER_,
3465 		   const unsigned char *name)
3466 #else /* not RE_ENABLE_I18N */
3467 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3468 #endif /* not RE_ENABLE_I18N */
3469 {
3470 #ifdef _LIBC
3471   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3472   if (nrules != 0)
3473     {
3474       const int32_t *table, *indirect;
3475       const unsigned char *weights, *extra, *cp;
3476       unsigned char char_buf[2];
3477       int32_t idx1, idx2;
3478       unsigned int ch;
3479       size_t len;
3480       /* This #include defines a local function!  */
3481 # include <locale/weight.h>
3482       /* Calculate the index for equivalence class.  */
3483       cp = name;
3484       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3485       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3486 					       _NL_COLLATE_WEIGHTMB);
3487       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3488 						   _NL_COLLATE_EXTRAMB);
3489       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3490 						_NL_COLLATE_INDIRECTMB);
3491       idx1 = findidx (&cp, -1);
3492       if (BE (idx1 == 0 || *cp != '\0', 0))
3493 	/* This isn't a valid character.  */
3494 	return REG_ECOLLATE;
3495 
3496       /* Build single byte matching table for this equivalence class.  */
3497       len = weights[idx1 & 0xffffff];
3498       for (ch = 0; ch < SBC_MAX; ++ch)
3499 	{
3500 	  char_buf[0] = ch;
3501 	  cp = char_buf;
3502 	  idx2 = findidx (&cp, 1);
3503 /*
3504 	  idx2 = table[ch];
3505 */
3506 	  if (idx2 == 0)
3507 	    /* This isn't a valid character.  */
3508 	    continue;
3509 	  /* Compare only if the length matches and the collation rule
3510 	     index is the same.  */
3511 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3512 	    {
3513 	      int cnt = 0;
3514 
3515 	      while (cnt <= len &&
3516 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3517 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3518 		++cnt;
3519 
3520 	      if (cnt > len)
3521 		bitset_set (sbcset, ch);
3522 	    }
3523 	}
3524       /* Check whether the array has enough space.  */
3525       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3526 	{
3527 	  /* Not enough, realloc it.  */
3528 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3529 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3530 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3531 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3532 						   int32_t,
3533 						   new_equiv_class_alloc);
3534 	  if (BE (new_equiv_classes == NULL, 0))
3535 	    return REG_ESPACE;
3536 	  mbcset->equiv_classes = new_equiv_classes;
3537 	  *equiv_class_alloc = new_equiv_class_alloc;
3538 	}
3539       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3540     }
3541   else
3542 #endif /* _LIBC */
3543     {
3544       if (BE (strlen ((const char *) name) != 1, 0))
3545 	return REG_ECOLLATE;
3546       bitset_set (sbcset, *name);
3547     }
3548   return REG_NOERROR;
3549 }
3550 
3551   /* Helper function for parse_bracket_exp.
3552      Build the character class which is represented by NAME.
3553      The result are written to MBCSET and SBCSET.
3554      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3555      is a pointer argument since we may update it.  */
3556 
3557 static reg_errcode_t
3558 #ifdef RE_ENABLE_I18N
3559 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3560 		 re_charset_t *mbcset, Idx *char_class_alloc,
3561 		 const char *class_name, reg_syntax_t syntax)
3562 #else /* not RE_ENABLE_I18N */
3563 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3564 		 const char *class_name, reg_syntax_t syntax)
3565 #endif /* not RE_ENABLE_I18N */
3566 {
3567   int i;
3568   const char *name = class_name;
3569 
3570   /* In case of REG_ICASE "upper" and "lower" match the both of
3571      upper and lower cases.  */
3572   if ((syntax & RE_ICASE)
3573       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3574     name = "alpha";
3575 
3576 #ifdef RE_ENABLE_I18N
3577   /* Check the space of the arrays.  */
3578   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3579     {
3580       /* Not enough, realloc it.  */
3581       /* +1 in case of mbcset->nchar_classes is 0.  */
3582       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3583       /* Use realloc since array is NULL if *alloc == 0.  */
3584       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3585 					       new_char_class_alloc);
3586       if (BE (new_char_classes == NULL, 0))
3587 	return REG_ESPACE;
3588       mbcset->char_classes = new_char_classes;
3589       *char_class_alloc = new_char_class_alloc;
3590     }
3591   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3592 #endif /* RE_ENABLE_I18N */
3593 
3594 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3595   do {						\
3596     if (BE (trans != NULL, 0))			\
3597       {						\
3598 	for (i = 0; i < SBC_MAX; ++i)		\
3599 	  if (ctype_func (i))			\
3600 	    bitset_set (sbcset, trans[i]);	\
3601       }						\
3602     else					\
3603       {						\
3604 	for (i = 0; i < SBC_MAX; ++i)		\
3605 	  if (ctype_func (i))			\
3606 	    bitset_set (sbcset, i);		\
3607       }						\
3608   } while (0)
3609 
3610   if (strcmp (name, "alnum") == 0)
3611     BUILD_CHARCLASS_LOOP (isalnum);
3612   else if (strcmp (name, "cntrl") == 0)
3613     BUILD_CHARCLASS_LOOP (iscntrl);
3614   else if (strcmp (name, "lower") == 0)
3615     BUILD_CHARCLASS_LOOP (islower);
3616   else if (strcmp (name, "space") == 0)
3617     BUILD_CHARCLASS_LOOP (isspace);
3618   else if (strcmp (name, "alpha") == 0)
3619     BUILD_CHARCLASS_LOOP (isalpha);
3620   else if (strcmp (name, "digit") == 0)
3621     BUILD_CHARCLASS_LOOP (isdigit);
3622   else if (strcmp (name, "print") == 0)
3623     BUILD_CHARCLASS_LOOP (isprint);
3624   else if (strcmp (name, "upper") == 0)
3625     BUILD_CHARCLASS_LOOP (isupper);
3626   else if (strcmp (name, "blank") == 0)
3627     BUILD_CHARCLASS_LOOP (isblank);
3628   else if (strcmp (name, "graph") == 0)
3629     BUILD_CHARCLASS_LOOP (isgraph);
3630   else if (strcmp (name, "punct") == 0)
3631     BUILD_CHARCLASS_LOOP (ispunct);
3632   else if (strcmp (name, "xdigit") == 0)
3633     BUILD_CHARCLASS_LOOP (isxdigit);
3634   else
3635     return REG_ECTYPE;
3636 
3637   return REG_NOERROR;
3638 }
3639 
3640 static bin_tree_t *
3641 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3642 		    const char *class_name,
3643 		    const char *extra, bool non_match,
3644 		    reg_errcode_t *err)
3645 {
3646   re_bitset_ptr_t sbcset;
3647 #ifdef RE_ENABLE_I18N
3648   re_charset_t *mbcset;
3649   Idx alloc = 0;
3650 #endif /* not RE_ENABLE_I18N */
3651   reg_errcode_t ret;
3652   re_token_t br_token;
3653   bin_tree_t *tree;
3654 
3655   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3656 #ifdef RE_ENABLE_I18N
3657   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3658 #endif /* RE_ENABLE_I18N */
3659 
3660 #ifdef RE_ENABLE_I18N
3661   if (BE (sbcset == NULL || mbcset == NULL, 0))
3662 #else /* not RE_ENABLE_I18N */
3663   if (BE (sbcset == NULL, 0))
3664 #endif /* not RE_ENABLE_I18N */
3665     {
3666       *err = REG_ESPACE;
3667       return NULL;
3668     }
3669 
3670   if (non_match)
3671     {
3672 #ifdef RE_ENABLE_I18N
3673       mbcset->non_match = 1;
3674 #endif /* not RE_ENABLE_I18N */
3675     }
3676 
3677   /* We don't care the syntax in this case.  */
3678   ret = build_charclass (trans, sbcset,
3679 #ifdef RE_ENABLE_I18N
3680 			 mbcset, &alloc,
3681 #endif /* RE_ENABLE_I18N */
3682 			 class_name, 0);
3683 
3684   if (BE (ret != REG_NOERROR, 0))
3685     {
3686       re_free (sbcset);
3687 #ifdef RE_ENABLE_I18N
3688       free_charset (mbcset);
3689 #endif /* RE_ENABLE_I18N */
3690       *err = ret;
3691       return NULL;
3692     }
3693   /* \w match '_' also.  */
3694   for (; *extra; extra++)
3695     bitset_set (sbcset, *extra);
3696 
3697   /* If it is non-matching list.  */
3698   if (non_match)
3699     bitset_not (sbcset);
3700 
3701 #ifdef RE_ENABLE_I18N
3702   /* Ensure only single byte characters are set.  */
3703   if (dfa->mb_cur_max > 1)
3704     bitset_mask (sbcset, dfa->sb_char);
3705 #endif
3706 
3707   /* Build a tree for simple bracket.  */
3708   br_token.type = SIMPLE_BRACKET;
3709   br_token.opr.sbcset = sbcset;
3710   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3711   if (BE (tree == NULL, 0))
3712     goto build_word_op_espace;
3713 
3714 #ifdef RE_ENABLE_I18N
3715   if (dfa->mb_cur_max > 1)
3716     {
3717       bin_tree_t *mbc_tree;
3718       /* Build a tree for complex bracket.  */
3719       br_token.type = COMPLEX_BRACKET;
3720       br_token.opr.mbcset = mbcset;
3721       dfa->has_mb_node = 1;
3722       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723       if (BE (mbc_tree == NULL, 0))
3724 	goto build_word_op_espace;
3725       /* Then join them by ALT node.  */
3726       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3727       if (BE (mbc_tree != NULL, 1))
3728 	return tree;
3729     }
3730   else
3731     {
3732       free_charset (mbcset);
3733       return tree;
3734     }
3735 #else /* not RE_ENABLE_I18N */
3736   return tree;
3737 #endif /* not RE_ENABLE_I18N */
3738 
3739  build_word_op_espace:
3740   re_free (sbcset);
3741 #ifdef RE_ENABLE_I18N
3742   free_charset (mbcset);
3743 #endif /* RE_ENABLE_I18N */
3744   *err = REG_ESPACE;
3745   return NULL;
3746 }
3747 
3748 /* This is intended for the expressions like "a{1,3}".
3749    Fetch a number from 'input', and return the number.
3750    Return REG_MISSING if the number field is empty like "{,1}".
3751    Return RE_DUP_MAX + 1 if the number field is too large.
3752    Return REG_ERROR if an error occurred.  */
3753 
3754 static Idx
3755 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3756 {
3757   Idx num = REG_MISSING;
3758   unsigned char c;
3759   while (1)
3760     {
3761       fetch_token (token, input, syntax);
3762       c = token->opr.c;
3763       if (BE (token->type == END_OF_RE, 0))
3764 	return REG_ERROR;
3765       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3766 	break;
3767       num = ((token->type != CHARACTER || c < '0' || '9' < c
3768 	      || num == REG_ERROR)
3769 	     ? REG_ERROR
3770 	     : num == REG_MISSING
3771 	     ? c - '0'
3772 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3773     }
3774   return num;
3775 }
3776 
3777 #ifdef RE_ENABLE_I18N
3778 static void
3779 free_charset (re_charset_t *cset)
3780 {
3781   re_free (cset->mbchars);
3782 # ifdef _LIBC
3783   re_free (cset->coll_syms);
3784   re_free (cset->equiv_classes);
3785   re_free (cset->range_starts);
3786   re_free (cset->range_ends);
3787 # endif
3788   re_free (cset->char_classes);
3789   re_free (cset);
3790 }
3791 #endif /* RE_ENABLE_I18N */
3792 
3793 /* Functions for binary tree operation.  */
3794 
3795 /* Create a tree node.  */
3796 
3797 static bin_tree_t *
3798 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3799 	     re_token_type_t type)
3800 {
3801   re_token_t t;
3802   t.type = type;
3803   return create_token_tree (dfa, left, right, &t);
3804 }
3805 
3806 static bin_tree_t *
3807 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3808 		   const re_token_t *token)
3809 {
3810   bin_tree_t *tree;
3811   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3812     {
3813       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3814 
3815       if (storage == NULL)
3816 	return NULL;
3817       storage->next = dfa->str_tree_storage;
3818       dfa->str_tree_storage = storage;
3819       dfa->str_tree_storage_idx = 0;
3820     }
3821   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3822 
3823   tree->parent = NULL;
3824   tree->left = left;
3825   tree->right = right;
3826   tree->token = *token;
3827   tree->token.duplicated = 0;
3828   tree->token.opt_subexp = 0;
3829   tree->first = NULL;
3830   tree->next = NULL;
3831   tree->node_idx = REG_MISSING;
3832 
3833   if (left != NULL)
3834     left->parent = tree;
3835   if (right != NULL)
3836     right->parent = tree;
3837   return tree;
3838 }
3839 
3840 /* Mark the tree SRC as an optional subexpression.
3841    To be called from preorder or postorder.  */
3842 
3843 static reg_errcode_t
3844 mark_opt_subexp (void *extra, bin_tree_t *node)
3845 {
3846   Idx idx = (uintptr_t) extra;
3847   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3848     node->token.opt_subexp = 1;
3849 
3850   return REG_NOERROR;
3851 }
3852 
3853 /* Free the allocated memory inside NODE. */
3854 
3855 static void
3856 free_token (re_token_t *node)
3857 {
3858 #ifdef RE_ENABLE_I18N
3859   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3860     free_charset (node->opr.mbcset);
3861   else
3862 #endif /* RE_ENABLE_I18N */
3863     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3864       re_free (node->opr.sbcset);
3865 }
3866 
3867 /* Worker function for tree walking.  Free the allocated memory inside NODE
3868    and its children. */
3869 
3870 static reg_errcode_t
3871 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node)
3872 {
3873   free_token (&node->token);
3874   return REG_NOERROR;
3875 }
3876 
3877 
3878 /* Duplicate the node SRC, and return new node.  This is a preorder
3879    visit similar to the one implemented by the generic visitor, but
3880    we need more infrastructure to maintain two parallel trees --- so,
3881    it's easier to duplicate.  */
3882 
3883 static bin_tree_t *
3884 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3885 {
3886   const bin_tree_t *node;
3887   bin_tree_t *dup_root;
3888   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3889 
3890   for (node = root; ; )
3891     {
3892       /* Create a new tree and link it back to the current parent.  */
3893       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3894       if (*p_new == NULL)
3895 	return NULL;
3896       (*p_new)->parent = dup_node;
3897       (*p_new)->token.duplicated = 1;
3898       dup_node = *p_new;
3899 
3900       /* Go to the left node, or up and to the right.  */
3901       if (node->left)
3902 	{
3903 	  node = node->left;
3904 	  p_new = &dup_node->left;
3905 	}
3906       else
3907 	{
3908 	  const bin_tree_t *prev = NULL;
3909 	  while (node->right == prev || node->right == NULL)
3910 	    {
3911 	      prev = node;
3912 	      node = node->parent;
3913 	      dup_node = dup_node->parent;
3914 	      if (!node)
3915 		return dup_root;
3916 	    }
3917 	  node = node->right;
3918 	  p_new = &dup_node->right;
3919 	}
3920     }
3921 }
3922