xref: /dragonfly/contrib/grep/lib/regcomp.c (revision e98bdfd3)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2015 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU General Public
8    License as published by the Free Software Foundation; either
9    version 3 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    General Public License for more details.
15 
16    You should have received a copy of the GNU General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23 
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 					  size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 				     const re_dfastate_t *init_state,
28 				     char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 			       reg_errcode_t (fn (void *, bin_tree_t *)),
41 			       void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 				reg_errcode_t (fn (void *, bin_tree_t *)),
44 				void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 				 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 				   unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 					 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 			 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 			reg_syntax_t syntax) internal_function;
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 			  reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 				  re_token_t *token, reg_syntax_t syntax,
67 				  Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 				 re_token_t *token, reg_syntax_t syntax,
70 				 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 				     re_token_t *token, reg_syntax_t syntax,
73 				     Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 				  re_token_t *token, reg_syntax_t syntax,
76 				  Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 				 re_dfa_t *dfa, re_token_t *token,
79 				 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 				      re_token_t *token, reg_syntax_t syntax,
82 				      reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 					    re_string_t *regexp,
85 					    re_token_t *token, int token_len,
86 					    re_dfa_t *dfa,
87 					    reg_syntax_t syntax,
88 					    bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 					  re_string_t *regexp,
91 					  re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 					re_charset_t *mbcset,
95 					Idx *equiv_class_alloc,
96 					const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 				      bitset_t sbcset,
99 				      re_charset_t *mbcset,
100 				      Idx *char_class_alloc,
101 				      const char *class_name,
102 				      reg_syntax_t syntax);
103 #else  /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 					const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 				      bitset_t sbcset,
108 				      const char *class_name,
109 				      reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 				       RE_TRANSLATE_TYPE trans,
113 				       const char *class_name,
114 				       const char *extra,
115 				       bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 				bin_tree_t *left, bin_tree_t *right,
118 				re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 				      bin_tree_t *left, bin_tree_t *right,
121 				      const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 
127 /* This table gives an error message for each of the error codes listed
128    in regex.h.  Obviously the order here has to be same as there.
129    POSIX doesn't require that we do anything for REG_NOERROR,
130    but why not be nice?  */
131 
132 static const char __re_error_msgid[] =
133   {
134 #define REG_NOERROR_IDX	0
135     gettext_noop ("Success")	/* REG_NOERROR */
136     "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138     gettext_noop ("No match")	/* REG_NOMATCH */
139     "\0"
140 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142     "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145     "\0"
146 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148     "\0"
149 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151     "\0"
152 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154     "\0"
155 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
156     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
157     "\0"
158 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160     "\0"
161 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163     "\0"
164 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166     "\0"
167 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168     gettext_noop ("Invalid range end")	/* REG_ERANGE */
169     "\0"
170 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
172     "\0"
173 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175     "\0"
176 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
178     "\0"
179 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
181     "\0"
182 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184   };
185 
186 static const size_t __re_error_msgid_idx[] =
187   {
188     REG_NOERROR_IDX,
189     REG_NOMATCH_IDX,
190     REG_BADPAT_IDX,
191     REG_ECOLLATE_IDX,
192     REG_ECTYPE_IDX,
193     REG_EESCAPE_IDX,
194     REG_ESUBREG_IDX,
195     REG_EBRACK_IDX,
196     REG_EPAREN_IDX,
197     REG_EBRACE_IDX,
198     REG_BADBR_IDX,
199     REG_ERANGE_IDX,
200     REG_ESPACE_IDX,
201     REG_BADRPT_IDX,
202     REG_EEND_IDX,
203     REG_ESIZE_IDX,
204     REG_ERPAREN_IDX
205   };
206 
207 /* Entry points for GNU code.  */
208 
209 /* re_compile_pattern is the GNU regular expression compiler: it
210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211    Returns 0 if the pattern was valid, otherwise an error string.
212 
213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214    are set in BUFP on entry.  */
215 
216 #ifdef _LIBC
217 const char *
218 re_compile_pattern (pattern, length, bufp)
219     const char *pattern;
220     size_t length;
221     struct re_pattern_buffer *bufp;
222 #else /* size_t might promote */
223 const char *
224 re_compile_pattern (const char *pattern, size_t length,
225 		    struct re_pattern_buffer *bufp)
226 #endif
227 {
228   reg_errcode_t ret;
229 
230   /* And GNU code determines whether or not to get register information
231      by passing null for the REGS argument to re_match, etc., not by
232      setting no_sub, unless RE_NO_SUB is set.  */
233   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
234 
235   /* Match anchors at newline.  */
236   bufp->newline_anchor = 1;
237 
238   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
239 
240   if (!ret)
241     return NULL;
242   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
243 }
244 #ifdef _LIBC
245 weak_alias (__re_compile_pattern, re_compile_pattern)
246 #endif
247 
248 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
249    also be assigned to arbitrarily: each pattern buffer stores its own
250    syntax, so it can be changed between regex compilations.  */
251 /* This has no initializer because initialized variables in Emacs
252    become read-only after dumping.  */
253 reg_syntax_t re_syntax_options;
254 
255 
256 /* Specify the precise syntax of regexps for compilation.  This provides
257    for compatibility for various utilities which historically have
258    different, incompatible syntaxes.
259 
260    The argument SYNTAX is a bit mask comprised of the various bits
261    defined in regex.h.  We return the old syntax.  */
262 
263 reg_syntax_t
264 re_set_syntax (syntax)
265     reg_syntax_t syntax;
266 {
267   reg_syntax_t ret = re_syntax_options;
268 
269   re_syntax_options = syntax;
270   return ret;
271 }
272 #ifdef _LIBC
273 weak_alias (__re_set_syntax, re_set_syntax)
274 #endif
275 
276 int
277 re_compile_fastmap (bufp)
278     struct re_pattern_buffer *bufp;
279 {
280   re_dfa_t *dfa = bufp->buffer;
281   char *fastmap = bufp->fastmap;
282 
283   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
284   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
285   if (dfa->init_state != dfa->init_state_word)
286     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
287   if (dfa->init_state != dfa->init_state_nl)
288     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
289   if (dfa->init_state != dfa->init_state_begbuf)
290     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
291   bufp->fastmap_accurate = 1;
292   return 0;
293 }
294 #ifdef _LIBC
295 weak_alias (__re_compile_fastmap, re_compile_fastmap)
296 #endif
297 
298 static inline void
299 __attribute__ ((always_inline))
300 re_set_fastmap (char *fastmap, bool icase, int ch)
301 {
302   fastmap[ch] = 1;
303   if (icase)
304     fastmap[tolower (ch)] = 1;
305 }
306 
307 /* Helper function for re_compile_fastmap.
308    Compile fastmap for the initial_state INIT_STATE.  */
309 
310 static void
311 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
312 			 char *fastmap)
313 {
314   re_dfa_t *dfa = bufp->buffer;
315   Idx node_cnt;
316   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
317   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
318     {
319       Idx node = init_state->nodes.elems[node_cnt];
320       re_token_type_t type = dfa->nodes[node].type;
321 
322       if (type == CHARACTER)
323 	{
324 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
325 #ifdef RE_ENABLE_I18N
326 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
327 	    {
328 	      unsigned char buf[MB_LEN_MAX];
329 	      unsigned char *p;
330 	      wchar_t wc;
331 	      mbstate_t state;
332 
333 	      p = buf;
334 	      *p++ = dfa->nodes[node].opr.c;
335 	      while (++node < dfa->nodes_len
336 		     &&	dfa->nodes[node].type == CHARACTER
337 		     && dfa->nodes[node].mb_partial)
338 		*p++ = dfa->nodes[node].opr.c;
339 	      memset (&state, '\0', sizeof (state));
340 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
341 			     &state) == p - buf
342 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
343 		      != (size_t) -1))
344 		re_set_fastmap (fastmap, false, buf[0]);
345 	    }
346 #endif
347 	}
348       else if (type == SIMPLE_BRACKET)
349 	{
350 	  int i, ch;
351 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
352 	    {
353 	      int j;
354 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
355 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
356 		if (w & ((bitset_word_t) 1 << j))
357 		  re_set_fastmap (fastmap, icase, ch);
358 	    }
359 	}
360 #ifdef RE_ENABLE_I18N
361       else if (type == COMPLEX_BRACKET)
362 	{
363 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364 	  Idx i;
365 
366 # ifdef _LIBC
367 	  /* See if we have to try all bytes which start multiple collation
368 	     elements.
369 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
370 		  collation element, and don't catch 'b' since 'b' is
371 		  the only collation element which starts from 'b' (and
372 		  it is caught by SIMPLE_BRACKET).  */
373 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
374 		  && (cset->ncoll_syms || cset->nranges))
375 		{
376 		  const int32_t *table = (const int32_t *)
377 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
378 		  for (i = 0; i < SBC_MAX; ++i)
379 		    if (table[i] < 0)
380 		      re_set_fastmap (fastmap, icase, i);
381 		}
382 # endif /* _LIBC */
383 
384 	  /* See if we have to start the match at all multibyte characters,
385 	     i.e. where we would not find an invalid sequence.  This only
386 	     applies to multibyte character sets; for single byte character
387 	     sets, the SIMPLE_BRACKET again suffices.  */
388 	  if (dfa->mb_cur_max > 1
389 	      && (cset->nchar_classes || cset->non_match || cset->nranges
390 # ifdef _LIBC
391 		  || cset->nequiv_classes
392 # endif /* _LIBC */
393 		 ))
394 	    {
395 	      unsigned char c = 0;
396 	      do
397 		{
398 		  mbstate_t mbs;
399 		  memset (&mbs, 0, sizeof (mbs));
400 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
401 		    re_set_fastmap (fastmap, false, (int) c);
402 		}
403 	      while (++c != 0);
404 	    }
405 
406 	  else
407 	    {
408 	      /* ... Else catch all bytes which can start the mbchars.  */
409 	      for (i = 0; i < cset->nmbchars; ++i)
410 		{
411 		  char buf[256];
412 		  mbstate_t state;
413 		  memset (&state, '\0', sizeof (state));
414 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
415 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
416 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
417 		    {
418 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
419 			  != (size_t) -1)
420 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
421 		    }
422 		}
423 	    }
424 	}
425 #endif /* RE_ENABLE_I18N */
426       else if (type == OP_PERIOD
427 #ifdef RE_ENABLE_I18N
428 	       || type == OP_UTF8_PERIOD
429 #endif /* RE_ENABLE_I18N */
430 	       || type == END_OF_RE)
431 	{
432 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
433 	  if (type == END_OF_RE)
434 	    bufp->can_be_null = 1;
435 	  return;
436 	}
437     }
438 }
439 
440 /* Entry point for POSIX code.  */
441 /* regcomp takes a regular expression as a string and compiles it.
442 
443    PREG is a regex_t *.  We do not expect any fields to be initialized,
444    since POSIX says we shouldn't.  Thus, we set
445 
446      'buffer' to the compiled pattern;
447      'used' to the length of the compiled pattern;
448      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
449        REG_EXTENDED bit in CFLAGS is set; otherwise, to
450        RE_SYNTAX_POSIX_BASIC;
451      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
452      'fastmap' to an allocated space for the fastmap;
453      'fastmap_accurate' to zero;
454      're_nsub' to the number of subexpressions in PATTERN.
455 
456    PATTERN is the address of the pattern string.
457 
458    CFLAGS is a series of bits which affect compilation.
459 
460      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
461      use POSIX basic syntax.
462 
463      If REG_NEWLINE is set, then . and [^...] don't match newline.
464      Also, regexec will try a match beginning after every newline.
465 
466      If REG_ICASE is set, then we considers upper- and lowercase
467      versions of letters to be equivalent when matching.
468 
469      If REG_NOSUB is set, then when PREG is passed to regexec, that
470      routine will report only success or failure, and nothing about the
471      registers.
472 
473    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
474    the return codes and their meanings.)  */
475 
476 int
477 regcomp (preg, pattern, cflags)
478     regex_t *_Restrict_ preg;
479     const char *_Restrict_ pattern;
480     int cflags;
481 {
482   reg_errcode_t ret;
483   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
484 			 : RE_SYNTAX_POSIX_BASIC);
485 
486   preg->buffer = NULL;
487   preg->allocated = 0;
488   preg->used = 0;
489 
490   /* Try to allocate space for the fastmap.  */
491   preg->fastmap = re_malloc (char, SBC_MAX);
492   if (BE (preg->fastmap == NULL, 0))
493     return REG_ESPACE;
494 
495   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
496 
497   /* If REG_NEWLINE is set, newlines are treated differently.  */
498   if (cflags & REG_NEWLINE)
499     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
500       syntax &= ~RE_DOT_NEWLINE;
501       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
502       /* It also changes the matching behavior.  */
503       preg->newline_anchor = 1;
504     }
505   else
506     preg->newline_anchor = 0;
507   preg->no_sub = !!(cflags & REG_NOSUB);
508   preg->translate = NULL;
509 
510   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
511 
512   /* POSIX doesn't distinguish between an unmatched open-group and an
513      unmatched close-group: both are REG_EPAREN.  */
514   if (ret == REG_ERPAREN)
515     ret = REG_EPAREN;
516 
517   /* We have already checked preg->fastmap != NULL.  */
518   if (BE (ret == REG_NOERROR, 1))
519     /* Compute the fastmap now, since regexec cannot modify the pattern
520        buffer.  This function never fails in this implementation.  */
521     (void) re_compile_fastmap (preg);
522   else
523     {
524       /* Some error occurred while compiling the expression.  */
525       re_free (preg->fastmap);
526       preg->fastmap = NULL;
527     }
528 
529   return (int) ret;
530 }
531 #ifdef _LIBC
532 weak_alias (__regcomp, regcomp)
533 #endif
534 
535 /* Returns a message corresponding to an error code, ERRCODE, returned
536    from either regcomp or regexec.   We don't use PREG here.  */
537 
538 #ifdef _LIBC
539 size_t
540 regerror (errcode, preg, errbuf, errbuf_size)
541     int errcode;
542     const regex_t *_Restrict_ preg;
543     char *_Restrict_ errbuf;
544     size_t errbuf_size;
545 #else /* size_t might promote */
546 size_t
547 regerror (int errcode, const regex_t *_Restrict_ preg,
548 	  char *_Restrict_ errbuf, size_t errbuf_size)
549 #endif
550 {
551   const char *msg;
552   size_t msg_size;
553 
554   if (BE (errcode < 0
555 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
556 			       / sizeof (__re_error_msgid_idx[0])), 0))
557     /* Only error codes returned by the rest of the code should be passed
558        to this routine.  If we are given anything else, or if other regex
559        code generates an invalid error code, then the program has a bug.
560        Dump core so we can fix it.  */
561     abort ();
562 
563   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
564 
565   msg_size = strlen (msg) + 1; /* Includes the null.  */
566 
567   if (BE (errbuf_size != 0, 1))
568     {
569       size_t cpy_size = msg_size;
570       if (BE (msg_size > errbuf_size, 0))
571 	{
572 	  cpy_size = errbuf_size - 1;
573 	  errbuf[cpy_size] = '\0';
574 	}
575       memcpy (errbuf, msg, cpy_size);
576     }
577 
578   return msg_size;
579 }
580 #ifdef _LIBC
581 weak_alias (__regerror, regerror)
582 #endif
583 
584 
585 #ifdef RE_ENABLE_I18N
586 /* This static array is used for the map to single-byte characters when
587    UTF-8 is used.  Otherwise we would allocate memory just to initialize
588    it the same all the time.  UTF-8 is the preferred encoding so this is
589    a worthwhile optimization.  */
590 static const bitset_t utf8_sb_map =
591 {
592   /* Set the first 128 bits.  */
593 # if defined __GNUC__ && !defined __STRICT_ANSI__
594   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
595 # else
596 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
597 #   error "bitset_word_t is narrower than 32 bits"
598 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
599   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
600 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
601   BITSET_WORD_MAX, BITSET_WORD_MAX,
602 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
603   BITSET_WORD_MAX,
604 #  endif
605   (BITSET_WORD_MAX
606    >> (SBC_MAX % BITSET_WORD_BITS == 0
607        ? 0
608        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
609 # endif
610 };
611 #endif
612 
613 
614 static void
615 free_dfa_content (re_dfa_t *dfa)
616 {
617   Idx i, j;
618 
619   if (dfa->nodes)
620     for (i = 0; i < dfa->nodes_len; ++i)
621       free_token (dfa->nodes + i);
622   re_free (dfa->nexts);
623   for (i = 0; i < dfa->nodes_len; ++i)
624     {
625       if (dfa->eclosures != NULL)
626 	re_node_set_free (dfa->eclosures + i);
627       if (dfa->inveclosures != NULL)
628 	re_node_set_free (dfa->inveclosures + i);
629       if (dfa->edests != NULL)
630 	re_node_set_free (dfa->edests + i);
631     }
632   re_free (dfa->edests);
633   re_free (dfa->eclosures);
634   re_free (dfa->inveclosures);
635   re_free (dfa->nodes);
636 
637   if (dfa->state_table)
638     for (i = 0; i <= dfa->state_hash_mask; ++i)
639       {
640 	struct re_state_table_entry *entry = dfa->state_table + i;
641 	for (j = 0; j < entry->num; ++j)
642 	  {
643 	    re_dfastate_t *state = entry->array[j];
644 	    free_state (state);
645 	  }
646 	re_free (entry->array);
647       }
648   re_free (dfa->state_table);
649 #ifdef RE_ENABLE_I18N
650   if (dfa->sb_char != utf8_sb_map)
651     re_free (dfa->sb_char);
652 #endif
653   re_free (dfa->subexp_map);
654 #ifdef DEBUG
655   re_free (dfa->re_str);
656 #endif
657 
658   re_free (dfa);
659 }
660 
661 
662 /* Free dynamically allocated space used by PREG.  */
663 
664 void
665 regfree (preg)
666     regex_t *preg;
667 {
668   re_dfa_t *dfa = preg->buffer;
669   if (BE (dfa != NULL, 1))
670     {
671       lock_fini (dfa->lock);
672       free_dfa_content (dfa);
673     }
674   preg->buffer = NULL;
675   preg->allocated = 0;
676 
677   re_free (preg->fastmap);
678   preg->fastmap = NULL;
679 
680   re_free (preg->translate);
681   preg->translate = NULL;
682 }
683 #ifdef _LIBC
684 weak_alias (__regfree, regfree)
685 #endif
686 
687 /* Entry points compatible with 4.2 BSD regex library.  We don't define
688    them unless specifically requested.  */
689 
690 #if defined _REGEX_RE_COMP || defined _LIBC
691 
692 /* BSD has one and only one pattern buffer.  */
693 static struct re_pattern_buffer re_comp_buf;
694 
695 char *
696 # ifdef _LIBC
697 /* Make these definitions weak in libc, so POSIX programs can redefine
698    these names if they don't use our functions, and still use
699    regcomp/regexec above without link errors.  */
700 weak_function
701 # endif
702 re_comp (s)
703      const char *s;
704 {
705   reg_errcode_t ret;
706   char *fastmap;
707 
708   if (!s)
709     {
710       if (!re_comp_buf.buffer)
711 	return gettext ("No previous regular expression");
712       return 0;
713     }
714 
715   if (re_comp_buf.buffer)
716     {
717       fastmap = re_comp_buf.fastmap;
718       re_comp_buf.fastmap = NULL;
719       __regfree (&re_comp_buf);
720       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
721       re_comp_buf.fastmap = fastmap;
722     }
723 
724   if (re_comp_buf.fastmap == NULL)
725     {
726       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
727       if (re_comp_buf.fastmap == NULL)
728 	return (char *) gettext (__re_error_msgid
729 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
730     }
731 
732   /* Since 're_exec' always passes NULL for the 'regs' argument, we
733      don't need to initialize the pattern buffer fields which affect it.  */
734 
735   /* Match anchors at newlines.  */
736   re_comp_buf.newline_anchor = 1;
737 
738   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
739 
740   if (!ret)
741     return NULL;
742 
743   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
744   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
745 }
746 
747 #ifdef _LIBC
748 libc_freeres_fn (free_mem)
749 {
750   __regfree (&re_comp_buf);
751 }
752 #endif
753 
754 #endif /* _REGEX_RE_COMP */
755 
756 /* Internal entry point.
757    Compile the regular expression PATTERN, whose length is LENGTH.
758    SYNTAX indicate regular expression's syntax.  */
759 
760 static reg_errcode_t
761 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
762 		     reg_syntax_t syntax)
763 {
764   reg_errcode_t err = REG_NOERROR;
765   re_dfa_t *dfa;
766   re_string_t regexp;
767 
768   /* Initialize the pattern buffer.  */
769   preg->fastmap_accurate = 0;
770   preg->syntax = syntax;
771   preg->not_bol = preg->not_eol = 0;
772   preg->used = 0;
773   preg->re_nsub = 0;
774   preg->can_be_null = 0;
775   preg->regs_allocated = REGS_UNALLOCATED;
776 
777   /* Initialize the dfa.  */
778   dfa = preg->buffer;
779   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
780     {
781       /* If zero allocated, but buffer is non-null, try to realloc
782 	 enough space.  This loses if buffer's address is bogus, but
783 	 that is the user's responsibility.  If ->buffer is NULL this
784 	 is a simple allocation.  */
785       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
786       if (dfa == NULL)
787 	return REG_ESPACE;
788       preg->allocated = sizeof (re_dfa_t);
789       preg->buffer = dfa;
790     }
791   preg->used = sizeof (re_dfa_t);
792 
793   err = init_dfa (dfa, length);
794   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
795     err = REG_ESPACE;
796   if (BE (err != REG_NOERROR, 0))
797     {
798       free_dfa_content (dfa);
799       preg->buffer = NULL;
800       preg->allocated = 0;
801       return err;
802     }
803 #ifdef DEBUG
804   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
805   dfa->re_str = re_malloc (char, length + 1);
806   strncpy (dfa->re_str, pattern, length + 1);
807 #endif
808 
809   err = re_string_construct (&regexp, pattern, length, preg->translate,
810 			     (syntax & RE_ICASE) != 0, dfa);
811   if (BE (err != REG_NOERROR, 0))
812     {
813     re_compile_internal_free_return:
814       free_workarea_compile (preg);
815       re_string_destruct (&regexp);
816       lock_fini (dfa->lock);
817       free_dfa_content (dfa);
818       preg->buffer = NULL;
819       preg->allocated = 0;
820       return err;
821     }
822 
823   /* Parse the regular expression, and build a structure tree.  */
824   preg->re_nsub = 0;
825   dfa->str_tree = parse (&regexp, preg, syntax, &err);
826   if (BE (dfa->str_tree == NULL, 0))
827     goto re_compile_internal_free_return;
828 
829   /* Analyze the tree and create the nfa.  */
830   err = analyze (preg);
831   if (BE (err != REG_NOERROR, 0))
832     goto re_compile_internal_free_return;
833 
834 #ifdef RE_ENABLE_I18N
835   /* If possible, do searching in single byte encoding to speed things up.  */
836   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
837     optimize_utf8 (dfa);
838 #endif
839 
840   /* Then create the initial state of the dfa.  */
841   err = create_initial_state (dfa);
842 
843   /* Release work areas.  */
844   free_workarea_compile (preg);
845   re_string_destruct (&regexp);
846 
847   if (BE (err != REG_NOERROR, 0))
848     {
849       lock_fini (dfa->lock);
850       free_dfa_content (dfa);
851       preg->buffer = NULL;
852       preg->allocated = 0;
853     }
854 
855   return err;
856 }
857 
858 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
859    as the initial length of some arrays.  */
860 
861 static reg_errcode_t
862 init_dfa (re_dfa_t *dfa, size_t pat_len)
863 {
864   __re_size_t table_size;
865 #ifndef _LIBC
866   const char *codeset_name;
867 #endif
868 #ifdef RE_ENABLE_I18N
869   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
870 #else
871   size_t max_i18n_object_size = 0;
872 #endif
873   size_t max_object_size =
874     MAX (sizeof (struct re_state_table_entry),
875 	 MAX (sizeof (re_token_t),
876 	      MAX (sizeof (re_node_set),
877 		   MAX (sizeof (regmatch_t),
878 			max_i18n_object_size))));
879 
880   memset (dfa, '\0', sizeof (re_dfa_t));
881 
882   /* Force allocation of str_tree_storage the first time.  */
883   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
884 
885   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
886      calculation below, and for similar doubling calculations
887      elsewhere.  And it's <= rather than <, because some of the
888      doubling calculations add 1 afterwards.  */
889   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
890     return REG_ESPACE;
891 
892   dfa->nodes_alloc = pat_len + 1;
893   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
894 
895   /*  table_size = 2 ^ ceil(log pat_len) */
896   for (table_size = 1; ; table_size <<= 1)
897     if (table_size > pat_len)
898       break;
899 
900   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
901   dfa->state_hash_mask = table_size - 1;
902 
903   dfa->mb_cur_max = MB_CUR_MAX;
904 #ifdef _LIBC
905   if (dfa->mb_cur_max == 6
906       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
907     dfa->is_utf8 = 1;
908   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
909 		       != 0);
910 #else
911   codeset_name = nl_langinfo (CODESET);
912   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
913       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
914       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
915       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
916     dfa->is_utf8 = 1;
917 
918   /* We check exhaustively in the loop below if this charset is a
919      superset of ASCII.  */
920   dfa->map_notascii = 0;
921 #endif
922 
923 #ifdef RE_ENABLE_I18N
924   if (dfa->mb_cur_max > 1)
925     {
926       if (dfa->is_utf8)
927 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
928       else
929 	{
930 	  int i, j, ch;
931 
932 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
933 	  if (BE (dfa->sb_char == NULL, 0))
934 	    return REG_ESPACE;
935 
936 	  /* Set the bits corresponding to single byte chars.  */
937 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
938 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
939 	      {
940 		wint_t wch = __btowc (ch);
941 		if (wch != WEOF)
942 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
943 # ifndef _LIBC
944 		if (isascii (ch) && wch != ch)
945 		  dfa->map_notascii = 1;
946 # endif
947 	      }
948 	}
949     }
950 #endif
951 
952   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
953     return REG_ESPACE;
954   return REG_NOERROR;
955 }
956 
957 /* Initialize WORD_CHAR table, which indicate which character is
958    "word".  In this case "word" means that it is the word construction
959    character used by some operators like "\<", "\>", etc.  */
960 
961 static void
962 internal_function
963 init_word_char (re_dfa_t *dfa)
964 {
965   int i = 0;
966   int j;
967   int ch = 0;
968   dfa->word_ops_used = 1;
969   if (BE (dfa->map_notascii == 0, 1))
970     {
971       bitset_word_t bits0 = 0x00000000;
972       bitset_word_t bits1 = 0x03ff0000;
973       bitset_word_t bits2 = 0x87fffffe;
974       bitset_word_t bits3 = 0x07fffffe;
975       if (BITSET_WORD_BITS == 64)
976 	{
977 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
978 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
979 	  i = 2;
980 	}
981       else if (BITSET_WORD_BITS == 32)
982 	{
983 	  dfa->word_char[0] = bits0;
984 	  dfa->word_char[1] = bits1;
985 	  dfa->word_char[2] = bits2;
986 	  dfa->word_char[3] = bits3;
987 	  i = 4;
988 	}
989       else
990         goto general_case;
991       ch = 128;
992 
993       if (BE (dfa->is_utf8, 1))
994 	{
995 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
996 	  return;
997 	}
998     }
999 
1000  general_case:
1001   for (; i < BITSET_WORDS; ++i)
1002     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1003       if (isalnum (ch) || ch == '_')
1004 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
1005 }
1006 
1007 /* Free the work area which are only used while compiling.  */
1008 
1009 static void
1010 free_workarea_compile (regex_t *preg)
1011 {
1012   re_dfa_t *dfa = preg->buffer;
1013   bin_tree_storage_t *storage, *next;
1014   for (storage = dfa->str_tree_storage; storage; storage = next)
1015     {
1016       next = storage->next;
1017       re_free (storage);
1018     }
1019   dfa->str_tree_storage = NULL;
1020   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1021   dfa->str_tree = NULL;
1022   re_free (dfa->org_indices);
1023   dfa->org_indices = NULL;
1024 }
1025 
1026 /* Create initial states for all contexts.  */
1027 
1028 static reg_errcode_t
1029 create_initial_state (re_dfa_t *dfa)
1030 {
1031   Idx first, i;
1032   reg_errcode_t err;
1033   re_node_set init_nodes;
1034 
1035   /* Initial states have the epsilon closure of the node which is
1036      the first node of the regular expression.  */
1037   first = dfa->str_tree->first->node_idx;
1038   dfa->init_node = first;
1039   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1040   if (BE (err != REG_NOERROR, 0))
1041     return err;
1042 
1043   /* The back-references which are in initial states can epsilon transit,
1044      since in this case all of the subexpressions can be null.
1045      Then we add epsilon closures of the nodes which are the next nodes of
1046      the back-references.  */
1047   if (dfa->nbackref > 0)
1048     for (i = 0; i < init_nodes.nelem; ++i)
1049       {
1050 	Idx node_idx = init_nodes.elems[i];
1051 	re_token_type_t type = dfa->nodes[node_idx].type;
1052 
1053 	Idx clexp_idx;
1054 	if (type != OP_BACK_REF)
1055 	  continue;
1056 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1057 	  {
1058 	    re_token_t *clexp_node;
1059 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1060 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1061 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1062 	      break;
1063 	  }
1064 	if (clexp_idx == init_nodes.nelem)
1065 	  continue;
1066 
1067 	if (type == OP_BACK_REF)
1068 	  {
1069 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1070 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1071 	      {
1072 		reg_errcode_t merge_err
1073                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1074 		if (merge_err != REG_NOERROR)
1075 		  return merge_err;
1076 		i = 0;
1077 	      }
1078 	  }
1079       }
1080 
1081   /* It must be the first time to invoke acquire_state.  */
1082   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1083   /* We don't check ERR here, since the initial state must not be NULL.  */
1084   if (BE (dfa->init_state == NULL, 0))
1085     return err;
1086   if (dfa->init_state->has_constraint)
1087     {
1088       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1089 						       CONTEXT_WORD);
1090       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1091 						     CONTEXT_NEWLINE);
1092       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1093 							 &init_nodes,
1094 							 CONTEXT_NEWLINE
1095 							 | CONTEXT_BEGBUF);
1096       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1097 	      || dfa->init_state_begbuf == NULL, 0))
1098 	return err;
1099     }
1100   else
1101     dfa->init_state_word = dfa->init_state_nl
1102       = dfa->init_state_begbuf = dfa->init_state;
1103 
1104   re_node_set_free (&init_nodes);
1105   return REG_NOERROR;
1106 }
1107 
1108 #ifdef RE_ENABLE_I18N
1109 /* If it is possible to do searching in single byte encoding instead of UTF-8
1110    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1111    DFA nodes where needed.  */
1112 
1113 static void
1114 optimize_utf8 (re_dfa_t *dfa)
1115 {
1116   Idx node;
1117   int i;
1118   bool mb_chars = false;
1119   bool has_period = false;
1120 
1121   for (node = 0; node < dfa->nodes_len; ++node)
1122     switch (dfa->nodes[node].type)
1123       {
1124       case CHARACTER:
1125 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1126 	  mb_chars = true;
1127 	break;
1128       case ANCHOR:
1129 	switch (dfa->nodes[node].opr.ctx_type)
1130 	  {
1131 	  case LINE_FIRST:
1132 	  case LINE_LAST:
1133 	  case BUF_FIRST:
1134 	  case BUF_LAST:
1135 	    break;
1136 	  default:
1137 	    /* Word anchors etc. cannot be handled.  It's okay to test
1138 	       opr.ctx_type since constraints (for all DFA nodes) are
1139 	       created by ORing one or more opr.ctx_type values.  */
1140 	    return;
1141 	  }
1142 	break;
1143       case OP_PERIOD:
1144 	has_period = true;
1145 	break;
1146       case OP_BACK_REF:
1147       case OP_ALT:
1148       case END_OF_RE:
1149       case OP_DUP_ASTERISK:
1150       case OP_OPEN_SUBEXP:
1151       case OP_CLOSE_SUBEXP:
1152 	break;
1153       case COMPLEX_BRACKET:
1154 	return;
1155       case SIMPLE_BRACKET:
1156 	/* Just double check.  */
1157 	{
1158 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1159 			? 0
1160 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1161 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1162 	    {
1163 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1164 		return;
1165 	      rshift = 0;
1166 	    }
1167 	}
1168 	break;
1169       default:
1170 	abort ();
1171       }
1172 
1173   if (mb_chars || has_period)
1174     for (node = 0; node < dfa->nodes_len; ++node)
1175       {
1176 	if (dfa->nodes[node].type == CHARACTER
1177 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1178 	  dfa->nodes[node].mb_partial = 0;
1179 	else if (dfa->nodes[node].type == OP_PERIOD)
1180 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1181       }
1182 
1183   /* The search can be in single byte locale.  */
1184   dfa->mb_cur_max = 1;
1185   dfa->is_utf8 = 0;
1186   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1187 }
1188 #endif
1189 
1190 /* Analyze the structure tree, and calculate "first", "next", "edest",
1191    "eclosure", and "inveclosure".  */
1192 
1193 static reg_errcode_t
1194 analyze (regex_t *preg)
1195 {
1196   re_dfa_t *dfa = preg->buffer;
1197   reg_errcode_t ret;
1198 
1199   /* Allocate arrays.  */
1200   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1201   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1202   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1203   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1204   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1205 	  || dfa->eclosures == NULL, 0))
1206     return REG_ESPACE;
1207 
1208   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1209   if (dfa->subexp_map != NULL)
1210     {
1211       Idx i;
1212       for (i = 0; i < preg->re_nsub; i++)
1213 	dfa->subexp_map[i] = i;
1214       preorder (dfa->str_tree, optimize_subexps, dfa);
1215       for (i = 0; i < preg->re_nsub; i++)
1216 	if (dfa->subexp_map[i] != i)
1217 	  break;
1218       if (i == preg->re_nsub)
1219 	{
1220 	  free (dfa->subexp_map);
1221 	  dfa->subexp_map = NULL;
1222 	}
1223     }
1224 
1225   ret = postorder (dfa->str_tree, lower_subexps, preg);
1226   if (BE (ret != REG_NOERROR, 0))
1227     return ret;
1228   ret = postorder (dfa->str_tree, calc_first, dfa);
1229   if (BE (ret != REG_NOERROR, 0))
1230     return ret;
1231   preorder (dfa->str_tree, calc_next, dfa);
1232   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1233   if (BE (ret != REG_NOERROR, 0))
1234     return ret;
1235   ret = calc_eclosure (dfa);
1236   if (BE (ret != REG_NOERROR, 0))
1237     return ret;
1238 
1239   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1240      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1241   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1242       || dfa->nbackref)
1243     {
1244       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1245       if (BE (dfa->inveclosures == NULL, 0))
1246 	return REG_ESPACE;
1247       ret = calc_inveclosure (dfa);
1248     }
1249 
1250   return ret;
1251 }
1252 
1253 /* Our parse trees are very unbalanced, so we cannot use a stack to
1254    implement parse tree visits.  Instead, we use parent pointers and
1255    some hairy code in these two functions.  */
1256 static reg_errcode_t
1257 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1258 	   void *extra)
1259 {
1260   bin_tree_t *node, *prev;
1261 
1262   for (node = root; ; )
1263     {
1264       /* Descend down the tree, preferably to the left (or to the right
1265 	 if that's the only child).  */
1266       while (node->left || node->right)
1267 	if (node->left)
1268 	  node = node->left;
1269 	else
1270 	  node = node->right;
1271 
1272       do
1273 	{
1274 	  reg_errcode_t err = fn (extra, node);
1275 	  if (BE (err != REG_NOERROR, 0))
1276 	    return err;
1277 	  if (node->parent == NULL)
1278 	    return REG_NOERROR;
1279 	  prev = node;
1280 	  node = node->parent;
1281 	}
1282       /* Go up while we have a node that is reached from the right.  */
1283       while (node->right == prev || node->right == NULL);
1284       node = node->right;
1285     }
1286 }
1287 
1288 static reg_errcode_t
1289 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1290 	  void *extra)
1291 {
1292   bin_tree_t *node;
1293 
1294   for (node = root; ; )
1295     {
1296       reg_errcode_t err = fn (extra, node);
1297       if (BE (err != REG_NOERROR, 0))
1298 	return err;
1299 
1300       /* Go to the left node, or up and to the right.  */
1301       if (node->left)
1302 	node = node->left;
1303       else
1304 	{
1305 	  bin_tree_t *prev = NULL;
1306 	  while (node->right == prev || node->right == NULL)
1307 	    {
1308 	      prev = node;
1309 	      node = node->parent;
1310 	      if (!node)
1311 		return REG_NOERROR;
1312 	    }
1313 	  node = node->right;
1314 	}
1315     }
1316 }
1317 
1318 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1319    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1320    backreferences as well.  Requires a preorder visit.  */
1321 static reg_errcode_t
1322 optimize_subexps (void *extra, bin_tree_t *node)
1323 {
1324   re_dfa_t *dfa = (re_dfa_t *) extra;
1325 
1326   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1327     {
1328       int idx = node->token.opr.idx;
1329       node->token.opr.idx = dfa->subexp_map[idx];
1330       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1331     }
1332 
1333   else if (node->token.type == SUBEXP
1334 	   && node->left && node->left->token.type == SUBEXP)
1335     {
1336       Idx other_idx = node->left->token.opr.idx;
1337 
1338       node->left = node->left->left;
1339       if (node->left)
1340 	node->left->parent = node;
1341 
1342       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1343       if (other_idx < BITSET_WORD_BITS)
1344 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1345     }
1346 
1347   return REG_NOERROR;
1348 }
1349 
1350 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1351    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1352 static reg_errcode_t
1353 lower_subexps (void *extra, bin_tree_t *node)
1354 {
1355   regex_t *preg = (regex_t *) extra;
1356   reg_errcode_t err = REG_NOERROR;
1357 
1358   if (node->left && node->left->token.type == SUBEXP)
1359     {
1360       node->left = lower_subexp (&err, preg, node->left);
1361       if (node->left)
1362 	node->left->parent = node;
1363     }
1364   if (node->right && node->right->token.type == SUBEXP)
1365     {
1366       node->right = lower_subexp (&err, preg, node->right);
1367       if (node->right)
1368 	node->right->parent = node;
1369     }
1370 
1371   return err;
1372 }
1373 
1374 static bin_tree_t *
1375 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1376 {
1377   re_dfa_t *dfa = preg->buffer;
1378   bin_tree_t *body = node->left;
1379   bin_tree_t *op, *cls, *tree1, *tree;
1380 
1381   if (preg->no_sub
1382       /* We do not optimize empty subexpressions, because otherwise we may
1383 	 have bad CONCAT nodes with NULL children.  This is obviously not
1384 	 very common, so we do not lose much.  An example that triggers
1385 	 this case is the sed "script" /\(\)/x.  */
1386       && node->left != NULL
1387       && (node->token.opr.idx >= BITSET_WORD_BITS
1388 	  || !(dfa->used_bkref_map
1389 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1390     return node->left;
1391 
1392   /* Convert the SUBEXP node to the concatenation of an
1393      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1394   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1395   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1396   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1397   tree = create_tree (dfa, op, tree1, CONCAT);
1398   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1399     {
1400       *err = REG_ESPACE;
1401       return NULL;
1402     }
1403 
1404   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1405   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1406   return tree;
1407 }
1408 
1409 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1410    nodes.  Requires a postorder visit.  */
1411 static reg_errcode_t
1412 calc_first (void *extra, bin_tree_t *node)
1413 {
1414   re_dfa_t *dfa = (re_dfa_t *) extra;
1415   if (node->token.type == CONCAT)
1416     {
1417       node->first = node->left->first;
1418       node->node_idx = node->left->node_idx;
1419     }
1420   else
1421     {
1422       node->first = node;
1423       node->node_idx = re_dfa_add_node (dfa, node->token);
1424       if (BE (node->node_idx == REG_MISSING, 0))
1425 	return REG_ESPACE;
1426       if (node->token.type == ANCHOR)
1427 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1428     }
1429   return REG_NOERROR;
1430 }
1431 
1432 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1433 static reg_errcode_t
1434 calc_next (void *extra, bin_tree_t *node)
1435 {
1436   switch (node->token.type)
1437     {
1438     case OP_DUP_ASTERISK:
1439       node->left->next = node;
1440       break;
1441     case CONCAT:
1442       node->left->next = node->right->first;
1443       node->right->next = node->next;
1444       break;
1445     default:
1446       if (node->left)
1447 	node->left->next = node->next;
1448       if (node->right)
1449 	node->right->next = node->next;
1450       break;
1451     }
1452   return REG_NOERROR;
1453 }
1454 
1455 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1456 static reg_errcode_t
1457 link_nfa_nodes (void *extra, bin_tree_t *node)
1458 {
1459   re_dfa_t *dfa = (re_dfa_t *) extra;
1460   Idx idx = node->node_idx;
1461   reg_errcode_t err = REG_NOERROR;
1462 
1463   switch (node->token.type)
1464     {
1465     case CONCAT:
1466       break;
1467 
1468     case END_OF_RE:
1469       assert (node->next == NULL);
1470       break;
1471 
1472     case OP_DUP_ASTERISK:
1473     case OP_ALT:
1474       {
1475 	Idx left, right;
1476 	dfa->has_plural_match = 1;
1477 	if (node->left != NULL)
1478 	  left = node->left->first->node_idx;
1479 	else
1480 	  left = node->next->node_idx;
1481 	if (node->right != NULL)
1482 	  right = node->right->first->node_idx;
1483 	else
1484 	  right = node->next->node_idx;
1485 	assert (REG_VALID_INDEX (left));
1486 	assert (REG_VALID_INDEX (right));
1487 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1488       }
1489       break;
1490 
1491     case ANCHOR:
1492     case OP_OPEN_SUBEXP:
1493     case OP_CLOSE_SUBEXP:
1494       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1495       break;
1496 
1497     case OP_BACK_REF:
1498       dfa->nexts[idx] = node->next->node_idx;
1499       if (node->token.type == OP_BACK_REF)
1500 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1501       break;
1502 
1503     default:
1504       assert (!IS_EPSILON_NODE (node->token.type));
1505       dfa->nexts[idx] = node->next->node_idx;
1506       break;
1507     }
1508 
1509   return err;
1510 }
1511 
1512 /* Duplicate the epsilon closure of the node ROOT_NODE.
1513    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1514    to their own constraint.  */
1515 
1516 static reg_errcode_t
1517 internal_function
1518 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1519 			Idx root_node, unsigned int init_constraint)
1520 {
1521   Idx org_node, clone_node;
1522   bool ok;
1523   unsigned int constraint = init_constraint;
1524   for (org_node = top_org_node, clone_node = top_clone_node;;)
1525     {
1526       Idx org_dest, clone_dest;
1527       if (dfa->nodes[org_node].type == OP_BACK_REF)
1528 	{
1529 	  /* If the back reference epsilon-transit, its destination must
1530 	     also have the constraint.  Then duplicate the epsilon closure
1531 	     of the destination of the back reference, and store it in
1532 	     edests of the back reference.  */
1533 	  org_dest = dfa->nexts[org_node];
1534 	  re_node_set_empty (dfa->edests + clone_node);
1535 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1536 	  if (BE (clone_dest == REG_MISSING, 0))
1537 	    return REG_ESPACE;
1538 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1539 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540 	  if (BE (! ok, 0))
1541 	    return REG_ESPACE;
1542 	}
1543       else if (dfa->edests[org_node].nelem == 0)
1544 	{
1545 	  /* In case of the node can't epsilon-transit, don't duplicate the
1546 	     destination and store the original destination as the
1547 	     destination of the node.  */
1548 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1549 	  break;
1550 	}
1551       else if (dfa->edests[org_node].nelem == 1)
1552 	{
1553 	  /* In case of the node can epsilon-transit, and it has only one
1554 	     destination.  */
1555 	  org_dest = dfa->edests[org_node].elems[0];
1556 	  re_node_set_empty (dfa->edests + clone_node);
1557 	  /* If the node is root_node itself, it means the epsilon closure
1558 	     has a loop.  Then tie it to the destination of the root_node.  */
1559 	  if (org_node == root_node && clone_node != org_node)
1560 	    {
1561 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1562 	      if (BE (! ok, 0))
1563 	        return REG_ESPACE;
1564 	      break;
1565 	    }
1566 	  /* In case the node has another constraint, append it.  */
1567 	  constraint |= dfa->nodes[org_node].constraint;
1568 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1569 	  if (BE (clone_dest == REG_MISSING, 0))
1570 	    return REG_ESPACE;
1571 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 	  if (BE (! ok, 0))
1573 	    return REG_ESPACE;
1574 	}
1575       else /* dfa->edests[org_node].nelem == 2 */
1576 	{
1577 	  /* In case of the node can epsilon-transit, and it has two
1578 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1579 	  org_dest = dfa->edests[org_node].elems[0];
1580 	  re_node_set_empty (dfa->edests + clone_node);
1581 	  /* Search for a duplicated node which satisfies the constraint.  */
1582 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1583 	  if (clone_dest == REG_MISSING)
1584 	    {
1585 	      /* There is no such duplicated node, create a new one.  */
1586 	      reg_errcode_t err;
1587 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 	      if (BE (clone_dest == REG_MISSING, 0))
1589 		return REG_ESPACE;
1590 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 	      if (BE (! ok, 0))
1592 		return REG_ESPACE;
1593 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1594 					    root_node, constraint);
1595 	      if (BE (err != REG_NOERROR, 0))
1596 		return err;
1597 	    }
1598 	  else
1599 	    {
1600 	      /* There is a duplicated node which satisfies the constraint,
1601 		 use it to avoid infinite loop.  */
1602 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603 	      if (BE (! ok, 0))
1604 		return REG_ESPACE;
1605 	    }
1606 
1607 	  org_dest = dfa->edests[org_node].elems[1];
1608 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1609 	  if (BE (clone_dest == REG_MISSING, 0))
1610 	    return REG_ESPACE;
1611 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1612 	  if (BE (! ok, 0))
1613 	    return REG_ESPACE;
1614 	}
1615       org_node = org_dest;
1616       clone_node = clone_dest;
1617     }
1618   return REG_NOERROR;
1619 }
1620 
1621 /* Search for a node which is duplicated from the node ORG_NODE, and
1622    satisfies the constraint CONSTRAINT.  */
1623 
1624 static Idx
1625 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1626 			unsigned int constraint)
1627 {
1628   Idx idx;
1629   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1630     {
1631       if (org_node == dfa->org_indices[idx]
1632 	  && constraint == dfa->nodes[idx].constraint)
1633 	return idx; /* Found.  */
1634     }
1635   return REG_MISSING; /* Not found.  */
1636 }
1637 
1638 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1639    Return the index of the new node, or REG_MISSING if insufficient storage is
1640    available.  */
1641 
1642 static Idx
1643 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1644 {
1645   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1646   if (BE (dup_idx != REG_MISSING, 1))
1647     {
1648       dfa->nodes[dup_idx].constraint = constraint;
1649       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1650       dfa->nodes[dup_idx].duplicated = 1;
1651 
1652       /* Store the index of the original node.  */
1653       dfa->org_indices[dup_idx] = org_idx;
1654     }
1655   return dup_idx;
1656 }
1657 
1658 static reg_errcode_t
1659 calc_inveclosure (re_dfa_t *dfa)
1660 {
1661   Idx src, idx;
1662   bool ok;
1663   for (idx = 0; idx < dfa->nodes_len; ++idx)
1664     re_node_set_init_empty (dfa->inveclosures + idx);
1665 
1666   for (src = 0; src < dfa->nodes_len; ++src)
1667     {
1668       Idx *elems = dfa->eclosures[src].elems;
1669       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1670 	{
1671 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1672 	  if (BE (! ok, 0))
1673 	    return REG_ESPACE;
1674 	}
1675     }
1676 
1677   return REG_NOERROR;
1678 }
1679 
1680 /* Calculate "eclosure" for all the node in DFA.  */
1681 
1682 static reg_errcode_t
1683 calc_eclosure (re_dfa_t *dfa)
1684 {
1685   Idx node_idx;
1686   bool incomplete;
1687 #ifdef DEBUG
1688   assert (dfa->nodes_len > 0);
1689 #endif
1690   incomplete = false;
1691   /* For each nodes, calculate epsilon closure.  */
1692   for (node_idx = 0; ; ++node_idx)
1693     {
1694       reg_errcode_t err;
1695       re_node_set eclosure_elem;
1696       if (node_idx == dfa->nodes_len)
1697 	{
1698 	  if (!incomplete)
1699 	    break;
1700 	  incomplete = false;
1701 	  node_idx = 0;
1702 	}
1703 
1704 #ifdef DEBUG
1705       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1706 #endif
1707 
1708       /* If we have already calculated, skip it.  */
1709       if (dfa->eclosures[node_idx].nelem != 0)
1710 	continue;
1711       /* Calculate epsilon closure of 'node_idx'.  */
1712       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1713       if (BE (err != REG_NOERROR, 0))
1714 	return err;
1715 
1716       if (dfa->eclosures[node_idx].nelem == 0)
1717 	{
1718 	  incomplete = true;
1719 	  re_node_set_free (&eclosure_elem);
1720 	}
1721     }
1722   return REG_NOERROR;
1723 }
1724 
1725 /* Calculate epsilon closure of NODE.  */
1726 
1727 static reg_errcode_t
1728 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1729 {
1730   reg_errcode_t err;
1731   Idx i;
1732   re_node_set eclosure;
1733   bool ok;
1734   bool incomplete = false;
1735   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1736   if (BE (err != REG_NOERROR, 0))
1737     return err;
1738 
1739   /* This indicates that we are calculating this node now.
1740      We reference this value to avoid infinite loop.  */
1741   dfa->eclosures[node].nelem = REG_MISSING;
1742 
1743   /* If the current node has constraints, duplicate all nodes
1744      since they must inherit the constraints.  */
1745   if (dfa->nodes[node].constraint
1746       && dfa->edests[node].nelem
1747       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1748     {
1749       err = duplicate_node_closure (dfa, node, node, node,
1750 				    dfa->nodes[node].constraint);
1751       if (BE (err != REG_NOERROR, 0))
1752 	return err;
1753     }
1754 
1755   /* Expand each epsilon destination nodes.  */
1756   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1757     for (i = 0; i < dfa->edests[node].nelem; ++i)
1758       {
1759 	re_node_set eclosure_elem;
1760 	Idx edest = dfa->edests[node].elems[i];
1761 	/* If calculating the epsilon closure of 'edest' is in progress,
1762 	   return intermediate result.  */
1763 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1764 	  {
1765 	    incomplete = true;
1766 	    continue;
1767 	  }
1768 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1769 	   calculate now. Otherwise use calculated epsilon closure.  */
1770 	if (dfa->eclosures[edest].nelem == 0)
1771 	  {
1772 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1773 	    if (BE (err != REG_NOERROR, 0))
1774 	      return err;
1775 	  }
1776 	else
1777 	  eclosure_elem = dfa->eclosures[edest];
1778 	/* Merge the epsilon closure of 'edest'.  */
1779 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1780 	if (BE (err != REG_NOERROR, 0))
1781 	  return err;
1782 	/* If the epsilon closure of 'edest' is incomplete,
1783 	   the epsilon closure of this node is also incomplete.  */
1784 	if (dfa->eclosures[edest].nelem == 0)
1785 	  {
1786 	    incomplete = true;
1787 	    re_node_set_free (&eclosure_elem);
1788 	  }
1789       }
1790 
1791   /* An epsilon closure includes itself.  */
1792   ok = re_node_set_insert (&eclosure, node);
1793   if (BE (! ok, 0))
1794     return REG_ESPACE;
1795   if (incomplete && !root)
1796     dfa->eclosures[node].nelem = 0;
1797   else
1798     dfa->eclosures[node] = eclosure;
1799   *new_set = eclosure;
1800   return REG_NOERROR;
1801 }
1802 
1803 /* Functions for token which are used in the parser.  */
1804 
1805 /* Fetch a token from INPUT.
1806    We must not use this function inside bracket expressions.  */
1807 
1808 static void
1809 internal_function
1810 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1811 {
1812   re_string_skip_bytes (input, peek_token (result, input, syntax));
1813 }
1814 
1815 /* Peek a token from INPUT, and return the length of the token.
1816    We must not use this function inside bracket expressions.  */
1817 
1818 static int
1819 internal_function
1820 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1821 {
1822   unsigned char c;
1823 
1824   if (re_string_eoi (input))
1825     {
1826       token->type = END_OF_RE;
1827       return 0;
1828     }
1829 
1830   c = re_string_peek_byte (input, 0);
1831   token->opr.c = c;
1832 
1833   token->word_char = 0;
1834 #ifdef RE_ENABLE_I18N
1835   token->mb_partial = 0;
1836   if (input->mb_cur_max > 1 &&
1837       !re_string_first_byte (input, re_string_cur_idx (input)))
1838     {
1839       token->type = CHARACTER;
1840       token->mb_partial = 1;
1841       return 1;
1842     }
1843 #endif
1844   if (c == '\\')
1845     {
1846       unsigned char c2;
1847       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1848 	{
1849 	  token->type = BACK_SLASH;
1850 	  return 1;
1851 	}
1852 
1853       c2 = re_string_peek_byte_case (input, 1);
1854       token->opr.c = c2;
1855       token->type = CHARACTER;
1856 #ifdef RE_ENABLE_I18N
1857       if (input->mb_cur_max > 1)
1858 	{
1859 	  wint_t wc = re_string_wchar_at (input,
1860 					  re_string_cur_idx (input) + 1);
1861 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1862 	}
1863       else
1864 #endif
1865 	token->word_char = IS_WORD_CHAR (c2) != 0;
1866 
1867       switch (c2)
1868 	{
1869 	case '|':
1870 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1871 	    token->type = OP_ALT;
1872 	  break;
1873 	case '1': case '2': case '3': case '4': case '5':
1874 	case '6': case '7': case '8': case '9':
1875 	  if (!(syntax & RE_NO_BK_REFS))
1876 	    {
1877 	      token->type = OP_BACK_REF;
1878 	      token->opr.idx = c2 - '1';
1879 	    }
1880 	  break;
1881 	case '<':
1882 	  if (!(syntax & RE_NO_GNU_OPS))
1883 	    {
1884 	      token->type = ANCHOR;
1885 	      token->opr.ctx_type = WORD_FIRST;
1886 	    }
1887 	  break;
1888 	case '>':
1889 	  if (!(syntax & RE_NO_GNU_OPS))
1890 	    {
1891 	      token->type = ANCHOR;
1892 	      token->opr.ctx_type = WORD_LAST;
1893 	    }
1894 	  break;
1895 	case 'b':
1896 	  if (!(syntax & RE_NO_GNU_OPS))
1897 	    {
1898 	      token->type = ANCHOR;
1899 	      token->opr.ctx_type = WORD_DELIM;
1900 	    }
1901 	  break;
1902 	case 'B':
1903 	  if (!(syntax & RE_NO_GNU_OPS))
1904 	    {
1905 	      token->type = ANCHOR;
1906 	      token->opr.ctx_type = NOT_WORD_DELIM;
1907 	    }
1908 	  break;
1909 	case 'w':
1910 	  if (!(syntax & RE_NO_GNU_OPS))
1911 	    token->type = OP_WORD;
1912 	  break;
1913 	case 'W':
1914 	  if (!(syntax & RE_NO_GNU_OPS))
1915 	    token->type = OP_NOTWORD;
1916 	  break;
1917 	case 's':
1918 	  if (!(syntax & RE_NO_GNU_OPS))
1919 	    token->type = OP_SPACE;
1920 	  break;
1921 	case 'S':
1922 	  if (!(syntax & RE_NO_GNU_OPS))
1923 	    token->type = OP_NOTSPACE;
1924 	  break;
1925 	case '`':
1926 	  if (!(syntax & RE_NO_GNU_OPS))
1927 	    {
1928 	      token->type = ANCHOR;
1929 	      token->opr.ctx_type = BUF_FIRST;
1930 	    }
1931 	  break;
1932 	case '\'':
1933 	  if (!(syntax & RE_NO_GNU_OPS))
1934 	    {
1935 	      token->type = ANCHOR;
1936 	      token->opr.ctx_type = BUF_LAST;
1937 	    }
1938 	  break;
1939 	case '(':
1940 	  if (!(syntax & RE_NO_BK_PARENS))
1941 	    token->type = OP_OPEN_SUBEXP;
1942 	  break;
1943 	case ')':
1944 	  if (!(syntax & RE_NO_BK_PARENS))
1945 	    token->type = OP_CLOSE_SUBEXP;
1946 	  break;
1947 	case '+':
1948 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949 	    token->type = OP_DUP_PLUS;
1950 	  break;
1951 	case '?':
1952 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1953 	    token->type = OP_DUP_QUESTION;
1954 	  break;
1955 	case '{':
1956 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957 	    token->type = OP_OPEN_DUP_NUM;
1958 	  break;
1959 	case '}':
1960 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1961 	    token->type = OP_CLOSE_DUP_NUM;
1962 	  break;
1963 	default:
1964 	  break;
1965 	}
1966       return 2;
1967     }
1968 
1969   token->type = CHARACTER;
1970 #ifdef RE_ENABLE_I18N
1971   if (input->mb_cur_max > 1)
1972     {
1973       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1974       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1975     }
1976   else
1977 #endif
1978     token->word_char = IS_WORD_CHAR (token->opr.c);
1979 
1980   switch (c)
1981     {
1982     case '\n':
1983       if (syntax & RE_NEWLINE_ALT)
1984 	token->type = OP_ALT;
1985       break;
1986     case '|':
1987       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1988 	token->type = OP_ALT;
1989       break;
1990     case '*':
1991       token->type = OP_DUP_ASTERISK;
1992       break;
1993     case '+':
1994       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995 	token->type = OP_DUP_PLUS;
1996       break;
1997     case '?':
1998       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1999 	token->type = OP_DUP_QUESTION;
2000       break;
2001     case '{':
2002       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003 	token->type = OP_OPEN_DUP_NUM;
2004       break;
2005     case '}':
2006       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2007 	token->type = OP_CLOSE_DUP_NUM;
2008       break;
2009     case '(':
2010       if (syntax & RE_NO_BK_PARENS)
2011 	token->type = OP_OPEN_SUBEXP;
2012       break;
2013     case ')':
2014       if (syntax & RE_NO_BK_PARENS)
2015 	token->type = OP_CLOSE_SUBEXP;
2016       break;
2017     case '[':
2018       token->type = OP_OPEN_BRACKET;
2019       break;
2020     case '.':
2021       token->type = OP_PERIOD;
2022       break;
2023     case '^':
2024       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2025 	  re_string_cur_idx (input) != 0)
2026 	{
2027 	  char prev = re_string_peek_byte (input, -1);
2028 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2029 	    break;
2030 	}
2031       token->type = ANCHOR;
2032       token->opr.ctx_type = LINE_FIRST;
2033       break;
2034     case '$':
2035       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2036 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2037 	{
2038 	  re_token_t next;
2039 	  re_string_skip_bytes (input, 1);
2040 	  peek_token (&next, input, syntax);
2041 	  re_string_skip_bytes (input, -1);
2042 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2043 	    break;
2044 	}
2045       token->type = ANCHOR;
2046       token->opr.ctx_type = LINE_LAST;
2047       break;
2048     default:
2049       break;
2050     }
2051   return 1;
2052 }
2053 
2054 /* Peek a token from INPUT, and return the length of the token.
2055    We must not use this function out of bracket expressions.  */
2056 
2057 static int
2058 internal_function
2059 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2060 {
2061   unsigned char c;
2062   if (re_string_eoi (input))
2063     {
2064       token->type = END_OF_RE;
2065       return 0;
2066     }
2067   c = re_string_peek_byte (input, 0);
2068   token->opr.c = c;
2069 
2070 #ifdef RE_ENABLE_I18N
2071   if (input->mb_cur_max > 1 &&
2072       !re_string_first_byte (input, re_string_cur_idx (input)))
2073     {
2074       token->type = CHARACTER;
2075       return 1;
2076     }
2077 #endif /* RE_ENABLE_I18N */
2078 
2079   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2080       && re_string_cur_idx (input) + 1 < re_string_length (input))
2081     {
2082       /* In this case, '\' escape a character.  */
2083       unsigned char c2;
2084       re_string_skip_bytes (input, 1);
2085       c2 = re_string_peek_byte (input, 0);
2086       token->opr.c = c2;
2087       token->type = CHARACTER;
2088       return 1;
2089     }
2090   if (c == '[') /* '[' is a special char in a bracket exps.  */
2091     {
2092       unsigned char c2;
2093       int token_len;
2094       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2095 	c2 = re_string_peek_byte (input, 1);
2096       else
2097 	c2 = 0;
2098       token->opr.c = c2;
2099       token_len = 2;
2100       switch (c2)
2101 	{
2102 	case '.':
2103 	  token->type = OP_OPEN_COLL_ELEM;
2104 	  break;
2105 	case '=':
2106 	  token->type = OP_OPEN_EQUIV_CLASS;
2107 	  break;
2108 	case ':':
2109 	  if (syntax & RE_CHAR_CLASSES)
2110 	    {
2111 	      token->type = OP_OPEN_CHAR_CLASS;
2112 	      break;
2113 	    }
2114 	  /* else fall through.  */
2115 	default:
2116 	  token->type = CHARACTER;
2117 	  token->opr.c = c;
2118 	  token_len = 1;
2119 	  break;
2120 	}
2121       return token_len;
2122     }
2123   switch (c)
2124     {
2125     case '-':
2126       token->type = OP_CHARSET_RANGE;
2127       break;
2128     case ']':
2129       token->type = OP_CLOSE_BRACKET;
2130       break;
2131     case '^':
2132       token->type = OP_NON_MATCH_LIST;
2133       break;
2134     default:
2135       token->type = CHARACTER;
2136     }
2137   return 1;
2138 }
2139 
2140 /* Functions for parser.  */
2141 
2142 /* Entry point of the parser.
2143    Parse the regular expression REGEXP and return the structure tree.
2144    If an error occurs, ERR is set by error code, and return NULL.
2145    This function build the following tree, from regular expression <reg_exp>:
2146 	   CAT
2147 	   / \
2148 	  /   \
2149    <reg_exp>  EOR
2150 
2151    CAT means concatenation.
2152    EOR means end of regular expression.  */
2153 
2154 static bin_tree_t *
2155 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2156        reg_errcode_t *err)
2157 {
2158   re_dfa_t *dfa = preg->buffer;
2159   bin_tree_t *tree, *eor, *root;
2160   re_token_t current_token;
2161   dfa->syntax = syntax;
2162   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2163   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2164   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2165     return NULL;
2166   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2167   if (tree != NULL)
2168     root = create_tree (dfa, tree, eor, CONCAT);
2169   else
2170     root = eor;
2171   if (BE (eor == NULL || root == NULL, 0))
2172     {
2173       *err = REG_ESPACE;
2174       return NULL;
2175     }
2176   return root;
2177 }
2178 
2179 /* This function build the following tree, from regular expression
2180    <branch1>|<branch2>:
2181 	   ALT
2182 	   / \
2183 	  /   \
2184    <branch1> <branch2>
2185 
2186    ALT means alternative, which represents the operator '|'.  */
2187 
2188 static bin_tree_t *
2189 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2191 {
2192   re_dfa_t *dfa = preg->buffer;
2193   bin_tree_t *tree, *branch = NULL;
2194   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2195   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2196   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2197     return NULL;
2198 
2199   while (token->type == OP_ALT)
2200     {
2201       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2202       if (token->type != OP_ALT && token->type != END_OF_RE
2203 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2204 	{
2205 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2206 	  dfa->completed_bkref_map = initial_bkref_map;
2207 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2208 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2209 	    {
2210 	      if (tree != NULL)
2211 		postorder (tree, free_tree, NULL);
2212 	      return NULL;
2213 	    }
2214 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2215 	}
2216       else
2217 	branch = NULL;
2218       tree = create_tree (dfa, tree, branch, OP_ALT);
2219       if (BE (tree == NULL, 0))
2220 	{
2221 	  *err = REG_ESPACE;
2222 	  return NULL;
2223 	}
2224     }
2225   return tree;
2226 }
2227 
2228 /* This function build the following tree, from regular expression
2229    <exp1><exp2>:
2230 	CAT
2231 	/ \
2232        /   \
2233    <exp1> <exp2>
2234 
2235    CAT means concatenation.  */
2236 
2237 static bin_tree_t *
2238 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2239 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2240 {
2241   bin_tree_t *tree, *expr;
2242   re_dfa_t *dfa = preg->buffer;
2243   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2244   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2245     return NULL;
2246 
2247   while (token->type != OP_ALT && token->type != END_OF_RE
2248 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2249     {
2250       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2251       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2252 	{
2253 	  if (tree != NULL)
2254 	    postorder (tree, free_tree, NULL);
2255 	  return NULL;
2256 	}
2257       if (tree != NULL && expr != NULL)
2258 	{
2259 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2260 	  if (newtree == NULL)
2261 	    {
2262 	      postorder (expr, free_tree, NULL);
2263 	      postorder (tree, free_tree, NULL);
2264 	      *err = REG_ESPACE;
2265 	      return NULL;
2266 	    }
2267 	  tree = newtree;
2268 	}
2269       else if (tree == NULL)
2270 	tree = expr;
2271       /* Otherwise expr == NULL, we don't need to create new tree.  */
2272     }
2273   return tree;
2274 }
2275 
2276 /* This function build the following tree, from regular expression a*:
2277 	 *
2278 	 |
2279 	 a
2280 */
2281 
2282 static bin_tree_t *
2283 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2284 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2285 {
2286   re_dfa_t *dfa = preg->buffer;
2287   bin_tree_t *tree;
2288   switch (token->type)
2289     {
2290     case CHARACTER:
2291       tree = create_token_tree (dfa, NULL, NULL, token);
2292       if (BE (tree == NULL, 0))
2293 	{
2294 	  *err = REG_ESPACE;
2295 	  return NULL;
2296 	}
2297 #ifdef RE_ENABLE_I18N
2298       if (dfa->mb_cur_max > 1)
2299 	{
2300 	  while (!re_string_eoi (regexp)
2301 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2302 	    {
2303 	      bin_tree_t *mbc_remain;
2304 	      fetch_token (token, regexp, syntax);
2305 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2306 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2307 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2308 		{
2309 		  *err = REG_ESPACE;
2310 		  return NULL;
2311 		}
2312 	    }
2313 	}
2314 #endif
2315       break;
2316     case OP_OPEN_SUBEXP:
2317       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2318       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2319 	return NULL;
2320       break;
2321     case OP_OPEN_BRACKET:
2322       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2323       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 	return NULL;
2325       break;
2326     case OP_BACK_REF:
2327       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2328 	{
2329 	  *err = REG_ESUBREG;
2330 	  return NULL;
2331 	}
2332       dfa->used_bkref_map |= 1 << token->opr.idx;
2333       tree = create_token_tree (dfa, NULL, NULL, token);
2334       if (BE (tree == NULL, 0))
2335 	{
2336 	  *err = REG_ESPACE;
2337 	  return NULL;
2338 	}
2339       ++dfa->nbackref;
2340       dfa->has_mb_node = 1;
2341       break;
2342     case OP_OPEN_DUP_NUM:
2343       if (syntax & RE_CONTEXT_INVALID_DUP)
2344 	{
2345 	  *err = REG_BADRPT;
2346 	  return NULL;
2347 	}
2348       /* FALLTHROUGH */
2349     case OP_DUP_ASTERISK:
2350     case OP_DUP_PLUS:
2351     case OP_DUP_QUESTION:
2352       if (syntax & RE_CONTEXT_INVALID_OPS)
2353 	{
2354 	  *err = REG_BADRPT;
2355 	  return NULL;
2356 	}
2357       else if (syntax & RE_CONTEXT_INDEP_OPS)
2358 	{
2359 	  fetch_token (token, regexp, syntax);
2360 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2361 	}
2362       /* else fall through  */
2363     case OP_CLOSE_SUBEXP:
2364       if ((token->type == OP_CLOSE_SUBEXP) &&
2365 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2366 	{
2367 	  *err = REG_ERPAREN;
2368 	  return NULL;
2369 	}
2370       /* else fall through  */
2371     case OP_CLOSE_DUP_NUM:
2372       /* We treat it as a normal character.  */
2373 
2374       /* Then we can these characters as normal characters.  */
2375       token->type = CHARACTER;
2376       /* mb_partial and word_char bits should be initialized already
2377 	 by peek_token.  */
2378       tree = create_token_tree (dfa, NULL, NULL, token);
2379       if (BE (tree == NULL, 0))
2380 	{
2381 	  *err = REG_ESPACE;
2382 	  return NULL;
2383 	}
2384       break;
2385     case ANCHOR:
2386       if ((token->opr.ctx_type
2387 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2388 	  && dfa->word_ops_used == 0)
2389 	init_word_char (dfa);
2390       if (token->opr.ctx_type == WORD_DELIM
2391 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2392 	{
2393 	  bin_tree_t *tree_first, *tree_last;
2394 	  if (token->opr.ctx_type == WORD_DELIM)
2395 	    {
2396 	      token->opr.ctx_type = WORD_FIRST;
2397 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2398 	      token->opr.ctx_type = WORD_LAST;
2399 	    }
2400 	  else
2401 	    {
2402 	      token->opr.ctx_type = INSIDE_WORD;
2403 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2404 	      token->opr.ctx_type = INSIDE_NOTWORD;
2405 	    }
2406 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2407 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2408 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2409 	    {
2410 	      *err = REG_ESPACE;
2411 	      return NULL;
2412 	    }
2413 	}
2414       else
2415 	{
2416 	  tree = create_token_tree (dfa, NULL, NULL, token);
2417 	  if (BE (tree == NULL, 0))
2418 	    {
2419 	      *err = REG_ESPACE;
2420 	      return NULL;
2421 	    }
2422 	}
2423       /* We must return here, since ANCHORs can't be followed
2424 	 by repetition operators.
2425 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2426 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2427       fetch_token (token, regexp, syntax);
2428       return tree;
2429     case OP_PERIOD:
2430       tree = create_token_tree (dfa, NULL, NULL, token);
2431       if (BE (tree == NULL, 0))
2432 	{
2433 	  *err = REG_ESPACE;
2434 	  return NULL;
2435 	}
2436       if (dfa->mb_cur_max > 1)
2437 	dfa->has_mb_node = 1;
2438       break;
2439     case OP_WORD:
2440     case OP_NOTWORD:
2441       tree = build_charclass_op (dfa, regexp->trans,
2442 				 "alnum",
2443 				 "_",
2444 				 token->type == OP_NOTWORD, err);
2445       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2446 	return NULL;
2447       break;
2448     case OP_SPACE:
2449     case OP_NOTSPACE:
2450       tree = build_charclass_op (dfa, regexp->trans,
2451 				 "space",
2452 				 "",
2453 				 token->type == OP_NOTSPACE, err);
2454       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2455 	return NULL;
2456       break;
2457     case OP_ALT:
2458     case END_OF_RE:
2459       return NULL;
2460     case BACK_SLASH:
2461       *err = REG_EESCAPE;
2462       return NULL;
2463     default:
2464       /* Must not happen?  */
2465 #ifdef DEBUG
2466       assert (0);
2467 #endif
2468       return NULL;
2469     }
2470   fetch_token (token, regexp, syntax);
2471 
2472   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2473 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2474     {
2475       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2476 					   syntax, err);
2477       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2478 	{
2479 	  if (tree != NULL)
2480 	    postorder (tree, free_tree, NULL);
2481 	  return NULL;
2482 	}
2483       tree = dup_tree;
2484       /* In BRE consecutive duplications are not allowed.  */
2485       if ((syntax & RE_CONTEXT_INVALID_DUP)
2486 	  && (token->type == OP_DUP_ASTERISK
2487 	      || token->type == OP_OPEN_DUP_NUM))
2488 	{
2489 	  if (tree != NULL)
2490 	    postorder (tree, free_tree, NULL);
2491 	  *err = REG_BADRPT;
2492 	  return NULL;
2493 	}
2494     }
2495 
2496   return tree;
2497 }
2498 
2499 /* This function build the following tree, from regular expression
2500    (<reg_exp>):
2501 	 SUBEXP
2502 	    |
2503 	<reg_exp>
2504 */
2505 
2506 static bin_tree_t *
2507 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2508 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2509 {
2510   re_dfa_t *dfa = preg->buffer;
2511   bin_tree_t *tree;
2512   size_t cur_nsub;
2513   cur_nsub = preg->re_nsub++;
2514 
2515   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2516 
2517   /* The subexpression may be a null string.  */
2518   if (token->type == OP_CLOSE_SUBEXP)
2519     tree = NULL;
2520   else
2521     {
2522       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2523       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2524 	{
2525 	  if (tree != NULL)
2526 	    postorder (tree, free_tree, NULL);
2527 	  *err = REG_EPAREN;
2528 	}
2529       if (BE (*err != REG_NOERROR, 0))
2530 	return NULL;
2531     }
2532 
2533   if (cur_nsub <= '9' - '1')
2534     dfa->completed_bkref_map |= 1 << cur_nsub;
2535 
2536   tree = create_tree (dfa, tree, NULL, SUBEXP);
2537   if (BE (tree == NULL, 0))
2538     {
2539       *err = REG_ESPACE;
2540       return NULL;
2541     }
2542   tree->token.opr.idx = cur_nsub;
2543   return tree;
2544 }
2545 
2546 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2547 
2548 static bin_tree_t *
2549 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2550 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2551 {
2552   bin_tree_t *tree = NULL, *old_tree = NULL;
2553   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2554   re_token_t start_token = *token;
2555 
2556   if (token->type == OP_OPEN_DUP_NUM)
2557     {
2558       end = 0;
2559       start = fetch_number (regexp, token, syntax);
2560       if (start == REG_MISSING)
2561 	{
2562 	  if (token->type == CHARACTER && token->opr.c == ',')
2563 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2564 	  else
2565 	    {
2566 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2567 	      return NULL;
2568 	    }
2569 	}
2570       if (BE (start != REG_ERROR, 1))
2571 	{
2572 	  /* We treat "{n}" as "{n,n}".  */
2573 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2574 		 : ((token->type == CHARACTER && token->opr.c == ',')
2575 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2576 	}
2577       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2578 	{
2579 	  /* Invalid sequence.  */
2580 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2581 	    {
2582 	      if (token->type == END_OF_RE)
2583 		*err = REG_EBRACE;
2584 	      else
2585 		*err = REG_BADBR;
2586 
2587 	      return NULL;
2588 	    }
2589 
2590 	  /* If the syntax bit is set, rollback.  */
2591 	  re_string_set_index (regexp, start_idx);
2592 	  *token = start_token;
2593 	  token->type = CHARACTER;
2594 	  /* mb_partial and word_char bits should be already initialized by
2595 	     peek_token.  */
2596 	  return elem;
2597 	}
2598 
2599       if (BE ((end != REG_MISSING && start > end)
2600 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2601 	{
2602 	  /* First number greater than second.  */
2603 	  *err = REG_BADBR;
2604 	  return NULL;
2605 	}
2606 
2607       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2608 	{
2609 	  *err = REG_ESIZE;
2610 	  return NULL;
2611 	}
2612     }
2613   else
2614     {
2615       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2616       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2617     }
2618 
2619   fetch_token (token, regexp, syntax);
2620 
2621   if (BE (elem == NULL, 0))
2622     return NULL;
2623   if (BE (start == 0 && end == 0, 0))
2624     {
2625       postorder (elem, free_tree, NULL);
2626       return NULL;
2627     }
2628 
2629   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2630   if (BE (start > 0, 0))
2631     {
2632       tree = elem;
2633       for (i = 2; i <= start; ++i)
2634 	{
2635 	  elem = duplicate_tree (elem, dfa);
2636 	  tree = create_tree (dfa, tree, elem, CONCAT);
2637 	  if (BE (elem == NULL || tree == NULL, 0))
2638 	    goto parse_dup_op_espace;
2639 	}
2640 
2641       if (start == end)
2642 	return tree;
2643 
2644       /* Duplicate ELEM before it is marked optional.  */
2645       elem = duplicate_tree (elem, dfa);
2646       if (BE (elem == NULL, 0))
2647         goto parse_dup_op_espace;
2648       old_tree = tree;
2649     }
2650   else
2651     old_tree = NULL;
2652 
2653   if (elem->token.type == SUBEXP)
2654     {
2655       uintptr_t subidx = elem->token.opr.idx;
2656       postorder (elem, mark_opt_subexp, (void *) subidx);
2657     }
2658 
2659   tree = create_tree (dfa, elem, NULL,
2660 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2661   if (BE (tree == NULL, 0))
2662     goto parse_dup_op_espace;
2663 
2664 /* From gnulib's "intprops.h":
2665    True if the arithmetic type T is signed.  */
2666 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2667 
2668   /* This loop is actually executed only when end != REG_MISSING,
2669      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2670      already created the start+1-th copy.  */
2671   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2672     for (i = start + 2; i <= end; ++i)
2673       {
2674 	elem = duplicate_tree (elem, dfa);
2675 	tree = create_tree (dfa, tree, elem, CONCAT);
2676 	if (BE (elem == NULL || tree == NULL, 0))
2677 	  goto parse_dup_op_espace;
2678 
2679 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2680 	if (BE (tree == NULL, 0))
2681 	  goto parse_dup_op_espace;
2682       }
2683 
2684   if (old_tree)
2685     tree = create_tree (dfa, old_tree, tree, CONCAT);
2686 
2687   return tree;
2688 
2689  parse_dup_op_espace:
2690   *err = REG_ESPACE;
2691   return NULL;
2692 }
2693 
2694 /* Size of the names for collating symbol/equivalence_class/character_class.
2695    I'm not sure, but maybe enough.  */
2696 #define BRACKET_NAME_BUF_SIZE 32
2697 
2698 #ifndef _LIBC
2699   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2700      Build the range expression which starts from START_ELEM, and ends
2701      at END_ELEM.  The result are written to MBCSET and SBCSET.
2702      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2703      mbcset->range_ends, is a pointer argument since we may
2704      update it.  */
2705 
2706 static reg_errcode_t
2707 internal_function
2708 # ifdef RE_ENABLE_I18N
2709 build_range_exp (const reg_syntax_t syntax,
2710                  bitset_t sbcset,
2711                  re_charset_t *mbcset,
2712                  Idx *range_alloc,
2713                  const bracket_elem_t *start_elem,
2714                  const bracket_elem_t *end_elem)
2715 # else /* not RE_ENABLE_I18N */
2716 build_range_exp (const reg_syntax_t syntax,
2717                  bitset_t sbcset,
2718                  const bracket_elem_t *start_elem,
2719                  const bracket_elem_t *end_elem)
2720 # endif /* not RE_ENABLE_I18N */
2721 {
2722   unsigned int start_ch, end_ch;
2723   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2724   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2725 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2726 	  0))
2727     return REG_ERANGE;
2728 
2729   /* We can handle no multi character collating elements without libc
2730      support.  */
2731   if (BE ((start_elem->type == COLL_SYM
2732 	   && strlen ((char *) start_elem->opr.name) > 1)
2733 	  || (end_elem->type == COLL_SYM
2734 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2735     return REG_ECOLLATE;
2736 
2737 # ifdef RE_ENABLE_I18N
2738   {
2739     wchar_t wc;
2740     wint_t start_wc;
2741     wint_t end_wc;
2742 
2743     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2744 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2745 		   : 0));
2746     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2747 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2748 		 : 0));
2749     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2750 		? __btowc (start_ch) : start_elem->opr.wch);
2751     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2752 	      ? __btowc (end_ch) : end_elem->opr.wch);
2753     if (start_wc == WEOF || end_wc == WEOF)
2754       return REG_ECOLLATE;
2755     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2756       return REG_ERANGE;
2757 
2758     /* Got valid collation sequence values, add them as a new entry.
2759        However, for !_LIBC we have no collation elements: if the
2760        character set is single byte, the single byte character set
2761        that we build below suffices.  parse_bracket_exp passes
2762        no MBCSET if dfa->mb_cur_max == 1.  */
2763     if (mbcset)
2764       {
2765 	/* Check the space of the arrays.  */
2766 	if (BE (*range_alloc == mbcset->nranges, 0))
2767 	  {
2768 	    /* There is not enough space, need realloc.  */
2769 	    wchar_t *new_array_start, *new_array_end;
2770 	    Idx new_nranges;
2771 
2772 	    /* +1 in case of mbcset->nranges is 0.  */
2773 	    new_nranges = 2 * mbcset->nranges + 1;
2774 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2775 	       are NULL if *range_alloc == 0.  */
2776 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2777 					  new_nranges);
2778 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2779 					new_nranges);
2780 
2781 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2782 	      return REG_ESPACE;
2783 
2784 	    mbcset->range_starts = new_array_start;
2785 	    mbcset->range_ends = new_array_end;
2786 	    *range_alloc = new_nranges;
2787 	  }
2788 
2789 	mbcset->range_starts[mbcset->nranges] = start_wc;
2790 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2791       }
2792 
2793     /* Build the table for single byte characters.  */
2794     for (wc = 0; wc < SBC_MAX; ++wc)
2795       {
2796 	if (start_wc <= wc && wc <= end_wc)
2797 	  bitset_set (sbcset, wc);
2798       }
2799   }
2800 # else /* not RE_ENABLE_I18N */
2801   {
2802     unsigned int ch;
2803     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2804 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2805 		   : 0));
2806     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2807 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2808 		 : 0));
2809     if (start_ch > end_ch)
2810       return REG_ERANGE;
2811     /* Build the table for single byte characters.  */
2812     for (ch = 0; ch < SBC_MAX; ++ch)
2813       if (start_ch <= ch  && ch <= end_ch)
2814 	bitset_set (sbcset, ch);
2815   }
2816 # endif /* not RE_ENABLE_I18N */
2817   return REG_NOERROR;
2818 }
2819 #endif /* not _LIBC */
2820 
2821 #ifndef _LIBC
2822 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2823    Build the collating element which is represented by NAME.
2824    The result are written to MBCSET and SBCSET.
2825    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2826    pointer argument since we may update it.  */
2827 
2828 static reg_errcode_t
2829 internal_function
2830 # ifdef RE_ENABLE_I18N
2831 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832 			Idx *coll_sym_alloc, const unsigned char *name)
2833 # else /* not RE_ENABLE_I18N */
2834 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835 # endif /* not RE_ENABLE_I18N */
2836 {
2837   size_t name_len = strlen ((const char *) name);
2838   if (BE (name_len != 1, 0))
2839     return REG_ECOLLATE;
2840   else
2841     {
2842       bitset_set (sbcset, name[0]);
2843       return REG_NOERROR;
2844     }
2845 }
2846 #endif /* not _LIBC */
2847 
2848 /* This function parse bracket expression like "[abc]", "[a-c]",
2849    "[[.a-a.]]" etc.  */
2850 
2851 static bin_tree_t *
2852 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 		   reg_syntax_t syntax, reg_errcode_t *err)
2854 {
2855 #ifdef _LIBC
2856   const unsigned char *collseqmb;
2857   const char *collseqwc;
2858   uint32_t nrules;
2859   int32_t table_size;
2860   const int32_t *symb_table;
2861   const unsigned char *extra;
2862 
2863   /* Local function for parse_bracket_exp used in _LIBC environment.
2864      Seek the collating symbol entry corresponding to NAME.
2865      Return the index of the symbol in the SYMB_TABLE,
2866      or -1 if not found.  */
2867 
2868   auto inline int32_t
2869   __attribute__ ((always_inline))
2870   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871     {
2872       int32_t elem;
2873 
2874       for (elem = 0; elem < table_size; elem++)
2875 	if (symb_table[2 * elem] != 0)
2876 	  {
2877 	    int32_t idx = symb_table[2 * elem + 1];
2878 	    /* Skip the name of collating element name.  */
2879 	    idx += 1 + extra[idx];
2880 	    if (/* Compare the length of the name.  */
2881 		name_len == extra[idx]
2882 		/* Compare the name.  */
2883 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2884 	      /* Yep, this is the entry.  */
2885 	      return elem;
2886 	  }
2887       return -1;
2888     }
2889 
2890   /* Local function for parse_bracket_exp used in _LIBC environment.
2891      Look up the collation sequence value of BR_ELEM.
2892      Return the value if succeeded, UINT_MAX otherwise.  */
2893 
2894   auto inline unsigned int
2895   __attribute__ ((always_inline))
2896   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897     {
2898       if (br_elem->type == SB_CHAR)
2899 	{
2900 	  /*
2901 	  if (MB_CUR_MAX == 1)
2902 	  */
2903 	  if (nrules == 0)
2904 	    return collseqmb[br_elem->opr.ch];
2905 	  else
2906 	    {
2907 	      wint_t wc = __btowc (br_elem->opr.ch);
2908 	      return __collseq_table_lookup (collseqwc, wc);
2909 	    }
2910 	}
2911       else if (br_elem->type == MB_CHAR)
2912 	{
2913 	  if (nrules != 0)
2914 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915 	}
2916       else if (br_elem->type == COLL_SYM)
2917 	{
2918 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 	  if (nrules != 0)
2920 	    {
2921 	      int32_t elem, idx;
2922 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 						  sym_name_len);
2924 	      if (elem != -1)
2925 		{
2926 		  /* We found the entry.  */
2927 		  idx = symb_table[2 * elem + 1];
2928 		  /* Skip the name of collating element name.  */
2929 		  idx += 1 + extra[idx];
2930 		  /* Skip the byte sequence of the collating element.  */
2931 		  idx += 1 + extra[idx];
2932 		  /* Adjust for the alignment.  */
2933 		  idx = (idx + 3) & ~3;
2934 		  /* Skip the multibyte collation sequence value.  */
2935 		  idx += sizeof (unsigned int);
2936 		  /* Skip the wide char sequence of the collating element.  */
2937 		  idx += sizeof (unsigned int) *
2938 		    (1 + *(unsigned int *) (extra + idx));
2939 		  /* Return the collation sequence value.  */
2940 		  return *(unsigned int *) (extra + idx);
2941 		}
2942 	      else if (sym_name_len == 1)
2943 		{
2944 		  /* No valid character.  Match it as a single byte
2945 		     character.  */
2946 		  return collseqmb[br_elem->opr.name[0]];
2947 		}
2948 	    }
2949 	  else if (sym_name_len == 1)
2950 	    return collseqmb[br_elem->opr.name[0]];
2951 	}
2952       return UINT_MAX;
2953     }
2954 
2955   /* Local function for parse_bracket_exp used in _LIBC environment.
2956      Build the range expression which starts from START_ELEM, and ends
2957      at END_ELEM.  The result are written to MBCSET and SBCSET.
2958      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959      mbcset->range_ends, is a pointer argument since we may
2960      update it.  */
2961 
2962   auto inline reg_errcode_t
2963   __attribute__ ((always_inline))
2964   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2966     {
2967       unsigned int ch;
2968       uint32_t start_collseq;
2969       uint32_t end_collseq;
2970 
2971       /* Equivalence Classes and Character Classes can't be a range
2972 	 start/end.  */
2973       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2974 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2975 	      0))
2976 	return REG_ERANGE;
2977 
2978       /* FIXME: Implement rational ranges here, too.  */
2979       start_collseq = lookup_collation_sequence_value (start_elem);
2980       end_collseq = lookup_collation_sequence_value (end_elem);
2981       /* Check start/end collation sequence values.  */
2982       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2983 	return REG_ECOLLATE;
2984       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2985 	return REG_ERANGE;
2986 
2987       /* Got valid collation sequence values, add them as a new entry.
2988 	 However, if we have no collation elements, and the character set
2989 	 is single byte, the single byte character set that we
2990 	 build below suffices. */
2991       if (nrules > 0 || dfa->mb_cur_max > 1)
2992 	{
2993 	  /* Check the space of the arrays.  */
2994 	  if (BE (*range_alloc == mbcset->nranges, 0))
2995 	    {
2996 	      /* There is not enough space, need realloc.  */
2997 	      uint32_t *new_array_start;
2998 	      uint32_t *new_array_end;
2999 	      Idx new_nranges;
3000 
3001 	      /* +1 in case of mbcset->nranges is 0.  */
3002 	      new_nranges = 2 * mbcset->nranges + 1;
3003 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3004 					    new_nranges);
3005 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3006 					  new_nranges);
3007 
3008 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3009 		return REG_ESPACE;
3010 
3011 	      mbcset->range_starts = new_array_start;
3012 	      mbcset->range_ends = new_array_end;
3013 	      *range_alloc = new_nranges;
3014 	    }
3015 
3016 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
3017 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
3018 	}
3019 
3020       /* Build the table for single byte characters.  */
3021       for (ch = 0; ch < SBC_MAX; ch++)
3022 	{
3023 	  uint32_t ch_collseq;
3024 	  /*
3025 	  if (MB_CUR_MAX == 1)
3026 	  */
3027 	  if (nrules == 0)
3028 	    ch_collseq = collseqmb[ch];
3029 	  else
3030 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3031 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3032 	    bitset_set (sbcset, ch);
3033 	}
3034       return REG_NOERROR;
3035     }
3036 
3037   /* Local function for parse_bracket_exp used in _LIBC environment.
3038      Build the collating element which is represented by NAME.
3039      The result are written to MBCSET and SBCSET.
3040      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3041      pointer argument since we may update it.  */
3042 
3043   auto inline reg_errcode_t
3044   __attribute__ ((always_inline))
3045   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3046 			  Idx *coll_sym_alloc, const unsigned char *name)
3047     {
3048       int32_t elem, idx;
3049       size_t name_len = strlen ((const char *) name);
3050       if (nrules != 0)
3051 	{
3052 	  elem = seek_collating_symbol_entry (name, name_len);
3053 	  if (elem != -1)
3054 	    {
3055 	      /* We found the entry.  */
3056 	      idx = symb_table[2 * elem + 1];
3057 	      /* Skip the name of collating element name.  */
3058 	      idx += 1 + extra[idx];
3059 	    }
3060 	  else if (name_len == 1)
3061 	    {
3062 	      /* No valid character, treat it as a normal
3063 		 character.  */
3064 	      bitset_set (sbcset, name[0]);
3065 	      return REG_NOERROR;
3066 	    }
3067 	  else
3068 	    return REG_ECOLLATE;
3069 
3070 	  /* Got valid collation sequence, add it as a new entry.  */
3071 	  /* Check the space of the arrays.  */
3072 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3073 	    {
3074 	      /* Not enough, realloc it.  */
3075 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3076 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3077 	      /* Use realloc since mbcset->coll_syms is NULL
3078 		 if *alloc == 0.  */
3079 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3080 						   new_coll_sym_alloc);
3081 	      if (BE (new_coll_syms == NULL, 0))
3082 		return REG_ESPACE;
3083 	      mbcset->coll_syms = new_coll_syms;
3084 	      *coll_sym_alloc = new_coll_sym_alloc;
3085 	    }
3086 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3087 	  return REG_NOERROR;
3088 	}
3089       else
3090 	{
3091 	  if (BE (name_len != 1, 0))
3092 	    return REG_ECOLLATE;
3093 	  else
3094 	    {
3095 	      bitset_set (sbcset, name[0]);
3096 	      return REG_NOERROR;
3097 	    }
3098 	}
3099     }
3100 #endif
3101 
3102   re_token_t br_token;
3103   re_bitset_ptr_t sbcset;
3104 #ifdef RE_ENABLE_I18N
3105   re_charset_t *mbcset;
3106   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3107   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3108 #endif /* not RE_ENABLE_I18N */
3109   bool non_match = false;
3110   bin_tree_t *work_tree;
3111   int token_len;
3112   bool first_round = true;
3113 #ifdef _LIBC
3114   collseqmb = (const unsigned char *)
3115     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3116   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3117   if (nrules)
3118     {
3119       /*
3120       if (MB_CUR_MAX > 1)
3121       */
3122       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3123       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3124       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3125 						  _NL_COLLATE_SYMB_TABLEMB);
3126       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3127 						   _NL_COLLATE_SYMB_EXTRAMB);
3128     }
3129 #endif
3130   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3131 #ifdef RE_ENABLE_I18N
3132   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3133 #endif /* RE_ENABLE_I18N */
3134 #ifdef RE_ENABLE_I18N
3135   if (BE (sbcset == NULL || mbcset == NULL, 0))
3136 #else
3137   if (BE (sbcset == NULL, 0))
3138 #endif /* RE_ENABLE_I18N */
3139     {
3140       re_free (sbcset);
3141 #ifdef RE_ENABLE_I18N
3142       re_free (mbcset);
3143 #endif
3144       *err = REG_ESPACE;
3145       return NULL;
3146     }
3147 
3148   token_len = peek_token_bracket (token, regexp, syntax);
3149   if (BE (token->type == END_OF_RE, 0))
3150     {
3151       *err = REG_BADPAT;
3152       goto parse_bracket_exp_free_return;
3153     }
3154   if (token->type == OP_NON_MATCH_LIST)
3155     {
3156 #ifdef RE_ENABLE_I18N
3157       mbcset->non_match = 1;
3158 #endif /* not RE_ENABLE_I18N */
3159       non_match = true;
3160       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3161 	bitset_set (sbcset, '\n');
3162       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3163       token_len = peek_token_bracket (token, regexp, syntax);
3164       if (BE (token->type == END_OF_RE, 0))
3165 	{
3166 	  *err = REG_BADPAT;
3167 	  goto parse_bracket_exp_free_return;
3168 	}
3169     }
3170 
3171   /* We treat the first ']' as a normal character.  */
3172   if (token->type == OP_CLOSE_BRACKET)
3173     token->type = CHARACTER;
3174 
3175   while (1)
3176     {
3177       bracket_elem_t start_elem, end_elem;
3178       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3179       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3180       reg_errcode_t ret;
3181       int token_len2 = 0;
3182       bool is_range_exp = false;
3183       re_token_t token2;
3184 
3185       start_elem.opr.name = start_name_buf;
3186       start_elem.type = COLL_SYM;
3187       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3188 				   syntax, first_round);
3189       if (BE (ret != REG_NOERROR, 0))
3190 	{
3191 	  *err = ret;
3192 	  goto parse_bracket_exp_free_return;
3193 	}
3194       first_round = false;
3195 
3196       /* Get information about the next token.  We need it in any case.  */
3197       token_len = peek_token_bracket (token, regexp, syntax);
3198 
3199       /* Do not check for ranges if we know they are not allowed.  */
3200       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3201 	{
3202 	  if (BE (token->type == END_OF_RE, 0))
3203 	    {
3204 	      *err = REG_EBRACK;
3205 	      goto parse_bracket_exp_free_return;
3206 	    }
3207 	  if (token->type == OP_CHARSET_RANGE)
3208 	    {
3209 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3210 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3211 	      if (BE (token2.type == END_OF_RE, 0))
3212 		{
3213 		  *err = REG_EBRACK;
3214 		  goto parse_bracket_exp_free_return;
3215 		}
3216 	      if (token2.type == OP_CLOSE_BRACKET)
3217 		{
3218 		  /* We treat the last '-' as a normal character.  */
3219 		  re_string_skip_bytes (regexp, -token_len);
3220 		  token->type = CHARACTER;
3221 		}
3222 	      else
3223 		is_range_exp = true;
3224 	    }
3225 	}
3226 
3227       if (is_range_exp == true)
3228 	{
3229 	  end_elem.opr.name = end_name_buf;
3230 	  end_elem.type = COLL_SYM;
3231 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3232 				       dfa, syntax, true);
3233 	  if (BE (ret != REG_NOERROR, 0))
3234 	    {
3235 	      *err = ret;
3236 	      goto parse_bracket_exp_free_return;
3237 	    }
3238 
3239 	  token_len = peek_token_bracket (token, regexp, syntax);
3240 
3241 #ifdef _LIBC
3242 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3243 				  &start_elem, &end_elem);
3244 #else
3245 # ifdef RE_ENABLE_I18N
3246 	  *err = build_range_exp (syntax, sbcset,
3247 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3248 				  &range_alloc, &start_elem, &end_elem);
3249 # else
3250 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3251 # endif
3252 #endif /* RE_ENABLE_I18N */
3253 	  if (BE (*err != REG_NOERROR, 0))
3254 	    goto parse_bracket_exp_free_return;
3255 	}
3256       else
3257 	{
3258 	  switch (start_elem.type)
3259 	    {
3260 	    case SB_CHAR:
3261 	      bitset_set (sbcset, start_elem.opr.ch);
3262 	      break;
3263 #ifdef RE_ENABLE_I18N
3264 	    case MB_CHAR:
3265 	      /* Check whether the array has enough space.  */
3266 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3267 		{
3268 		  wchar_t *new_mbchars;
3269 		  /* Not enough, realloc it.  */
3270 		  /* +1 in case of mbcset->nmbchars is 0.  */
3271 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3272 		  /* Use realloc since array is NULL if *alloc == 0.  */
3273 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3274 					    mbchar_alloc);
3275 		  if (BE (new_mbchars == NULL, 0))
3276 		    goto parse_bracket_exp_espace;
3277 		  mbcset->mbchars = new_mbchars;
3278 		}
3279 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3280 	      break;
3281 #endif /* RE_ENABLE_I18N */
3282 	    case EQUIV_CLASS:
3283 	      *err = build_equiv_class (sbcset,
3284 #ifdef RE_ENABLE_I18N
3285 					mbcset, &equiv_class_alloc,
3286 #endif /* RE_ENABLE_I18N */
3287 					start_elem.opr.name);
3288 	      if (BE (*err != REG_NOERROR, 0))
3289 		goto parse_bracket_exp_free_return;
3290 	      break;
3291 	    case COLL_SYM:
3292 	      *err = build_collating_symbol (sbcset,
3293 #ifdef RE_ENABLE_I18N
3294 					     mbcset, &coll_sym_alloc,
3295 #endif /* RE_ENABLE_I18N */
3296 					     start_elem.opr.name);
3297 	      if (BE (*err != REG_NOERROR, 0))
3298 		goto parse_bracket_exp_free_return;
3299 	      break;
3300 	    case CHAR_CLASS:
3301 	      *err = build_charclass (regexp->trans, sbcset,
3302 #ifdef RE_ENABLE_I18N
3303 				      mbcset, &char_class_alloc,
3304 #endif /* RE_ENABLE_I18N */
3305 				      (const char *) start_elem.opr.name,
3306 				      syntax);
3307 	      if (BE (*err != REG_NOERROR, 0))
3308 	       goto parse_bracket_exp_free_return;
3309 	      break;
3310 	    default:
3311 	      assert (0);
3312 	      break;
3313 	    }
3314 	}
3315       if (BE (token->type == END_OF_RE, 0))
3316 	{
3317 	  *err = REG_EBRACK;
3318 	  goto parse_bracket_exp_free_return;
3319 	}
3320       if (token->type == OP_CLOSE_BRACKET)
3321 	break;
3322     }
3323 
3324   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3325 
3326   /* If it is non-matching list.  */
3327   if (non_match)
3328     bitset_not (sbcset);
3329 
3330 #ifdef RE_ENABLE_I18N
3331   /* Ensure only single byte characters are set.  */
3332   if (dfa->mb_cur_max > 1)
3333     bitset_mask (sbcset, dfa->sb_char);
3334 
3335   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3336       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3337 						     || mbcset->non_match)))
3338     {
3339       bin_tree_t *mbc_tree;
3340       int sbc_idx;
3341       /* Build a tree for complex bracket.  */
3342       dfa->has_mb_node = 1;
3343       br_token.type = COMPLEX_BRACKET;
3344       br_token.opr.mbcset = mbcset;
3345       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346       if (BE (mbc_tree == NULL, 0))
3347 	goto parse_bracket_exp_espace;
3348       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3349 	if (sbcset[sbc_idx])
3350 	  break;
3351       /* If there are no bits set in sbcset, there is no point
3352 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3353       if (sbc_idx < BITSET_WORDS)
3354 	{
3355 	  /* Build a tree for simple bracket.  */
3356 	  br_token.type = SIMPLE_BRACKET;
3357 	  br_token.opr.sbcset = sbcset;
3358 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 	  if (BE (work_tree == NULL, 0))
3360 	    goto parse_bracket_exp_espace;
3361 
3362 	  /* Then join them by ALT node.  */
3363 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3364 	  if (BE (work_tree == NULL, 0))
3365 	    goto parse_bracket_exp_espace;
3366 	}
3367       else
3368 	{
3369 	  re_free (sbcset);
3370 	  work_tree = mbc_tree;
3371 	}
3372     }
3373   else
3374 #endif /* not RE_ENABLE_I18N */
3375     {
3376 #ifdef RE_ENABLE_I18N
3377       free_charset (mbcset);
3378 #endif
3379       /* Build a tree for simple bracket.  */
3380       br_token.type = SIMPLE_BRACKET;
3381       br_token.opr.sbcset = sbcset;
3382       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3383       if (BE (work_tree == NULL, 0))
3384 	goto parse_bracket_exp_espace;
3385     }
3386   return work_tree;
3387 
3388  parse_bracket_exp_espace:
3389   *err = REG_ESPACE;
3390  parse_bracket_exp_free_return:
3391   re_free (sbcset);
3392 #ifdef RE_ENABLE_I18N
3393   free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3395   return NULL;
3396 }
3397 
3398 /* Parse an element in the bracket expression.  */
3399 
3400 static reg_errcode_t
3401 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3402 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3403 		       reg_syntax_t syntax, bool accept_hyphen)
3404 {
3405 #ifdef RE_ENABLE_I18N
3406   int cur_char_size;
3407   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3408   if (cur_char_size > 1)
3409     {
3410       elem->type = MB_CHAR;
3411       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3412       re_string_skip_bytes (regexp, cur_char_size);
3413       return REG_NOERROR;
3414     }
3415 #endif /* RE_ENABLE_I18N */
3416   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3417   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3418       || token->type == OP_OPEN_EQUIV_CLASS)
3419     return parse_bracket_symbol (elem, regexp, token);
3420   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3421     {
3422       /* A '-' must only appear as anything but a range indicator before
3423 	 the closing bracket.  Everything else is an error.  */
3424       re_token_t token2;
3425       (void) peek_token_bracket (&token2, regexp, syntax);
3426       if (token2.type != OP_CLOSE_BRACKET)
3427 	/* The actual error value is not standardized since this whole
3428 	   case is undefined.  But ERANGE makes good sense.  */
3429 	return REG_ERANGE;
3430     }
3431   elem->type = SB_CHAR;
3432   elem->opr.ch = token->opr.c;
3433   return REG_NOERROR;
3434 }
3435 
3436 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3437    such as [:<character_class>:], [.<collating_element>.], and
3438    [=<equivalent_class>=].  */
3439 
3440 static reg_errcode_t
3441 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3442 		      re_token_t *token)
3443 {
3444   unsigned char ch, delim = token->opr.c;
3445   int i = 0;
3446   if (re_string_eoi(regexp))
3447     return REG_EBRACK;
3448   for (;; ++i)
3449     {
3450       if (i >= BRACKET_NAME_BUF_SIZE)
3451 	return REG_EBRACK;
3452       if (token->type == OP_OPEN_CHAR_CLASS)
3453 	ch = re_string_fetch_byte_case (regexp);
3454       else
3455 	ch = re_string_fetch_byte (regexp);
3456       if (re_string_eoi(regexp))
3457 	return REG_EBRACK;
3458       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3459 	break;
3460       elem->opr.name[i] = ch;
3461     }
3462   re_string_skip_bytes (regexp, 1);
3463   elem->opr.name[i] = '\0';
3464   switch (token->type)
3465     {
3466     case OP_OPEN_COLL_ELEM:
3467       elem->type = COLL_SYM;
3468       break;
3469     case OP_OPEN_EQUIV_CLASS:
3470       elem->type = EQUIV_CLASS;
3471       break;
3472     case OP_OPEN_CHAR_CLASS:
3473       elem->type = CHAR_CLASS;
3474       break;
3475     default:
3476       break;
3477     }
3478   return REG_NOERROR;
3479 }
3480 
3481   /* Helper function for parse_bracket_exp.
3482      Build the equivalence class which is represented by NAME.
3483      The result are written to MBCSET and SBCSET.
3484      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3485      is a pointer argument since we may update it.  */
3486 
3487 static reg_errcode_t
3488 #ifdef RE_ENABLE_I18N
3489 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3490 		   Idx *equiv_class_alloc, const unsigned char *name)
3491 #else /* not RE_ENABLE_I18N */
3492 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3493 #endif /* not RE_ENABLE_I18N */
3494 {
3495 #ifdef _LIBC
3496   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3497   if (nrules != 0)
3498     {
3499       const int32_t *table, *indirect;
3500       const unsigned char *weights, *extra, *cp;
3501       unsigned char char_buf[2];
3502       int32_t idx1, idx2;
3503       unsigned int ch;
3504       size_t len;
3505       /* Calculate the index for equivalence class.  */
3506       cp = name;
3507       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 					       _NL_COLLATE_WEIGHTMB);
3510       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 						   _NL_COLLATE_EXTRAMB);
3512       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 						_NL_COLLATE_INDIRECTMB);
3514       idx1 = findidx (table, indirect, extra, &cp, -1);
3515       if (BE (idx1 == 0 || *cp != '\0', 0))
3516 	/* This isn't a valid character.  */
3517 	return REG_ECOLLATE;
3518 
3519       /* Build single byte matching table for this equivalence class.  */
3520       len = weights[idx1 & 0xffffff];
3521       for (ch = 0; ch < SBC_MAX; ++ch)
3522 	{
3523 	  char_buf[0] = ch;
3524 	  cp = char_buf;
3525 	  idx2 = findidx (table, indirect, extra, &cp, 1);
3526 /*
3527 	  idx2 = table[ch];
3528 */
3529 	  if (idx2 == 0)
3530 	    /* This isn't a valid character.  */
3531 	    continue;
3532 	  /* Compare only if the length matches and the collation rule
3533 	     index is the same.  */
3534 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3535 	    {
3536 	      int cnt = 0;
3537 
3538 	      while (cnt <= len &&
3539 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3540 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3541 		++cnt;
3542 
3543 	      if (cnt > len)
3544 		bitset_set (sbcset, ch);
3545 	    }
3546 	}
3547       /* Check whether the array has enough space.  */
3548       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3549 	{
3550 	  /* Not enough, realloc it.  */
3551 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3552 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3554 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555 						   int32_t,
3556 						   new_equiv_class_alloc);
3557 	  if (BE (new_equiv_classes == NULL, 0))
3558 	    return REG_ESPACE;
3559 	  mbcset->equiv_classes = new_equiv_classes;
3560 	  *equiv_class_alloc = new_equiv_class_alloc;
3561 	}
3562       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3563     }
3564   else
3565 #endif /* _LIBC */
3566     {
3567       if (BE (strlen ((const char *) name) != 1, 0))
3568 	return REG_ECOLLATE;
3569       bitset_set (sbcset, *name);
3570     }
3571   return REG_NOERROR;
3572 }
3573 
3574   /* Helper function for parse_bracket_exp.
3575      Build the character class which is represented by NAME.
3576      The result are written to MBCSET and SBCSET.
3577      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578      is a pointer argument since we may update it.  */
3579 
3580 static reg_errcode_t
3581 #ifdef RE_ENABLE_I18N
3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 		 re_charset_t *mbcset, Idx *char_class_alloc,
3584 		 const char *class_name, reg_syntax_t syntax)
3585 #else /* not RE_ENABLE_I18N */
3586 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587 		 const char *class_name, reg_syntax_t syntax)
3588 #endif /* not RE_ENABLE_I18N */
3589 {
3590   int i;
3591   const char *name = class_name;
3592 
3593   /* In case of REG_ICASE "upper" and "lower" match the both of
3594      upper and lower cases.  */
3595   if ((syntax & RE_ICASE)
3596       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3597     name = "alpha";
3598 
3599 #ifdef RE_ENABLE_I18N
3600   /* Check the space of the arrays.  */
3601   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3602     {
3603       /* Not enough, realloc it.  */
3604       /* +1 in case of mbcset->nchar_classes is 0.  */
3605       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606       /* Use realloc since array is NULL if *alloc == 0.  */
3607       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608 					       new_char_class_alloc);
3609       if (BE (new_char_classes == NULL, 0))
3610 	return REG_ESPACE;
3611       mbcset->char_classes = new_char_classes;
3612       *char_class_alloc = new_char_class_alloc;
3613     }
3614   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615 #endif /* RE_ENABLE_I18N */
3616 
3617 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3618   do {						\
3619     if (BE (trans != NULL, 0))			\
3620       {						\
3621 	for (i = 0; i < SBC_MAX; ++i)		\
3622 	  if (ctype_func (i))			\
3623 	    bitset_set (sbcset, trans[i]);	\
3624       }						\
3625     else					\
3626       {						\
3627 	for (i = 0; i < SBC_MAX; ++i)		\
3628 	  if (ctype_func (i))			\
3629 	    bitset_set (sbcset, i);		\
3630       }						\
3631   } while (0)
3632 
3633   if (strcmp (name, "alnum") == 0)
3634     BUILD_CHARCLASS_LOOP (isalnum);
3635   else if (strcmp (name, "cntrl") == 0)
3636     BUILD_CHARCLASS_LOOP (iscntrl);
3637   else if (strcmp (name, "lower") == 0)
3638     BUILD_CHARCLASS_LOOP (islower);
3639   else if (strcmp (name, "space") == 0)
3640     BUILD_CHARCLASS_LOOP (isspace);
3641   else if (strcmp (name, "alpha") == 0)
3642     BUILD_CHARCLASS_LOOP (isalpha);
3643   else if (strcmp (name, "digit") == 0)
3644     BUILD_CHARCLASS_LOOP (isdigit);
3645   else if (strcmp (name, "print") == 0)
3646     BUILD_CHARCLASS_LOOP (isprint);
3647   else if (strcmp (name, "upper") == 0)
3648     BUILD_CHARCLASS_LOOP (isupper);
3649   else if (strcmp (name, "blank") == 0)
3650     BUILD_CHARCLASS_LOOP (isblank);
3651   else if (strcmp (name, "graph") == 0)
3652     BUILD_CHARCLASS_LOOP (isgraph);
3653   else if (strcmp (name, "punct") == 0)
3654     BUILD_CHARCLASS_LOOP (ispunct);
3655   else if (strcmp (name, "xdigit") == 0)
3656     BUILD_CHARCLASS_LOOP (isxdigit);
3657   else
3658     return REG_ECTYPE;
3659 
3660   return REG_NOERROR;
3661 }
3662 
3663 static bin_tree_t *
3664 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665 		    const char *class_name,
3666 		    const char *extra, bool non_match,
3667 		    reg_errcode_t *err)
3668 {
3669   re_bitset_ptr_t sbcset;
3670 #ifdef RE_ENABLE_I18N
3671   re_charset_t *mbcset;
3672   Idx alloc = 0;
3673 #endif /* not RE_ENABLE_I18N */
3674   reg_errcode_t ret;
3675   re_token_t br_token;
3676   bin_tree_t *tree;
3677 
3678   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679 #ifdef RE_ENABLE_I18N
3680   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681 #endif /* RE_ENABLE_I18N */
3682 
3683 #ifdef RE_ENABLE_I18N
3684   if (BE (sbcset == NULL || mbcset == NULL, 0))
3685 #else /* not RE_ENABLE_I18N */
3686   if (BE (sbcset == NULL, 0))
3687 #endif /* not RE_ENABLE_I18N */
3688     {
3689       *err = REG_ESPACE;
3690       return NULL;
3691     }
3692 
3693   if (non_match)
3694     {
3695 #ifdef RE_ENABLE_I18N
3696       mbcset->non_match = 1;
3697 #endif /* not RE_ENABLE_I18N */
3698     }
3699 
3700   /* We don't care the syntax in this case.  */
3701   ret = build_charclass (trans, sbcset,
3702 #ifdef RE_ENABLE_I18N
3703 			 mbcset, &alloc,
3704 #endif /* RE_ENABLE_I18N */
3705 			 class_name, 0);
3706 
3707   if (BE (ret != REG_NOERROR, 0))
3708     {
3709       re_free (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711       free_charset (mbcset);
3712 #endif /* RE_ENABLE_I18N */
3713       *err = ret;
3714       return NULL;
3715     }
3716   /* \w match '_' also.  */
3717   for (; *extra; extra++)
3718     bitset_set (sbcset, *extra);
3719 
3720   /* If it is non-matching list.  */
3721   if (non_match)
3722     bitset_not (sbcset);
3723 
3724 #ifdef RE_ENABLE_I18N
3725   /* Ensure only single byte characters are set.  */
3726   if (dfa->mb_cur_max > 1)
3727     bitset_mask (sbcset, dfa->sb_char);
3728 #endif
3729 
3730   /* Build a tree for simple bracket.  */
3731   br_token.type = SIMPLE_BRACKET;
3732   br_token.opr.sbcset = sbcset;
3733   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734   if (BE (tree == NULL, 0))
3735     goto build_word_op_espace;
3736 
3737 #ifdef RE_ENABLE_I18N
3738   if (dfa->mb_cur_max > 1)
3739     {
3740       bin_tree_t *mbc_tree;
3741       /* Build a tree for complex bracket.  */
3742       br_token.type = COMPLEX_BRACKET;
3743       br_token.opr.mbcset = mbcset;
3744       dfa->has_mb_node = 1;
3745       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746       if (BE (mbc_tree == NULL, 0))
3747 	goto build_word_op_espace;
3748       /* Then join them by ALT node.  */
3749       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750       if (BE (mbc_tree != NULL, 1))
3751 	return tree;
3752     }
3753   else
3754     {
3755       free_charset (mbcset);
3756       return tree;
3757     }
3758 #else /* not RE_ENABLE_I18N */
3759   return tree;
3760 #endif /* not RE_ENABLE_I18N */
3761 
3762  build_word_op_espace:
3763   re_free (sbcset);
3764 #ifdef RE_ENABLE_I18N
3765   free_charset (mbcset);
3766 #endif /* RE_ENABLE_I18N */
3767   *err = REG_ESPACE;
3768   return NULL;
3769 }
3770 
3771 /* This is intended for the expressions like "a{1,3}".
3772    Fetch a number from 'input', and return the number.
3773    Return REG_MISSING if the number field is empty like "{,1}".
3774    Return RE_DUP_MAX + 1 if the number field is too large.
3775    Return REG_ERROR if an error occurred.  */
3776 
3777 static Idx
3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3779 {
3780   Idx num = REG_MISSING;
3781   unsigned char c;
3782   while (1)
3783     {
3784       fetch_token (token, input, syntax);
3785       c = token->opr.c;
3786       if (BE (token->type == END_OF_RE, 0))
3787 	return REG_ERROR;
3788       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3789 	break;
3790       num = ((token->type != CHARACTER || c < '0' || '9' < c
3791 	      || num == REG_ERROR)
3792 	     ? REG_ERROR
3793 	     : num == REG_MISSING
3794 	     ? c - '0'
3795 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3796     }
3797   return num;
3798 }
3799 
3800 #ifdef RE_ENABLE_I18N
3801 static void
3802 free_charset (re_charset_t *cset)
3803 {
3804   re_free (cset->mbchars);
3805 # ifdef _LIBC
3806   re_free (cset->coll_syms);
3807   re_free (cset->equiv_classes);
3808   re_free (cset->range_starts);
3809   re_free (cset->range_ends);
3810 # endif
3811   re_free (cset->char_classes);
3812   re_free (cset);
3813 }
3814 #endif /* RE_ENABLE_I18N */
3815 
3816 /* Functions for binary tree operation.  */
3817 
3818 /* Create a tree node.  */
3819 
3820 static bin_tree_t *
3821 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 	     re_token_type_t type)
3823 {
3824   re_token_t t;
3825   t.type = type;
3826   return create_token_tree (dfa, left, right, &t);
3827 }
3828 
3829 static bin_tree_t *
3830 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3831 		   const re_token_t *token)
3832 {
3833   bin_tree_t *tree;
3834   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3835     {
3836       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3837 
3838       if (storage == NULL)
3839 	return NULL;
3840       storage->next = dfa->str_tree_storage;
3841       dfa->str_tree_storage = storage;
3842       dfa->str_tree_storage_idx = 0;
3843     }
3844   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3845 
3846   tree->parent = NULL;
3847   tree->left = left;
3848   tree->right = right;
3849   tree->token = *token;
3850   tree->token.duplicated = 0;
3851   tree->token.opt_subexp = 0;
3852   tree->first = NULL;
3853   tree->next = NULL;
3854   tree->node_idx = REG_MISSING;
3855 
3856   if (left != NULL)
3857     left->parent = tree;
3858   if (right != NULL)
3859     right->parent = tree;
3860   return tree;
3861 }
3862 
3863 /* Mark the tree SRC as an optional subexpression.
3864    To be called from preorder or postorder.  */
3865 
3866 static reg_errcode_t
3867 mark_opt_subexp (void *extra, bin_tree_t *node)
3868 {
3869   Idx idx = (uintptr_t) extra;
3870   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3871     node->token.opt_subexp = 1;
3872 
3873   return REG_NOERROR;
3874 }
3875 
3876 /* Free the allocated memory inside NODE. */
3877 
3878 static void
3879 free_token (re_token_t *node)
3880 {
3881 #ifdef RE_ENABLE_I18N
3882   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3883     free_charset (node->opr.mbcset);
3884   else
3885 #endif /* RE_ENABLE_I18N */
3886     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3887       re_free (node->opr.sbcset);
3888 }
3889 
3890 /* Worker function for tree walking.  Free the allocated memory inside NODE
3891    and its children. */
3892 
3893 static reg_errcode_t
3894 free_tree (void *extra, bin_tree_t *node)
3895 {
3896   free_token (&node->token);
3897   return REG_NOERROR;
3898 }
3899 
3900 
3901 /* Duplicate the node SRC, and return new node.  This is a preorder
3902    visit similar to the one implemented by the generic visitor, but
3903    we need more infrastructure to maintain two parallel trees --- so,
3904    it's easier to duplicate.  */
3905 
3906 static bin_tree_t *
3907 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3908 {
3909   const bin_tree_t *node;
3910   bin_tree_t *dup_root;
3911   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3912 
3913   for (node = root; ; )
3914     {
3915       /* Create a new tree and link it back to the current parent.  */
3916       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3917       if (*p_new == NULL)
3918 	return NULL;
3919       (*p_new)->parent = dup_node;
3920       (*p_new)->token.duplicated = 1;
3921       dup_node = *p_new;
3922 
3923       /* Go to the left node, or up and to the right.  */
3924       if (node->left)
3925 	{
3926 	  node = node->left;
3927 	  p_new = &dup_node->left;
3928 	}
3929       else
3930 	{
3931 	  const bin_tree_t *prev = NULL;
3932 	  while (node->right == prev || node->right == NULL)
3933 	    {
3934 	      prev = node;
3935 	      node = node->parent;
3936 	      dup_node = dup_node->parent;
3937 	      if (!node)
3938 		return dup_root;
3939 	    }
3940 	  node = node->right;
3941 	  p_new = &dup_node->right;
3942 	}
3943     }
3944 }
3945