1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 					  size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 				     const re_dfastate_t *init_state,
25 				     char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 			       reg_errcode_t (fn (void *, bin_tree_t *)),
38 			       void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 				reg_errcode_t (fn (void *, bin_tree_t *)),
41 				void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 				 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 				   unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 					 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 			 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 			reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 			  reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 				  re_token_t *token, reg_syntax_t syntax,
64 				  int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 				 re_token_t *token, reg_syntax_t syntax,
67 				 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 				     re_token_t *token, reg_syntax_t syntax,
70 				     int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 				  re_token_t *token, reg_syntax_t syntax,
73 				  int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 				 re_dfa_t *dfa, re_token_t *token,
76 				 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 				      re_token_t *token, reg_syntax_t syntax,
79 				      reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 					    re_string_t *regexp,
82 					    re_token_t *token, int token_len,
83 					    re_dfa_t *dfa,
84 					    reg_syntax_t syntax,
85 					    int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 					  re_string_t *regexp,
88 					  re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 					re_charset_t *mbcset,
92 					int *equiv_class_alloc,
93 					const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 				      bitset_t sbcset,
96 				      re_charset_t *mbcset,
97 				      int *char_class_alloc,
98 				      const unsigned char *class_name,
99 				      reg_syntax_t syntax);
100 #else  /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 					const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 				      bitset_t sbcset,
105 				      const unsigned char *class_name,
106 				      reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 				       RE_TRANSLATE_TYPE trans,
110 				       const unsigned char *class_name,
111 				       const unsigned char *extra,
112 				       int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 				bin_tree_t *left, bin_tree_t *right,
115 				re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 				      bin_tree_t *left, bin_tree_t *right,
118 				      const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 
124 /* This table gives an error message for each of the error codes listed
125    in regex.h.  Obviously the order here has to be same as there.
126    POSIX doesn't require that we do anything for REG_NOERROR,
127    but why not be nice?  */
128 
129 const char __re_error_msgid[] attribute_hidden =
130   {
131 #define REG_NOERROR_IDX	0
132     gettext_noop ("Success")	/* REG_NOERROR */
133     "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135     gettext_noop ("No match")	/* REG_NOMATCH */
136     "\0"
137 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139     "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142     "\0"
143 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145     "\0"
146 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148     "\0"
149 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151     "\0"
152 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
153     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
154     "\0"
155 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157     "\0"
158 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160     "\0"
161 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163     "\0"
164 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165     gettext_noop ("Invalid range end")	/* REG_ERANGE */
166     "\0"
167 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
169     "\0"
170 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172     "\0"
173 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
175     "\0"
176 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
178     "\0"
179 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181   };
182 
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184   {
185     REG_NOERROR_IDX,
186     REG_NOMATCH_IDX,
187     REG_BADPAT_IDX,
188     REG_ECOLLATE_IDX,
189     REG_ECTYPE_IDX,
190     REG_EESCAPE_IDX,
191     REG_ESUBREG_IDX,
192     REG_EBRACK_IDX,
193     REG_EPAREN_IDX,
194     REG_EBRACE_IDX,
195     REG_BADBR_IDX,
196     REG_ERANGE_IDX,
197     REG_ESPACE_IDX,
198     REG_BADRPT_IDX,
199     REG_EEND_IDX,
200     REG_ESIZE_IDX,
201     REG_ERPAREN_IDX
202   };
203 
204 /* Entry points for GNU code.  */
205 
206 /* re_compile_pattern is the GNU regular expression compiler: it
207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208    Returns 0 if the pattern was valid, otherwise an error string.
209 
210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211    are set in BUFP on entry.  */
212 
213 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)214 re_compile_pattern (const char *pattern, size_t length,
215                     struct re_pattern_buffer *bufp)
216 {
217   reg_errcode_t ret;
218 
219   /* And GNU code determines whether or not to get register information
220      by passing null for the REGS argument to re_match, etc., not by
221      setting no_sub, unless RE_NO_SUB is set.  */
222   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
223 
224   /* Match anchors at newline.  */
225   bufp->newline_anchor = 1;
226 
227   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
228 
229   if (!ret)
230     return NULL;
231   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
232 }
233 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)234 weak_alias (__re_compile_pattern, re_compile_pattern)
235 #endif
236 
237 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
238    also be assigned to arbitrarily: each pattern buffer stores its own
239    syntax, so it can be changed between regex compilations.  */
240 /* This has no initializer because initialized variables in Emacs
241    become read-only after dumping.  */
242 reg_syntax_t re_syntax_options;
243 
244 
245 /* Specify the precise syntax of regexps for compilation.  This provides
246    for compatibility for various utilities which historically have
247    different, incompatible syntaxes.
248 
249    The argument SYNTAX is a bit mask comprised of the various bits
250    defined in regex.h.  We return the old syntax.  */
251 
252 reg_syntax_t
253 re_set_syntax (reg_syntax_t syntax)
254 {
255   reg_syntax_t ret = re_syntax_options;
256 
257   re_syntax_options = syntax;
258   return ret;
259 }
260 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)261 weak_alias (__re_set_syntax, re_set_syntax)
262 #endif
263 
264 int
265 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 {
267   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
268   char *fastmap = bufp->fastmap;
269 
270   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
271   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
272   if (dfa->init_state != dfa->init_state_word)
273     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
274   if (dfa->init_state != dfa->init_state_nl)
275     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
276   if (dfa->init_state != dfa->init_state_begbuf)
277     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
278   bufp->fastmap_accurate = 1;
279   return 0;
280 }
281 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)282 weak_alias (__re_compile_fastmap, re_compile_fastmap)
283 #endif
284 
285 static inline void
286 __attribute ((always_inline))
287 re_set_fastmap (char *fastmap, int icase, int ch)
288 {
289   fastmap[ch] = 1;
290   if (icase)
291     fastmap[tolower (ch)] = 1;
292 }
293 
294 /* Helper function for re_compile_fastmap.
295    Compile fastmap for the initial_state INIT_STATE.  */
296 
297 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)298 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
299 			 char *fastmap)
300 {
301   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
302   int node_cnt;
303   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
304   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305     {
306       int node = init_state->nodes.elems[node_cnt];
307       re_token_type_t type = dfa->nodes[node].type;
308 
309       if (type == CHARACTER)
310 	{
311 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
312 #ifdef RE_ENABLE_I18N
313 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
314 	    {
315 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
316 	      wchar_t wc;
317 	      mbstate_t state;
318 
319 	      p = buf;
320 	      *p++ = dfa->nodes[node].opr.c;
321 	      while (++node < dfa->nodes_len
322 		     &&	dfa->nodes[node].type == CHARACTER
323 		     && dfa->nodes[node].mb_partial)
324 		*p++ = dfa->nodes[node].opr.c;
325 	      memset (&state, '\0', sizeof (state));
326 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
327 			   &state) == p - buf
328 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 		      != (size_t) -1))
330 		re_set_fastmap (fastmap, 0, buf[0]);
331 	    }
332 #endif
333 	}
334       else if (type == SIMPLE_BRACKET)
335 	{
336 	  int i, ch;
337 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 	    {
339 	      int j;
340 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
341 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342 		if (w & ((bitset_word_t) 1 << j))
343 		  re_set_fastmap (fastmap, icase, ch);
344 	    }
345 	}
346 #ifdef RE_ENABLE_I18N
347       else if (type == COMPLEX_BRACKET)
348 	{
349 	  int i;
350 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
351 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
352 	      || cset->nranges || cset->nchar_classes)
353 	    {
354 # ifdef _LIBC
355 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
356 		{
357 		  /* In this case we want to catch the bytes which are
358 		     the first byte of any collation elements.
359 		     e.g. In da_DK, we want to catch 'a' since "aa"
360 			  is a valid collation element, and don't catch
361 			  'b' since 'b' is the only collation element
362 			  which starts from 'b'.  */
363 		  const int32_t *table = (const int32_t *)
364 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
365 		  for (i = 0; i < SBC_MAX; ++i)
366 		    if (table[i] < 0)
367 		      re_set_fastmap (fastmap, icase, i);
368 		}
369 # else
370 	      if (dfa->mb_cur_max > 1)
371 		for (i = 0; i < SBC_MAX; ++i)
372 		  if (__btowc (i) == WEOF)
373 		    re_set_fastmap (fastmap, icase, i);
374 # endif /* not _LIBC */
375 	    }
376 	  for (i = 0; i < cset->nmbchars; ++i)
377 	    {
378 	      char buf[256];
379 	      mbstate_t state;
380 	      memset (&state, '\0', sizeof (state));
381 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383 	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384 		{
385 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
386 		      != (size_t) -1)
387 		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
388 		}
389 	    }
390 	}
391 #endif /* RE_ENABLE_I18N */
392       else if (type == OP_PERIOD
393 #ifdef RE_ENABLE_I18N
394 	       || type == OP_UTF8_PERIOD
395 #endif /* RE_ENABLE_I18N */
396 	       || type == END_OF_RE)
397 	{
398 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
399 	  if (type == END_OF_RE)
400 	    bufp->can_be_null = 1;
401 	  return;
402 	}
403     }
404 }
405 
406 /* Entry point for POSIX code.  */
407 /* regcomp takes a regular expression as a string and compiles it.
408 
409    PREG is a regex_t *.  We do not expect any fields to be initialized,
410    since POSIX says we shouldn't.  Thus, we set
411 
412      `buffer' to the compiled pattern;
413      `used' to the length of the compiled pattern;
414      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
415        REG_EXTENDED bit in CFLAGS is set; otherwise, to
416        RE_SYNTAX_POSIX_BASIC;
417      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
418      `fastmap' to an allocated space for the fastmap;
419      `fastmap_accurate' to zero;
420      `re_nsub' to the number of subexpressions in PATTERN.
421 
422    PATTERN is the address of the pattern string.
423 
424    CFLAGS is a series of bits which affect compilation.
425 
426      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
427      use POSIX basic syntax.
428 
429      If REG_NEWLINE is set, then . and [^...] don't match newline.
430      Also, regexec will try a match beginning after every newline.
431 
432      If REG_ICASE is set, then we considers upper- and lowercase
433      versions of letters to be equivalent when matching.
434 
435      If REG_NOSUB is set, then when PREG is passed to regexec, that
436      routine will report only success or failure, and nothing about the
437      registers.
438 
439    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
440    the return codes and their meanings.)  */
441 
442 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)443 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 {
445   reg_errcode_t ret;
446   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
447 			 : RE_SYNTAX_POSIX_BASIC);
448 
449   preg->buffer = NULL;
450   preg->allocated = 0;
451   preg->used = 0;
452 
453   /* Try to allocate space for the fastmap.  */
454   preg->fastmap = re_malloc (char, SBC_MAX);
455   if (BE (preg->fastmap == NULL, 0))
456     return REG_ESPACE;
457 
458   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
459 
460   /* If REG_NEWLINE is set, newlines are treated differently.  */
461   if (cflags & REG_NEWLINE)
462     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
463       syntax &= ~RE_DOT_NEWLINE;
464       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
465       /* It also changes the matching behavior.  */
466       preg->newline_anchor = 1;
467     }
468   else
469     preg->newline_anchor = 0;
470   preg->no_sub = !!(cflags & REG_NOSUB);
471   preg->translate = NULL;
472 
473   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
474 
475   /* POSIX doesn't distinguish between an unmatched open-group and an
476      unmatched close-group: both are REG_EPAREN.  */
477   if (ret == REG_ERPAREN)
478     ret = REG_EPAREN;
479 
480   /* We have already checked preg->fastmap != NULL.  */
481   if (BE (ret == REG_NOERROR, 1))
482     /* Compute the fastmap now, since regexec cannot modify the pattern
483        buffer.  This function never fails in this implementation.  */
484     (void) re_compile_fastmap (preg);
485   else
486     {
487       /* Some error occurred while compiling the expression.  */
488       re_free (preg->fastmap);
489       preg->fastmap = NULL;
490     }
491 
492   return (int) ret;
493 }
494 #ifdef _LIBC
weak_alias(__regcomp,regcomp)495 weak_alias (__regcomp, regcomp)
496 #endif
497 
498 /* Returns a message corresponding to an error code, ERRCODE, returned
499    from either regcomp or regexec.   We don't use PREG here.  */
500 
501 size_t
502 regerror (int errcode, const regex_t *__restrict preg,
503           char *__restrict errbuf, size_t errbuf_size)
504 {
505   const char *msg;
506   size_t msg_size;
507 
508   if (BE (errcode < 0
509 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
510 			       / sizeof (__re_error_msgid_idx[0])), 0))
511     /* Only error codes returned by the rest of the code should be passed
512        to this routine.  If we are given anything else, or if other regex
513        code generates an invalid error code, then the program has a bug.
514        Dump core so we can fix it.  */
515     abort ();
516 
517   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
518 
519   msg_size = strlen (msg) + 1; /* Includes the null.  */
520 
521   if (BE (errbuf_size != 0, 1))
522     {
523       if (BE (msg_size > errbuf_size, 0))
524 	{
525 #if defined HAVE_MEMPCPY || defined _LIBC
526 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
527 #else
528 	  memcpy (errbuf, msg, errbuf_size - 1);
529 	  errbuf[errbuf_size - 1] = 0;
530 #endif
531 	}
532       else
533 	memcpy (errbuf, msg, msg_size);
534     }
535 
536   return msg_size;
537 }
538 #ifdef _LIBC
539 weak_alias (__regerror, regerror)
540 #endif
541 
542 
543 #ifdef RE_ENABLE_I18N
544 /* This static array is used for the map to single-byte characters when
545    UTF-8 is used.  Otherwise we would allocate memory just to initialize
546    it the same all the time.  UTF-8 is the preferred encoding so this is
547    a worthwhile optimization.  */
548 static const bitset_t utf8_sb_map =
549 {
550   /* Set the first 128 bits.  */
551   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
552 };
553 #endif
554 
555 
556 static void
free_dfa_content(re_dfa_t * dfa)557 free_dfa_content (re_dfa_t *dfa)
558 {
559   int i, j;
560 
561   if (dfa->nodes)
562     for (i = 0; i < dfa->nodes_len; ++i)
563       free_token (dfa->nodes + i);
564   re_free (dfa->nexts);
565   for (i = 0; i < dfa->nodes_len; ++i)
566     {
567       if (dfa->eclosures != NULL)
568 	re_node_set_free (dfa->eclosures + i);
569       if (dfa->inveclosures != NULL)
570 	re_node_set_free (dfa->inveclosures + i);
571       if (dfa->edests != NULL)
572 	re_node_set_free (dfa->edests + i);
573     }
574   re_free (dfa->edests);
575   re_free (dfa->eclosures);
576   re_free (dfa->inveclosures);
577   re_free (dfa->nodes);
578 
579   if (dfa->state_table)
580     for (i = 0; i <= dfa->state_hash_mask; ++i)
581       {
582 	struct re_state_table_entry *entry = dfa->state_table + i;
583 	for (j = 0; j < entry->num; ++j)
584 	  {
585 	    re_dfastate_t *state = entry->array[j];
586 	    free_state (state);
587 	  }
588         re_free (entry->array);
589       }
590   re_free (dfa->state_table);
591 #ifdef RE_ENABLE_I18N
592   if (dfa->sb_char != utf8_sb_map)
593     re_free (dfa->sb_char);
594 #endif
595   re_free (dfa->subexp_map);
596 #ifdef DEBUG
597   re_free (dfa->re_str);
598 #endif
599 
600   re_free (dfa);
601 }
602 
603 
604 /* Free dynamically allocated space used by PREG.  */
605 
606 void
regfree(regex_t * preg)607 regfree (regex_t *preg)
608 {
609   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
610   if (BE (dfa != NULL, 1))
611     free_dfa_content (dfa);
612   preg->buffer = NULL;
613   preg->allocated = 0;
614 
615   re_free (preg->fastmap);
616   preg->fastmap = NULL;
617 
618   re_free (preg->translate);
619   preg->translate = NULL;
620 }
621 #ifdef _LIBC
622 weak_alias (__regfree, regfree)
623 #endif
624 
625 /* Entry points compatible with 4.2 BSD regex library.  We don't define
626    them unless specifically requested.  */
627 
628 #if defined _REGEX_RE_COMP || defined _LIBC
629 
630 /* BSD has one and only one pattern buffer.  */
631 static struct re_pattern_buffer re_comp_buf;
632 
633 char *
634 # ifdef _LIBC
635 /* Make these definitions weak in libc, so POSIX programs can redefine
636    these names if they don't use our functions, and still use
637    regcomp/regexec above without link errors.  */
638 weak_function
639 # endif
re_comp(s)640 re_comp (s)
641      const char *s;
642 {
643   reg_errcode_t ret;
644   char *fastmap;
645 
646   if (!s)
647     {
648       if (!re_comp_buf.buffer)
649 	return gettext ("No previous regular expression");
650       return 0;
651     }
652 
653   if (re_comp_buf.buffer)
654     {
655       fastmap = re_comp_buf.fastmap;
656       re_comp_buf.fastmap = NULL;
657       __regfree (&re_comp_buf);
658       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
659       re_comp_buf.fastmap = fastmap;
660     }
661 
662   if (re_comp_buf.fastmap == NULL)
663     {
664       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
665       if (re_comp_buf.fastmap == NULL)
666 	return (char *) gettext (__re_error_msgid
667 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
668     }
669 
670   /* Since `re_exec' always passes NULL for the `regs' argument, we
671      don't need to initialize the pattern buffer fields which affect it.  */
672 
673   /* Match anchors at newlines.  */
674   re_comp_buf.newline_anchor = 1;
675 
676   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
677 
678   if (!ret)
679     return NULL;
680 
681   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
682   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
683 }
684 
685 #ifdef _LIBC
libc_freeres_fn(free_mem)686 libc_freeres_fn (free_mem)
687 {
688   __regfree (&re_comp_buf);
689 }
690 #endif
691 
692 #endif /* _REGEX_RE_COMP */
693 
694 /* Internal entry point.
695    Compile the regular expression PATTERN, whose length is LENGTH.
696    SYNTAX indicate regular expression's syntax.  */
697 
698 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)699 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
700 		     reg_syntax_t syntax)
701 {
702   reg_errcode_t err = REG_NOERROR;
703   re_dfa_t *dfa;
704   re_string_t regexp;
705 
706   /* Initialize the pattern buffer.  */
707   preg->fastmap_accurate = 0;
708   preg->syntax = syntax;
709   preg->not_bol = preg->not_eol = 0;
710   preg->used = 0;
711   preg->re_nsub = 0;
712   preg->can_be_null = 0;
713   preg->regs_allocated = REGS_UNALLOCATED;
714 
715   /* Initialize the dfa.  */
716   dfa = (re_dfa_t *) preg->buffer;
717   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
718     {
719       /* If zero allocated, but buffer is non-null, try to realloc
720 	 enough space.  This loses if buffer's address is bogus, but
721 	 that is the user's responsibility.  If ->buffer is NULL this
722 	 is a simple allocation.  */
723       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
724       if (dfa == NULL)
725 	return REG_ESPACE;
726       preg->allocated = sizeof (re_dfa_t);
727       preg->buffer = (unsigned char *) dfa;
728     }
729   preg->used = sizeof (re_dfa_t);
730 
731   err = init_dfa (dfa, length);
732   if (BE (err != REG_NOERROR, 0))
733     {
734       free_dfa_content (dfa);
735       preg->buffer = NULL;
736       preg->allocated = 0;
737       return err;
738     }
739 #ifdef DEBUG
740   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
741   dfa->re_str = re_malloc (char, length + 1);
742   strncpy (dfa->re_str, pattern, length + 1);
743 #endif
744 
745   __libc_lock_init (dfa->lock);
746 
747   err = re_string_construct (&regexp, pattern, length, preg->translate,
748 			     syntax & RE_ICASE, dfa);
749   if (BE (err != REG_NOERROR, 0))
750     {
751     re_compile_internal_free_return:
752       free_workarea_compile (preg);
753       re_string_destruct (&regexp);
754       free_dfa_content (dfa);
755       preg->buffer = NULL;
756       preg->allocated = 0;
757       return err;
758     }
759 
760   /* Parse the regular expression, and build a structure tree.  */
761   preg->re_nsub = 0;
762   dfa->str_tree = parse (&regexp, preg, syntax, &err);
763   if (BE (dfa->str_tree == NULL, 0))
764     goto re_compile_internal_free_return;
765 
766   /* Analyze the tree and create the nfa.  */
767   err = analyze (preg);
768   if (BE (err != REG_NOERROR, 0))
769     goto re_compile_internal_free_return;
770 
771 #ifdef RE_ENABLE_I18N
772   /* If possible, do searching in single byte encoding to speed things up.  */
773   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
774     optimize_utf8 (dfa);
775 #endif
776 
777   /* Then create the initial state of the dfa.  */
778   err = create_initial_state (dfa);
779 
780   /* Release work areas.  */
781   free_workarea_compile (preg);
782   re_string_destruct (&regexp);
783 
784   if (BE (err != REG_NOERROR, 0))
785     {
786       free_dfa_content (dfa);
787       preg->buffer = NULL;
788       preg->allocated = 0;
789     }
790 
791   return err;
792 }
793 
794 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
795    as the initial length of some arrays.  */
796 
797 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)798 init_dfa (re_dfa_t *dfa, size_t pat_len)
799 {
800   unsigned int table_size;
801 #ifndef _LIBC
802   const char *codeset_name;
803 #endif
804 
805   memset (dfa, '\0', sizeof (re_dfa_t));
806 
807   /* Force allocation of str_tree_storage the first time.  */
808   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
809 
810   /* Avoid overflows.  */
811   if (pat_len == SIZE_MAX)
812     return REG_ESPACE;
813 
814   dfa->nodes_alloc = pat_len + 1;
815   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
816 
817   /*  table_size = 2 ^ ceil(log pat_len) */
818   for (table_size = 1; ; table_size <<= 1)
819     if (table_size > pat_len)
820       break;
821 
822   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
823   dfa->state_hash_mask = table_size - 1;
824 
825   dfa->mb_cur_max = MB_CUR_MAX;
826 #ifdef _LIBC
827   if (dfa->mb_cur_max == 6
828       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
829     dfa->is_utf8 = 1;
830   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
831 		       != 0);
832 #else
833 # ifdef HAVE_LANGINFO_CODESET
834   codeset_name = nl_langinfo (CODESET);
835 # else
836   codeset_name = getenv ("LC_ALL");
837   if (codeset_name == NULL || codeset_name[0] == '\0')
838     codeset_name = getenv ("LC_CTYPE");
839   if (codeset_name == NULL || codeset_name[0] == '\0')
840     codeset_name = getenv ("LANG");
841   if (codeset_name == NULL)
842     codeset_name = "";
843   else if (strchr (codeset_name, '.') !=  NULL)
844     codeset_name = strchr (codeset_name, '.') + 1;
845 # endif
846 
847   if (strcasecmp (codeset_name, "UTF-8") == 0
848       || strcasecmp (codeset_name, "UTF8") == 0)
849     dfa->is_utf8 = 1;
850 
851   /* We check exhaustively in the loop below if this charset is a
852      superset of ASCII.  */
853   dfa->map_notascii = 0;
854 #endif
855 
856 #ifdef RE_ENABLE_I18N
857   if (dfa->mb_cur_max > 1)
858     {
859       if (dfa->is_utf8)
860 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861       else
862 	{
863 	  int i, j, ch;
864 
865 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866 	  if (BE (dfa->sb_char == NULL, 0))
867 	    return REG_ESPACE;
868 
869 	  /* Set the bits corresponding to single byte chars.  */
870 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872 	      {
873 		wint_t wch = __btowc (ch);
874 		if (wch != WEOF)
875 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876 # ifndef _LIBC
877 		if (isascii (ch) && wch != ch)
878 		  dfa->map_notascii = 1;
879 # endif
880 	      }
881 	}
882     }
883 #endif
884 
885   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
886     return REG_ESPACE;
887   return REG_NOERROR;
888 }
889 
890 /* Initialize WORD_CHAR table, which indicate which character is
891    "word".  In this case "word" means that it is the word construction
892    character used by some operators like "\<", "\>", etc.  */
893 
894 static void
895 internal_function
init_word_char(re_dfa_t * dfa)896 init_word_char (re_dfa_t *dfa)
897 {
898   int i, j, ch;
899   dfa->word_ops_used = 1;
900   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902       if (isalnum (ch) || ch == '_')
903 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
904 }
905 
906 /* Free the work area which are only used while compiling.  */
907 
908 static void
free_workarea_compile(regex_t * preg)909 free_workarea_compile (regex_t *preg)
910 {
911   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
912   bin_tree_storage_t *storage, *next;
913   for (storage = dfa->str_tree_storage; storage; storage = next)
914     {
915       next = storage->next;
916       re_free (storage);
917     }
918   dfa->str_tree_storage = NULL;
919   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
920   dfa->str_tree = NULL;
921   re_free (dfa->org_indices);
922   dfa->org_indices = NULL;
923 }
924 
925 /* Create initial states for all contexts.  */
926 
927 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)928 create_initial_state (re_dfa_t *dfa)
929 {
930   int first, i;
931   reg_errcode_t err;
932   re_node_set init_nodes;
933 
934   /* Initial states have the epsilon closure of the node which is
935      the first node of the regular expression.  */
936   first = dfa->str_tree->first->node_idx;
937   dfa->init_node = first;
938   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
939   if (BE (err != REG_NOERROR, 0))
940     return err;
941 
942   /* The back-references which are in initial states can epsilon transit,
943      since in this case all of the subexpressions can be null.
944      Then we add epsilon closures of the nodes which are the next nodes of
945      the back-references.  */
946   if (dfa->nbackref > 0)
947     for (i = 0; i < init_nodes.nelem; ++i)
948       {
949 	int node_idx = init_nodes.elems[i];
950 	re_token_type_t type = dfa->nodes[node_idx].type;
951 
952 	int clexp_idx;
953 	if (type != OP_BACK_REF)
954 	  continue;
955 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
956 	  {
957 	    re_token_t *clexp_node;
958 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
959 	    if (clexp_node->type == OP_CLOSE_SUBEXP
960 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
961 	      break;
962 	  }
963 	if (clexp_idx == init_nodes.nelem)
964 	  continue;
965 
966 	if (type == OP_BACK_REF)
967 	  {
968 	    int dest_idx = dfa->edests[node_idx].elems[0];
969 	    if (!re_node_set_contains (&init_nodes, dest_idx))
970 	      {
971 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
972 		i = 0;
973 	      }
974 	  }
975       }
976 
977   /* It must be the first time to invoke acquire_state.  */
978   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
979   /* We don't check ERR here, since the initial state must not be NULL.  */
980   if (BE (dfa->init_state == NULL, 0))
981     return err;
982   if (dfa->init_state->has_constraint)
983     {
984       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
985 						       CONTEXT_WORD);
986       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
987 						     CONTEXT_NEWLINE);
988       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
989 							 &init_nodes,
990 							 CONTEXT_NEWLINE
991 							 | CONTEXT_BEGBUF);
992       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
993 	      || dfa->init_state_begbuf == NULL, 0))
994 	return err;
995     }
996   else
997     dfa->init_state_word = dfa->init_state_nl
998       = dfa->init_state_begbuf = dfa->init_state;
999 
1000   re_node_set_free (&init_nodes);
1001   return REG_NOERROR;
1002 }
1003 
1004 #ifdef RE_ENABLE_I18N
1005 /* If it is possible to do searching in single byte encoding instead of UTF-8
1006    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1007    DFA nodes where needed.  */
1008 
1009 static void
optimize_utf8(re_dfa_t * dfa)1010 optimize_utf8 (re_dfa_t *dfa)
1011 {
1012   int node, i, mb_chars = 0, has_period = 0;
1013 
1014   for (node = 0; node < dfa->nodes_len; ++node)
1015     switch (dfa->nodes[node].type)
1016       {
1017       case CHARACTER:
1018 	if (dfa->nodes[node].opr.c >= 0x80)
1019 	  mb_chars = 1;
1020 	break;
1021       case ANCHOR:
1022 	switch (dfa->nodes[node].opr.idx)
1023 	  {
1024 	  case LINE_FIRST:
1025 	  case LINE_LAST:
1026 	  case BUF_FIRST:
1027 	  case BUF_LAST:
1028 	    break;
1029 	  default:
1030 	    /* Word anchors etc. cannot be handled.  */
1031 	    return;
1032 	  }
1033 	break;
1034       case OP_PERIOD:
1035         has_period = 1;
1036         break;
1037       case OP_BACK_REF:
1038       case OP_ALT:
1039       case END_OF_RE:
1040       case OP_DUP_ASTERISK:
1041       case OP_OPEN_SUBEXP:
1042       case OP_CLOSE_SUBEXP:
1043 	break;
1044       case COMPLEX_BRACKET:
1045 	return;
1046       case SIMPLE_BRACKET:
1047 	/* Just double check.  The non-ASCII range starts at 0x80.  */
1048 	assert (0x80 % BITSET_WORD_BITS == 0);
1049         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1050 	  if (dfa->nodes[node].opr.sbcset[i])
1051 	    return;
1052 	break;
1053       default:
1054 	abort ();
1055       }
1056 
1057   if (mb_chars || has_period)
1058     for (node = 0; node < dfa->nodes_len; ++node)
1059       {
1060 	if (dfa->nodes[node].type == CHARACTER
1061 	    && dfa->nodes[node].opr.c >= 0x80)
1062 	  dfa->nodes[node].mb_partial = 0;
1063 	else if (dfa->nodes[node].type == OP_PERIOD)
1064 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1065       }
1066 
1067   /* The search can be in single byte locale.  */
1068   dfa->mb_cur_max = 1;
1069   dfa->is_utf8 = 0;
1070   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1071 }
1072 #endif
1073 
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075    "eclosure", and "inveclosure".  */
1076 
1077 static reg_errcode_t
analyze(regex_t * preg)1078 analyze (regex_t *preg)
1079 {
1080   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1081   reg_errcode_t ret;
1082 
1083   /* Allocate arrays.  */
1084   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1085   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1086   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1087   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089 	  || dfa->eclosures == NULL, 0))
1090     return REG_ESPACE;
1091 
1092   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1093   if (dfa->subexp_map != NULL)
1094     {
1095       int i;
1096       for (i = 0; i < preg->re_nsub; i++)
1097 	dfa->subexp_map[i] = i;
1098       preorder (dfa->str_tree, optimize_subexps, dfa);
1099       for (i = 0; i < preg->re_nsub; i++)
1100 	if (dfa->subexp_map[i] != i)
1101 	  break;
1102       if (i == preg->re_nsub)
1103 	{
1104 	  free (dfa->subexp_map);
1105 	  dfa->subexp_map = NULL;
1106 	}
1107     }
1108 
1109   ret = postorder (dfa->str_tree, lower_subexps, preg);
1110   if (BE (ret != REG_NOERROR, 0))
1111     return ret;
1112   ret = postorder (dfa->str_tree, calc_first, dfa);
1113   if (BE (ret != REG_NOERROR, 0))
1114     return ret;
1115   preorder (dfa->str_tree, calc_next, dfa);
1116   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117   if (BE (ret != REG_NOERROR, 0))
1118     return ret;
1119   ret = calc_eclosure (dfa);
1120   if (BE (ret != REG_NOERROR, 0))
1121     return ret;
1122 
1123   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1125   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126       || dfa->nbackref)
1127     {
1128       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1129       if (BE (dfa->inveclosures == NULL, 0))
1130         return REG_ESPACE;
1131       ret = calc_inveclosure (dfa);
1132     }
1133 
1134   return ret;
1135 }
1136 
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138    implement parse tree visits.  Instead, we use parent pointers and
1139    some hairy code in these two functions.  */
1140 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142 	   void *extra)
1143 {
1144   bin_tree_t *node, *prev;
1145 
1146   for (node = root; ; )
1147     {
1148       /* Descend down the tree, preferably to the left (or to the right
1149 	 if that's the only child).  */
1150       while (node->left || node->right)
1151 	if (node->left)
1152           node = node->left;
1153         else
1154           node = node->right;
1155 
1156       do
1157 	{
1158 	  reg_errcode_t err = fn (extra, node);
1159 	  if (BE (err != REG_NOERROR, 0))
1160 	    return err;
1161           if (node->parent == NULL)
1162 	    return REG_NOERROR;
1163 	  prev = node;
1164 	  node = node->parent;
1165 	}
1166       /* Go up while we have a node that is reached from the right.  */
1167       while (node->right == prev || node->right == NULL);
1168       node = node->right;
1169     }
1170 }
1171 
1172 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1174 	  void *extra)
1175 {
1176   bin_tree_t *node;
1177 
1178   for (node = root; ; )
1179     {
1180       reg_errcode_t err = fn (extra, node);
1181       if (BE (err != REG_NOERROR, 0))
1182 	return err;
1183 
1184       /* Go to the left node, or up and to the right.  */
1185       if (node->left)
1186 	node = node->left;
1187       else
1188 	{
1189 	  bin_tree_t *prev = NULL;
1190 	  while (node->right == prev || node->right == NULL)
1191 	    {
1192 	      prev = node;
1193 	      node = node->parent;
1194 	      if (!node)
1195 	        return REG_NOERROR;
1196 	    }
1197 	  node = node->right;
1198 	}
1199     }
1200 }
1201 
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1204    backreferences as well.  Requires a preorder visit.  */
1205 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1206 optimize_subexps (void *extra, bin_tree_t *node)
1207 {
1208   re_dfa_t *dfa = (re_dfa_t *) extra;
1209 
1210   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1211     {
1212       int idx = node->token.opr.idx;
1213       node->token.opr.idx = dfa->subexp_map[idx];
1214       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215     }
1216 
1217   else if (node->token.type == SUBEXP
1218            && node->left && node->left->token.type == SUBEXP)
1219     {
1220       int other_idx = node->left->token.opr.idx;
1221 
1222       node->left = node->left->left;
1223       if (node->left)
1224         node->left->parent = node;
1225 
1226       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227       if (other_idx < BITSET_WORD_BITS)
1228 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1229     }
1230 
1231   return REG_NOERROR;
1232 }
1233 
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1236 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1237 lower_subexps (void *extra, bin_tree_t *node)
1238 {
1239   regex_t *preg = (regex_t *) extra;
1240   reg_errcode_t err = REG_NOERROR;
1241 
1242   if (node->left && node->left->token.type == SUBEXP)
1243     {
1244       node->left = lower_subexp (&err, preg, node->left);
1245       if (node->left)
1246 	node->left->parent = node;
1247     }
1248   if (node->right && node->right->token.type == SUBEXP)
1249     {
1250       node->right = lower_subexp (&err, preg, node->right);
1251       if (node->right)
1252 	node->right->parent = node;
1253     }
1254 
1255   return err;
1256 }
1257 
1258 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1260 {
1261   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1262   bin_tree_t *body = node->left;
1263   bin_tree_t *op, *cls, *tree1, *tree;
1264 
1265   if (preg->no_sub
1266       /* We do not optimize empty subexpressions, because otherwise we may
1267 	 have bad CONCAT nodes with NULL children.  This is obviously not
1268 	 very common, so we do not lose much.  An example that triggers
1269 	 this case is the sed "script" /\(\)/x.  */
1270       && node->left != NULL
1271       && (node->token.opr.idx >= BITSET_WORD_BITS
1272 	  || !(dfa->used_bkref_map
1273 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1274     return node->left;
1275 
1276   /* Convert the SUBEXP node to the concatenation of an
1277      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1278   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1279   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1280   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1281   tree = create_tree (dfa, op, tree1, CONCAT);
1282   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1283     {
1284       *err = REG_ESPACE;
1285       return NULL;
1286     }
1287 
1288   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1289   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1290   return tree;
1291 }
1292 
1293 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1294    nodes.  Requires a postorder visit.  */
1295 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1296 calc_first (void *extra, bin_tree_t *node)
1297 {
1298   re_dfa_t *dfa = (re_dfa_t *) extra;
1299   if (node->token.type == CONCAT)
1300     {
1301       node->first = node->left->first;
1302       node->node_idx = node->left->node_idx;
1303     }
1304   else
1305     {
1306       node->first = node;
1307       node->node_idx = re_dfa_add_node (dfa, node->token);
1308       if (BE (node->node_idx == -1, 0))
1309         return REG_ESPACE;
1310     }
1311   return REG_NOERROR;
1312 }
1313 
1314 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1315 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1316 calc_next (void *extra, bin_tree_t *node)
1317 {
1318   switch (node->token.type)
1319     {
1320     case OP_DUP_ASTERISK:
1321       node->left->next = node;
1322       break;
1323     case CONCAT:
1324       node->left->next = node->right->first;
1325       node->right->next = node->next;
1326       break;
1327     default:
1328       if (node->left)
1329 	node->left->next = node->next;
1330       if (node->right)
1331         node->right->next = node->next;
1332       break;
1333     }
1334   return REG_NOERROR;
1335 }
1336 
1337 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1338 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1339 link_nfa_nodes (void *extra, bin_tree_t *node)
1340 {
1341   re_dfa_t *dfa = (re_dfa_t *) extra;
1342   int idx = node->node_idx;
1343   reg_errcode_t err = REG_NOERROR;
1344 
1345   switch (node->token.type)
1346     {
1347     case CONCAT:
1348       break;
1349 
1350     case END_OF_RE:
1351       assert (node->next == NULL);
1352       break;
1353 
1354     case OP_DUP_ASTERISK:
1355     case OP_ALT:
1356       {
1357 	int left, right;
1358 	dfa->has_plural_match = 1;
1359 	if (node->left != NULL)
1360 	  left = node->left->first->node_idx;
1361 	else
1362 	  left = node->next->node_idx;
1363 	if (node->right != NULL)
1364 	  right = node->right->first->node_idx;
1365 	else
1366 	  right = node->next->node_idx;
1367 	assert (left > -1);
1368 	assert (right > -1);
1369 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1370       }
1371       break;
1372 
1373     case ANCHOR:
1374     case OP_OPEN_SUBEXP:
1375     case OP_CLOSE_SUBEXP:
1376       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1377       break;
1378 
1379     case OP_BACK_REF:
1380       dfa->nexts[idx] = node->next->node_idx;
1381       if (node->token.type == OP_BACK_REF)
1382 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1383       break;
1384 
1385     default:
1386       assert (!IS_EPSILON_NODE (node->token.type));
1387       dfa->nexts[idx] = node->next->node_idx;
1388       break;
1389     }
1390 
1391   return err;
1392 }
1393 
1394 /* Duplicate the epsilon closure of the node ROOT_NODE.
1395    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1396    to their own constraint.  */
1397 
1398 static reg_errcode_t
1399 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1400 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1401 			int root_node, unsigned int init_constraint)
1402 {
1403   int org_node, clone_node, ret;
1404   unsigned int constraint = init_constraint;
1405   for (org_node = top_org_node, clone_node = top_clone_node;;)
1406     {
1407       int org_dest, clone_dest;
1408       if (dfa->nodes[org_node].type == OP_BACK_REF)
1409 	{
1410 	  /* If the back reference epsilon-transit, its destination must
1411 	     also have the constraint.  Then duplicate the epsilon closure
1412 	     of the destination of the back reference, and store it in
1413 	     edests of the back reference.  */
1414 	  org_dest = dfa->nexts[org_node];
1415 	  re_node_set_empty (dfa->edests + clone_node);
1416 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1417 	  if (BE (clone_dest == -1, 0))
1418 	    return REG_ESPACE;
1419 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1420 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1421 	  if (BE (ret < 0, 0))
1422 	    return REG_ESPACE;
1423 	}
1424       else if (dfa->edests[org_node].nelem == 0)
1425 	{
1426 	  /* In case of the node can't epsilon-transit, don't duplicate the
1427 	     destination and store the original destination as the
1428 	     destination of the node.  */
1429 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1430 	  break;
1431 	}
1432       else if (dfa->edests[org_node].nelem == 1)
1433 	{
1434 	  /* In case of the node can epsilon-transit, and it has only one
1435 	     destination.  */
1436 	  org_dest = dfa->edests[org_node].elems[0];
1437 	  re_node_set_empty (dfa->edests + clone_node);
1438 	  if (dfa->nodes[org_node].type == ANCHOR)
1439 	    {
1440 	      /* In case of the node has another constraint, append it.  */
1441 	      if (org_node == root_node && clone_node != org_node)
1442 		{
1443 		  /* ...but if the node is root_node itself, it means the
1444 		     epsilon closure have a loop, then tie it to the
1445 		     destination of the root_node.  */
1446 		  ret = re_node_set_insert (dfa->edests + clone_node,
1447 					    org_dest);
1448 		  if (BE (ret < 0, 0))
1449 		    return REG_ESPACE;
1450 		  break;
1451 		}
1452 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1453 	    }
1454 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1455 	  if (BE (clone_dest == -1, 0))
1456 	    return REG_ESPACE;
1457 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1458 	  if (BE (ret < 0, 0))
1459 	    return REG_ESPACE;
1460 	}
1461       else /* dfa->edests[org_node].nelem == 2 */
1462 	{
1463 	  /* In case of the node can epsilon-transit, and it has two
1464 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1465 	  org_dest = dfa->edests[org_node].elems[0];
1466 	  re_node_set_empty (dfa->edests + clone_node);
1467 	  /* Search for a duplicated node which satisfies the constraint.  */
1468 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1469 	  if (clone_dest == -1)
1470 	    {
1471 	      /* There are no such a duplicated node, create a new one.  */
1472 	      reg_errcode_t err;
1473 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1474 	      if (BE (clone_dest == -1, 0))
1475 		return REG_ESPACE;
1476 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1477 	      if (BE (ret < 0, 0))
1478 		return REG_ESPACE;
1479 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1480 					    root_node, constraint);
1481 	      if (BE (err != REG_NOERROR, 0))
1482 		return err;
1483 	    }
1484 	  else
1485 	    {
1486 	      /* There are a duplicated node which satisfy the constraint,
1487 		 use it to avoid infinite loop.  */
1488 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489 	      if (BE (ret < 0, 0))
1490 		return REG_ESPACE;
1491 	    }
1492 
1493 	  org_dest = dfa->edests[org_node].elems[1];
1494 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 	  if (BE (clone_dest == -1, 0))
1496 	    return REG_ESPACE;
1497 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498 	  if (BE (ret < 0, 0))
1499 	    return REG_ESPACE;
1500 	}
1501       org_node = org_dest;
1502       clone_node = clone_dest;
1503     }
1504   return REG_NOERROR;
1505 }
1506 
1507 /* Search for a node which is duplicated from the node ORG_NODE, and
1508    satisfies the constraint CONSTRAINT.  */
1509 
1510 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1511 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1512 			unsigned int constraint)
1513 {
1514   int idx;
1515   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1516     {
1517       if (org_node == dfa->org_indices[idx]
1518 	  && constraint == dfa->nodes[idx].constraint)
1519 	return idx; /* Found.  */
1520     }
1521   return -1; /* Not found.  */
1522 }
1523 
1524 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1525    Return the index of the new node, or -1 if insufficient storage is
1526    available.  */
1527 
1528 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1529 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1530 {
1531   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1532   if (BE (dup_idx != -1, 1))
1533     {
1534       dfa->nodes[dup_idx].constraint = constraint;
1535       if (dfa->nodes[org_idx].type == ANCHOR)
1536 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1537       dfa->nodes[dup_idx].duplicated = 1;
1538 
1539       /* Store the index of the original node.  */
1540       dfa->org_indices[dup_idx] = org_idx;
1541     }
1542   return dup_idx;
1543 }
1544 
1545 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1546 calc_inveclosure (re_dfa_t *dfa)
1547 {
1548   int src, idx, ret;
1549   for (idx = 0; idx < dfa->nodes_len; ++idx)
1550     re_node_set_init_empty (dfa->inveclosures + idx);
1551 
1552   for (src = 0; src < dfa->nodes_len; ++src)
1553     {
1554       int *elems = dfa->eclosures[src].elems;
1555       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1556 	{
1557 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1558 	  if (BE (ret == -1, 0))
1559 	    return REG_ESPACE;
1560 	}
1561     }
1562 
1563   return REG_NOERROR;
1564 }
1565 
1566 /* Calculate "eclosure" for all the node in DFA.  */
1567 
1568 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1569 calc_eclosure (re_dfa_t *dfa)
1570 {
1571   int node_idx, incomplete;
1572 #ifdef DEBUG
1573   assert (dfa->nodes_len > 0);
1574 #endif
1575   incomplete = 0;
1576   /* For each nodes, calculate epsilon closure.  */
1577   for (node_idx = 0; ; ++node_idx)
1578     {
1579       reg_errcode_t err;
1580       re_node_set eclosure_elem;
1581       if (node_idx == dfa->nodes_len)
1582 	{
1583 	  if (!incomplete)
1584 	    break;
1585 	  incomplete = 0;
1586 	  node_idx = 0;
1587 	}
1588 
1589 #ifdef DEBUG
1590       assert (dfa->eclosures[node_idx].nelem != -1);
1591 #endif
1592 
1593       /* If we have already calculated, skip it.  */
1594       if (dfa->eclosures[node_idx].nelem != 0)
1595 	continue;
1596       /* Calculate epsilon closure of `node_idx'.  */
1597       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1598       if (BE (err != REG_NOERROR, 0))
1599 	return err;
1600 
1601       if (dfa->eclosures[node_idx].nelem == 0)
1602 	{
1603 	  incomplete = 1;
1604 	  re_node_set_free (&eclosure_elem);
1605 	}
1606     }
1607   return REG_NOERROR;
1608 }
1609 
1610 /* Calculate epsilon closure of NODE.  */
1611 
1612 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1613 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1614 {
1615   reg_errcode_t err;
1616   unsigned int constraint;
1617   int i, incomplete;
1618   re_node_set eclosure;
1619   incomplete = 0;
1620   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1621   if (BE (err != REG_NOERROR, 0))
1622     return err;
1623 
1624   /* This indicates that we are calculating this node now.
1625      We reference this value to avoid infinite loop.  */
1626   dfa->eclosures[node].nelem = -1;
1627 
1628   constraint = ((dfa->nodes[node].type == ANCHOR)
1629 		? dfa->nodes[node].opr.ctx_type : 0);
1630   /* If the current node has constraints, duplicate all nodes.
1631      Since they must inherit the constraints.  */
1632   if (constraint
1633       && dfa->edests[node].nelem
1634       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1635     {
1636       err = duplicate_node_closure (dfa, node, node, node, constraint);
1637       if (BE (err != REG_NOERROR, 0))
1638 	return err;
1639     }
1640 
1641   /* Expand each epsilon destination nodes.  */
1642   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1643     for (i = 0; i < dfa->edests[node].nelem; ++i)
1644       {
1645 	re_node_set eclosure_elem;
1646 	int edest = dfa->edests[node].elems[i];
1647 	/* If calculating the epsilon closure of `edest' is in progress,
1648 	   return intermediate result.  */
1649 	if (dfa->eclosures[edest].nelem == -1)
1650 	  {
1651 	    incomplete = 1;
1652 	    continue;
1653 	  }
1654 	/* If we haven't calculated the epsilon closure of `edest' yet,
1655 	   calculate now. Otherwise use calculated epsilon closure.  */
1656 	if (dfa->eclosures[edest].nelem == 0)
1657 	  {
1658 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1659 	    if (BE (err != REG_NOERROR, 0))
1660 	      return err;
1661 	  }
1662 	else
1663 	  eclosure_elem = dfa->eclosures[edest];
1664 	/* Merge the epsilon closure of `edest'.  */
1665 	re_node_set_merge (&eclosure, &eclosure_elem);
1666 	/* If the epsilon closure of `edest' is incomplete,
1667 	   the epsilon closure of this node is also incomplete.  */
1668 	if (dfa->eclosures[edest].nelem == 0)
1669 	  {
1670 	    incomplete = 1;
1671 	    re_node_set_free (&eclosure_elem);
1672 	  }
1673       }
1674 
1675   /* Epsilon closures include itself.  */
1676   re_node_set_insert (&eclosure, node);
1677   if (incomplete && !root)
1678     dfa->eclosures[node].nelem = 0;
1679   else
1680     dfa->eclosures[node] = eclosure;
1681   *new_set = eclosure;
1682   return REG_NOERROR;
1683 }
1684 
1685 /* Functions for token which are used in the parser.  */
1686 
1687 /* Fetch a token from INPUT.
1688    We must not use this function inside bracket expressions.  */
1689 
1690 static void
1691 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1692 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1693 {
1694   re_string_skip_bytes (input, peek_token (result, input, syntax));
1695 }
1696 
1697 /* Peek a token from INPUT, and return the length of the token.
1698    We must not use this function inside bracket expressions.  */
1699 
1700 static int
1701 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1702 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1703 {
1704   unsigned char c;
1705 
1706   if (re_string_eoi (input))
1707     {
1708       token->type = END_OF_RE;
1709       return 0;
1710     }
1711 
1712   c = re_string_peek_byte (input, 0);
1713   token->opr.c = c;
1714 
1715   token->word_char = 0;
1716 #ifdef RE_ENABLE_I18N
1717   token->mb_partial = 0;
1718   if (input->mb_cur_max > 1 &&
1719       !re_string_first_byte (input, re_string_cur_idx (input)))
1720     {
1721       token->type = CHARACTER;
1722       token->mb_partial = 1;
1723       return 1;
1724     }
1725 #endif
1726   if (c == '\\')
1727     {
1728       unsigned char c2;
1729       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1730 	{
1731 	  token->type = BACK_SLASH;
1732 	  return 1;
1733 	}
1734 
1735       c2 = re_string_peek_byte_case (input, 1);
1736       token->opr.c = c2;
1737       token->type = CHARACTER;
1738 #ifdef RE_ENABLE_I18N
1739       if (input->mb_cur_max > 1)
1740 	{
1741 	  wint_t wc = re_string_wchar_at (input,
1742 					  re_string_cur_idx (input) + 1);
1743 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1744 	}
1745       else
1746 #endif
1747 	token->word_char = IS_WORD_CHAR (c2) != 0;
1748 
1749       switch (c2)
1750 	{
1751 	case '|':
1752 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1753 	    token->type = OP_ALT;
1754 	  break;
1755 	case '1': case '2': case '3': case '4': case '5':
1756 	case '6': case '7': case '8': case '9':
1757 	  if (!(syntax & RE_NO_BK_REFS))
1758 	    {
1759 	      token->type = OP_BACK_REF;
1760 	      token->opr.idx = c2 - '1';
1761 	    }
1762 	  break;
1763 	case '<':
1764 	  if (!(syntax & RE_NO_GNU_OPS))
1765 	    {
1766 	      token->type = ANCHOR;
1767 	      token->opr.ctx_type = WORD_FIRST;
1768 	    }
1769 	  break;
1770 	case '>':
1771 	  if (!(syntax & RE_NO_GNU_OPS))
1772 	    {
1773 	      token->type = ANCHOR;
1774 	      token->opr.ctx_type = WORD_LAST;
1775 	    }
1776 	  break;
1777 	case 'b':
1778 	  if (!(syntax & RE_NO_GNU_OPS))
1779 	    {
1780 	      token->type = ANCHOR;
1781 	      token->opr.ctx_type = WORD_DELIM;
1782 	    }
1783 	  break;
1784 	case 'B':
1785 	  if (!(syntax & RE_NO_GNU_OPS))
1786 	    {
1787 	      token->type = ANCHOR;
1788 	      token->opr.ctx_type = NOT_WORD_DELIM;
1789 	    }
1790 	  break;
1791 	case 'w':
1792 	  if (!(syntax & RE_NO_GNU_OPS))
1793 	    token->type = OP_WORD;
1794 	  break;
1795 	case 'W':
1796 	  if (!(syntax & RE_NO_GNU_OPS))
1797 	    token->type = OP_NOTWORD;
1798 	  break;
1799 	case 's':
1800 	  if (!(syntax & RE_NO_GNU_OPS))
1801 	    token->type = OP_SPACE;
1802 	  break;
1803 	case 'S':
1804 	  if (!(syntax & RE_NO_GNU_OPS))
1805 	    token->type = OP_NOTSPACE;
1806 	  break;
1807 	case '`':
1808 	  if (!(syntax & RE_NO_GNU_OPS))
1809 	    {
1810 	      token->type = ANCHOR;
1811 	      token->opr.ctx_type = BUF_FIRST;
1812 	    }
1813 	  break;
1814 	case '\'':
1815 	  if (!(syntax & RE_NO_GNU_OPS))
1816 	    {
1817 	      token->type = ANCHOR;
1818 	      token->opr.ctx_type = BUF_LAST;
1819 	    }
1820 	  break;
1821 	case '(':
1822 	  if (!(syntax & RE_NO_BK_PARENS))
1823 	    token->type = OP_OPEN_SUBEXP;
1824 	  break;
1825 	case ')':
1826 	  if (!(syntax & RE_NO_BK_PARENS))
1827 	    token->type = OP_CLOSE_SUBEXP;
1828 	  break;
1829 	case '+':
1830 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1831 	    token->type = OP_DUP_PLUS;
1832 	  break;
1833 	case '?':
1834 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1835 	    token->type = OP_DUP_QUESTION;
1836 	  break;
1837 	case '{':
1838 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1839 	    token->type = OP_OPEN_DUP_NUM;
1840 	  break;
1841 	case '}':
1842 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1843 	    token->type = OP_CLOSE_DUP_NUM;
1844 	  break;
1845 	default:
1846 	  break;
1847 	}
1848       return 2;
1849     }
1850 
1851   token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853   if (input->mb_cur_max > 1)
1854     {
1855       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1856       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1857     }
1858   else
1859 #endif
1860     token->word_char = IS_WORD_CHAR (token->opr.c);
1861 
1862   switch (c)
1863     {
1864     case '\n':
1865       if (syntax & RE_NEWLINE_ALT)
1866 	token->type = OP_ALT;
1867       break;
1868     case '|':
1869       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1870 	token->type = OP_ALT;
1871       break;
1872     case '*':
1873       token->type = OP_DUP_ASTERISK;
1874       break;
1875     case '+':
1876       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1877 	token->type = OP_DUP_PLUS;
1878       break;
1879     case '?':
1880       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1881 	token->type = OP_DUP_QUESTION;
1882       break;
1883     case '{':
1884       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1885 	token->type = OP_OPEN_DUP_NUM;
1886       break;
1887     case '}':
1888       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1889 	token->type = OP_CLOSE_DUP_NUM;
1890       break;
1891     case '(':
1892       if (syntax & RE_NO_BK_PARENS)
1893 	token->type = OP_OPEN_SUBEXP;
1894       break;
1895     case ')':
1896       if (syntax & RE_NO_BK_PARENS)
1897 	token->type = OP_CLOSE_SUBEXP;
1898       break;
1899     case '[':
1900       token->type = OP_OPEN_BRACKET;
1901       break;
1902     case '.':
1903       token->type = OP_PERIOD;
1904       break;
1905     case '^':
1906       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1907 	  re_string_cur_idx (input) != 0)
1908 	{
1909 	  char prev = re_string_peek_byte (input, -1);
1910 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1911 	    break;
1912 	}
1913       token->type = ANCHOR;
1914       token->opr.ctx_type = LINE_FIRST;
1915       break;
1916     case '$':
1917       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1918 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1919 	{
1920 	  re_token_t next;
1921 	  re_string_skip_bytes (input, 1);
1922 	  peek_token (&next, input, syntax);
1923 	  re_string_skip_bytes (input, -1);
1924 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1925 	    break;
1926 	}
1927       token->type = ANCHOR;
1928       token->opr.ctx_type = LINE_LAST;
1929       break;
1930     default:
1931       break;
1932     }
1933   return 1;
1934 }
1935 
1936 /* Peek a token from INPUT, and return the length of the token.
1937    We must not use this function out of bracket expressions.  */
1938 
1939 static int
1940 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1941 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1942 {
1943   unsigned char c;
1944   if (re_string_eoi (input))
1945     {
1946       token->type = END_OF_RE;
1947       return 0;
1948     }
1949   c = re_string_peek_byte (input, 0);
1950   token->opr.c = c;
1951 
1952 #ifdef RE_ENABLE_I18N
1953   if (input->mb_cur_max > 1 &&
1954       !re_string_first_byte (input, re_string_cur_idx (input)))
1955     {
1956       token->type = CHARACTER;
1957       return 1;
1958     }
1959 #endif /* RE_ENABLE_I18N */
1960 
1961   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1962       && re_string_cur_idx (input) + 1 < re_string_length (input))
1963     {
1964       /* In this case, '\' escape a character.  */
1965       unsigned char c2;
1966       re_string_skip_bytes (input, 1);
1967       c2 = re_string_peek_byte (input, 0);
1968       token->opr.c = c2;
1969       token->type = CHARACTER;
1970       return 1;
1971     }
1972   if (c == '[') /* '[' is a special char in a bracket exps.  */
1973     {
1974       unsigned char c2;
1975       int token_len;
1976       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1977 	c2 = re_string_peek_byte (input, 1);
1978       else
1979 	c2 = 0;
1980       token->opr.c = c2;
1981       token_len = 2;
1982       switch (c2)
1983 	{
1984 	case '.':
1985 	  token->type = OP_OPEN_COLL_ELEM;
1986 	  break;
1987 	case '=':
1988 	  token->type = OP_OPEN_EQUIV_CLASS;
1989 	  break;
1990 	case ':':
1991 	  if (syntax & RE_CHAR_CLASSES)
1992 	    {
1993 	      token->type = OP_OPEN_CHAR_CLASS;
1994 	      break;
1995 	    }
1996 	  /* else fall through.  */
1997 	default:
1998 	  token->type = CHARACTER;
1999 	  token->opr.c = c;
2000 	  token_len = 1;
2001 	  break;
2002 	}
2003       return token_len;
2004     }
2005   switch (c)
2006     {
2007     case '-':
2008       token->type = OP_CHARSET_RANGE;
2009       break;
2010     case ']':
2011       token->type = OP_CLOSE_BRACKET;
2012       break;
2013     case '^':
2014       token->type = OP_NON_MATCH_LIST;
2015       break;
2016     default:
2017       token->type = CHARACTER;
2018     }
2019   return 1;
2020 }
2021 
2022 /* Functions for parser.  */
2023 
2024 /* Entry point of the parser.
2025    Parse the regular expression REGEXP and return the structure tree.
2026    If an error is occured, ERR is set by error code, and return NULL.
2027    This function build the following tree, from regular expression <reg_exp>:
2028 	   CAT
2029 	   / \
2030 	  /   \
2031    <reg_exp>  EOR
2032 
2033    CAT means concatenation.
2034    EOR means end of regular expression.  */
2035 
2036 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2037 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2038        reg_errcode_t *err)
2039 {
2040   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2041   bin_tree_t *tree, *eor, *root;
2042   re_token_t current_token;
2043   dfa->syntax = syntax;
2044   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2045   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2046   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2047     return NULL;
2048   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2049   if (tree != NULL)
2050     root = create_tree (dfa, tree, eor, CONCAT);
2051   else
2052     root = eor;
2053   if (BE (eor == NULL || root == NULL, 0))
2054     {
2055       *err = REG_ESPACE;
2056       return NULL;
2057     }
2058   return root;
2059 }
2060 
2061 /* This function build the following tree, from regular expression
2062    <branch1>|<branch2>:
2063 	   ALT
2064 	   / \
2065 	  /   \
2066    <branch1> <branch2>
2067 
2068    ALT means alternative, which represents the operator `|'.  */
2069 
2070 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2071 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2072 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2073 {
2074   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2075   bin_tree_t *tree, *branch = NULL;
2076   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2077   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078     return NULL;
2079 
2080   while (token->type == OP_ALT)
2081     {
2082       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2083       if (token->type != OP_ALT && token->type != END_OF_RE
2084 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2085 	{
2086 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2087 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2088 	    return NULL;
2089 	}
2090       else
2091 	branch = NULL;
2092       tree = create_tree (dfa, tree, branch, OP_ALT);
2093       if (BE (tree == NULL, 0))
2094 	{
2095 	  *err = REG_ESPACE;
2096 	  return NULL;
2097 	}
2098     }
2099   return tree;
2100 }
2101 
2102 /* This function build the following tree, from regular expression
2103    <exp1><exp2>:
2104 	CAT
2105 	/ \
2106        /   \
2107    <exp1> <exp2>
2108 
2109    CAT means concatenation.  */
2110 
2111 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2112 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2113 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2114 {
2115   bin_tree_t *tree, *exp;
2116   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2117   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2118   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119     return NULL;
2120 
2121   while (token->type != OP_ALT && token->type != END_OF_RE
2122 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2123     {
2124       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2125       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2126 	{
2127 	  return NULL;
2128 	}
2129       if (tree != NULL && exp != NULL)
2130 	{
2131 	  tree = create_tree (dfa, tree, exp, CONCAT);
2132 	  if (tree == NULL)
2133 	    {
2134 	      *err = REG_ESPACE;
2135 	      return NULL;
2136 	    }
2137 	}
2138       else if (tree == NULL)
2139 	tree = exp;
2140       /* Otherwise exp == NULL, we don't need to create new tree.  */
2141     }
2142   return tree;
2143 }
2144 
2145 /* This function build the following tree, from regular expression a*:
2146 	 *
2147 	 |
2148 	 a
2149 */
2150 
2151 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2152 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2154 {
2155   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2156   bin_tree_t *tree;
2157   switch (token->type)
2158     {
2159     case CHARACTER:
2160       tree = create_token_tree (dfa, NULL, NULL, token);
2161       if (BE (tree == NULL, 0))
2162 	{
2163 	  *err = REG_ESPACE;
2164 	  return NULL;
2165 	}
2166 #ifdef RE_ENABLE_I18N
2167       if (dfa->mb_cur_max > 1)
2168 	{
2169 	  while (!re_string_eoi (regexp)
2170 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2171 	    {
2172 	      bin_tree_t *mbc_remain;
2173 	      fetch_token (token, regexp, syntax);
2174 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2175 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2176 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2177 		{
2178 		  *err = REG_ESPACE;
2179 		  return NULL;
2180 		}
2181 	    }
2182 	}
2183 #endif
2184       break;
2185     case OP_OPEN_SUBEXP:
2186       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2187       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188 	return NULL;
2189       break;
2190     case OP_OPEN_BRACKET:
2191       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2192       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 	return NULL;
2194       break;
2195     case OP_BACK_REF:
2196       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2197 	{
2198 	  *err = REG_ESUBREG;
2199 	  return NULL;
2200 	}
2201       dfa->used_bkref_map |= 1 << token->opr.idx;
2202       tree = create_token_tree (dfa, NULL, NULL, token);
2203       if (BE (tree == NULL, 0))
2204 	{
2205 	  *err = REG_ESPACE;
2206 	  return NULL;
2207 	}
2208       ++dfa->nbackref;
2209       dfa->has_mb_node = 1;
2210       break;
2211     case OP_OPEN_DUP_NUM:
2212       if (syntax & RE_CONTEXT_INVALID_DUP)
2213 	{
2214 	  *err = REG_BADRPT;
2215 	  return NULL;
2216 	}
2217       /* FALLTHROUGH */
2218     case OP_DUP_ASTERISK:
2219     case OP_DUP_PLUS:
2220     case OP_DUP_QUESTION:
2221       if (syntax & RE_CONTEXT_INVALID_OPS)
2222 	{
2223 	  *err = REG_BADRPT;
2224 	  return NULL;
2225 	}
2226       else if (syntax & RE_CONTEXT_INDEP_OPS)
2227 	{
2228 	  fetch_token (token, regexp, syntax);
2229 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2230 	}
2231       /* else fall through  */
2232     case OP_CLOSE_SUBEXP:
2233       if ((token->type == OP_CLOSE_SUBEXP) &&
2234 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2235 	{
2236 	  *err = REG_ERPAREN;
2237 	  return NULL;
2238 	}
2239       /* else fall through  */
2240     case OP_CLOSE_DUP_NUM:
2241       /* We treat it as a normal character.  */
2242 
2243       /* Then we can these characters as normal characters.  */
2244       token->type = CHARACTER;
2245       /* mb_partial and word_char bits should be initialized already
2246 	 by peek_token.  */
2247       tree = create_token_tree (dfa, NULL, NULL, token);
2248       if (BE (tree == NULL, 0))
2249 	{
2250 	  *err = REG_ESPACE;
2251 	  return NULL;
2252 	}
2253       break;
2254     case ANCHOR:
2255       if ((token->opr.ctx_type
2256 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2257 	  && dfa->word_ops_used == 0)
2258 	init_word_char (dfa);
2259       if (token->opr.ctx_type == WORD_DELIM
2260           || token->opr.ctx_type == NOT_WORD_DELIM)
2261 	{
2262 	  bin_tree_t *tree_first, *tree_last;
2263 	  if (token->opr.ctx_type == WORD_DELIM)
2264 	    {
2265 	      token->opr.ctx_type = WORD_FIRST;
2266 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2267 	      token->opr.ctx_type = WORD_LAST;
2268             }
2269           else
2270             {
2271 	      token->opr.ctx_type = INSIDE_WORD;
2272 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2273 	      token->opr.ctx_type = INSIDE_NOTWORD;
2274             }
2275 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2276 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2277 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2278 	    {
2279 	      *err = REG_ESPACE;
2280 	      return NULL;
2281 	    }
2282 	}
2283       else
2284 	{
2285 	  tree = create_token_tree (dfa, NULL, NULL, token);
2286 	  if (BE (tree == NULL, 0))
2287 	    {
2288 	      *err = REG_ESPACE;
2289 	      return NULL;
2290 	    }
2291 	}
2292       /* We must return here, since ANCHORs can't be followed
2293 	 by repetition operators.
2294 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2295 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2296       fetch_token (token, regexp, syntax);
2297       return tree;
2298     case OP_PERIOD:
2299       tree = create_token_tree (dfa, NULL, NULL, token);
2300       if (BE (tree == NULL, 0))
2301 	{
2302 	  *err = REG_ESPACE;
2303 	  return NULL;
2304 	}
2305       if (dfa->mb_cur_max > 1)
2306 	dfa->has_mb_node = 1;
2307       break;
2308     case OP_WORD:
2309     case OP_NOTWORD:
2310       tree = build_charclass_op (dfa, regexp->trans,
2311 				 (const unsigned char *) "alnum",
2312 				 (const unsigned char *) "_",
2313 				 token->type == OP_NOTWORD, err);
2314       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2315 	return NULL;
2316       break;
2317     case OP_SPACE:
2318     case OP_NOTSPACE:
2319       tree = build_charclass_op (dfa, regexp->trans,
2320 				 (const unsigned char *) "space",
2321 				 (const unsigned char *) "",
2322 				 token->type == OP_NOTSPACE, err);
2323       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 	return NULL;
2325       break;
2326     case OP_ALT:
2327     case END_OF_RE:
2328       return NULL;
2329     case BACK_SLASH:
2330       *err = REG_EESCAPE;
2331       return NULL;
2332     default:
2333       /* Must not happen?  */
2334 #ifdef DEBUG
2335       assert (0);
2336 #endif
2337       return NULL;
2338     }
2339   fetch_token (token, regexp, syntax);
2340 
2341   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2342 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2343     {
2344       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2345       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2346 	return NULL;
2347       /* In BRE consecutive duplications are not allowed.  */
2348       if ((syntax & RE_CONTEXT_INVALID_DUP)
2349 	  && (token->type == OP_DUP_ASTERISK
2350 	      || token->type == OP_OPEN_DUP_NUM))
2351 	{
2352 	  *err = REG_BADRPT;
2353 	  return NULL;
2354 	}
2355     }
2356 
2357   return tree;
2358 }
2359 
2360 /* This function build the following tree, from regular expression
2361    (<reg_exp>):
2362 	 SUBEXP
2363 	    |
2364 	<reg_exp>
2365 */
2366 
2367 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2368 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2369 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2370 {
2371   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2372   bin_tree_t *tree;
2373   size_t cur_nsub;
2374   cur_nsub = preg->re_nsub++;
2375 
2376   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2377 
2378   /* The subexpression may be a null string.  */
2379   if (token->type == OP_CLOSE_SUBEXP)
2380     tree = NULL;
2381   else
2382     {
2383       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2384       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2385         *err = REG_EPAREN;
2386       if (BE (*err != REG_NOERROR, 0))
2387 	return NULL;
2388     }
2389 
2390   if (cur_nsub <= '9' - '1')
2391     dfa->completed_bkref_map |= 1 << cur_nsub;
2392 
2393   tree = create_tree (dfa, tree, NULL, SUBEXP);
2394   if (BE (tree == NULL, 0))
2395     {
2396       *err = REG_ESPACE;
2397       return NULL;
2398     }
2399   tree->token.opr.idx = cur_nsub;
2400   return tree;
2401 }
2402 
2403 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2404 
2405 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2406 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2407 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2408 {
2409   bin_tree_t *tree = NULL, *old_tree = NULL;
2410   int i, start, end, start_idx = re_string_cur_idx (regexp);
2411   re_token_t start_token = *token;
2412 
2413   if (token->type == OP_OPEN_DUP_NUM)
2414     {
2415       end = 0;
2416       start = fetch_number (regexp, token, syntax);
2417       if (start == -1)
2418 	{
2419 	  if (token->type == CHARACTER && token->opr.c == ',')
2420 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2421 	  else
2422 	    {
2423 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2424 	      return NULL;
2425 	    }
2426 	}
2427       if (BE (start != -2, 1))
2428 	{
2429 	  /* We treat "{n}" as "{n,n}".  */
2430 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2431 		 : ((token->type == CHARACTER && token->opr.c == ',')
2432 		    ? fetch_number (regexp, token, syntax) : -2));
2433 	}
2434       if (BE (start == -2 || end == -2, 0))
2435 	{
2436 	  /* Invalid sequence.  */
2437 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2438 	    {
2439 	      if (token->type == END_OF_RE)
2440 		*err = REG_EBRACE;
2441 	      else
2442 		*err = REG_BADBR;
2443 
2444 	      return NULL;
2445 	    }
2446 
2447 	  /* If the syntax bit is set, rollback.  */
2448 	  re_string_set_index (regexp, start_idx);
2449 	  *token = start_token;
2450 	  token->type = CHARACTER;
2451 	  /* mb_partial and word_char bits should be already initialized by
2452 	     peek_token.  */
2453 	  return elem;
2454 	}
2455 
2456       if (BE (end != -1 && start > end, 0))
2457 	{
2458 	  /* First number greater than second.  */
2459 	  *err = REG_BADBR;
2460 	  return NULL;
2461 	}
2462     }
2463   else
2464     {
2465       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2466       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2467     }
2468 
2469   fetch_token (token, regexp, syntax);
2470 
2471   if (BE (elem == NULL, 0))
2472     return NULL;
2473   if (BE (start == 0 && end == 0, 0))
2474     {
2475       postorder (elem, free_tree, NULL);
2476       return NULL;
2477     }
2478 
2479   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2480   if (BE (start > 0, 0))
2481     {
2482       tree = elem;
2483       for (i = 2; i <= start; ++i)
2484 	{
2485 	  elem = duplicate_tree (elem, dfa);
2486 	  tree = create_tree (dfa, tree, elem, CONCAT);
2487 	  if (BE (elem == NULL || tree == NULL, 0))
2488 	    goto parse_dup_op_espace;
2489 	}
2490 
2491       if (start == end)
2492 	return tree;
2493 
2494       /* Duplicate ELEM before it is marked optional.  */
2495       elem = duplicate_tree (elem, dfa);
2496       old_tree = tree;
2497     }
2498   else
2499     old_tree = NULL;
2500 
2501   if (elem->token.type == SUBEXP)
2502 #ifdef _WIN64
2503     postorder (elem, mark_opt_subexp, (void *) (__int64) elem->token.opr.idx);
2504 #else
2505     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2506 #endif
2507 
2508   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2509   if (BE (tree == NULL, 0))
2510     goto parse_dup_op_espace;
2511 
2512   /* This loop is actually executed only when end != -1,
2513      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2514      already created the start+1-th copy.  */
2515   for (i = start + 2; i <= end; ++i)
2516     {
2517       elem = duplicate_tree (elem, dfa);
2518       tree = create_tree (dfa, tree, elem, CONCAT);
2519       if (BE (elem == NULL || tree == NULL, 0))
2520         goto parse_dup_op_espace;
2521 
2522       tree = create_tree (dfa, tree, NULL, OP_ALT);
2523       if (BE (tree == NULL, 0))
2524         goto parse_dup_op_espace;
2525     }
2526 
2527   if (old_tree)
2528     tree = create_tree (dfa, old_tree, tree, CONCAT);
2529 
2530   return tree;
2531 
2532  parse_dup_op_espace:
2533   *err = REG_ESPACE;
2534   return NULL;
2535 }
2536 
2537 /* Size of the names for collating symbol/equivalence_class/character_class.
2538    I'm not sure, but maybe enough.  */
2539 #define BRACKET_NAME_BUF_SIZE 32
2540 
2541 #ifndef _LIBC
2542   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2543      Build the range expression which starts from START_ELEM, and ends
2544      at END_ELEM.  The result are written to MBCSET and SBCSET.
2545      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2546      mbcset->range_ends, is a pointer argument sinse we may
2547      update it.  */
2548 
2549 static reg_errcode_t
2550 internal_function
2551 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2552 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2553 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2554 # else /* not RE_ENABLE_I18N */
2555 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2556 		 bracket_elem_t *end_elem)
2557 # endif /* not RE_ENABLE_I18N */
2558 {
2559   unsigned int start_ch, end_ch;
2560   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2561   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2562 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2563 	  0))
2564     return REG_ERANGE;
2565 
2566   /* We can handle no multi character collating elements without libc
2567      support.  */
2568   if (BE ((start_elem->type == COLL_SYM
2569 	   && strlen ((char *) start_elem->opr.name) > 1)
2570 	  || (end_elem->type == COLL_SYM
2571 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2572     return REG_ECOLLATE;
2573 
2574 # ifdef RE_ENABLE_I18N
2575   {
2576     wchar_t wc;
2577     wint_t start_wc;
2578     wint_t end_wc;
2579     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2580 
2581     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2582 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2583 		   : 0));
2584     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2585 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2586 		 : 0));
2587     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2588 		? __btowc (start_ch) : start_elem->opr.wch);
2589     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2590 	      ? __btowc (end_ch) : end_elem->opr.wch);
2591     if (start_wc == WEOF || end_wc == WEOF)
2592       return REG_ECOLLATE;
2593     cmp_buf[0] = start_wc;
2594     cmp_buf[4] = end_wc;
2595     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2596       return REG_ERANGE;
2597 
2598     /* Got valid collation sequence values, add them as a new entry.
2599        However, for !_LIBC we have no collation elements: if the
2600        character set is single byte, the single byte character set
2601        that we build below suffices.  parse_bracket_exp passes
2602        no MBCSET if dfa->mb_cur_max == 1.  */
2603     if (mbcset)
2604       {
2605         /* Check the space of the arrays.  */
2606         if (BE (*range_alloc == mbcset->nranges, 0))
2607           {
2608 	    /* There is not enough space, need realloc.  */
2609 	    wchar_t *new_array_start, *new_array_end;
2610 	    int new_nranges;
2611 
2612 	    /* +1 in case of mbcset->nranges is 0.  */
2613 	    new_nranges = 2 * mbcset->nranges + 1;
2614 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2615 	       are NULL if *range_alloc == 0.  */
2616 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2617 				          new_nranges);
2618 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2619 				        new_nranges);
2620 
2621 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2622 	      return REG_ESPACE;
2623 
2624 	    mbcset->range_starts = new_array_start;
2625 	    mbcset->range_ends = new_array_end;
2626 	    *range_alloc = new_nranges;
2627           }
2628 
2629         mbcset->range_starts[mbcset->nranges] = start_wc;
2630         mbcset->range_ends[mbcset->nranges++] = end_wc;
2631       }
2632 
2633     /* Build the table for single byte characters.  */
2634     for (wc = 0; wc < SBC_MAX; ++wc)
2635       {
2636 	cmp_buf[2] = wc;
2637 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2638 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2639 	  bitset_set (sbcset, wc);
2640       }
2641   }
2642 # else /* not RE_ENABLE_I18N */
2643   {
2644     unsigned int ch;
2645     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2646 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2647 		   : 0));
2648     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2649 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2650 		 : 0));
2651     if (start_ch > end_ch)
2652       return REG_ERANGE;
2653     /* Build the table for single byte characters.  */
2654     for (ch = 0; ch < SBC_MAX; ++ch)
2655       if (start_ch <= ch  && ch <= end_ch)
2656 	bitset_set (sbcset, ch);
2657   }
2658 # endif /* not RE_ENABLE_I18N */
2659   return REG_NOERROR;
2660 }
2661 #endif /* not _LIBC */
2662 
2663 #ifndef _LIBC
2664 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2665    Build the collating element which is represented by NAME.
2666    The result are written to MBCSET and SBCSET.
2667    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2668    pointer argument since we may update it.  */
2669 
2670 static reg_errcode_t
2671 internal_function
2672 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2673 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2674 			int *coll_sym_alloc, const unsigned char *name)
2675 # else /* not RE_ENABLE_I18N */
2676 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2677 # endif /* not RE_ENABLE_I18N */
2678 {
2679   size_t name_len = strlen ((const char *) name);
2680   if (BE (name_len != 1, 0))
2681     return REG_ECOLLATE;
2682   else
2683     {
2684       bitset_set (sbcset, name[0]);
2685       return REG_NOERROR;
2686     }
2687 }
2688 #endif /* not _LIBC */
2689 
2690 /* This function parse bracket expression like "[abc]", "[a-c]",
2691    "[[.a-a.]]" etc.  */
2692 
2693 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2694 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2695 		   reg_syntax_t syntax, reg_errcode_t *err)
2696 {
2697 #ifdef _LIBC
2698   const unsigned char *collseqmb;
2699   const char *collseqwc;
2700   uint32_t nrules;
2701   int32_t table_size;
2702   const int32_t *symb_table;
2703   const unsigned char *extra;
2704 
2705   /* Local function for parse_bracket_exp used in _LIBC environement.
2706      Seek the collating symbol entry correspondings to NAME.
2707      Return the index of the symbol in the SYMB_TABLE.  */
2708 
2709   auto inline int32_t
2710   __attribute ((always_inline))
2711   seek_collating_symbol_entry (name, name_len)
2712 	 const unsigned char *name;
2713 	 size_t name_len;
2714     {
2715       int32_t hash = elem_hash ((const char *) name, name_len);
2716       int32_t elem = hash % table_size;
2717       if (symb_table[2 * elem] != 0)
2718 	{
2719 	  int32_t second = hash % (table_size - 2) + 1;
2720 
2721 	  do
2722 	    {
2723 	      /* First compare the hashing value.  */
2724 	      if (symb_table[2 * elem] == hash
2725 		  /* Compare the length of the name.  */
2726 		  && name_len == extra[symb_table[2 * elem + 1]]
2727 		  /* Compare the name.  */
2728 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2729 			     name_len) == 0)
2730 		{
2731 		  /* Yep, this is the entry.  */
2732 		  break;
2733 		}
2734 
2735 	      /* Next entry.  */
2736 	      elem += second;
2737 	    }
2738 	  while (symb_table[2 * elem] != 0);
2739 	}
2740       return elem;
2741     }
2742 
2743   /* Local function for parse_bracket_exp used in _LIBC environement.
2744      Look up the collation sequence value of BR_ELEM.
2745      Return the value if succeeded, UINT_MAX otherwise.  */
2746 
2747   auto inline unsigned int
2748   __attribute ((always_inline))
2749   lookup_collation_sequence_value (br_elem)
2750 	 bracket_elem_t *br_elem;
2751     {
2752       if (br_elem->type == SB_CHAR)
2753 	{
2754 	  /*
2755 	  if (MB_CUR_MAX == 1)
2756 	  */
2757 	  if (nrules == 0)
2758 	    return collseqmb[br_elem->opr.ch];
2759 	  else
2760 	    {
2761 	      wint_t wc = __btowc (br_elem->opr.ch);
2762 	      return __collseq_table_lookup (collseqwc, wc);
2763 	    }
2764 	}
2765       else if (br_elem->type == MB_CHAR)
2766 	{
2767 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2768 	}
2769       else if (br_elem->type == COLL_SYM)
2770 	{
2771 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2772 	  if (nrules != 0)
2773 	    {
2774 	      int32_t elem, idx;
2775 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2776 						  sym_name_len);
2777 	      if (symb_table[2 * elem] != 0)
2778 		{
2779 		  /* We found the entry.  */
2780 		  idx = symb_table[2 * elem + 1];
2781 		  /* Skip the name of collating element name.  */
2782 		  idx += 1 + extra[idx];
2783 		  /* Skip the byte sequence of the collating element.  */
2784 		  idx += 1 + extra[idx];
2785 		  /* Adjust for the alignment.  */
2786 		  idx = (idx + 3) & ~3;
2787 		  /* Skip the multibyte collation sequence value.  */
2788 		  idx += sizeof (unsigned int);
2789 		  /* Skip the wide char sequence of the collating element.  */
2790 		  idx += sizeof (unsigned int) *
2791 		    (1 + *(unsigned int *) (extra + idx));
2792 		  /* Return the collation sequence value.  */
2793 		  return *(unsigned int *) (extra + idx);
2794 		}
2795 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2796 		{
2797 		  /* No valid character.  Match it as a single byte
2798 		     character.  */
2799 		  return collseqmb[br_elem->opr.name[0]];
2800 		}
2801 	    }
2802 	  else if (sym_name_len == 1)
2803 	    return collseqmb[br_elem->opr.name[0]];
2804 	}
2805       return UINT_MAX;
2806     }
2807 
2808   /* Local function for parse_bracket_exp used in _LIBC environement.
2809      Build the range expression which starts from START_ELEM, and ends
2810      at END_ELEM.  The result are written to MBCSET and SBCSET.
2811      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2812      mbcset->range_ends, is a pointer argument sinse we may
2813      update it.  */
2814 
2815   auto inline reg_errcode_t
2816   __attribute ((always_inline))
2817   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2818 	 re_charset_t *mbcset;
2819 	 int *range_alloc;
2820 	 bitset_t sbcset;
2821 	 bracket_elem_t *start_elem, *end_elem;
2822     {
2823       unsigned int ch;
2824       uint32_t start_collseq;
2825       uint32_t end_collseq;
2826 
2827       /* Equivalence Classes and Character Classes can't be a range
2828 	 start/end.  */
2829       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2830 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2831 	      0))
2832 	return REG_ERANGE;
2833 
2834       start_collseq = lookup_collation_sequence_value (start_elem);
2835       end_collseq = lookup_collation_sequence_value (end_elem);
2836       /* Check start/end collation sequence values.  */
2837       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2838 	return REG_ECOLLATE;
2839       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2840 	return REG_ERANGE;
2841 
2842       /* Got valid collation sequence values, add them as a new entry.
2843 	 However, if we have no collation elements, and the character set
2844 	 is single byte, the single byte character set that we
2845 	 build below suffices. */
2846       if (nrules > 0 || dfa->mb_cur_max > 1)
2847 	{
2848           /* Check the space of the arrays.  */
2849           if (BE (*range_alloc == mbcset->nranges, 0))
2850 	    {
2851 	      /* There is not enough space, need realloc.  */
2852 	      uint32_t *new_array_start;
2853 	      uint32_t *new_array_end;
2854 	      int new_nranges;
2855 
2856 	      /* +1 in case of mbcset->nranges is 0.  */
2857 	      new_nranges = 2 * mbcset->nranges + 1;
2858 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2859 					    new_nranges);
2860 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2861 				          new_nranges);
2862 
2863 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2864 	        return REG_ESPACE;
2865 
2866 	      mbcset->range_starts = new_array_start;
2867 	      mbcset->range_ends = new_array_end;
2868 	      *range_alloc = new_nranges;
2869 	    }
2870 
2871           mbcset->range_starts[mbcset->nranges] = start_collseq;
2872           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2873 	}
2874 
2875       /* Build the table for single byte characters.  */
2876       for (ch = 0; ch < SBC_MAX; ch++)
2877 	{
2878 	  uint32_t ch_collseq;
2879 	  /*
2880 	  if (MB_CUR_MAX == 1)
2881 	  */
2882 	  if (nrules == 0)
2883 	    ch_collseq = collseqmb[ch];
2884 	  else
2885 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2886 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2887 	    bitset_set (sbcset, ch);
2888 	}
2889       return REG_NOERROR;
2890     }
2891 
2892   /* Local function for parse_bracket_exp used in _LIBC environement.
2893      Build the collating element which is represented by NAME.
2894      The result are written to MBCSET and SBCSET.
2895      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2896      pointer argument sinse we may update it.  */
2897 
2898   auto inline reg_errcode_t
2899   __attribute ((always_inline))
2900   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2901 	 re_charset_t *mbcset;
2902 	 int *coll_sym_alloc;
2903 	 bitset_t sbcset;
2904 	 const unsigned char *name;
2905     {
2906       int32_t elem, idx;
2907       size_t name_len = strlen ((const char *) name);
2908       if (nrules != 0)
2909 	{
2910 	  elem = seek_collating_symbol_entry (name, name_len);
2911 	  if (symb_table[2 * elem] != 0)
2912 	    {
2913 	      /* We found the entry.  */
2914 	      idx = symb_table[2 * elem + 1];
2915 	      /* Skip the name of collating element name.  */
2916 	      idx += 1 + extra[idx];
2917 	    }
2918 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2919 	    {
2920 	      /* No valid character, treat it as a normal
2921 		 character.  */
2922 	      bitset_set (sbcset, name[0]);
2923 	      return REG_NOERROR;
2924 	    }
2925 	  else
2926 	    return REG_ECOLLATE;
2927 
2928 	  /* Got valid collation sequence, add it as a new entry.  */
2929 	  /* Check the space of the arrays.  */
2930 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2931 	    {
2932 	      /* Not enough, realloc it.  */
2933 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2934 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2935 	      /* Use realloc since mbcset->coll_syms is NULL
2936 		 if *alloc == 0.  */
2937 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2938 						   new_coll_sym_alloc);
2939 	      if (BE (new_coll_syms == NULL, 0))
2940 		return REG_ESPACE;
2941 	      mbcset->coll_syms = new_coll_syms;
2942 	      *coll_sym_alloc = new_coll_sym_alloc;
2943 	    }
2944 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2945 	  return REG_NOERROR;
2946 	}
2947       else
2948 	{
2949 	  if (BE (name_len != 1, 0))
2950 	    return REG_ECOLLATE;
2951 	  else
2952 	    {
2953 	      bitset_set (sbcset, name[0]);
2954 	      return REG_NOERROR;
2955 	    }
2956 	}
2957     }
2958 #endif
2959 
2960   re_token_t br_token;
2961   re_bitset_ptr_t sbcset;
2962 #ifdef RE_ENABLE_I18N
2963   re_charset_t *mbcset;
2964   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2965   int equiv_class_alloc = 0, char_class_alloc = 0;
2966 #endif /* not RE_ENABLE_I18N */
2967   int non_match = 0;
2968   bin_tree_t *work_tree;
2969   int token_len;
2970   int first_round = 1;
2971 #ifdef _LIBC
2972   collseqmb = (const unsigned char *)
2973     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2974   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2975   if (nrules)
2976     {
2977       /*
2978       if (MB_CUR_MAX > 1)
2979       */
2980       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2981       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2982       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2983 						  _NL_COLLATE_SYMB_TABLEMB);
2984       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2985 						   _NL_COLLATE_SYMB_EXTRAMB);
2986     }
2987 #endif
2988   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2989 #ifdef RE_ENABLE_I18N
2990   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2991 #endif /* RE_ENABLE_I18N */
2992 #ifdef RE_ENABLE_I18N
2993   if (BE (sbcset == NULL || mbcset == NULL, 0))
2994 #else
2995   if (BE (sbcset == NULL, 0))
2996 #endif /* RE_ENABLE_I18N */
2997     {
2998       *err = REG_ESPACE;
2999       return NULL;
3000     }
3001 
3002   token_len = peek_token_bracket (token, regexp, syntax);
3003   if (BE (token->type == END_OF_RE, 0))
3004     {
3005       *err = REG_BADPAT;
3006       goto parse_bracket_exp_free_return;
3007     }
3008   if (token->type == OP_NON_MATCH_LIST)
3009     {
3010 #ifdef RE_ENABLE_I18N
3011       mbcset->non_match = 1;
3012 #endif /* not RE_ENABLE_I18N */
3013       non_match = 1;
3014       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3015 	bitset_set (sbcset, '\0');
3016       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3017       token_len = peek_token_bracket (token, regexp, syntax);
3018       if (BE (token->type == END_OF_RE, 0))
3019 	{
3020 	  *err = REG_BADPAT;
3021 	  goto parse_bracket_exp_free_return;
3022 	}
3023     }
3024 
3025   /* We treat the first ']' as a normal character.  */
3026   if (token->type == OP_CLOSE_BRACKET)
3027     token->type = CHARACTER;
3028 
3029   while (1)
3030     {
3031       bracket_elem_t start_elem, end_elem;
3032       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3033       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3034       reg_errcode_t ret;
3035       int token_len2 = 0, is_range_exp = 0;
3036       re_token_t token2;
3037 
3038       start_elem.opr.name = start_name_buf;
3039       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3040 				   syntax, first_round);
3041       if (BE (ret != REG_NOERROR, 0))
3042 	{
3043 	  *err = ret;
3044 	  goto parse_bracket_exp_free_return;
3045 	}
3046       first_round = 0;
3047 
3048       /* Get information about the next token.  We need it in any case.  */
3049       token_len = peek_token_bracket (token, regexp, syntax);
3050 
3051       /* Do not check for ranges if we know they are not allowed.  */
3052       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3053 	{
3054 	  if (BE (token->type == END_OF_RE, 0))
3055 	    {
3056 	      *err = REG_EBRACK;
3057 	      goto parse_bracket_exp_free_return;
3058 	    }
3059 	  if (token->type == OP_CHARSET_RANGE)
3060 	    {
3061 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3062 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3063 	      if (BE (token2.type == END_OF_RE, 0))
3064 		{
3065 		  *err = REG_EBRACK;
3066 		  goto parse_bracket_exp_free_return;
3067 		}
3068 	      if (token2.type == OP_CLOSE_BRACKET)
3069 		{
3070 		  /* We treat the last '-' as a normal character.  */
3071 		  re_string_skip_bytes (regexp, -token_len);
3072 		  token->type = CHARACTER;
3073 		}
3074 	      else
3075 		is_range_exp = 1;
3076 	    }
3077 	}
3078 
3079       if (is_range_exp == 1)
3080 	{
3081 	  end_elem.opr.name = end_name_buf;
3082 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3083 				       dfa, syntax, 1);
3084 	  if (BE (ret != REG_NOERROR, 0))
3085 	    {
3086 	      *err = ret;
3087 	      goto parse_bracket_exp_free_return;
3088 	    }
3089 
3090 	  token_len = peek_token_bracket (token, regexp, syntax);
3091 
3092 #ifdef _LIBC
3093 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3094 				  &start_elem, &end_elem);
3095 #else
3096 # ifdef RE_ENABLE_I18N
3097 	  *err = build_range_exp (sbcset,
3098 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3099 				  &range_alloc, &start_elem, &end_elem);
3100 # else
3101 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3102 # endif
3103 #endif /* RE_ENABLE_I18N */
3104 	  if (BE (*err != REG_NOERROR, 0))
3105 	    goto parse_bracket_exp_free_return;
3106 	}
3107       else
3108 	{
3109 	  switch (start_elem.type)
3110 	    {
3111 	    case SB_CHAR:
3112 	      bitset_set (sbcset, start_elem.opr.ch);
3113 	      break;
3114 #ifdef RE_ENABLE_I18N
3115 	    case MB_CHAR:
3116 	      /* Check whether the array has enough space.  */
3117 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3118 		{
3119 		  wchar_t *new_mbchars;
3120 		  /* Not enough, realloc it.  */
3121 		  /* +1 in case of mbcset->nmbchars is 0.  */
3122 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3123 		  /* Use realloc since array is NULL if *alloc == 0.  */
3124 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3125 					    mbchar_alloc);
3126 		  if (BE (new_mbchars == NULL, 0))
3127 		    goto parse_bracket_exp_espace;
3128 		  mbcset->mbchars = new_mbchars;
3129 		}
3130 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3131 	      break;
3132 #endif /* RE_ENABLE_I18N */
3133 	    case EQUIV_CLASS:
3134 	      *err = build_equiv_class (sbcset,
3135 #ifdef RE_ENABLE_I18N
3136 					mbcset, &equiv_class_alloc,
3137 #endif /* RE_ENABLE_I18N */
3138 					start_elem.opr.name);
3139 	      if (BE (*err != REG_NOERROR, 0))
3140 		goto parse_bracket_exp_free_return;
3141 	      break;
3142 	    case COLL_SYM:
3143 	      *err = build_collating_symbol (sbcset,
3144 #ifdef RE_ENABLE_I18N
3145 					     mbcset, &coll_sym_alloc,
3146 #endif /* RE_ENABLE_I18N */
3147 					     start_elem.opr.name);
3148 	      if (BE (*err != REG_NOERROR, 0))
3149 		goto parse_bracket_exp_free_return;
3150 	      break;
3151 	    case CHAR_CLASS:
3152 	      *err = build_charclass (regexp->trans, sbcset,
3153 #ifdef RE_ENABLE_I18N
3154 				      mbcset, &char_class_alloc,
3155 #endif /* RE_ENABLE_I18N */
3156 				      start_elem.opr.name, syntax);
3157 	      if (BE (*err != REG_NOERROR, 0))
3158 	       goto parse_bracket_exp_free_return;
3159 	      break;
3160 	    default:
3161 	      assert (0);
3162 	      break;
3163 	    }
3164 	}
3165       if (BE (token->type == END_OF_RE, 0))
3166 	{
3167 	  *err = REG_EBRACK;
3168 	  goto parse_bracket_exp_free_return;
3169 	}
3170       if (token->type == OP_CLOSE_BRACKET)
3171 	break;
3172     }
3173 
3174   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3175 
3176   /* If it is non-matching list.  */
3177   if (non_match)
3178     bitset_not (sbcset);
3179 
3180 #ifdef RE_ENABLE_I18N
3181   /* Ensure only single byte characters are set.  */
3182   if (dfa->mb_cur_max > 1)
3183     bitset_mask (sbcset, dfa->sb_char);
3184 
3185   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3186       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3187 						     || mbcset->non_match)))
3188     {
3189       bin_tree_t *mbc_tree;
3190       int sbc_idx;
3191       /* Build a tree for complex bracket.  */
3192       dfa->has_mb_node = 1;
3193       br_token.type = COMPLEX_BRACKET;
3194       br_token.opr.mbcset = mbcset;
3195       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3196       if (BE (mbc_tree == NULL, 0))
3197 	goto parse_bracket_exp_espace;
3198       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3199 	if (sbcset[sbc_idx])
3200 	  break;
3201       /* If there are no bits set in sbcset, there is no point
3202 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3203       if (sbc_idx < BITSET_WORDS)
3204 	{
3205           /* Build a tree for simple bracket.  */
3206           br_token.type = SIMPLE_BRACKET;
3207           br_token.opr.sbcset = sbcset;
3208           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3209           if (BE (work_tree == NULL, 0))
3210             goto parse_bracket_exp_espace;
3211 
3212           /* Then join them by ALT node.  */
3213           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3214           if (BE (work_tree == NULL, 0))
3215             goto parse_bracket_exp_espace;
3216 	}
3217       else
3218 	{
3219 	  re_free (sbcset);
3220 	  work_tree = mbc_tree;
3221 	}
3222     }
3223   else
3224 #endif /* not RE_ENABLE_I18N */
3225     {
3226 #ifdef RE_ENABLE_I18N
3227       free_charset (mbcset);
3228 #endif
3229       /* Build a tree for simple bracket.  */
3230       br_token.type = SIMPLE_BRACKET;
3231       br_token.opr.sbcset = sbcset;
3232       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3233       if (BE (work_tree == NULL, 0))
3234         goto parse_bracket_exp_espace;
3235     }
3236   return work_tree;
3237 
3238  parse_bracket_exp_espace:
3239   *err = REG_ESPACE;
3240  parse_bracket_exp_free_return:
3241   re_free (sbcset);
3242 #ifdef RE_ENABLE_I18N
3243   free_charset (mbcset);
3244 #endif /* RE_ENABLE_I18N */
3245   return NULL;
3246 }
3247 
3248 /* Parse an element in the bracket expression.  */
3249 
3250 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3251 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3252 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3253 		       reg_syntax_t syntax, int accept_hyphen)
3254 {
3255 #ifdef RE_ENABLE_I18N
3256   int cur_char_size;
3257   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3258   if (cur_char_size > 1)
3259     {
3260       elem->type = MB_CHAR;
3261       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3262       re_string_skip_bytes (regexp, cur_char_size);
3263       return REG_NOERROR;
3264     }
3265 #endif /* RE_ENABLE_I18N */
3266   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3267   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3268       || token->type == OP_OPEN_EQUIV_CLASS)
3269     return parse_bracket_symbol (elem, regexp, token);
3270   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3271     {
3272       /* A '-' must only appear as anything but a range indicator before
3273 	 the closing bracket.  Everything else is an error.  */
3274       re_token_t token2;
3275       (void) peek_token_bracket (&token2, regexp, syntax);
3276       if (token2.type != OP_CLOSE_BRACKET)
3277 	/* The actual error value is not standardized since this whole
3278 	   case is undefined.  But ERANGE makes good sense.  */
3279 	return REG_ERANGE;
3280     }
3281   elem->type = SB_CHAR;
3282   elem->opr.ch = token->opr.c;
3283   return REG_NOERROR;
3284 }
3285 
3286 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3287    such as [:<character_class>:], [.<collating_element>.], and
3288    [=<equivalent_class>=].  */
3289 
3290 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3291 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3292 		      re_token_t *token)
3293 {
3294   unsigned char ch, delim = token->opr.c;
3295   int i = 0;
3296   if (re_string_eoi(regexp))
3297     return REG_EBRACK;
3298   for (;; ++i)
3299     {
3300       if (i >= BRACKET_NAME_BUF_SIZE)
3301 	return REG_EBRACK;
3302       if (token->type == OP_OPEN_CHAR_CLASS)
3303 	ch = re_string_fetch_byte_case (regexp);
3304       else
3305 	ch = re_string_fetch_byte (regexp);
3306       if (re_string_eoi(regexp))
3307 	return REG_EBRACK;
3308       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3309 	break;
3310       elem->opr.name[i] = ch;
3311     }
3312   re_string_skip_bytes (regexp, 1);
3313   elem->opr.name[i] = '\0';
3314   switch (token->type)
3315     {
3316     case OP_OPEN_COLL_ELEM:
3317       elem->type = COLL_SYM;
3318       break;
3319     case OP_OPEN_EQUIV_CLASS:
3320       elem->type = EQUIV_CLASS;
3321       break;
3322     case OP_OPEN_CHAR_CLASS:
3323       elem->type = CHAR_CLASS;
3324       break;
3325     default:
3326       break;
3327     }
3328   return REG_NOERROR;
3329 }
3330 
3331   /* Helper function for parse_bracket_exp.
3332      Build the equivalence class which is represented by NAME.
3333      The result are written to MBCSET and SBCSET.
3334      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3335      is a pointer argument sinse we may update it.  */
3336 
3337 static reg_errcode_t
3338 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3339 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3340 		   int *equiv_class_alloc, const unsigned char *name)
3341 #else /* not RE_ENABLE_I18N */
3342 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3343 #endif /* not RE_ENABLE_I18N */
3344 {
3345 #ifdef _LIBC
3346   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3347   if (nrules != 0)
3348     {
3349       const int32_t *table, *indirect;
3350       const unsigned char *weights, *extra, *cp;
3351       unsigned char char_buf[2];
3352       int32_t idx1, idx2;
3353       unsigned int ch;
3354       size_t len;
3355       /* This #include defines a local function!  */
3356 # include <locale/weight.h>
3357       /* Calculate the index for equivalence class.  */
3358       cp = name;
3359       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3360       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3361 					       _NL_COLLATE_WEIGHTMB);
3362       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3363 						   _NL_COLLATE_EXTRAMB);
3364       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3365 						_NL_COLLATE_INDIRECTMB);
3366       idx1 = findidx (&cp);
3367       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3368 	/* This isn't a valid character.  */
3369 	return REG_ECOLLATE;
3370 
3371       /* Build single byte matcing table for this equivalence class.  */
3372       char_buf[1] = (unsigned char) '\0';
3373       len = weights[idx1];
3374       for (ch = 0; ch < SBC_MAX; ++ch)
3375 	{
3376 	  char_buf[0] = ch;
3377 	  cp = char_buf;
3378 	  idx2 = findidx (&cp);
3379 /*
3380 	  idx2 = table[ch];
3381 */
3382 	  if (idx2 == 0)
3383 	    /* This isn't a valid character.  */
3384 	    continue;
3385 	  if (len == weights[idx2])
3386 	    {
3387 	      int cnt = 0;
3388 	      while (cnt <= len &&
3389 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3390 		++cnt;
3391 
3392 	      if (cnt > len)
3393 		bitset_set (sbcset, ch);
3394 	    }
3395 	}
3396       /* Check whether the array has enough space.  */
3397       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3398 	{
3399 	  /* Not enough, realloc it.  */
3400 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3401 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3402 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3403 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3404 						   int32_t,
3405 						   new_equiv_class_alloc);
3406 	  if (BE (new_equiv_classes == NULL, 0))
3407 	    return REG_ESPACE;
3408 	  mbcset->equiv_classes = new_equiv_classes;
3409 	  *equiv_class_alloc = new_equiv_class_alloc;
3410 	}
3411       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3412     }
3413   else
3414 #endif /* _LIBC */
3415     {
3416       if (BE (strlen ((const char *) name) != 1, 0))
3417 	return REG_ECOLLATE;
3418       bitset_set (sbcset, *name);
3419     }
3420   return REG_NOERROR;
3421 }
3422 
3423   /* Helper function for parse_bracket_exp.
3424      Build the character class which is represented by NAME.
3425      The result are written to MBCSET and SBCSET.
3426      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3427      is a pointer argument sinse we may update it.  */
3428 
3429 static reg_errcode_t
3430 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3431 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3432 		 re_charset_t *mbcset, int *char_class_alloc,
3433 		 const unsigned char *class_name, reg_syntax_t syntax)
3434 #else /* not RE_ENABLE_I18N */
3435 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3436 		 const unsigned char *class_name, reg_syntax_t syntax)
3437 #endif /* not RE_ENABLE_I18N */
3438 {
3439   int i;
3440   const char *name = (const char *) class_name;
3441 
3442   /* In case of REG_ICASE "upper" and "lower" match the both of
3443      upper and lower cases.  */
3444   if ((syntax & RE_ICASE)
3445       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3446     name = "alpha";
3447 
3448 #ifdef RE_ENABLE_I18N
3449   /* Check the space of the arrays.  */
3450   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3451     {
3452       /* Not enough, realloc it.  */
3453       /* +1 in case of mbcset->nchar_classes is 0.  */
3454       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3455       /* Use realloc since array is NULL if *alloc == 0.  */
3456       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3457 					       new_char_class_alloc);
3458       if (BE (new_char_classes == NULL, 0))
3459 	return REG_ESPACE;
3460       mbcset->char_classes = new_char_classes;
3461       *char_class_alloc = new_char_class_alloc;
3462     }
3463   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3464 #endif /* RE_ENABLE_I18N */
3465 
3466 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3467   do {						\
3468     if (BE (trans != NULL, 0))			\
3469       {						\
3470 	for (i = 0; i < SBC_MAX; ++i)		\
3471   	  if (ctype_func (i))			\
3472 	    bitset_set (sbcset, trans[i]);	\
3473       }						\
3474     else					\
3475       {						\
3476 	for (i = 0; i < SBC_MAX; ++i)		\
3477   	  if (ctype_func (i))			\
3478 	    bitset_set (sbcset, i);		\
3479       }						\
3480   } while (0)
3481 
3482   if (strcmp (name, "alnum") == 0)
3483     BUILD_CHARCLASS_LOOP (isalnum);
3484   else if (strcmp (name, "cntrl") == 0)
3485     BUILD_CHARCLASS_LOOP (iscntrl);
3486   else if (strcmp (name, "lower") == 0)
3487     BUILD_CHARCLASS_LOOP (islower);
3488   else if (strcmp (name, "space") == 0)
3489     BUILD_CHARCLASS_LOOP (isspace);
3490   else if (strcmp (name, "alpha") == 0)
3491     BUILD_CHARCLASS_LOOP (isalpha);
3492   else if (strcmp (name, "digit") == 0)
3493     BUILD_CHARCLASS_LOOP (isdigit);
3494   else if (strcmp (name, "print") == 0)
3495     BUILD_CHARCLASS_LOOP (isprint);
3496   else if (strcmp (name, "upper") == 0)
3497     BUILD_CHARCLASS_LOOP (isupper);
3498   else if (strcmp (name, "blank") == 0)
3499     BUILD_CHARCLASS_LOOP (isblank);
3500   else if (strcmp (name, "graph") == 0)
3501     BUILD_CHARCLASS_LOOP (isgraph);
3502   else if (strcmp (name, "punct") == 0)
3503     BUILD_CHARCLASS_LOOP (ispunct);
3504   else if (strcmp (name, "xdigit") == 0)
3505     BUILD_CHARCLASS_LOOP (isxdigit);
3506   else
3507     return REG_ECTYPE;
3508 
3509   return REG_NOERROR;
3510 }
3511 
3512 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)3513 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3514 		    const unsigned char *class_name,
3515 		    const unsigned char *extra, int non_match,
3516 		    reg_errcode_t *err)
3517 {
3518   re_bitset_ptr_t sbcset;
3519 #ifdef RE_ENABLE_I18N
3520   re_charset_t *mbcset;
3521   int alloc = 0;
3522 #endif /* not RE_ENABLE_I18N */
3523   reg_errcode_t ret;
3524   re_token_t br_token;
3525   bin_tree_t *tree;
3526 
3527   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3528 #ifdef RE_ENABLE_I18N
3529   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3530 #endif /* RE_ENABLE_I18N */
3531 
3532 #ifdef RE_ENABLE_I18N
3533   if (BE (sbcset == NULL || mbcset == NULL, 0))
3534 #else /* not RE_ENABLE_I18N */
3535   if (BE (sbcset == NULL, 0))
3536 #endif /* not RE_ENABLE_I18N */
3537     {
3538       *err = REG_ESPACE;
3539       return NULL;
3540     }
3541 
3542   if (non_match)
3543     {
3544 #ifdef RE_ENABLE_I18N
3545       /*
3546       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3547 	bitset_set(cset->sbcset, '\0');
3548       */
3549       mbcset->non_match = 1;
3550 #endif /* not RE_ENABLE_I18N */
3551     }
3552 
3553   /* We don't care the syntax in this case.  */
3554   ret = build_charclass (trans, sbcset,
3555 #ifdef RE_ENABLE_I18N
3556 			 mbcset, &alloc,
3557 #endif /* RE_ENABLE_I18N */
3558 			 class_name, 0);
3559 
3560   if (BE (ret != REG_NOERROR, 0))
3561     {
3562       re_free (sbcset);
3563 #ifdef RE_ENABLE_I18N
3564       free_charset (mbcset);
3565 #endif /* RE_ENABLE_I18N */
3566       *err = ret;
3567       return NULL;
3568     }
3569   /* \w match '_' also.  */
3570   for (; *extra; extra++)
3571     bitset_set (sbcset, *extra);
3572 
3573   /* If it is non-matching list.  */
3574   if (non_match)
3575     bitset_not (sbcset);
3576 
3577 #ifdef RE_ENABLE_I18N
3578   /* Ensure only single byte characters are set.  */
3579   if (dfa->mb_cur_max > 1)
3580     bitset_mask (sbcset, dfa->sb_char);
3581 #endif
3582 
3583   /* Build a tree for simple bracket.  */
3584   br_token.type = SIMPLE_BRACKET;
3585   br_token.opr.sbcset = sbcset;
3586   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3587   if (BE (tree == NULL, 0))
3588     goto build_word_op_espace;
3589 
3590 #ifdef RE_ENABLE_I18N
3591   if (dfa->mb_cur_max > 1)
3592     {
3593       bin_tree_t *mbc_tree;
3594       /* Build a tree for complex bracket.  */
3595       br_token.type = COMPLEX_BRACKET;
3596       br_token.opr.mbcset = mbcset;
3597       dfa->has_mb_node = 1;
3598       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3599       if (BE (mbc_tree == NULL, 0))
3600 	goto build_word_op_espace;
3601       /* Then join them by ALT node.  */
3602       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3603       if (BE (mbc_tree != NULL, 1))
3604 	return tree;
3605     }
3606   else
3607     {
3608       free_charset (mbcset);
3609       return tree;
3610     }
3611 #else /* not RE_ENABLE_I18N */
3612   return tree;
3613 #endif /* not RE_ENABLE_I18N */
3614 
3615  build_word_op_espace:
3616   re_free (sbcset);
3617 #ifdef RE_ENABLE_I18N
3618   free_charset (mbcset);
3619 #endif /* RE_ENABLE_I18N */
3620   *err = REG_ESPACE;
3621   return NULL;
3622 }
3623 
3624 /* This is intended for the expressions like "a{1,3}".
3625    Fetch a number from `input', and return the number.
3626    Return -1, if the number field is empty like "{,1}".
3627    Return -2, If an error is occured.  */
3628 
3629 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3630 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3631 {
3632   int num = -1;
3633   unsigned char c;
3634   while (1)
3635     {
3636       fetch_token (token, input, syntax);
3637       c = token->opr.c;
3638       if (BE (token->type == END_OF_RE, 0))
3639 	return -2;
3640       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3641 	break;
3642       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3643 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3644       num = (num > RE_DUP_MAX) ? -2 : num;
3645     }
3646   return num;
3647 }
3648 
3649 #ifdef RE_ENABLE_I18N
3650 static void
free_charset(re_charset_t * cset)3651 free_charset (re_charset_t *cset)
3652 {
3653   re_free (cset->mbchars);
3654 # ifdef _LIBC
3655   re_free (cset->coll_syms);
3656   re_free (cset->equiv_classes);
3657   re_free (cset->range_starts);
3658   re_free (cset->range_ends);
3659 # endif
3660   re_free (cset->char_classes);
3661   re_free (cset);
3662 }
3663 #endif /* RE_ENABLE_I18N */
3664 
3665 /* Functions for binary tree operation.  */
3666 
3667 /* Create a tree node.  */
3668 
3669 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3670 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3671 	     re_token_type_t type)
3672 {
3673   re_token_t t;
3674   t.type = type;
3675   return create_token_tree (dfa, left, right, &t);
3676 }
3677 
3678 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3679 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3680 		   const re_token_t *token)
3681 {
3682   bin_tree_t *tree;
3683   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3684     {
3685       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3686 
3687       if (storage == NULL)
3688 	return NULL;
3689       storage->next = dfa->str_tree_storage;
3690       dfa->str_tree_storage = storage;
3691       dfa->str_tree_storage_idx = 0;
3692     }
3693   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3694 
3695   tree->parent = NULL;
3696   tree->left = left;
3697   tree->right = right;
3698   tree->token = *token;
3699   tree->token.duplicated = 0;
3700   tree->token.opt_subexp = 0;
3701   tree->first = NULL;
3702   tree->next = NULL;
3703   tree->node_idx = -1;
3704 
3705   if (left != NULL)
3706     left->parent = tree;
3707   if (right != NULL)
3708     right->parent = tree;
3709   return tree;
3710 }
3711 
3712 /* Mark the tree SRC as an optional subexpression.
3713    To be called from preorder or postorder.  */
3714 
3715 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3716 mark_opt_subexp (void *extra, bin_tree_t *node)
3717 {
3718 #ifdef _WIN64
3719   int idx = (int) (__int64) extra;
3720 #else
3721   int idx = (int) (long) extra;
3722 #endif
3723   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3724     node->token.opt_subexp = 1;
3725 
3726   return REG_NOERROR;
3727 }
3728 
3729 /* Free the allocated memory inside NODE. */
3730 
3731 static void
free_token(re_token_t * node)3732 free_token (re_token_t *node)
3733 {
3734 #ifdef RE_ENABLE_I18N
3735   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3736     free_charset (node->opr.mbcset);
3737   else
3738 #endif /* RE_ENABLE_I18N */
3739     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3740       re_free (node->opr.sbcset);
3741 }
3742 
3743 /* Worker function for tree walking.  Free the allocated memory inside NODE
3744    and its children. */
3745 
3746 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3747 free_tree (void *extra, bin_tree_t *node)
3748 {
3749   free_token (&node->token);
3750   return REG_NOERROR;
3751 }
3752 
3753 
3754 /* Duplicate the node SRC, and return new node.  This is a preorder
3755    visit similar to the one implemented by the generic visitor, but
3756    we need more infrastructure to maintain two parallel trees --- so,
3757    it's easier to duplicate.  */
3758 
3759 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3760 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3761 {
3762   const bin_tree_t *node;
3763   bin_tree_t *dup_root;
3764   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3765 
3766   for (node = root; ; )
3767     {
3768       /* Create a new tree and link it back to the current parent.  */
3769       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3770       if (*p_new == NULL)
3771 	return NULL;
3772       (*p_new)->parent = dup_node;
3773       (*p_new)->token.duplicated = 1;
3774       dup_node = *p_new;
3775 
3776       /* Go to the left node, or up and to the right.  */
3777       if (node->left)
3778 	{
3779 	  node = node->left;
3780 	  p_new = &dup_node->left;
3781 	}
3782       else
3783 	{
3784 	  const bin_tree_t *prev = NULL;
3785 	  while (node->right == prev || node->right == NULL)
3786 	    {
3787 	      prev = node;
3788 	      node = node->parent;
3789 	      dup_node = dup_node->parent;
3790 	      if (!node)
3791 	        return dup_root;
3792 	    }
3793 	  node = node->right;
3794 	  p_new = &dup_node->right;
3795 	}
3796     }
3797 }
3798