1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 					  size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 				     const re_dfastate_t *init_state,
25 				     char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 			       reg_errcode_t (fn (void *, bin_tree_t *)),
38 			       void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 				reg_errcode_t (fn (void *, bin_tree_t *)),
41 				void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 				 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 				   unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 					 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 			 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 			reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 			  reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 				  re_token_t *token, reg_syntax_t syntax,
64 				  int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 				 re_token_t *token, reg_syntax_t syntax,
67 				 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 				     re_token_t *token, reg_syntax_t syntax,
70 				     int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 				  re_token_t *token, reg_syntax_t syntax,
73 				  int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 				 re_dfa_t *dfa, re_token_t *token,
76 				 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 				      re_token_t *token, reg_syntax_t syntax,
79 				      reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 					    re_string_t *regexp,
82 					    re_token_t *token, int token_len,
83 					    re_dfa_t *dfa,
84 					    reg_syntax_t syntax,
85 					    int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 					  re_string_t *regexp,
88 					  re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 					re_charset_t *mbcset,
92 					int *equiv_class_alloc,
93 					const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 				      bitset_t sbcset,
96 				      re_charset_t *mbcset,
97 				      int *char_class_alloc,
98 				      const unsigned char *class_name,
99 				      reg_syntax_t syntax);
100 #else  /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 					const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 				      bitset_t sbcset,
105 				      const unsigned char *class_name,
106 				      reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 				       RE_TRANSLATE_TYPE trans,
110 				       const unsigned char *class_name,
111 				       const unsigned char *extra,
112 				       int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 				bin_tree_t *left, bin_tree_t *right,
115 				re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 				      bin_tree_t *left, bin_tree_t *right,
118 				      const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 
124 /* This table gives an error message for each of the error codes listed
125    in regex.h.  Obviously the order here has to be same as there.
126    POSIX doesn't require that we do anything for REG_NOERROR,
127    but why not be nice?  */
128 
129 const char __re_error_msgid[] attribute_hidden =
130   {
131 #define REG_NOERROR_IDX	0
132     gettext_noop ("Success")	/* REG_NOERROR */
133     "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135     gettext_noop ("No match")	/* REG_NOMATCH */
136     "\0"
137 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139     "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142     "\0"
143 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145     "\0"
146 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148     "\0"
149 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151     "\0"
152 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
153     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
154     "\0"
155 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157     "\0"
158 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160     "\0"
161 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163     "\0"
164 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165     gettext_noop ("Invalid range end")	/* REG_ERANGE */
166     "\0"
167 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
169     "\0"
170 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172     "\0"
173 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
175     "\0"
176 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
178     "\0"
179 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181   };
182 
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184   {
185     REG_NOERROR_IDX,
186     REG_NOMATCH_IDX,
187     REG_BADPAT_IDX,
188     REG_ECOLLATE_IDX,
189     REG_ECTYPE_IDX,
190     REG_EESCAPE_IDX,
191     REG_ESUBREG_IDX,
192     REG_EBRACK_IDX,
193     REG_EPAREN_IDX,
194     REG_EBRACE_IDX,
195     REG_BADBR_IDX,
196     REG_ERANGE_IDX,
197     REG_ESPACE_IDX,
198     REG_BADRPT_IDX,
199     REG_EEND_IDX,
200     REG_ESIZE_IDX,
201     REG_ERPAREN_IDX
202   };
203 
204 /* Entry points for GNU code.  */
205 
206 /* re_compile_pattern is the GNU regular expression compiler: it
207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208    Returns 0 if the pattern was valid, otherwise an error string.
209 
210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211    are set in BUFP on entry.  */
212 
213 const char *
re_compile_pattern(pattern,length,bufp)214 re_compile_pattern (pattern, length, bufp)
215     const char *pattern;
216     size_t length;
217     struct re_pattern_buffer *bufp;
218 {
219   reg_errcode_t ret;
220 
221   /* And GNU code determines whether or not to get register information
222      by passing null for the REGS argument to re_match, etc., not by
223      setting no_sub, unless RE_NO_SUB is set.  */
224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225 
226   /* Match anchors at newline.  */
227   bufp->newline_anchor = 1;
228 
229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 
231   if (!ret)
232     return NULL;
233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238 
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
240    also be assigned to arbitrarily: each pattern buffer stores its own
241    syntax, so it can be changed between regex compilations.  */
242 /* This has no initializer because initialized variables in Emacs
243    become read-only after dumping.  */
244 reg_syntax_t re_syntax_options;
245 
246 
247 /* Specify the precise syntax of regexps for compilation.  This provides
248    for compatibility for various utilities which historically have
249    different, incompatible syntaxes.
250 
251    The argument SYNTAX is a bit mask comprised of the various bits
252    defined in regex.h.  We return the old syntax.  */
253 
254 reg_syntax_t
255 re_set_syntax (syntax)
256     reg_syntax_t syntax;
257 {
258   reg_syntax_t ret = re_syntax_options;
259 
260   re_syntax_options = syntax;
261   return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266 
267 int
268 re_compile_fastmap (bufp)
269     struct re_pattern_buffer *bufp;
270 {
271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272   char *fastmap = bufp->fastmap;
273 
274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276   if (dfa->init_state != dfa->init_state_word)
277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278   if (dfa->init_state != dfa->init_state_nl)
279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280   if (dfa->init_state != dfa->init_state_begbuf)
281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282   bufp->fastmap_accurate = 1;
283   return 0;
284 }
285 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
288 
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293   fastmap[ch] = 1;
294   if (icase)
295     fastmap[tolower (ch)] = 1;
296 }
297 
298 /* Helper function for re_compile_fastmap.
299    Compile fastmap for the initial_state INIT_STATE.  */
300 
301 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 			 char *fastmap)
304 {
305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306   int node_cnt;
307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309     {
310       int node = init_state->nodes.elems[node_cnt];
311       re_token_type_t type = dfa->nodes[node].type;
312 
313       if (type == CHARACTER)
314 	{
315 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 	    {
319 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 	      wchar_t wc;
321 	      mbstate_t state;
322 
323 	      p = buf;
324 	      *p++ = dfa->nodes[node].opr.c;
325 	      while (++node < dfa->nodes_len
326 		     &&	dfa->nodes[node].type == CHARACTER
327 		     && dfa->nodes[node].mb_partial)
328 		*p++ = dfa->nodes[node].opr.c;
329 	      memset (&state, '\0', sizeof (state));
330 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
331 			   &state) == p - buf
332 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 		      != (size_t) -1))
334 		re_set_fastmap (fastmap, 0, buf[0]);
335 	    }
336 #endif
337 	}
338       else if (type == SIMPLE_BRACKET)
339 	{
340 	  int i, ch;
341 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 	    {
343 	      int j;
344 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 		if (w & ((bitset_word_t) 1 << j))
347 		  re_set_fastmap (fastmap, icase, ch);
348 	    }
349 	}
350 #ifdef RE_ENABLE_I18N
351       else if (type == COMPLEX_BRACKET)
352 	{
353 	  int i;
354 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
356 	      || cset->nranges || cset->nchar_classes)
357 	    {
358 # ifdef _LIBC
359 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
360 		{
361 		  /* In this case we want to catch the bytes which are
362 		     the first byte of any collation elements.
363 		     e.g. In da_DK, we want to catch 'a' since "aa"
364 			  is a valid collation element, and don't catch
365 			  'b' since 'b' is the only collation element
366 			  which starts from 'b'.  */
367 		  const int32_t *table = (const int32_t *)
368 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 		  for (i = 0; i < SBC_MAX; ++i)
370 		    if (table[i] < 0)
371 		      re_set_fastmap (fastmap, icase, i);
372 		}
373 # else
374 	      if (dfa->mb_cur_max > 1)
375 		for (i = 0; i < SBC_MAX; ++i)
376 		  if (__btowc (i) == WEOF)
377 		    re_set_fastmap (fastmap, icase, i);
378 # endif /* not _LIBC */
379 	    }
380 	  for (i = 0; i < cset->nmbchars; ++i)
381 	    {
382 	      char buf[256];
383 	      mbstate_t state;
384 	      memset (&state, '\0', sizeof (state));
385 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
386 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
387 	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
388 		{
389 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
390 		      != (size_t) -1)
391 		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
392 		}
393 	    }
394 	}
395 #endif /* RE_ENABLE_I18N */
396       else if (type == OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 	       || type == OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 	       || type == END_OF_RE)
401 	{
402 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
403 	  if (type == END_OF_RE)
404 	    bufp->can_be_null = 1;
405 	  return;
406 	}
407     }
408 }
409 
410 /* Entry point for POSIX code.  */
411 /* regcomp takes a regular expression as a string and compiles it.
412 
413    PREG is a regex_t *.  We do not expect any fields to be initialized,
414    since POSIX says we shouldn't.  Thus, we set
415 
416      `buffer' to the compiled pattern;
417      `used' to the length of the compiled pattern;
418      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419        REG_EXTENDED bit in CFLAGS is set; otherwise, to
420        RE_SYNTAX_POSIX_BASIC;
421      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422      `fastmap' to an allocated space for the fastmap;
423      `fastmap_accurate' to zero;
424      `re_nsub' to the number of subexpressions in PATTERN.
425 
426    PATTERN is the address of the pattern string.
427 
428    CFLAGS is a series of bits which affect compilation.
429 
430      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431      use POSIX basic syntax.
432 
433      If REG_NEWLINE is set, then . and [^...] don't match newline.
434      Also, regexec will try a match beginning after every newline.
435 
436      If REG_ICASE is set, then we considers upper- and lowercase
437      versions of letters to be equivalent when matching.
438 
439      If REG_NOSUB is set, then when PREG is passed to regexec, that
440      routine will report only success or failure, and nothing about the
441      registers.
442 
443    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
444    the return codes and their meanings.)  */
445 
446 int
regcomp(preg,pattern,cflags)447 regcomp (preg, pattern, cflags)
448     regex_t *__restrict preg;
449     const char *__restrict pattern;
450     int cflags;
451 {
452   reg_errcode_t ret;
453   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
454 			 : RE_SYNTAX_POSIX_BASIC);
455 
456   preg->buffer = NULL;
457   preg->allocated = 0;
458   preg->used = 0;
459 
460   /* Try to allocate space for the fastmap.  */
461   preg->fastmap = re_malloc (char, SBC_MAX);
462   if (BE (preg->fastmap == NULL, 0))
463     return REG_ESPACE;
464 
465   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
466 
467   /* If REG_NEWLINE is set, newlines are treated differently.  */
468   if (cflags & REG_NEWLINE)
469     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
470       syntax &= ~RE_DOT_NEWLINE;
471       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
472       /* It also changes the matching behavior.  */
473       preg->newline_anchor = 1;
474     }
475   else
476     preg->newline_anchor = 0;
477   preg->no_sub = !!(cflags & REG_NOSUB);
478   preg->translate = NULL;
479 
480   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
481 
482   /* POSIX doesn't distinguish between an unmatched open-group and an
483      unmatched close-group: both are REG_EPAREN.  */
484   if (ret == REG_ERPAREN)
485     ret = REG_EPAREN;
486 
487   /* We have already checked preg->fastmap != NULL.  */
488   if (BE (ret == REG_NOERROR, 1))
489     /* Compute the fastmap now, since regexec cannot modify the pattern
490        buffer.  This function never fails in this implementation.  */
491     (void) re_compile_fastmap (preg);
492   else
493     {
494       /* Some error occurred while compiling the expression.  */
495       re_free (preg->fastmap);
496       preg->fastmap = NULL;
497     }
498 
499   return (int) ret;
500 }
501 #ifdef _LIBC
502 weak_alias (__regcomp, regcomp)
503 #endif
504 
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506    from either regcomp or regexec.   We don't use PREG here.  */
507 
508 size_t
509 regerror (errcode, preg, errbuf, errbuf_size)
510     int errcode;
511     const regex_t *__restrict preg;
512     char *__restrict errbuf;
513     size_t errbuf_size;
514 {
515   const char *msg;
516   size_t msg_size;
517 
518   if (BE (errcode < 0
519 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
520 			       / sizeof (__re_error_msgid_idx[0])), 0))
521     /* Only error codes returned by the rest of the code should be passed
522        to this routine.  If we are given anything else, or if other regex
523        code generates an invalid error code, then the program has a bug.
524        Dump core so we can fix it.  */
525     abort ();
526 
527   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
528 
529   msg_size = strlen (msg) + 1; /* Includes the null.  */
530 
531   if (BE (errbuf_size != 0, 1))
532     {
533       if (BE (msg_size > errbuf_size, 0))
534 	{
535 #if defined HAVE_MEMPCPY || defined _LIBC
536 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
537 #else
538 	  memcpy (errbuf, msg, errbuf_size - 1);
539 	  errbuf[errbuf_size - 1] = 0;
540 #endif
541 	}
542       else
543 	memcpy (errbuf, msg, msg_size);
544     }
545 
546   return msg_size;
547 }
548 #ifdef _LIBC
549 weak_alias (__regerror, regerror)
550 #endif
551 
552 
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
556    it the same all the time.  UTF-8 is the preferred encoding so this is
557    a worthwhile optimization.  */
558 static const bitset_t utf8_sb_map =
559 {
560   /* Set the first 128 bits.  */
561   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
562 };
563 #endif
564 
565 
566 static void
free_dfa_content(re_dfa_t * dfa)567 free_dfa_content (re_dfa_t *dfa)
568 {
569   int i, j;
570 
571   if (dfa->nodes)
572     for (i = 0; i < dfa->nodes_len; ++i)
573       free_token (dfa->nodes + i);
574   re_free (dfa->nexts);
575   for (i = 0; i < dfa->nodes_len; ++i)
576     {
577       if (dfa->eclosures != NULL)
578 	re_node_set_free (dfa->eclosures + i);
579       if (dfa->inveclosures != NULL)
580 	re_node_set_free (dfa->inveclosures + i);
581       if (dfa->edests != NULL)
582 	re_node_set_free (dfa->edests + i);
583     }
584   re_free (dfa->edests);
585   re_free (dfa->eclosures);
586   re_free (dfa->inveclosures);
587   re_free (dfa->nodes);
588 
589   if (dfa->state_table)
590     for (i = 0; i <= dfa->state_hash_mask; ++i)
591       {
592 	struct re_state_table_entry *entry = dfa->state_table + i;
593 	for (j = 0; j < entry->num; ++j)
594 	  {
595 	    re_dfastate_t *state = entry->array[j];
596 	    free_state (state);
597 	  }
598         re_free (entry->array);
599       }
600   re_free (dfa->state_table);
601 #ifdef RE_ENABLE_I18N
602   if (dfa->sb_char != utf8_sb_map)
603     re_free (dfa->sb_char);
604 #endif
605   re_free (dfa->subexp_map);
606 #ifdef DEBUG
607   re_free (dfa->re_str);
608 #endif
609 
610   re_free (dfa);
611 }
612 
613 
614 /* Free dynamically allocated space used by PREG.  */
615 
616 void
regfree(preg)617 regfree (preg)
618     regex_t *preg;
619 {
620   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
621   if (BE (dfa != NULL, 1))
622     free_dfa_content (dfa);
623   preg->buffer = NULL;
624   preg->allocated = 0;
625 
626   re_free (preg->fastmap);
627   preg->fastmap = NULL;
628 
629   re_free (preg->translate);
630   preg->translate = NULL;
631 }
632 #ifdef _LIBC
633 weak_alias (__regfree, regfree)
634 #endif
635 
636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
637    them unless specifically requested.  */
638 
639 #if defined _REGEX_RE_COMP || defined _LIBC
640 
641 /* BSD has one and only one pattern buffer.  */
642 static struct re_pattern_buffer re_comp_buf;
643 
644 char *
645 # ifdef _LIBC
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647    these names if they don't use our functions, and still use
648    regcomp/regexec above without link errors.  */
649 weak_function
650 # endif
re_comp(s)651 re_comp (s)
652      const char *s;
653 {
654   reg_errcode_t ret;
655   char *fastmap;
656 
657   if (!s)
658     {
659       if (!re_comp_buf.buffer)
660 	return gettext ("No previous regular expression");
661       return 0;
662     }
663 
664   if (re_comp_buf.buffer)
665     {
666       fastmap = re_comp_buf.fastmap;
667       re_comp_buf.fastmap = NULL;
668       __regfree (&re_comp_buf);
669       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
670       re_comp_buf.fastmap = fastmap;
671     }
672 
673   if (re_comp_buf.fastmap == NULL)
674     {
675       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
676       if (re_comp_buf.fastmap == NULL)
677 	return (char *) gettext (__re_error_msgid
678 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
679     }
680 
681   /* Since `re_exec' always passes NULL for the `regs' argument, we
682      don't need to initialize the pattern buffer fields which affect it.  */
683 
684   /* Match anchors at newlines.  */
685   re_comp_buf.newline_anchor = 1;
686 
687   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
688 
689   if (!ret)
690     return NULL;
691 
692   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
693   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
694 }
695 
696 #ifdef _LIBC
libc_freeres_fn(free_mem)697 libc_freeres_fn (free_mem)
698 {
699   __regfree (&re_comp_buf);
700 }
701 #endif
702 
703 #endif /* _REGEX_RE_COMP */
704 
705 /* Internal entry point.
706    Compile the regular expression PATTERN, whose length is LENGTH.
707    SYNTAX indicate regular expression's syntax.  */
708 
709 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
711 		     reg_syntax_t syntax)
712 {
713   reg_errcode_t err = REG_NOERROR;
714   re_dfa_t *dfa;
715   re_string_t regexp;
716 
717   /* Initialize the pattern buffer.  */
718   preg->fastmap_accurate = 0;
719   preg->syntax = syntax;
720   preg->not_bol = preg->not_eol = 0;
721   preg->used = 0;
722   preg->re_nsub = 0;
723   preg->can_be_null = 0;
724   preg->regs_allocated = REGS_UNALLOCATED;
725 
726   /* Initialize the dfa.  */
727   dfa = (re_dfa_t *) preg->buffer;
728   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
729     {
730       /* If zero allocated, but buffer is non-null, try to realloc
731 	 enough space.  This loses if buffer's address is bogus, but
732 	 that is the user's responsibility.  If ->buffer is NULL this
733 	 is a simple allocation.  */
734       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
735       if (dfa == NULL)
736 	return REG_ESPACE;
737       preg->allocated = sizeof (re_dfa_t);
738       preg->buffer = (unsigned char *) dfa;
739     }
740   preg->used = sizeof (re_dfa_t);
741 
742   err = init_dfa (dfa, length);
743   if (BE (err != REG_NOERROR, 0))
744     {
745       free_dfa_content (dfa);
746       preg->buffer = NULL;
747       preg->allocated = 0;
748       return err;
749     }
750 #ifdef DEBUG
751   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
752   dfa->re_str = re_malloc (char, length + 1);
753   strncpy (dfa->re_str, pattern, length + 1);
754 #endif
755 
756   __libc_lock_init (dfa->lock);
757 
758   err = re_string_construct (&regexp, pattern, length, preg->translate,
759 			     syntax & RE_ICASE, dfa);
760   if (BE (err != REG_NOERROR, 0))
761     {
762     re_compile_internal_free_return:
763       free_workarea_compile (preg);
764       re_string_destruct (&regexp);
765       free_dfa_content (dfa);
766       preg->buffer = NULL;
767       preg->allocated = 0;
768       return err;
769     }
770 
771   /* Parse the regular expression, and build a structure tree.  */
772   preg->re_nsub = 0;
773   dfa->str_tree = parse (&regexp, preg, syntax, &err);
774   if (BE (dfa->str_tree == NULL, 0))
775     goto re_compile_internal_free_return;
776 
777   /* Analyze the tree and create the nfa.  */
778   err = analyze (preg);
779   if (BE (err != REG_NOERROR, 0))
780     goto re_compile_internal_free_return;
781 
782 #ifdef RE_ENABLE_I18N
783   /* If possible, do searching in single byte encoding to speed things up.  */
784   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
785     optimize_utf8 (dfa);
786 #endif
787 
788   /* Then create the initial state of the dfa.  */
789   err = create_initial_state (dfa);
790 
791   /* Release work areas.  */
792   free_workarea_compile (preg);
793   re_string_destruct (&regexp);
794 
795   if (BE (err != REG_NOERROR, 0))
796     {
797       free_dfa_content (dfa);
798       preg->buffer = NULL;
799       preg->allocated = 0;
800     }
801 
802   return err;
803 }
804 
805 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
806    as the initial length of some arrays.  */
807 
808 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)809 init_dfa (re_dfa_t *dfa, size_t pat_len)
810 {
811   unsigned int table_size;
812 #ifndef _LIBC
813   char *codeset_name;
814 #endif
815 
816   memset (dfa, '\0', sizeof (re_dfa_t));
817 
818   /* Force allocation of str_tree_storage the first time.  */
819   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
820 
821   /* Avoid overflows.  */
822   if (pat_len == SIZE_MAX)
823     return REG_ESPACE;
824 
825   dfa->nodes_alloc = pat_len + 1;
826   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
827 
828   /*  table_size = 2 ^ ceil(log pat_len) */
829   for (table_size = 1; ; table_size <<= 1)
830     if (table_size > pat_len)
831       break;
832 
833   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
834   dfa->state_hash_mask = table_size - 1;
835 
836   dfa->mb_cur_max = MB_CUR_MAX;
837 #ifdef _LIBC
838   if (dfa->mb_cur_max == 6
839       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
840     dfa->is_utf8 = 1;
841   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
842 		       != 0);
843 #else
844 # ifdef HAVE_LANGINFO_CODESET
845   codeset_name = nl_langinfo (CODESET);
846 # else
847   codeset_name = getenv ("LC_ALL");
848   if (codeset_name == NULL || codeset_name[0] == '\0')
849     codeset_name = getenv ("LC_CTYPE");
850   if (codeset_name == NULL || codeset_name[0] == '\0')
851     codeset_name = getenv ("LANG");
852   if (codeset_name == NULL)
853     codeset_name = "";
854   else if (strchr (codeset_name, '.') !=  NULL)
855     codeset_name = strchr (codeset_name, '.') + 1;
856 # endif
857 
858   if (strcasecmp (codeset_name, "UTF-8") == 0
859       || strcasecmp (codeset_name, "UTF8") == 0)
860     dfa->is_utf8 = 1;
861 
862   /* We check exhaustively in the loop below if this charset is a
863      superset of ASCII.  */
864   dfa->map_notascii = 0;
865 #endif
866 
867 #ifdef RE_ENABLE_I18N
868   if (dfa->mb_cur_max > 1)
869     {
870       if (dfa->is_utf8)
871 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
872       else
873 	{
874 	  int i, j, ch;
875 
876 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
877 	  if (BE (dfa->sb_char == NULL, 0))
878 	    return REG_ESPACE;
879 
880 	  /* Set the bits corresponding to single byte chars.  */
881 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
882 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
883 	      {
884 		wint_t wch = __btowc (ch);
885 		if (wch != WEOF)
886 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
887 # ifndef _LIBC
888 		if (isascii (ch) && wch != ch)
889 		  dfa->map_notascii = 1;
890 # endif
891 	      }
892 	}
893     }
894 #endif
895 
896   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
897     return REG_ESPACE;
898   return REG_NOERROR;
899 }
900 
901 /* Initialize WORD_CHAR table, which indicate which character is
902    "word".  In this case "word" means that it is the word construction
903    character used by some operators like "\<", "\>", etc.  */
904 
905 static void
906 internal_function
init_word_char(re_dfa_t * dfa)907 init_word_char (re_dfa_t *dfa)
908 {
909   int i, j, ch;
910   dfa->word_ops_used = 1;
911   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
912     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
913       if (isalnum (ch) || ch == '_')
914 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
915 }
916 
917 /* Free the work area which are only used while compiling.  */
918 
919 static void
free_workarea_compile(regex_t * preg)920 free_workarea_compile (regex_t *preg)
921 {
922   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
923   bin_tree_storage_t *storage, *next;
924   for (storage = dfa->str_tree_storage; storage; storage = next)
925     {
926       next = storage->next;
927       re_free (storage);
928     }
929   dfa->str_tree_storage = NULL;
930   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
931   dfa->str_tree = NULL;
932   re_free (dfa->org_indices);
933   dfa->org_indices = NULL;
934 }
935 
936 /* Create initial states for all contexts.  */
937 
938 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)939 create_initial_state (re_dfa_t *dfa)
940 {
941   int first, i;
942   reg_errcode_t err;
943   re_node_set init_nodes;
944 
945   /* Initial states have the epsilon closure of the node which is
946      the first node of the regular expression.  */
947   first = dfa->str_tree->first->node_idx;
948   dfa->init_node = first;
949   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
950   if (BE (err != REG_NOERROR, 0))
951     return err;
952 
953   /* The back-references which are in initial states can epsilon transit,
954      since in this case all of the subexpressions can be null.
955      Then we add epsilon closures of the nodes which are the next nodes of
956      the back-references.  */
957   if (dfa->nbackref > 0)
958     for (i = 0; i < init_nodes.nelem; ++i)
959       {
960 	int node_idx = init_nodes.elems[i];
961 	re_token_type_t type = dfa->nodes[node_idx].type;
962 
963 	int clexp_idx;
964 	if (type != OP_BACK_REF)
965 	  continue;
966 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
967 	  {
968 	    re_token_t *clexp_node;
969 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
970 	    if (clexp_node->type == OP_CLOSE_SUBEXP
971 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
972 	      break;
973 	  }
974 	if (clexp_idx == init_nodes.nelem)
975 	  continue;
976 
977 	if (type == OP_BACK_REF)
978 	  {
979 	    int dest_idx = dfa->edests[node_idx].elems[0];
980 	    if (!re_node_set_contains (&init_nodes, dest_idx))
981 	      {
982 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
983 		i = 0;
984 	      }
985 	  }
986       }
987 
988   /* It must be the first time to invoke acquire_state.  */
989   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
990   /* We don't check ERR here, since the initial state must not be NULL.  */
991   if (BE (dfa->init_state == NULL, 0))
992     return err;
993   if (dfa->init_state->has_constraint)
994     {
995       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
996 						       CONTEXT_WORD);
997       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
998 						     CONTEXT_NEWLINE);
999       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1000 							 &init_nodes,
1001 							 CONTEXT_NEWLINE
1002 							 | CONTEXT_BEGBUF);
1003       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1004 	      || dfa->init_state_begbuf == NULL, 0))
1005 	return err;
1006     }
1007   else
1008     dfa->init_state_word = dfa->init_state_nl
1009       = dfa->init_state_begbuf = dfa->init_state;
1010 
1011   re_node_set_free (&init_nodes);
1012   return REG_NOERROR;
1013 }
1014 
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018    DFA nodes where needed.  */
1019 
1020 static void
optimize_utf8(re_dfa_t * dfa)1021 optimize_utf8 (re_dfa_t *dfa)
1022 {
1023   int node, i, mb_chars = 0, has_period = 0;
1024 
1025   for (node = 0; node < dfa->nodes_len; ++node)
1026     switch (dfa->nodes[node].type)
1027       {
1028       case CHARACTER:
1029 	if (dfa->nodes[node].opr.c >= 0x80)
1030 	  mb_chars = 1;
1031 	break;
1032       case ANCHOR:
1033 	switch (dfa->nodes[node].opr.idx)
1034 	  {
1035 	  case LINE_FIRST:
1036 	  case LINE_LAST:
1037 	  case BUF_FIRST:
1038 	  case BUF_LAST:
1039 	    break;
1040 	  default:
1041 	    /* Word anchors etc. cannot be handled.  */
1042 	    return;
1043 	  }
1044 	break;
1045       case OP_PERIOD:
1046         has_period = 1;
1047         break;
1048       case OP_BACK_REF:
1049       case OP_ALT:
1050       case END_OF_RE:
1051       case OP_DUP_ASTERISK:
1052       case OP_OPEN_SUBEXP:
1053       case OP_CLOSE_SUBEXP:
1054 	break;
1055       case COMPLEX_BRACKET:
1056 	return;
1057       case SIMPLE_BRACKET:
1058 	/* Just double check.  The non-ASCII range starts at 0x80.  */
1059 	assert (0x80 % BITSET_WORD_BITS == 0);
1060         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061 	  if (dfa->nodes[node].opr.sbcset[i])
1062 	    return;
1063 	break;
1064       default:
1065 	abort ();
1066       }
1067 
1068   if (mb_chars || has_period)
1069     for (node = 0; node < dfa->nodes_len; ++node)
1070       {
1071 	if (dfa->nodes[node].type == CHARACTER
1072 	    && dfa->nodes[node].opr.c >= 0x80)
1073 	  dfa->nodes[node].mb_partial = 0;
1074 	else if (dfa->nodes[node].type == OP_PERIOD)
1075 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1076       }
1077 
1078   /* The search can be in single byte locale.  */
1079   dfa->mb_cur_max = 1;
1080   dfa->is_utf8 = 0;
1081   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1082 }
1083 #endif
1084 
1085 /* Analyze the structure tree, and calculate "first", "next", "edest",
1086    "eclosure", and "inveclosure".  */
1087 
1088 static reg_errcode_t
analyze(regex_t * preg)1089 analyze (regex_t *preg)
1090 {
1091   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1092   reg_errcode_t ret;
1093 
1094   /* Allocate arrays.  */
1095   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1096   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1097   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1098   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1099   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1100 	  || dfa->eclosures == NULL, 0))
1101     return REG_ESPACE;
1102 
1103   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1104   if (dfa->subexp_map != NULL)
1105     {
1106       int i;
1107       for (i = 0; i < preg->re_nsub; i++)
1108 	dfa->subexp_map[i] = i;
1109       preorder (dfa->str_tree, optimize_subexps, dfa);
1110       for (i = 0; i < preg->re_nsub; i++)
1111 	if (dfa->subexp_map[i] != i)
1112 	  break;
1113       if (i == preg->re_nsub)
1114 	{
1115 	  free (dfa->subexp_map);
1116 	  dfa->subexp_map = NULL;
1117 	}
1118     }
1119 
1120   ret = postorder (dfa->str_tree, lower_subexps, preg);
1121   if (BE (ret != REG_NOERROR, 0))
1122     return ret;
1123   ret = postorder (dfa->str_tree, calc_first, dfa);
1124   if (BE (ret != REG_NOERROR, 0))
1125     return ret;
1126   preorder (dfa->str_tree, calc_next, dfa);
1127   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1128   if (BE (ret != REG_NOERROR, 0))
1129     return ret;
1130   ret = calc_eclosure (dfa);
1131   if (BE (ret != REG_NOERROR, 0))
1132     return ret;
1133 
1134   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1135      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1136   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1137       || dfa->nbackref)
1138     {
1139       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1140       if (BE (dfa->inveclosures == NULL, 0))
1141         return REG_ESPACE;
1142       ret = calc_inveclosure (dfa);
1143     }
1144 
1145   return ret;
1146 }
1147 
1148 /* Our parse trees are very unbalanced, so we cannot use a stack to
1149    implement parse tree visits.  Instead, we use parent pointers and
1150    some hairy code in these two functions.  */
1151 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1152 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1153 	   void *extra)
1154 {
1155   bin_tree_t *node, *prev;
1156 
1157   for (node = root; ; )
1158     {
1159       /* Descend down the tree, preferably to the left (or to the right
1160 	 if that's the only child).  */
1161       while (node->left || node->right)
1162 	if (node->left)
1163           node = node->left;
1164         else
1165           node = node->right;
1166 
1167       do
1168 	{
1169 	  reg_errcode_t err = fn (extra, node);
1170 	  if (BE (err != REG_NOERROR, 0))
1171 	    return err;
1172           if (node->parent == NULL)
1173 	    return REG_NOERROR;
1174 	  prev = node;
1175 	  node = node->parent;
1176 	}
1177       /* Go up while we have a node that is reached from the right.  */
1178       while (node->right == prev || node->right == NULL);
1179       node = node->right;
1180     }
1181 }
1182 
1183 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1184 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1185 	  void *extra)
1186 {
1187   bin_tree_t *node;
1188 
1189   for (node = root; ; )
1190     {
1191       reg_errcode_t err = fn (extra, node);
1192       if (BE (err != REG_NOERROR, 0))
1193 	return err;
1194 
1195       /* Go to the left node, or up and to the right.  */
1196       if (node->left)
1197 	node = node->left;
1198       else
1199 	{
1200 	  bin_tree_t *prev = NULL;
1201 	  while (node->right == prev || node->right == NULL)
1202 	    {
1203 	      prev = node;
1204 	      node = node->parent;
1205 	      if (!node)
1206 	        return REG_NOERROR;
1207 	    }
1208 	  node = node->right;
1209 	}
1210     }
1211 }
1212 
1213 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1214    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1215    backreferences as well.  Requires a preorder visit.  */
1216 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1217 optimize_subexps (void *extra, bin_tree_t *node)
1218 {
1219   re_dfa_t *dfa = (re_dfa_t *) extra;
1220 
1221   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1222     {
1223       int idx = node->token.opr.idx;
1224       node->token.opr.idx = dfa->subexp_map[idx];
1225       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1226     }
1227 
1228   else if (node->token.type == SUBEXP
1229            && node->left && node->left->token.type == SUBEXP)
1230     {
1231       int other_idx = node->left->token.opr.idx;
1232 
1233       node->left = node->left->left;
1234       if (node->left)
1235         node->left->parent = node;
1236 
1237       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1238       if (other_idx < BITSET_WORD_BITS)
1239 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1240     }
1241 
1242   return REG_NOERROR;
1243 }
1244 
1245 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1246    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1247 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1248 lower_subexps (void *extra, bin_tree_t *node)
1249 {
1250   regex_t *preg = (regex_t *) extra;
1251   reg_errcode_t err = REG_NOERROR;
1252 
1253   if (node->left && node->left->token.type == SUBEXP)
1254     {
1255       node->left = lower_subexp (&err, preg, node->left);
1256       if (node->left)
1257 	node->left->parent = node;
1258     }
1259   if (node->right && node->right->token.type == SUBEXP)
1260     {
1261       node->right = lower_subexp (&err, preg, node->right);
1262       if (node->right)
1263 	node->right->parent = node;
1264     }
1265 
1266   return err;
1267 }
1268 
1269 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1270 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1271 {
1272   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1273   bin_tree_t *body = node->left;
1274   bin_tree_t *op, *cls, *tree1, *tree;
1275 
1276   if (preg->no_sub
1277       /* We do not optimize empty subexpressions, because otherwise we may
1278 	 have bad CONCAT nodes with NULL children.  This is obviously not
1279 	 very common, so we do not lose much.  An example that triggers
1280 	 this case is the sed "script" /\(\)/x.  */
1281       && node->left != NULL
1282       && (node->token.opr.idx >= BITSET_WORD_BITS
1283 	  || !(dfa->used_bkref_map
1284 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1285     return node->left;
1286 
1287   /* Convert the SUBEXP node to the concatenation of an
1288      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1289   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1290   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1291   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1292   tree = create_tree (dfa, op, tree1, CONCAT);
1293   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1294     {
1295       *err = REG_ESPACE;
1296       return NULL;
1297     }
1298 
1299   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1300   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1301   return tree;
1302 }
1303 
1304 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1305    nodes.  Requires a postorder visit.  */
1306 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1307 calc_first (void *extra, bin_tree_t *node)
1308 {
1309   re_dfa_t *dfa = (re_dfa_t *) extra;
1310   if (node->token.type == CONCAT)
1311     {
1312       node->first = node->left->first;
1313       node->node_idx = node->left->node_idx;
1314     }
1315   else
1316     {
1317       node->first = node;
1318       node->node_idx = re_dfa_add_node (dfa, node->token);
1319       if (BE (node->node_idx == -1, 0))
1320         return REG_ESPACE;
1321     }
1322   return REG_NOERROR;
1323 }
1324 
1325 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1326 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1327 calc_next (void *extra, bin_tree_t *node)
1328 {
1329   switch (node->token.type)
1330     {
1331     case OP_DUP_ASTERISK:
1332       node->left->next = node;
1333       break;
1334     case CONCAT:
1335       node->left->next = node->right->first;
1336       node->right->next = node->next;
1337       break;
1338     default:
1339       if (node->left)
1340 	node->left->next = node->next;
1341       if (node->right)
1342         node->right->next = node->next;
1343       break;
1344     }
1345   return REG_NOERROR;
1346 }
1347 
1348 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1349 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1350 link_nfa_nodes (void *extra, bin_tree_t *node)
1351 {
1352   re_dfa_t *dfa = (re_dfa_t *) extra;
1353   int idx = node->node_idx;
1354   reg_errcode_t err = REG_NOERROR;
1355 
1356   switch (node->token.type)
1357     {
1358     case CONCAT:
1359       break;
1360 
1361     case END_OF_RE:
1362       assert (node->next == NULL);
1363       break;
1364 
1365     case OP_DUP_ASTERISK:
1366     case OP_ALT:
1367       {
1368 	int left, right;
1369 	dfa->has_plural_match = 1;
1370 	if (node->left != NULL)
1371 	  left = node->left->first->node_idx;
1372 	else
1373 	  left = node->next->node_idx;
1374 	if (node->right != NULL)
1375 	  right = node->right->first->node_idx;
1376 	else
1377 	  right = node->next->node_idx;
1378 	assert (left > -1);
1379 	assert (right > -1);
1380 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1381       }
1382       break;
1383 
1384     case ANCHOR:
1385     case OP_OPEN_SUBEXP:
1386     case OP_CLOSE_SUBEXP:
1387       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1388       break;
1389 
1390     case OP_BACK_REF:
1391       dfa->nexts[idx] = node->next->node_idx;
1392       if (node->token.type == OP_BACK_REF)
1393 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1394       break;
1395 
1396     default:
1397       assert (!IS_EPSILON_NODE (node->token.type));
1398       dfa->nexts[idx] = node->next->node_idx;
1399       break;
1400     }
1401 
1402   return err;
1403 }
1404 
1405 /* Duplicate the epsilon closure of the node ROOT_NODE.
1406    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1407    to their own constraint.  */
1408 
1409 static reg_errcode_t
1410 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1411 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1412 			int root_node, unsigned int init_constraint)
1413 {
1414   int org_node, clone_node, ret;
1415   unsigned int constraint = init_constraint;
1416   for (org_node = top_org_node, clone_node = top_clone_node;;)
1417     {
1418       int org_dest, clone_dest;
1419       if (dfa->nodes[org_node].type == OP_BACK_REF)
1420 	{
1421 	  /* If the back reference epsilon-transit, its destination must
1422 	     also have the constraint.  Then duplicate the epsilon closure
1423 	     of the destination of the back reference, and store it in
1424 	     edests of the back reference.  */
1425 	  org_dest = dfa->nexts[org_node];
1426 	  re_node_set_empty (dfa->edests + clone_node);
1427 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1428 	  if (BE (clone_dest == -1, 0))
1429 	    return REG_ESPACE;
1430 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1431 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1432 	  if (BE (ret < 0, 0))
1433 	    return REG_ESPACE;
1434 	}
1435       else if (dfa->edests[org_node].nelem == 0)
1436 	{
1437 	  /* In case of the node can't epsilon-transit, don't duplicate the
1438 	     destination and store the original destination as the
1439 	     destination of the node.  */
1440 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1441 	  break;
1442 	}
1443       else if (dfa->edests[org_node].nelem == 1)
1444 	{
1445 	  /* In case of the node can epsilon-transit, and it has only one
1446 	     destination.  */
1447 	  org_dest = dfa->edests[org_node].elems[0];
1448 	  re_node_set_empty (dfa->edests + clone_node);
1449 	  if (dfa->nodes[org_node].type == ANCHOR)
1450 	    {
1451 	      /* In case of the node has another constraint, append it.  */
1452 	      if (org_node == root_node && clone_node != org_node)
1453 		{
1454 		  /* ...but if the node is root_node itself, it means the
1455 		     epsilon closure have a loop, then tie it to the
1456 		     destination of the root_node.  */
1457 		  ret = re_node_set_insert (dfa->edests + clone_node,
1458 					    org_dest);
1459 		  if (BE (ret < 0, 0))
1460 		    return REG_ESPACE;
1461 		  break;
1462 		}
1463 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1464 	    }
1465 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1466 	  if (BE (clone_dest == -1, 0))
1467 	    return REG_ESPACE;
1468 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1469 	  if (BE (ret < 0, 0))
1470 	    return REG_ESPACE;
1471 	}
1472       else /* dfa->edests[org_node].nelem == 2 */
1473 	{
1474 	  /* In case of the node can epsilon-transit, and it has two
1475 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1476 	  org_dest = dfa->edests[org_node].elems[0];
1477 	  re_node_set_empty (dfa->edests + clone_node);
1478 	  /* Search for a duplicated node which satisfies the constraint.  */
1479 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1480 	  if (clone_dest == -1)
1481 	    {
1482 	      /* There are no such a duplicated node, create a new one.  */
1483 	      reg_errcode_t err;
1484 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1485 	      if (BE (clone_dest == -1, 0))
1486 		return REG_ESPACE;
1487 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488 	      if (BE (ret < 0, 0))
1489 		return REG_ESPACE;
1490 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1491 					    root_node, constraint);
1492 	      if (BE (err != REG_NOERROR, 0))
1493 		return err;
1494 	    }
1495 	  else
1496 	    {
1497 	      /* There are a duplicated node which satisfy the constraint,
1498 		 use it to avoid infinite loop.  */
1499 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500 	      if (BE (ret < 0, 0))
1501 		return REG_ESPACE;
1502 	    }
1503 
1504 	  org_dest = dfa->edests[org_node].elems[1];
1505 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1506 	  if (BE (clone_dest == -1, 0))
1507 	    return REG_ESPACE;
1508 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 	  if (BE (ret < 0, 0))
1510 	    return REG_ESPACE;
1511 	}
1512       org_node = org_dest;
1513       clone_node = clone_dest;
1514     }
1515   return REG_NOERROR;
1516 }
1517 
1518 /* Search for a node which is duplicated from the node ORG_NODE, and
1519    satisfies the constraint CONSTRAINT.  */
1520 
1521 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1522 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1523 			unsigned int constraint)
1524 {
1525   int idx;
1526   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1527     {
1528       if (org_node == dfa->org_indices[idx]
1529 	  && constraint == dfa->nodes[idx].constraint)
1530 	return idx; /* Found.  */
1531     }
1532   return -1; /* Not found.  */
1533 }
1534 
1535 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1536    Return the index of the new node, or -1 if insufficient storage is
1537    available.  */
1538 
1539 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1540 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1541 {
1542   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1543   if (BE (dup_idx != -1, 1))
1544     {
1545       dfa->nodes[dup_idx].constraint = constraint;
1546       if (dfa->nodes[org_idx].type == ANCHOR)
1547 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1548       dfa->nodes[dup_idx].duplicated = 1;
1549 
1550       /* Store the index of the original node.  */
1551       dfa->org_indices[dup_idx] = org_idx;
1552     }
1553   return dup_idx;
1554 }
1555 
1556 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1557 calc_inveclosure (re_dfa_t *dfa)
1558 {
1559   int src, idx, ret;
1560   for (idx = 0; idx < dfa->nodes_len; ++idx)
1561     re_node_set_init_empty (dfa->inveclosures + idx);
1562 
1563   for (src = 0; src < dfa->nodes_len; ++src)
1564     {
1565       int *elems = dfa->eclosures[src].elems;
1566       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1567 	{
1568 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1569 	  if (BE (ret == -1, 0))
1570 	    return REG_ESPACE;
1571 	}
1572     }
1573 
1574   return REG_NOERROR;
1575 }
1576 
1577 /* Calculate "eclosure" for all the node in DFA.  */
1578 
1579 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1580 calc_eclosure (re_dfa_t *dfa)
1581 {
1582   int node_idx, incomplete;
1583 #ifdef DEBUG
1584   assert (dfa->nodes_len > 0);
1585 #endif
1586   incomplete = 0;
1587   /* For each nodes, calculate epsilon closure.  */
1588   for (node_idx = 0; ; ++node_idx)
1589     {
1590       reg_errcode_t err;
1591       re_node_set eclosure_elem;
1592       if (node_idx == dfa->nodes_len)
1593 	{
1594 	  if (!incomplete)
1595 	    break;
1596 	  incomplete = 0;
1597 	  node_idx = 0;
1598 	}
1599 
1600 #ifdef DEBUG
1601       assert (dfa->eclosures[node_idx].nelem != -1);
1602 #endif
1603 
1604       /* If we have already calculated, skip it.  */
1605       if (dfa->eclosures[node_idx].nelem != 0)
1606 	continue;
1607       /* Calculate epsilon closure of `node_idx'.  */
1608       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1609       if (BE (err != REG_NOERROR, 0))
1610 	return err;
1611 
1612       if (dfa->eclosures[node_idx].nelem == 0)
1613 	{
1614 	  incomplete = 1;
1615 	  re_node_set_free (&eclosure_elem);
1616 	}
1617     }
1618   return REG_NOERROR;
1619 }
1620 
1621 /* Calculate epsilon closure of NODE.  */
1622 
1623 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1624 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1625 {
1626   reg_errcode_t err;
1627   unsigned int constraint;
1628   int i, incomplete;
1629   re_node_set eclosure;
1630   incomplete = 0;
1631   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1632   if (BE (err != REG_NOERROR, 0))
1633     return err;
1634 
1635   /* This indicates that we are calculating this node now.
1636      We reference this value to avoid infinite loop.  */
1637   dfa->eclosures[node].nelem = -1;
1638 
1639   constraint = ((dfa->nodes[node].type == ANCHOR)
1640 		? dfa->nodes[node].opr.ctx_type : 0);
1641   /* If the current node has constraints, duplicate all nodes.
1642      Since they must inherit the constraints.  */
1643   if (constraint
1644       && dfa->edests[node].nelem
1645       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1646     {
1647       err = duplicate_node_closure (dfa, node, node, node, constraint);
1648       if (BE (err != REG_NOERROR, 0))
1649 	return err;
1650     }
1651 
1652   /* Expand each epsilon destination nodes.  */
1653   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1654     for (i = 0; i < dfa->edests[node].nelem; ++i)
1655       {
1656 	re_node_set eclosure_elem;
1657 	int edest = dfa->edests[node].elems[i];
1658 	/* If calculating the epsilon closure of `edest' is in progress,
1659 	   return intermediate result.  */
1660 	if (dfa->eclosures[edest].nelem == -1)
1661 	  {
1662 	    incomplete = 1;
1663 	    continue;
1664 	  }
1665 	/* If we haven't calculated the epsilon closure of `edest' yet,
1666 	   calculate now. Otherwise use calculated epsilon closure.  */
1667 	if (dfa->eclosures[edest].nelem == 0)
1668 	  {
1669 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1670 	    if (BE (err != REG_NOERROR, 0))
1671 	      return err;
1672 	  }
1673 	else
1674 	  eclosure_elem = dfa->eclosures[edest];
1675 	/* Merge the epsilon closure of `edest'.  */
1676 	re_node_set_merge (&eclosure, &eclosure_elem);
1677 	/* If the epsilon closure of `edest' is incomplete,
1678 	   the epsilon closure of this node is also incomplete.  */
1679 	if (dfa->eclosures[edest].nelem == 0)
1680 	  {
1681 	    incomplete = 1;
1682 	    re_node_set_free (&eclosure_elem);
1683 	  }
1684       }
1685 
1686   /* Epsilon closures include itself.  */
1687   re_node_set_insert (&eclosure, node);
1688   if (incomplete && !root)
1689     dfa->eclosures[node].nelem = 0;
1690   else
1691     dfa->eclosures[node] = eclosure;
1692   *new_set = eclosure;
1693   return REG_NOERROR;
1694 }
1695 
1696 /* Functions for token which are used in the parser.  */
1697 
1698 /* Fetch a token from INPUT.
1699    We must not use this function inside bracket expressions.  */
1700 
1701 static void
1702 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1703 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1704 {
1705   re_string_skip_bytes (input, peek_token (result, input, syntax));
1706 }
1707 
1708 /* Peek a token from INPUT, and return the length of the token.
1709    We must not use this function inside bracket expressions.  */
1710 
1711 static int
1712 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1713 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1714 {
1715   unsigned char c;
1716 
1717   if (re_string_eoi (input))
1718     {
1719       token->type = END_OF_RE;
1720       return 0;
1721     }
1722 
1723   c = re_string_peek_byte (input, 0);
1724   token->opr.c = c;
1725 
1726   token->word_char = 0;
1727 #ifdef RE_ENABLE_I18N
1728   token->mb_partial = 0;
1729   if (input->mb_cur_max > 1 &&
1730       !re_string_first_byte (input, re_string_cur_idx (input)))
1731     {
1732       token->type = CHARACTER;
1733       token->mb_partial = 1;
1734       return 1;
1735     }
1736 #endif
1737   if (c == '\\')
1738     {
1739       unsigned char c2;
1740       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1741 	{
1742 	  token->type = BACK_SLASH;
1743 	  return 1;
1744 	}
1745 
1746       c2 = re_string_peek_byte_case (input, 1);
1747       token->opr.c = c2;
1748       token->type = CHARACTER;
1749 #ifdef RE_ENABLE_I18N
1750       if (input->mb_cur_max > 1)
1751 	{
1752 	  wint_t wc = re_string_wchar_at (input,
1753 					  re_string_cur_idx (input) + 1);
1754 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1755 	}
1756       else
1757 #endif
1758 	token->word_char = IS_WORD_CHAR (c2) != 0;
1759 
1760       switch (c2)
1761 	{
1762 	case '|':
1763 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1764 	    token->type = OP_ALT;
1765 	  break;
1766 	case '1': case '2': case '3': case '4': case '5':
1767 	case '6': case '7': case '8': case '9':
1768 	  if (!(syntax & RE_NO_BK_REFS))
1769 	    {
1770 	      token->type = OP_BACK_REF;
1771 	      token->opr.idx = c2 - '1';
1772 	    }
1773 	  break;
1774 	case '<':
1775 	  if (!(syntax & RE_NO_GNU_OPS))
1776 	    {
1777 	      token->type = ANCHOR;
1778 	      token->opr.ctx_type = WORD_FIRST;
1779 	    }
1780 	  break;
1781 	case '>':
1782 	  if (!(syntax & RE_NO_GNU_OPS))
1783 	    {
1784 	      token->type = ANCHOR;
1785 	      token->opr.ctx_type = WORD_LAST;
1786 	    }
1787 	  break;
1788 	case 'b':
1789 	  if (!(syntax & RE_NO_GNU_OPS))
1790 	    {
1791 	      token->type = ANCHOR;
1792 	      token->opr.ctx_type = WORD_DELIM;
1793 	    }
1794 	  break;
1795 	case 'B':
1796 	  if (!(syntax & RE_NO_GNU_OPS))
1797 	    {
1798 	      token->type = ANCHOR;
1799 	      token->opr.ctx_type = NOT_WORD_DELIM;
1800 	    }
1801 	  break;
1802 	case 'w':
1803 	  if (!(syntax & RE_NO_GNU_OPS))
1804 	    token->type = OP_WORD;
1805 	  break;
1806 	case 'W':
1807 	  if (!(syntax & RE_NO_GNU_OPS))
1808 	    token->type = OP_NOTWORD;
1809 	  break;
1810 	case 's':
1811 	  if (!(syntax & RE_NO_GNU_OPS))
1812 	    token->type = OP_SPACE;
1813 	  break;
1814 	case 'S':
1815 	  if (!(syntax & RE_NO_GNU_OPS))
1816 	    token->type = OP_NOTSPACE;
1817 	  break;
1818 	case '`':
1819 	  if (!(syntax & RE_NO_GNU_OPS))
1820 	    {
1821 	      token->type = ANCHOR;
1822 	      token->opr.ctx_type = BUF_FIRST;
1823 	    }
1824 	  break;
1825 	case '\'':
1826 	  if (!(syntax & RE_NO_GNU_OPS))
1827 	    {
1828 	      token->type = ANCHOR;
1829 	      token->opr.ctx_type = BUF_LAST;
1830 	    }
1831 	  break;
1832 	case '(':
1833 	  if (!(syntax & RE_NO_BK_PARENS))
1834 	    token->type = OP_OPEN_SUBEXP;
1835 	  break;
1836 	case ')':
1837 	  if (!(syntax & RE_NO_BK_PARENS))
1838 	    token->type = OP_CLOSE_SUBEXP;
1839 	  break;
1840 	case '+':
1841 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1842 	    token->type = OP_DUP_PLUS;
1843 	  break;
1844 	case '?':
1845 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1846 	    token->type = OP_DUP_QUESTION;
1847 	  break;
1848 	case '{':
1849 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1850 	    token->type = OP_OPEN_DUP_NUM;
1851 	  break;
1852 	case '}':
1853 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1854 	    token->type = OP_CLOSE_DUP_NUM;
1855 	  break;
1856 	default:
1857 	  break;
1858 	}
1859       return 2;
1860     }
1861 
1862   token->type = CHARACTER;
1863 #ifdef RE_ENABLE_I18N
1864   if (input->mb_cur_max > 1)
1865     {
1866       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1867       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1868     }
1869   else
1870 #endif
1871     token->word_char = IS_WORD_CHAR (token->opr.c);
1872 
1873   switch (c)
1874     {
1875     case '\n':
1876       if (syntax & RE_NEWLINE_ALT)
1877 	token->type = OP_ALT;
1878       break;
1879     case '|':
1880       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1881 	token->type = OP_ALT;
1882       break;
1883     case '*':
1884       token->type = OP_DUP_ASTERISK;
1885       break;
1886     case '+':
1887       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1888 	token->type = OP_DUP_PLUS;
1889       break;
1890     case '?':
1891       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1892 	token->type = OP_DUP_QUESTION;
1893       break;
1894     case '{':
1895       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1896 	token->type = OP_OPEN_DUP_NUM;
1897       break;
1898     case '}':
1899       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1900 	token->type = OP_CLOSE_DUP_NUM;
1901       break;
1902     case '(':
1903       if (syntax & RE_NO_BK_PARENS)
1904 	token->type = OP_OPEN_SUBEXP;
1905       break;
1906     case ')':
1907       if (syntax & RE_NO_BK_PARENS)
1908 	token->type = OP_CLOSE_SUBEXP;
1909       break;
1910     case '[':
1911       token->type = OP_OPEN_BRACKET;
1912       break;
1913     case '.':
1914       token->type = OP_PERIOD;
1915       break;
1916     case '^':
1917       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1918 	  re_string_cur_idx (input) != 0)
1919 	{
1920 	  char prev = re_string_peek_byte (input, -1);
1921 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1922 	    break;
1923 	}
1924       token->type = ANCHOR;
1925       token->opr.ctx_type = LINE_FIRST;
1926       break;
1927     case '$':
1928       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1929 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1930 	{
1931 	  re_token_t next;
1932 	  re_string_skip_bytes (input, 1);
1933 	  peek_token (&next, input, syntax);
1934 	  re_string_skip_bytes (input, -1);
1935 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1936 	    break;
1937 	}
1938       token->type = ANCHOR;
1939       token->opr.ctx_type = LINE_LAST;
1940       break;
1941     default:
1942       break;
1943     }
1944   return 1;
1945 }
1946 
1947 /* Peek a token from INPUT, and return the length of the token.
1948    We must not use this function out of bracket expressions.  */
1949 
1950 static int
1951 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1952 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1953 {
1954   unsigned char c;
1955   if (re_string_eoi (input))
1956     {
1957       token->type = END_OF_RE;
1958       return 0;
1959     }
1960   c = re_string_peek_byte (input, 0);
1961   token->opr.c = c;
1962 
1963 #ifdef RE_ENABLE_I18N
1964   if (input->mb_cur_max > 1 &&
1965       !re_string_first_byte (input, re_string_cur_idx (input)))
1966     {
1967       token->type = CHARACTER;
1968       return 1;
1969     }
1970 #endif /* RE_ENABLE_I18N */
1971 
1972   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1973       && re_string_cur_idx (input) + 1 < re_string_length (input))
1974     {
1975       /* In this case, '\' escape a character.  */
1976       unsigned char c2;
1977       re_string_skip_bytes (input, 1);
1978       c2 = re_string_peek_byte (input, 0);
1979       token->opr.c = c2;
1980       token->type = CHARACTER;
1981       return 1;
1982     }
1983   if (c == '[') /* '[' is a special char in a bracket exps.  */
1984     {
1985       unsigned char c2;
1986       int token_len;
1987       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1988 	c2 = re_string_peek_byte (input, 1);
1989       else
1990 	c2 = 0;
1991       token->opr.c = c2;
1992       token_len = 2;
1993       switch (c2)
1994 	{
1995 	case '.':
1996 	  token->type = OP_OPEN_COLL_ELEM;
1997 	  break;
1998 	case '=':
1999 	  token->type = OP_OPEN_EQUIV_CLASS;
2000 	  break;
2001 	case ':':
2002 	  if (syntax & RE_CHAR_CLASSES)
2003 	    {
2004 	      token->type = OP_OPEN_CHAR_CLASS;
2005 	      break;
2006 	    }
2007 	  /* else fall through.  */
2008 	default:
2009 	  token->type = CHARACTER;
2010 	  token->opr.c = c;
2011 	  token_len = 1;
2012 	  break;
2013 	}
2014       return token_len;
2015     }
2016   switch (c)
2017     {
2018     case '-':
2019       token->type = OP_CHARSET_RANGE;
2020       break;
2021     case ']':
2022       token->type = OP_CLOSE_BRACKET;
2023       break;
2024     case '^':
2025       token->type = OP_NON_MATCH_LIST;
2026       break;
2027     default:
2028       token->type = CHARACTER;
2029     }
2030   return 1;
2031 }
2032 
2033 /* Functions for parser.  */
2034 
2035 /* Entry point of the parser.
2036    Parse the regular expression REGEXP and return the structure tree.
2037    If an error is occured, ERR is set by error code, and return NULL.
2038    This function build the following tree, from regular expression <reg_exp>:
2039 	   CAT
2040 	   / \
2041 	  /   \
2042    <reg_exp>  EOR
2043 
2044    CAT means concatenation.
2045    EOR means end of regular expression.  */
2046 
2047 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2048 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2049        reg_errcode_t *err)
2050 {
2051   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2052   bin_tree_t *tree, *eor, *root;
2053   re_token_t current_token;
2054   dfa->syntax = syntax;
2055   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2056   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2057   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2058     return NULL;
2059   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2060   if (tree != NULL)
2061     root = create_tree (dfa, tree, eor, CONCAT);
2062   else
2063     root = eor;
2064   if (BE (eor == NULL || root == NULL, 0))
2065     {
2066       *err = REG_ESPACE;
2067       return NULL;
2068     }
2069   return root;
2070 }
2071 
2072 /* This function build the following tree, from regular expression
2073    <branch1>|<branch2>:
2074 	   ALT
2075 	   / \
2076 	  /   \
2077    <branch1> <branch2>
2078 
2079    ALT means alternative, which represents the operator `|'.  */
2080 
2081 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2082 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2083 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2084 {
2085   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2086   bin_tree_t *tree, *branch = NULL;
2087   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2088   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2089     return NULL;
2090 
2091   while (token->type == OP_ALT)
2092     {
2093       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2094       if (token->type != OP_ALT && token->type != END_OF_RE
2095 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2096 	{
2097 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2098 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2099 	    return NULL;
2100 	}
2101       else
2102 	branch = NULL;
2103       tree = create_tree (dfa, tree, branch, OP_ALT);
2104       if (BE (tree == NULL, 0))
2105 	{
2106 	  *err = REG_ESPACE;
2107 	  return NULL;
2108 	}
2109     }
2110   return tree;
2111 }
2112 
2113 /* This function build the following tree, from regular expression
2114    <exp1><exp2>:
2115 	CAT
2116 	/ \
2117        /   \
2118    <exp1> <exp2>
2119 
2120    CAT means concatenation.  */
2121 
2122 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2123 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2124 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2125 {
2126   bin_tree_t *tree, *exp;
2127   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2128   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2129   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2130     return NULL;
2131 
2132   while (token->type != OP_ALT && token->type != END_OF_RE
2133 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2134     {
2135       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2136       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2137 	{
2138 	  return NULL;
2139 	}
2140       if (tree != NULL && exp != NULL)
2141 	{
2142 	  tree = create_tree (dfa, tree, exp, CONCAT);
2143 	  if (tree == NULL)
2144 	    {
2145 	      *err = REG_ESPACE;
2146 	      return NULL;
2147 	    }
2148 	}
2149       else if (tree == NULL)
2150 	tree = exp;
2151       /* Otherwise exp == NULL, we don't need to create new tree.  */
2152     }
2153   return tree;
2154 }
2155 
2156 /* This function build the following tree, from regular expression a*:
2157 	 *
2158 	 |
2159 	 a
2160 */
2161 
2162 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2163 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2164 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2165 {
2166   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2167   bin_tree_t *tree;
2168   switch (token->type)
2169     {
2170     case CHARACTER:
2171       tree = create_token_tree (dfa, NULL, NULL, token);
2172       if (BE (tree == NULL, 0))
2173 	{
2174 	  *err = REG_ESPACE;
2175 	  return NULL;
2176 	}
2177 #ifdef RE_ENABLE_I18N
2178       if (dfa->mb_cur_max > 1)
2179 	{
2180 	  while (!re_string_eoi (regexp)
2181 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2182 	    {
2183 	      bin_tree_t *mbc_remain;
2184 	      fetch_token (token, regexp, syntax);
2185 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2186 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2187 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2188 		{
2189 		  *err = REG_ESPACE;
2190 		  return NULL;
2191 		}
2192 	    }
2193 	}
2194 #endif
2195       break;
2196     case OP_OPEN_SUBEXP:
2197       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2198       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199 	return NULL;
2200       break;
2201     case OP_OPEN_BRACKET:
2202       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2203       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2204 	return NULL;
2205       break;
2206     case OP_BACK_REF:
2207       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2208 	{
2209 	  *err = REG_ESUBREG;
2210 	  return NULL;
2211 	}
2212       dfa->used_bkref_map |= 1 << token->opr.idx;
2213       tree = create_token_tree (dfa, NULL, NULL, token);
2214       if (BE (tree == NULL, 0))
2215 	{
2216 	  *err = REG_ESPACE;
2217 	  return NULL;
2218 	}
2219       ++dfa->nbackref;
2220       dfa->has_mb_node = 1;
2221       break;
2222     case OP_OPEN_DUP_NUM:
2223       if (syntax & RE_CONTEXT_INVALID_DUP)
2224 	{
2225 	  *err = REG_BADRPT;
2226 	  return NULL;
2227 	}
2228       /* FALLTHROUGH */
2229     case OP_DUP_ASTERISK:
2230     case OP_DUP_PLUS:
2231     case OP_DUP_QUESTION:
2232       if (syntax & RE_CONTEXT_INVALID_OPS)
2233 	{
2234 	  *err = REG_BADRPT;
2235 	  return NULL;
2236 	}
2237       else if (syntax & RE_CONTEXT_INDEP_OPS)
2238 	{
2239 	  fetch_token (token, regexp, syntax);
2240 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2241 	}
2242       /* else fall through  */
2243     case OP_CLOSE_SUBEXP:
2244       if ((token->type == OP_CLOSE_SUBEXP) &&
2245 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2246 	{
2247 	  *err = REG_ERPAREN;
2248 	  return NULL;
2249 	}
2250       /* else fall through  */
2251     case OP_CLOSE_DUP_NUM:
2252       /* We treat it as a normal character.  */
2253 
2254       /* Then we can these characters as normal characters.  */
2255       token->type = CHARACTER;
2256       /* mb_partial and word_char bits should be initialized already
2257 	 by peek_token.  */
2258       tree = create_token_tree (dfa, NULL, NULL, token);
2259       if (BE (tree == NULL, 0))
2260 	{
2261 	  *err = REG_ESPACE;
2262 	  return NULL;
2263 	}
2264       break;
2265     case ANCHOR:
2266       if ((token->opr.ctx_type
2267 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2268 	  && dfa->word_ops_used == 0)
2269 	init_word_char (dfa);
2270       if (token->opr.ctx_type == WORD_DELIM
2271           || token->opr.ctx_type == NOT_WORD_DELIM)
2272 	{
2273 	  bin_tree_t *tree_first, *tree_last;
2274 	  if (token->opr.ctx_type == WORD_DELIM)
2275 	    {
2276 	      token->opr.ctx_type = WORD_FIRST;
2277 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2278 	      token->opr.ctx_type = WORD_LAST;
2279             }
2280           else
2281             {
2282 	      token->opr.ctx_type = INSIDE_WORD;
2283 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2284 	      token->opr.ctx_type = INSIDE_NOTWORD;
2285             }
2286 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2287 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2288 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2289 	    {
2290 	      *err = REG_ESPACE;
2291 	      return NULL;
2292 	    }
2293 	}
2294       else
2295 	{
2296 	  tree = create_token_tree (dfa, NULL, NULL, token);
2297 	  if (BE (tree == NULL, 0))
2298 	    {
2299 	      *err = REG_ESPACE;
2300 	      return NULL;
2301 	    }
2302 	}
2303       /* We must return here, since ANCHORs can't be followed
2304 	 by repetition operators.
2305 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2306 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2307       fetch_token (token, regexp, syntax);
2308       return tree;
2309     case OP_PERIOD:
2310       tree = create_token_tree (dfa, NULL, NULL, token);
2311       if (BE (tree == NULL, 0))
2312 	{
2313 	  *err = REG_ESPACE;
2314 	  return NULL;
2315 	}
2316       if (dfa->mb_cur_max > 1)
2317 	dfa->has_mb_node = 1;
2318       break;
2319     case OP_WORD:
2320     case OP_NOTWORD:
2321       tree = build_charclass_op (dfa, regexp->trans,
2322 				 (const unsigned char *) "alnum",
2323 				 (const unsigned char *) "_",
2324 				 token->type == OP_NOTWORD, err);
2325       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2326 	return NULL;
2327       break;
2328     case OP_SPACE:
2329     case OP_NOTSPACE:
2330       tree = build_charclass_op (dfa, regexp->trans,
2331 				 (const unsigned char *) "space",
2332 				 (const unsigned char *) "",
2333 				 token->type == OP_NOTSPACE, err);
2334       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2335 	return NULL;
2336       break;
2337     case OP_ALT:
2338     case END_OF_RE:
2339       return NULL;
2340     case BACK_SLASH:
2341       *err = REG_EESCAPE;
2342       return NULL;
2343     default:
2344       /* Must not happen?  */
2345 #ifdef DEBUG
2346       assert (0);
2347 #endif
2348       return NULL;
2349     }
2350   fetch_token (token, regexp, syntax);
2351 
2352   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2353 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2354     {
2355       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2356       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2357 	return NULL;
2358       /* In BRE consecutive duplications are not allowed.  */
2359       if ((syntax & RE_CONTEXT_INVALID_DUP)
2360 	  && (token->type == OP_DUP_ASTERISK
2361 	      || token->type == OP_OPEN_DUP_NUM))
2362 	{
2363 	  *err = REG_BADRPT;
2364 	  return NULL;
2365 	}
2366     }
2367 
2368   return tree;
2369 }
2370 
2371 /* This function build the following tree, from regular expression
2372    (<reg_exp>):
2373 	 SUBEXP
2374 	    |
2375 	<reg_exp>
2376 */
2377 
2378 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2379 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2380 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2381 {
2382   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2383   bin_tree_t *tree;
2384   size_t cur_nsub;
2385   cur_nsub = preg->re_nsub++;
2386 
2387   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2388 
2389   /* The subexpression may be a null string.  */
2390   if (token->type == OP_CLOSE_SUBEXP)
2391     tree = NULL;
2392   else
2393     {
2394       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2395       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2396         *err = REG_EPAREN;
2397       if (BE (*err != REG_NOERROR, 0))
2398 	return NULL;
2399     }
2400 
2401   if (cur_nsub <= '9' - '1')
2402     dfa->completed_bkref_map |= 1 << cur_nsub;
2403 
2404   tree = create_tree (dfa, tree, NULL, SUBEXP);
2405   if (BE (tree == NULL, 0))
2406     {
2407       *err = REG_ESPACE;
2408       return NULL;
2409     }
2410   tree->token.opr.idx = cur_nsub;
2411   return tree;
2412 }
2413 
2414 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2415 
2416 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2417 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2418 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2419 {
2420   bin_tree_t *tree = NULL, *old_tree = NULL;
2421   int i, start, end, start_idx = re_string_cur_idx (regexp);
2422   re_token_t start_token = *token;
2423 
2424   if (token->type == OP_OPEN_DUP_NUM)
2425     {
2426       end = 0;
2427       start = fetch_number (regexp, token, syntax);
2428       if (start == -1)
2429 	{
2430 	  if (token->type == CHARACTER && token->opr.c == ',')
2431 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2432 	  else
2433 	    {
2434 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2435 	      return NULL;
2436 	    }
2437 	}
2438       if (BE (start != -2, 1))
2439 	{
2440 	  /* We treat "{n}" as "{n,n}".  */
2441 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2442 		 : ((token->type == CHARACTER && token->opr.c == ',')
2443 		    ? fetch_number (regexp, token, syntax) : -2));
2444 	}
2445       if (BE (start == -2 || end == -2, 0))
2446 	{
2447 	  /* Invalid sequence.  */
2448 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2449 	    {
2450 	      if (token->type == END_OF_RE)
2451 		*err = REG_EBRACE;
2452 	      else
2453 		*err = REG_BADBR;
2454 
2455 	      return NULL;
2456 	    }
2457 
2458 	  /* If the syntax bit is set, rollback.  */
2459 	  re_string_set_index (regexp, start_idx);
2460 	  *token = start_token;
2461 	  token->type = CHARACTER;
2462 	  /* mb_partial and word_char bits should be already initialized by
2463 	     peek_token.  */
2464 	  return elem;
2465 	}
2466 
2467       if (BE (end != -1 && start > end, 0))
2468 	{
2469 	  /* First number greater than second.  */
2470 	  *err = REG_BADBR;
2471 	  return NULL;
2472 	}
2473     }
2474   else
2475     {
2476       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2477       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2478     }
2479 
2480   fetch_token (token, regexp, syntax);
2481 
2482   if (BE (elem == NULL, 0))
2483     return NULL;
2484   if (BE (start == 0 && end == 0, 0))
2485     {
2486       postorder (elem, free_tree, NULL);
2487       return NULL;
2488     }
2489 
2490   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2491   if (BE (start > 0, 0))
2492     {
2493       tree = elem;
2494       for (i = 2; i <= start; ++i)
2495 	{
2496 	  elem = duplicate_tree (elem, dfa);
2497 	  tree = create_tree (dfa, tree, elem, CONCAT);
2498 	  if (BE (elem == NULL || tree == NULL, 0))
2499 	    goto parse_dup_op_espace;
2500 	}
2501 
2502       if (start == end)
2503 	return tree;
2504 
2505       /* Duplicate ELEM before it is marked optional.  */
2506       elem = duplicate_tree (elem, dfa);
2507       old_tree = tree;
2508     }
2509   else
2510     old_tree = NULL;
2511 
2512   if (elem->token.type == SUBEXP)
2513     postorder (elem, mark_opt_subexp, (void *) (ptrdiff_t) elem->token.opr.idx);
2514 
2515   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2516   if (BE (tree == NULL, 0))
2517     goto parse_dup_op_espace;
2518 
2519   /* This loop is actually executed only when end != -1,
2520      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2521      already created the start+1-th copy.  */
2522   for (i = start + 2; i <= end; ++i)
2523     {
2524       elem = duplicate_tree (elem, dfa);
2525       tree = create_tree (dfa, tree, elem, CONCAT);
2526       if (BE (elem == NULL || tree == NULL, 0))
2527         goto parse_dup_op_espace;
2528 
2529       tree = create_tree (dfa, tree, NULL, OP_ALT);
2530       if (BE (tree == NULL, 0))
2531         goto parse_dup_op_espace;
2532     }
2533 
2534   if (old_tree)
2535     tree = create_tree (dfa, old_tree, tree, CONCAT);
2536 
2537   return tree;
2538 
2539  parse_dup_op_espace:
2540   *err = REG_ESPACE;
2541   return NULL;
2542 }
2543 
2544 /* Size of the names for collating symbol/equivalence_class/character_class.
2545    I'm not sure, but maybe enough.  */
2546 #define BRACKET_NAME_BUF_SIZE 32
2547 
2548 #ifndef _LIBC
2549   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2550      Build the range expression which starts from START_ELEM, and ends
2551      at END_ELEM.  The result are written to MBCSET and SBCSET.
2552      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2553      mbcset->range_ends, is a pointer argument sinse we may
2554      update it.  */
2555 
2556 static reg_errcode_t
2557 internal_function
2558 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2559 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2560 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2561 # else /* not RE_ENABLE_I18N */
2562 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2563 		 bracket_elem_t *end_elem)
2564 # endif /* not RE_ENABLE_I18N */
2565 {
2566   unsigned int start_ch, end_ch;
2567   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2568   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2569 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2570 	  0))
2571     return REG_ERANGE;
2572 
2573   /* We can handle no multi character collating elements without libc
2574      support.  */
2575   if (BE ((start_elem->type == COLL_SYM
2576 	   && strlen ((char *) start_elem->opr.name) > 1)
2577 	  || (end_elem->type == COLL_SYM
2578 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2579     return REG_ECOLLATE;
2580 
2581 # ifdef RE_ENABLE_I18N
2582   {
2583     wchar_t wc;
2584     wint_t start_wc;
2585     wint_t end_wc;
2586     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2587 
2588     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2589 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2590 		   : 0));
2591     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2592 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2593 		 : 0));
2594     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2595 		? __btowc (start_ch) : start_elem->opr.wch);
2596     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2597 	      ? __btowc (end_ch) : end_elem->opr.wch);
2598     if (start_wc == WEOF || end_wc == WEOF)
2599       return REG_ECOLLATE;
2600     cmp_buf[0] = start_wc;
2601     cmp_buf[4] = end_wc;
2602     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2603       return REG_ERANGE;
2604 
2605     /* Got valid collation sequence values, add them as a new entry.
2606        However, for !_LIBC we have no collation elements: if the
2607        character set is single byte, the single byte character set
2608        that we build below suffices.  parse_bracket_exp passes
2609        no MBCSET if dfa->mb_cur_max == 1.  */
2610     if (mbcset)
2611       {
2612         /* Check the space of the arrays.  */
2613         if (BE (*range_alloc == mbcset->nranges, 0))
2614           {
2615 	    /* There is not enough space, need realloc.  */
2616 	    wchar_t *new_array_start, *new_array_end;
2617 	    int new_nranges;
2618 
2619 	    /* +1 in case of mbcset->nranges is 0.  */
2620 	    new_nranges = 2 * mbcset->nranges + 1;
2621 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2622 	       are NULL if *range_alloc == 0.  */
2623 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2624 				          new_nranges);
2625 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2626 				        new_nranges);
2627 
2628 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2629 	      return REG_ESPACE;
2630 
2631 	    mbcset->range_starts = new_array_start;
2632 	    mbcset->range_ends = new_array_end;
2633 	    *range_alloc = new_nranges;
2634           }
2635 
2636         mbcset->range_starts[mbcset->nranges] = start_wc;
2637         mbcset->range_ends[mbcset->nranges++] = end_wc;
2638       }
2639 
2640     /* Build the table for single byte characters.  */
2641     for (wc = 0; wc < SBC_MAX; ++wc)
2642       {
2643 	cmp_buf[2] = wc;
2644 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2645 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2646 	  bitset_set (sbcset, wc);
2647       }
2648   }
2649 # else /* not RE_ENABLE_I18N */
2650   {
2651     unsigned int ch;
2652     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2653 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2654 		   : 0));
2655     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2656 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2657 		 : 0));
2658     if (start_ch > end_ch)
2659       return REG_ERANGE;
2660     /* Build the table for single byte characters.  */
2661     for (ch = 0; ch < SBC_MAX; ++ch)
2662       if (start_ch <= ch  && ch <= end_ch)
2663 	bitset_set (sbcset, ch);
2664   }
2665 # endif /* not RE_ENABLE_I18N */
2666   return REG_NOERROR;
2667 }
2668 #endif /* not _LIBC */
2669 
2670 #ifndef _LIBC
2671 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2672    Build the collating element which is represented by NAME.
2673    The result are written to MBCSET and SBCSET.
2674    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2675    pointer argument since we may update it.  */
2676 
2677 static reg_errcode_t
2678 internal_function
2679 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2680 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2681 			int *coll_sym_alloc, const unsigned char *name)
2682 # else /* not RE_ENABLE_I18N */
2683 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2684 # endif /* not RE_ENABLE_I18N */
2685 {
2686   size_t name_len = strlen ((const char *) name);
2687   if (BE (name_len != 1, 0))
2688     return REG_ECOLLATE;
2689   else
2690     {
2691       bitset_set (sbcset, name[0]);
2692       return REG_NOERROR;
2693     }
2694 }
2695 #endif /* not _LIBC */
2696 
2697 /* This function parse bracket expression like "[abc]", "[a-c]",
2698    "[[.a-a.]]" etc.  */
2699 
2700 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2701 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2702 		   reg_syntax_t syntax, reg_errcode_t *err)
2703 {
2704 #ifdef _LIBC
2705   const unsigned char *collseqmb;
2706   const char *collseqwc;
2707   uint32_t nrules;
2708   int32_t table_size;
2709   const int32_t *symb_table;
2710   const unsigned char *extra;
2711 
2712   /* Local function for parse_bracket_exp used in _LIBC environement.
2713      Seek the collating symbol entry correspondings to NAME.
2714      Return the index of the symbol in the SYMB_TABLE.  */
2715 
2716   auto inline int32_t
2717   __attribute ((always_inline))
2718   seek_collating_symbol_entry (name, name_len)
2719 	 const unsigned char *name;
2720 	 size_t name_len;
2721     {
2722       int32_t hash = elem_hash ((const char *) name, name_len);
2723       int32_t elem = hash % table_size;
2724       if (symb_table[2 * elem] != 0)
2725 	{
2726 	  int32_t second = hash % (table_size - 2) + 1;
2727 
2728 	  do
2729 	    {
2730 	      /* First compare the hashing value.  */
2731 	      if (symb_table[2 * elem] == hash
2732 		  /* Compare the length of the name.  */
2733 		  && name_len == extra[symb_table[2 * elem + 1]]
2734 		  /* Compare the name.  */
2735 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2736 			     name_len) == 0)
2737 		{
2738 		  /* Yep, this is the entry.  */
2739 		  break;
2740 		}
2741 
2742 	      /* Next entry.  */
2743 	      elem += second;
2744 	    }
2745 	  while (symb_table[2 * elem] != 0);
2746 	}
2747       return elem;
2748     }
2749 
2750   /* Local function for parse_bracket_exp used in _LIBC environment.
2751      Look up the collation sequence value of BR_ELEM.
2752      Return the value if succeeded, UINT_MAX otherwise.  */
2753 
2754   auto inline unsigned int
2755   __attribute ((always_inline))
2756   lookup_collation_sequence_value (br_elem)
2757 	 bracket_elem_t *br_elem;
2758     {
2759       if (br_elem->type == SB_CHAR)
2760 	{
2761 	  /*
2762 	  if (MB_CUR_MAX == 1)
2763 	  */
2764 	  if (nrules == 0)
2765 	    return collseqmb[br_elem->opr.ch];
2766 	  else
2767 	    {
2768 	      wint_t wc = __btowc (br_elem->opr.ch);
2769 	      return __collseq_table_lookup (collseqwc, wc);
2770 	    }
2771 	}
2772       else if (br_elem->type == MB_CHAR)
2773 	{
2774 	  if (nrules != 0)
2775 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2776 	}
2777       else if (br_elem->type == COLL_SYM)
2778 	{
2779 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2780 	  if (nrules != 0)
2781 	    {
2782 	      int32_t elem, idx;
2783 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2784 						  sym_name_len);
2785 	      if (symb_table[2 * elem] != 0)
2786 		{
2787 		  /* We found the entry.  */
2788 		  idx = symb_table[2 * elem + 1];
2789 		  /* Skip the name of collating element name.  */
2790 		  idx += 1 + extra[idx];
2791 		  /* Skip the byte sequence of the collating element.  */
2792 		  idx += 1 + extra[idx];
2793 		  /* Adjust for the alignment.  */
2794 		  idx = (idx + 3) & ~3;
2795 		  /* Skip the multibyte collation sequence value.  */
2796 		  idx += sizeof (unsigned int);
2797 		  /* Skip the wide char sequence of the collating element.  */
2798 		  idx += sizeof (unsigned int) *
2799 		    (1 + *(unsigned int *) (extra + idx));
2800 		  /* Return the collation sequence value.  */
2801 		  return *(unsigned int *) (extra + idx);
2802 		}
2803 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2804 		{
2805 		  /* No valid character.  Match it as a single byte
2806 		     character.  */
2807 		  return collseqmb[br_elem->opr.name[0]];
2808 		}
2809 	    }
2810 	  else if (sym_name_len == 1)
2811 	    return collseqmb[br_elem->opr.name[0]];
2812 	}
2813       return UINT_MAX;
2814     }
2815 
2816   /* Local function for parse_bracket_exp used in _LIBC environement.
2817      Build the range expression which starts from START_ELEM, and ends
2818      at END_ELEM.  The result are written to MBCSET and SBCSET.
2819      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2820      mbcset->range_ends, is a pointer argument sinse we may
2821      update it.  */
2822 
2823   auto inline reg_errcode_t
2824   __attribute ((always_inline))
2825   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2826 	 re_charset_t *mbcset;
2827 	 int *range_alloc;
2828 	 bitset_t sbcset;
2829 	 bracket_elem_t *start_elem, *end_elem;
2830     {
2831       unsigned int ch;
2832       uint32_t start_collseq;
2833       uint32_t end_collseq;
2834 
2835       /* Equivalence Classes and Character Classes can't be a range
2836 	 start/end.  */
2837       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2838 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2839 	      0))
2840 	return REG_ERANGE;
2841 
2842       start_collseq = lookup_collation_sequence_value (start_elem);
2843       end_collseq = lookup_collation_sequence_value (end_elem);
2844       /* Check start/end collation sequence values.  */
2845       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2846 	return REG_ECOLLATE;
2847       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2848 	return REG_ERANGE;
2849 
2850       /* Got valid collation sequence values, add them as a new entry.
2851 	 However, if we have no collation elements, and the character set
2852 	 is single byte, the single byte character set that we
2853 	 build below suffices. */
2854       if (nrules > 0 || dfa->mb_cur_max > 1)
2855 	{
2856           /* Check the space of the arrays.  */
2857           if (BE (*range_alloc == mbcset->nranges, 0))
2858 	    {
2859 	      /* There is not enough space, need realloc.  */
2860 	      uint32_t *new_array_start;
2861 	      uint32_t *new_array_end;
2862 	      int new_nranges;
2863 
2864 	      /* +1 in case of mbcset->nranges is 0.  */
2865 	      new_nranges = 2 * mbcset->nranges + 1;
2866 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2867 					    new_nranges);
2868 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2869 				          new_nranges);
2870 
2871 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2872 	        return REG_ESPACE;
2873 
2874 	      mbcset->range_starts = new_array_start;
2875 	      mbcset->range_ends = new_array_end;
2876 	      *range_alloc = new_nranges;
2877 	    }
2878 
2879           mbcset->range_starts[mbcset->nranges] = start_collseq;
2880           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2881 	}
2882 
2883       /* Build the table for single byte characters.  */
2884       for (ch = 0; ch < SBC_MAX; ch++)
2885 	{
2886 	  uint32_t ch_collseq;
2887 	  /*
2888 	  if (MB_CUR_MAX == 1)
2889 	  */
2890 	  if (nrules == 0)
2891 	    ch_collseq = collseqmb[ch];
2892 	  else
2893 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2894 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2895 	    bitset_set (sbcset, ch);
2896 	}
2897       return REG_NOERROR;
2898     }
2899 
2900   /* Local function for parse_bracket_exp used in _LIBC environement.
2901      Build the collating element which is represented by NAME.
2902      The result are written to MBCSET and SBCSET.
2903      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2904      pointer argument sinse we may update it.  */
2905 
2906   auto inline reg_errcode_t
2907   __attribute ((always_inline))
2908   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2909 	 re_charset_t *mbcset;
2910 	 int *coll_sym_alloc;
2911 	 bitset_t sbcset;
2912 	 const unsigned char *name;
2913     {
2914       int32_t elem, idx;
2915       size_t name_len = strlen ((const char *) name);
2916       if (nrules != 0)
2917 	{
2918 	  elem = seek_collating_symbol_entry (name, name_len);
2919 	  if (symb_table[2 * elem] != 0)
2920 	    {
2921 	      /* We found the entry.  */
2922 	      idx = symb_table[2 * elem + 1];
2923 	      /* Skip the name of collating element name.  */
2924 	      idx += 1 + extra[idx];
2925 	    }
2926 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2927 	    {
2928 	      /* No valid character, treat it as a normal
2929 		 character.  */
2930 	      bitset_set (sbcset, name[0]);
2931 	      return REG_NOERROR;
2932 	    }
2933 	  else
2934 	    return REG_ECOLLATE;
2935 
2936 	  /* Got valid collation sequence, add it as a new entry.  */
2937 	  /* Check the space of the arrays.  */
2938 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2939 	    {
2940 	      /* Not enough, realloc it.  */
2941 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2942 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2943 	      /* Use realloc since mbcset->coll_syms is NULL
2944 		 if *alloc == 0.  */
2945 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2946 						   new_coll_sym_alloc);
2947 	      if (BE (new_coll_syms == NULL, 0))
2948 		return REG_ESPACE;
2949 	      mbcset->coll_syms = new_coll_syms;
2950 	      *coll_sym_alloc = new_coll_sym_alloc;
2951 	    }
2952 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2953 	  return REG_NOERROR;
2954 	}
2955       else
2956 	{
2957 	  if (BE (name_len != 1, 0))
2958 	    return REG_ECOLLATE;
2959 	  else
2960 	    {
2961 	      bitset_set (sbcset, name[0]);
2962 	      return REG_NOERROR;
2963 	    }
2964 	}
2965     }
2966 #endif
2967 
2968   re_token_t br_token;
2969   re_bitset_ptr_t sbcset;
2970 #ifdef RE_ENABLE_I18N
2971   re_charset_t *mbcset;
2972   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2973   int equiv_class_alloc = 0, char_class_alloc = 0;
2974 #endif /* not RE_ENABLE_I18N */
2975   int non_match = 0;
2976   bin_tree_t *work_tree;
2977   int token_len;
2978   int first_round = 1;
2979 #ifdef _LIBC
2980   collseqmb = (const unsigned char *)
2981     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2982   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2983   if (nrules)
2984     {
2985       /*
2986       if (MB_CUR_MAX > 1)
2987       */
2988       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2989       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2990       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2991 						  _NL_COLLATE_SYMB_TABLEMB);
2992       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2993 						   _NL_COLLATE_SYMB_EXTRAMB);
2994     }
2995 #endif
2996   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2997 #ifdef RE_ENABLE_I18N
2998   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2999 #endif /* RE_ENABLE_I18N */
3000 #ifdef RE_ENABLE_I18N
3001   if (BE (sbcset == NULL || mbcset == NULL, 0))
3002 #else
3003   if (BE (sbcset == NULL, 0))
3004 #endif /* RE_ENABLE_I18N */
3005     {
3006       *err = REG_ESPACE;
3007       return NULL;
3008     }
3009 
3010   token_len = peek_token_bracket (token, regexp, syntax);
3011   if (BE (token->type == END_OF_RE, 0))
3012     {
3013       *err = REG_BADPAT;
3014       goto parse_bracket_exp_free_return;
3015     }
3016   if (token->type == OP_NON_MATCH_LIST)
3017     {
3018 #ifdef RE_ENABLE_I18N
3019       mbcset->non_match = 1;
3020 #endif /* not RE_ENABLE_I18N */
3021       non_match = 1;
3022       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3023 	bitset_set (sbcset, '\n');
3024       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3025       token_len = peek_token_bracket (token, regexp, syntax);
3026       if (BE (token->type == END_OF_RE, 0))
3027 	{
3028 	  *err = REG_BADPAT;
3029 	  goto parse_bracket_exp_free_return;
3030 	}
3031     }
3032 
3033   /* We treat the first ']' as a normal character.  */
3034   if (token->type == OP_CLOSE_BRACKET)
3035     token->type = CHARACTER;
3036 
3037   while (1)
3038     {
3039       // Got warnings about being used uninitialized.
3040       // bracket_elem_t start_elem, end_elem;
3041       bracket_elem_t start_elem = {.type=0, .opr.name=NULL}, end_elem = {.type=0, .opr.name=NULL};
3042       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3043       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3044       reg_errcode_t ret;
3045       int token_len2 = 0, is_range_exp = 0;
3046       re_token_t token2;
3047 
3048       start_elem.opr.name = start_name_buf;
3049       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3050 				   syntax, first_round);
3051       if (BE (ret != REG_NOERROR, 0))
3052 	{
3053 	  *err = ret;
3054 	  goto parse_bracket_exp_free_return;
3055 	}
3056       first_round = 0;
3057 
3058       /* Get information about the next token.  We need it in any case.  */
3059       token_len = peek_token_bracket (token, regexp, syntax);
3060 
3061       /* Do not check for ranges if we know they are not allowed.  */
3062       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3063 	{
3064 	  if (BE (token->type == END_OF_RE, 0))
3065 	    {
3066 	      *err = REG_EBRACK;
3067 	      goto parse_bracket_exp_free_return;
3068 	    }
3069 	  if (token->type == OP_CHARSET_RANGE)
3070 	    {
3071 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3072 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3073 	      if (BE (token2.type == END_OF_RE, 0))
3074 		{
3075 		  *err = REG_EBRACK;
3076 		  goto parse_bracket_exp_free_return;
3077 		}
3078 	      if (token2.type == OP_CLOSE_BRACKET)
3079 		{
3080 		  /* We treat the last '-' as a normal character.  */
3081 		  re_string_skip_bytes (regexp, -token_len);
3082 		  token->type = CHARACTER;
3083 		}
3084 	      else
3085 		is_range_exp = 1;
3086 	    }
3087 	}
3088 
3089       if (is_range_exp == 1)
3090 	{
3091 	  end_elem.opr.name = end_name_buf;
3092 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3093 				       dfa, syntax, 1);
3094 	  if (BE (ret != REG_NOERROR, 0))
3095 	    {
3096 	      *err = ret;
3097 	      goto parse_bracket_exp_free_return;
3098 	    }
3099 
3100 	  token_len = peek_token_bracket (token, regexp, syntax);
3101 
3102 #ifdef _LIBC
3103 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3104 				  &start_elem, &end_elem);
3105 #else
3106 # ifdef RE_ENABLE_I18N
3107 	  *err = build_range_exp (sbcset,
3108 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3109 				  &range_alloc, &start_elem, &end_elem);
3110 # else
3111 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3112 # endif
3113 #endif /* RE_ENABLE_I18N */
3114 	  if (BE (*err != REG_NOERROR, 0))
3115 	    goto parse_bracket_exp_free_return;
3116 	}
3117       else
3118 	{
3119 	  switch (start_elem.type)
3120 	    {
3121 	    case SB_CHAR:
3122 	      bitset_set (sbcset, start_elem.opr.ch);
3123 	      break;
3124 #ifdef RE_ENABLE_I18N
3125 	    case MB_CHAR:
3126 	      /* Check whether the array has enough space.  */
3127 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3128 		{
3129 		  wchar_t *new_mbchars;
3130 		  /* Not enough, realloc it.  */
3131 		  /* +1 in case of mbcset->nmbchars is 0.  */
3132 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3133 		  /* Use realloc since array is NULL if *alloc == 0.  */
3134 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3135 					    mbchar_alloc);
3136 		  if (BE (new_mbchars == NULL, 0))
3137 		    goto parse_bracket_exp_espace;
3138 		  mbcset->mbchars = new_mbchars;
3139 		}
3140 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3141 	      break;
3142 #endif /* RE_ENABLE_I18N */
3143 	    case EQUIV_CLASS:
3144 	      *err = build_equiv_class (sbcset,
3145 #ifdef RE_ENABLE_I18N
3146 					mbcset, &equiv_class_alloc,
3147 #endif /* RE_ENABLE_I18N */
3148 					start_elem.opr.name);
3149 	      if (BE (*err != REG_NOERROR, 0))
3150 		goto parse_bracket_exp_free_return;
3151 	      break;
3152 	    case COLL_SYM:
3153 	      *err = build_collating_symbol (sbcset,
3154 #ifdef RE_ENABLE_I18N
3155 					     mbcset, &coll_sym_alloc,
3156 #endif /* RE_ENABLE_I18N */
3157 					     start_elem.opr.name);
3158 	      if (BE (*err != REG_NOERROR, 0))
3159 		goto parse_bracket_exp_free_return;
3160 	      break;
3161 	    case CHAR_CLASS:
3162 	      *err = build_charclass (regexp->trans, sbcset,
3163 #ifdef RE_ENABLE_I18N
3164 				      mbcset, &char_class_alloc,
3165 #endif /* RE_ENABLE_I18N */
3166 				      start_elem.opr.name, syntax);
3167 	      if (BE (*err != REG_NOERROR, 0))
3168 	       goto parse_bracket_exp_free_return;
3169 	      break;
3170 	    default:
3171 	      assert (0);
3172 	      break;
3173 	    }
3174 	}
3175       if (BE (token->type == END_OF_RE, 0))
3176 	{
3177 	  *err = REG_EBRACK;
3178 	  goto parse_bracket_exp_free_return;
3179 	}
3180       if (token->type == OP_CLOSE_BRACKET)
3181 	break;
3182     }
3183 
3184   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3185 
3186   /* If it is non-matching list.  */
3187   if (non_match)
3188     bitset_not (sbcset);
3189 
3190 #ifdef RE_ENABLE_I18N
3191   /* Ensure only single byte characters are set.  */
3192   if (dfa->mb_cur_max > 1)
3193     bitset_mask (sbcset, dfa->sb_char);
3194 
3195   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3196       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3197 						     || mbcset->non_match)))
3198     {
3199       bin_tree_t *mbc_tree;
3200       int sbc_idx;
3201       /* Build a tree for complex bracket.  */
3202       dfa->has_mb_node = 1;
3203       br_token.type = COMPLEX_BRACKET;
3204       br_token.opr.mbcset = mbcset;
3205       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3206       if (BE (mbc_tree == NULL, 0))
3207 	goto parse_bracket_exp_espace;
3208       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3209 	if (sbcset[sbc_idx])
3210 	  break;
3211       /* If there are no bits set in sbcset, there is no point
3212 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3213       if (sbc_idx < BITSET_WORDS)
3214 	{
3215           /* Build a tree for simple bracket.  */
3216           br_token.type = SIMPLE_BRACKET;
3217           br_token.opr.sbcset = sbcset;
3218           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3219           if (BE (work_tree == NULL, 0))
3220             goto parse_bracket_exp_espace;
3221 
3222           /* Then join them by ALT node.  */
3223           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3224           if (BE (work_tree == NULL, 0))
3225             goto parse_bracket_exp_espace;
3226 	}
3227       else
3228 	{
3229 	  re_free (sbcset);
3230 	  work_tree = mbc_tree;
3231 	}
3232     }
3233   else
3234 #endif /* not RE_ENABLE_I18N */
3235     {
3236 #ifdef RE_ENABLE_I18N
3237       free_charset (mbcset);
3238 #endif
3239       /* Build a tree for simple bracket.  */
3240       br_token.type = SIMPLE_BRACKET;
3241       br_token.opr.sbcset = sbcset;
3242       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3243       if (BE (work_tree == NULL, 0))
3244         goto parse_bracket_exp_espace;
3245     }
3246   return work_tree;
3247 
3248  parse_bracket_exp_espace:
3249   *err = REG_ESPACE;
3250  parse_bracket_exp_free_return:
3251   re_free (sbcset);
3252 #ifdef RE_ENABLE_I18N
3253   free_charset (mbcset);
3254 #endif /* RE_ENABLE_I18N */
3255   return NULL;
3256 }
3257 
3258 /* Parse an element in the bracket expression.  */
3259 
3260 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3261 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3262 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3263 		       reg_syntax_t syntax, int accept_hyphen)
3264 {
3265 #ifdef RE_ENABLE_I18N
3266   int cur_char_size;
3267   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3268   if (cur_char_size > 1)
3269     {
3270       elem->type = MB_CHAR;
3271       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3272       re_string_skip_bytes (regexp, cur_char_size);
3273       return REG_NOERROR;
3274     }
3275 #endif /* RE_ENABLE_I18N */
3276   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3277   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3278       || token->type == OP_OPEN_EQUIV_CLASS)
3279     return parse_bracket_symbol (elem, regexp, token);
3280   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3281     {
3282       /* A '-' must only appear as anything but a range indicator before
3283 	 the closing bracket.  Everything else is an error.  */
3284       re_token_t token2;
3285       (void) peek_token_bracket (&token2, regexp, syntax);
3286       if (token2.type != OP_CLOSE_BRACKET)
3287 	/* The actual error value is not standardized since this whole
3288 	   case is undefined.  But ERANGE makes good sense.  */
3289 	return REG_ERANGE;
3290     }
3291   elem->type = SB_CHAR;
3292   elem->opr.ch = token->opr.c;
3293   return REG_NOERROR;
3294 }
3295 
3296 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3297    such as [:<character_class>:], [.<collating_element>.], and
3298    [=<equivalent_class>=].  */
3299 
3300 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3301 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3302 		      re_token_t *token)
3303 {
3304   unsigned char ch, delim = token->opr.c;
3305   int i = 0;
3306   if (re_string_eoi(regexp))
3307     return REG_EBRACK;
3308   for (;; ++i)
3309     {
3310       if (i >= BRACKET_NAME_BUF_SIZE)
3311 	return REG_EBRACK;
3312       if (token->type == OP_OPEN_CHAR_CLASS)
3313 	ch = re_string_fetch_byte_case (regexp);
3314       else
3315 	ch = re_string_fetch_byte (regexp);
3316       if (re_string_eoi(regexp))
3317 	return REG_EBRACK;
3318       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3319 	break;
3320       elem->opr.name[i] = ch;
3321     }
3322   re_string_skip_bytes (regexp, 1);
3323   elem->opr.name[i] = '\0';
3324   switch (token->type)
3325     {
3326     case OP_OPEN_COLL_ELEM:
3327       elem->type = COLL_SYM;
3328       break;
3329     case OP_OPEN_EQUIV_CLASS:
3330       elem->type = EQUIV_CLASS;
3331       break;
3332     case OP_OPEN_CHAR_CLASS:
3333       elem->type = CHAR_CLASS;
3334       break;
3335     default:
3336       break;
3337     }
3338   return REG_NOERROR;
3339 }
3340 
3341   /* Helper function for parse_bracket_exp.
3342      Build the equivalence class which is represented by NAME.
3343      The result are written to MBCSET and SBCSET.
3344      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3345      is a pointer argument sinse we may update it.  */
3346 
3347 static reg_errcode_t
3348 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3349 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3350 		   int *equiv_class_alloc, const unsigned char *name)
3351 #else /* not RE_ENABLE_I18N */
3352 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3353 #endif /* not RE_ENABLE_I18N */
3354 {
3355 #ifdef _LIBC
3356   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3357   if (nrules != 0)
3358     {
3359       const int32_t *table, *indirect;
3360       const unsigned char *weights, *extra, *cp;
3361       unsigned char char_buf[2];
3362       int32_t idx1, idx2;
3363       unsigned int ch;
3364       size_t len;
3365       /* This #include defines a local function!  */
3366 # include <locale/weight.h>
3367       /* Calculate the index for equivalence class.  */
3368       cp = name;
3369       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3370       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3371 					       _NL_COLLATE_WEIGHTMB);
3372       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3373 						   _NL_COLLATE_EXTRAMB);
3374       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3375 						_NL_COLLATE_INDIRECTMB);
3376       idx1 = findidx (&cp);
3377       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3378 	/* This isn't a valid character.  */
3379 	return REG_ECOLLATE;
3380 
3381       /* Build single byte matcing table for this equivalence class.  */
3382       char_buf[1] = (unsigned char) '\0';
3383       len = weights[idx1 & 0xffffff];
3384       for (ch = 0; ch < SBC_MAX; ++ch)
3385 	{
3386 	  char_buf[0] = ch;
3387 	  cp = char_buf;
3388 	  idx2 = findidx (&cp);
3389 /*
3390 	  idx2 = table[ch];
3391 */
3392 	  if (idx2 == 0)
3393 	    /* This isn't a valid character.  */
3394 	    continue;
3395 	  /* Compare only if the length matches and the collation rule
3396 	     index is the same.  */
3397 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3398 	    {
3399 	      int cnt = 0;
3400 
3401 	      while (cnt <= len &&
3402 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3403 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3404 		++cnt;
3405 
3406 	      if (cnt > len)
3407 		bitset_set (sbcset, ch);
3408 	    }
3409 	}
3410       /* Check whether the array has enough space.  */
3411       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3412 	{
3413 	  /* Not enough, realloc it.  */
3414 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3415 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3416 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3417 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3418 						   int32_t,
3419 						   new_equiv_class_alloc);
3420 	  if (BE (new_equiv_classes == NULL, 0))
3421 	    return REG_ESPACE;
3422 	  mbcset->equiv_classes = new_equiv_classes;
3423 	  *equiv_class_alloc = new_equiv_class_alloc;
3424 	}
3425       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3426     }
3427   else
3428 #endif /* _LIBC */
3429     {
3430       if (BE (strlen ((const char *) name) != 1, 0))
3431 	return REG_ECOLLATE;
3432       bitset_set (sbcset, *name);
3433     }
3434   return REG_NOERROR;
3435 }
3436 
3437   /* Helper function for parse_bracket_exp.
3438      Build the character class which is represented by NAME.
3439      The result are written to MBCSET and SBCSET.
3440      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3441      is a pointer argument sinse we may update it.  */
3442 
3443 static reg_errcode_t
3444 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3445 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3446 		 re_charset_t *mbcset, int *char_class_alloc,
3447 		 const unsigned char *class_name, reg_syntax_t syntax)
3448 #else /* not RE_ENABLE_I18N */
3449 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3450 		 const unsigned char *class_name, reg_syntax_t syntax)
3451 #endif /* not RE_ENABLE_I18N */
3452 {
3453   int i;
3454   const char *name = (const char *) class_name;
3455 
3456   /* In case of REG_ICASE "upper" and "lower" match the both of
3457      upper and lower cases.  */
3458   if ((syntax & RE_ICASE)
3459       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3460     name = "alpha";
3461 
3462 #ifdef RE_ENABLE_I18N
3463   /* Check the space of the arrays.  */
3464   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3465     {
3466       /* Not enough, realloc it.  */
3467       /* +1 in case of mbcset->nchar_classes is 0.  */
3468       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3469       /* Use realloc since array is NULL if *alloc == 0.  */
3470       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3471 					       new_char_class_alloc);
3472       if (BE (new_char_classes == NULL, 0))
3473 	return REG_ESPACE;
3474       mbcset->char_classes = new_char_classes;
3475       *char_class_alloc = new_char_class_alloc;
3476     }
3477   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3478 #endif /* RE_ENABLE_I18N */
3479 
3480 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3481   do {						\
3482     if (BE (trans != NULL, 0))			\
3483       {						\
3484 	for (i = 0; i < SBC_MAX; ++i)		\
3485   	  if (ctype_func (i))			\
3486 	    bitset_set (sbcset, trans[i]);	\
3487       }						\
3488     else					\
3489       {						\
3490 	for (i = 0; i < SBC_MAX; ++i)		\
3491   	  if (ctype_func (i))			\
3492 	    bitset_set (sbcset, i);		\
3493       }						\
3494   } while (0)
3495 
3496   if (strcmp (name, "alnum") == 0)
3497     BUILD_CHARCLASS_LOOP (isalnum);
3498   else if (strcmp (name, "cntrl") == 0)
3499     BUILD_CHARCLASS_LOOP (iscntrl);
3500   else if (strcmp (name, "lower") == 0)
3501     BUILD_CHARCLASS_LOOP (islower);
3502   else if (strcmp (name, "space") == 0)
3503     BUILD_CHARCLASS_LOOP (isspace);
3504   else if (strcmp (name, "alpha") == 0)
3505     BUILD_CHARCLASS_LOOP (isalpha);
3506   else if (strcmp (name, "digit") == 0)
3507     BUILD_CHARCLASS_LOOP (isdigit);
3508   else if (strcmp (name, "print") == 0)
3509     BUILD_CHARCLASS_LOOP (isprint);
3510   else if (strcmp (name, "upper") == 0)
3511     BUILD_CHARCLASS_LOOP (isupper);
3512   else if (strcmp (name, "blank") == 0)
3513     BUILD_CHARCLASS_LOOP (isblank);
3514   else if (strcmp (name, "graph") == 0)
3515     BUILD_CHARCLASS_LOOP (isgraph);
3516   else if (strcmp (name, "punct") == 0)
3517     BUILD_CHARCLASS_LOOP (ispunct);
3518   else if (strcmp (name, "xdigit") == 0)
3519     BUILD_CHARCLASS_LOOP (isxdigit);
3520   else
3521     return REG_ECTYPE;
3522 
3523   return REG_NOERROR;
3524 }
3525 
3526 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)3527 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3528 		    const unsigned char *class_name,
3529 		    const unsigned char *extra, int non_match,
3530 		    reg_errcode_t *err)
3531 {
3532   re_bitset_ptr_t sbcset;
3533 #ifdef RE_ENABLE_I18N
3534   re_charset_t *mbcset;
3535   int alloc = 0;
3536 #endif /* not RE_ENABLE_I18N */
3537   reg_errcode_t ret;
3538   re_token_t br_token;
3539   bin_tree_t *tree;
3540 
3541   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3542 #ifdef RE_ENABLE_I18N
3543   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3544 #endif /* RE_ENABLE_I18N */
3545 
3546 #ifdef RE_ENABLE_I18N
3547   if (BE (sbcset == NULL || mbcset == NULL, 0))
3548 #else /* not RE_ENABLE_I18N */
3549   if (BE (sbcset == NULL, 0))
3550 #endif /* not RE_ENABLE_I18N */
3551     {
3552       *err = REG_ESPACE;
3553       return NULL;
3554     }
3555 
3556   if (non_match)
3557     {
3558 #ifdef RE_ENABLE_I18N
3559       mbcset->non_match = 1;
3560 #endif /* not RE_ENABLE_I18N */
3561     }
3562 
3563   /* We don't care the syntax in this case.  */
3564   ret = build_charclass (trans, sbcset,
3565 #ifdef RE_ENABLE_I18N
3566 			 mbcset, &alloc,
3567 #endif /* RE_ENABLE_I18N */
3568 			 class_name, 0);
3569 
3570   if (BE (ret != REG_NOERROR, 0))
3571     {
3572       re_free (sbcset);
3573 #ifdef RE_ENABLE_I18N
3574       free_charset (mbcset);
3575 #endif /* RE_ENABLE_I18N */
3576       *err = ret;
3577       return NULL;
3578     }
3579   /* \w match '_' also.  */
3580   for (; *extra; extra++)
3581     bitset_set (sbcset, *extra);
3582 
3583   /* If it is non-matching list.  */
3584   if (non_match)
3585     bitset_not (sbcset);
3586 
3587 #ifdef RE_ENABLE_I18N
3588   /* Ensure only single byte characters are set.  */
3589   if (dfa->mb_cur_max > 1)
3590     bitset_mask (sbcset, dfa->sb_char);
3591 #endif
3592 
3593   /* Build a tree for simple bracket.  */
3594   br_token.type = SIMPLE_BRACKET;
3595   br_token.opr.sbcset = sbcset;
3596   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3597   if (BE (tree == NULL, 0))
3598     goto build_word_op_espace;
3599 
3600 #ifdef RE_ENABLE_I18N
3601   if (dfa->mb_cur_max > 1)
3602     {
3603       bin_tree_t *mbc_tree;
3604       /* Build a tree for complex bracket.  */
3605       br_token.type = COMPLEX_BRACKET;
3606       br_token.opr.mbcset = mbcset;
3607       dfa->has_mb_node = 1;
3608       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3609       if (BE (mbc_tree == NULL, 0))
3610 	goto build_word_op_espace;
3611       /* Then join them by ALT node.  */
3612       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3613       if (BE (mbc_tree != NULL, 1))
3614 	return tree;
3615     }
3616   else
3617     {
3618       free_charset (mbcset);
3619       return tree;
3620     }
3621 #else /* not RE_ENABLE_I18N */
3622   return tree;
3623 #endif /* not RE_ENABLE_I18N */
3624 
3625  build_word_op_espace:
3626   re_free (sbcset);
3627 #ifdef RE_ENABLE_I18N
3628   free_charset (mbcset);
3629 #endif /* RE_ENABLE_I18N */
3630   *err = REG_ESPACE;
3631   return NULL;
3632 }
3633 
3634 /* This is intended for the expressions like "a{1,3}".
3635    Fetch a number from `input', and return the number.
3636    Return -1, if the number field is empty like "{,1}".
3637    Return -2, If an error is occured.  */
3638 
3639 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3640 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3641 {
3642   int num = -1;
3643   unsigned char c;
3644   while (1)
3645     {
3646       fetch_token (token, input, syntax);
3647       c = token->opr.c;
3648       if (BE (token->type == END_OF_RE, 0))
3649 	return -2;
3650       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3651 	break;
3652       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3653 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3654       num = (num > RE_DUP_MAX) ? -2 : num;
3655     }
3656   return num;
3657 }
3658 
3659 #ifdef RE_ENABLE_I18N
3660 static void
free_charset(re_charset_t * cset)3661 free_charset (re_charset_t *cset)
3662 {
3663   re_free (cset->mbchars);
3664 # ifdef _LIBC
3665   re_free (cset->coll_syms);
3666   re_free (cset->equiv_classes);
3667   re_free (cset->range_starts);
3668   re_free (cset->range_ends);
3669 # endif
3670   re_free (cset->char_classes);
3671   re_free (cset);
3672 }
3673 #endif /* RE_ENABLE_I18N */
3674 
3675 /* Functions for binary tree operation.  */
3676 
3677 /* Create a tree node.  */
3678 
3679 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3680 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3681 	     re_token_type_t type)
3682 {
3683   re_token_t t;
3684   t.type = type;
3685   return create_token_tree (dfa, left, right, &t);
3686 }
3687 
3688 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3689 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3690 		   const re_token_t *token)
3691 {
3692   bin_tree_t *tree;
3693   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3694     {
3695       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3696 
3697       if (storage == NULL)
3698 	return NULL;
3699       storage->next = dfa->str_tree_storage;
3700       dfa->str_tree_storage = storage;
3701       dfa->str_tree_storage_idx = 0;
3702     }
3703   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3704 
3705   tree->parent = NULL;
3706   tree->left = left;
3707   tree->right = right;
3708   tree->token = *token;
3709   tree->token.duplicated = 0;
3710   tree->token.opt_subexp = 0;
3711   tree->first = NULL;
3712   tree->next = NULL;
3713   tree->node_idx = -1;
3714 
3715   if (left != NULL)
3716     left->parent = tree;
3717   if (right != NULL)
3718     right->parent = tree;
3719   return tree;
3720 }
3721 
3722 /* Mark the tree SRC as an optional subexpression.
3723    To be called from preorder or postorder.  */
3724 
3725 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3726 mark_opt_subexp (void *extra, bin_tree_t *node)
3727 {
3728   int idx = (int) (ptrdiff_t) extra;
3729   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3730     node->token.opt_subexp = 1;
3731 
3732   return REG_NOERROR;
3733 }
3734 
3735 /* Free the allocated memory inside NODE. */
3736 
3737 static void
free_token(re_token_t * node)3738 free_token (re_token_t *node)
3739 {
3740 #ifdef RE_ENABLE_I18N
3741   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3742     free_charset (node->opr.mbcset);
3743   else
3744 #endif /* RE_ENABLE_I18N */
3745     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3746       re_free (node->opr.sbcset);
3747 }
3748 
3749 /* Worker function for tree walking.  Free the allocated memory inside NODE
3750    and its children. */
3751 
3752 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3753 free_tree (void *extra, bin_tree_t *node)
3754 {
3755   free_token (&node->token);
3756   return REG_NOERROR;
3757 }
3758 
3759 
3760 /* Duplicate the node SRC, and return new node.  This is a preorder
3761    visit similar to the one implemented by the generic visitor, but
3762    we need more infrastructure to maintain two parallel trees --- so,
3763    it's easier to duplicate.  */
3764 
3765 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3766 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3767 {
3768   const bin_tree_t *node;
3769   bin_tree_t *dup_root;
3770   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3771 
3772   for (node = root; ; )
3773     {
3774       /* Create a new tree and link it back to the current parent.  */
3775       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3776       if (*p_new == NULL)
3777 	return NULL;
3778       (*p_new)->parent = dup_node;
3779       (*p_new)->token.duplicated = 1;
3780       dup_node = *p_new;
3781 
3782       /* Go to the left node, or up and to the right.  */
3783       if (node->left)
3784 	{
3785 	  node = node->left;
3786 	  p_new = &dup_node->left;
3787 	}
3788       else
3789 	{
3790 	  const bin_tree_t *prev = NULL;
3791 	  while (node->right == prev || node->right == NULL)
3792 	    {
3793 	      prev = node;
3794 	      node = node->parent;
3795 	      dup_node = dup_node->parent;
3796 	      if (!node)
3797 	        return dup_root;
3798 	    }
3799 	  node = node->right;
3800 	  p_new = &dup_node->right;
3801 	}
3802     }
3803 }
3804