1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 					  size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 				     const re_dfastate_t *init_state,
25 				     char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 			       reg_errcode_t (fn (void *, bin_tree_t *)),
38 			       void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 				reg_errcode_t (fn (void *, bin_tree_t *)),
41 				void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 				 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 				   unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 					 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 			 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 			reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 			  reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 				  re_token_t *token, reg_syntax_t syntax,
64 				  int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 				 re_token_t *token, reg_syntax_t syntax,
67 				 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 				     re_token_t *token, reg_syntax_t syntax,
70 				     int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 				  re_token_t *token, reg_syntax_t syntax,
73 				  int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 				 re_dfa_t *dfa, re_token_t *token,
76 				 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 				      re_token_t *token, reg_syntax_t syntax,
79 				      reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 					    re_string_t *regexp,
82 					    re_token_t *token, int token_len,
83 					    re_dfa_t *dfa,
84 					    reg_syntax_t syntax,
85 					    int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 					  re_string_t *regexp,
88 					  re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 					re_charset_t *mbcset,
92 					int *equiv_class_alloc,
93 					const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 				      bitset_t sbcset,
96 				      re_charset_t *mbcset,
97 				      int *char_class_alloc,
98 				      const unsigned char *class_name,
99 				      reg_syntax_t syntax);
100 #else  /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 					const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 				      bitset_t sbcset,
105 				      const unsigned char *class_name,
106 				      reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 				       RE_TRANSLATE_TYPE trans,
110 				       const unsigned char *class_name,
111 				       const unsigned char *extra,
112 				       int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 				bin_tree_t *left, bin_tree_t *right,
115 				re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 				      bin_tree_t *left, bin_tree_t *right,
118 				      const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 
124 /* This table gives an error message for each of the error codes listed
125    in regex.h.  Obviously the order here has to be same as there.
126    POSIX doesn't require that we do anything for REG_NOERROR,
127    but why not be nice?  */
128 
129 const char __re_error_msgid[] attribute_hidden =
130   {
131 #define REG_NOERROR_IDX	0
132     gettext_noop ("Success")	/* REG_NOERROR */
133     "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135     gettext_noop ("No match")	/* REG_NOMATCH */
136     "\0"
137 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139     "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142     "\0"
143 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145     "\0"
146 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148     "\0"
149 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151     "\0"
152 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
153     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
154     "\0"
155 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157     "\0"
158 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160     "\0"
161 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163     "\0"
164 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165     gettext_noop ("Invalid range end")	/* REG_ERANGE */
166     "\0"
167 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
169     "\0"
170 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172     "\0"
173 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
175     "\0"
176 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
178     "\0"
179 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181   };
182 
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184   {
185     REG_NOERROR_IDX,
186     REG_NOMATCH_IDX,
187     REG_BADPAT_IDX,
188     REG_ECOLLATE_IDX,
189     REG_ECTYPE_IDX,
190     REG_EESCAPE_IDX,
191     REG_ESUBREG_IDX,
192     REG_EBRACK_IDX,
193     REG_EPAREN_IDX,
194     REG_EBRACE_IDX,
195     REG_BADBR_IDX,
196     REG_ERANGE_IDX,
197     REG_ESPACE_IDX,
198     REG_BADRPT_IDX,
199     REG_EEND_IDX,
200     REG_ESIZE_IDX,
201     REG_ERPAREN_IDX
202   };
203 
204 /* Entry points for GNU code.  */
205 
206 /* re_compile_pattern is the GNU regular expression compiler: it
207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208    Returns 0 if the pattern was valid, otherwise an error string.
209 
210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211    are set in BUFP on entry.  */
212 
213 const char *
re_compile_pattern(pattern,length,bufp)214 re_compile_pattern (pattern, length, bufp)
215     const char *pattern;
216     size_t length;
217     struct re_pattern_buffer *bufp;
218 {
219   reg_errcode_t ret;
220 
221   /* And GNU code determines whether or not to get register information
222      by passing null for the REGS argument to re_match, etc., not by
223      setting no_sub, unless RE_NO_SUB is set.  */
224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225 
226   /* Match anchors at newline.  */
227   bufp->newline_anchor = 1;
228 
229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 
231   if (!ret)
232     return NULL;
233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238 
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
240    also be assigned to arbitrarily: each pattern buffer stores its own
241    syntax, so it can be changed between regex compilations.  */
242 /* This has no initializer because initialized variables in Emacs
243    become read-only after dumping.  */
244 reg_syntax_t re_syntax_options;
245 
246 
247 /* Specify the precise syntax of regexps for compilation.  This provides
248    for compatibility for various utilities which historically have
249    different, incompatible syntaxes.
250 
251    The argument SYNTAX is a bit mask comprised of the various bits
252    defined in regex.h.  We return the old syntax.  */
253 
254 reg_syntax_t
255 re_set_syntax (syntax)
256     reg_syntax_t syntax;
257 {
258   reg_syntax_t ret = re_syntax_options;
259 
260   re_syntax_options = syntax;
261   return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266 
267 int
268 re_compile_fastmap (bufp)
269     struct re_pattern_buffer *bufp;
270 {
271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272   char *fastmap = bufp->fastmap;
273 
274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276   if (dfa->init_state != dfa->init_state_word)
277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278   if (dfa->init_state != dfa->init_state_nl)
279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280   if (dfa->init_state != dfa->init_state_begbuf)
281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282   bufp->fastmap_accurate = 1;
283   return 0;
284 }
285 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
288 
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293   fastmap[ch] = 1;
294   if (icase)
295     fastmap[tolower (ch)] = 1;
296 }
297 
298 /* Helper function for re_compile_fastmap.
299    Compile fastmap for the initial_state INIT_STATE.  */
300 
301 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 			 char *fastmap)
304 {
305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306   int node_cnt;
307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309     {
310       int node = init_state->nodes.elems[node_cnt];
311       re_token_type_t type = dfa->nodes[node].type;
312 
313       if (type == CHARACTER)
314 	{
315 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 	    {
319 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 	      wchar_t wc;
321 	      mbstate_t state;
322 
323 	      p = buf;
324 	      *p++ = dfa->nodes[node].opr.c;
325 	      while (++node < dfa->nodes_len
326 		     &&	dfa->nodes[node].type == CHARACTER
327 		     && dfa->nodes[node].mb_partial)
328 		*p++ = dfa->nodes[node].opr.c;
329 	      memset (&state, '\0', sizeof (state));
330 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
331 			   &state) == p - buf
332 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 		      != (size_t) -1))
334 		re_set_fastmap (fastmap, 0, buf[0]);
335 	    }
336 #endif
337 	}
338       else if (type == SIMPLE_BRACKET)
339 	{
340 	  int i, ch;
341 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 	    {
343 	      int j;
344 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 		if (w & ((bitset_word_t) 1 << j))
347 		  re_set_fastmap (fastmap, icase, ch);
348 	    }
349 	}
350 #ifdef RE_ENABLE_I18N
351       else if (type == COMPLEX_BRACKET)
352 	{
353 	  int i;
354 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
356 	      || cset->nranges || cset->nchar_classes)
357 	    {
358 # ifdef _LIBC
359 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
360 		{
361 		  /* In this case we want to catch the bytes which are
362 		     the first byte of any collation elements.
363 		     e.g. In da_DK, we want to catch 'a' since "aa"
364 			  is a valid collation element, and don't catch
365 			  'b' since 'b' is the only collation element
366 			  which starts from 'b'.  */
367 		  const int32_t *table = (const int32_t *)
368 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 		  for (i = 0; i < SBC_MAX; ++i)
370 		    if (table[i] < 0)
371 		      re_set_fastmap (fastmap, icase, i);
372 		}
373 # else
374 	      if (dfa->mb_cur_max > 1)
375 		for (i = 0; i < SBC_MAX; ++i)
376 		  if (__btowc (i) == WEOF)
377 		    re_set_fastmap (fastmap, icase, i);
378 # endif /* not _LIBC */
379 	    }
380 	  for (i = 0; i < cset->nmbchars; ++i)
381 	    {
382 	      char buf[256];
383 	      mbstate_t state;
384 	      memset (&state, '\0', sizeof (state));
385 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
386 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
387 	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
388 		{
389 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
390 		      != (size_t) -1)
391 		    re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
392 		}
393 	    }
394 	}
395 #endif /* RE_ENABLE_I18N */
396       else if (type == OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 	       || type == OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 	       || type == END_OF_RE)
401 	{
402 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
403 	  if (type == END_OF_RE)
404 	    bufp->can_be_null = 1;
405 	  return;
406 	}
407     }
408 }
409 
410 /* Entry point for POSIX code.  */
411 /* regcomp takes a regular expression as a string and compiles it.
412 
413    PREG is a regex_t *.  We do not expect any fields to be initialized,
414    since POSIX says we shouldn't.  Thus, we set
415 
416      `buffer' to the compiled pattern;
417      `used' to the length of the compiled pattern;
418      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419        REG_EXTENDED bit in CFLAGS is set; otherwise, to
420        RE_SYNTAX_POSIX_BASIC;
421      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422      `fastmap' to an allocated space for the fastmap;
423      `fastmap_accurate' to zero;
424      `re_nsub' to the number of subexpressions in PATTERN.
425 
426    PATTERN is the address of the pattern string.
427 
428    CFLAGS is a series of bits which affect compilation.
429 
430      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431      use POSIX basic syntax.
432 
433      If REG_NEWLINE is set, then . and [^...] don't match newline.
434      Also, regexec will try a match beginning after every newline.
435 
436      If REG_ICASE is set, then we considers upper- and lowercase
437      versions of letters to be equivalent when matching.
438 
439      If REG_NOSUB is set, then when PREG is passed to regexec, that
440      routine will report only success or failure, and nothing about the
441      registers.
442 
443    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
444    the return codes and their meanings.)  */
445 
446 int
regcomp(preg,pattern,cflags)447 regcomp (preg, pattern, cflags)
448     regex_t *__restrict preg;
449     const char *__restrict pattern;
450     int cflags;
451 {
452   reg_errcode_t ret;
453   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
454 			 : RE_SYNTAX_POSIX_BASIC);
455 
456   preg->buffer = NULL;
457   preg->allocated = 0;
458   preg->used = 0;
459 
460   /* Try to allocate space for the fastmap.  */
461   preg->fastmap = re_malloc (char, SBC_MAX);
462   if (BE (preg->fastmap == NULL, 0))
463     return REG_ESPACE;
464 
465   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
466 
467   /* If REG_NEWLINE is set, newlines are treated differently.  */
468   if (cflags & REG_NEWLINE)
469     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
470       syntax &= ~RE_DOT_NEWLINE;
471       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
472       /* It also changes the matching behavior.  */
473       preg->newline_anchor = 1;
474     }
475   else
476     preg->newline_anchor = 0;
477   preg->no_sub = !!(cflags & REG_NOSUB);
478   preg->translate = NULL;
479 
480   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
481 
482   /* POSIX doesn't distinguish between an unmatched open-group and an
483      unmatched close-group: both are REG_EPAREN.  */
484   if (ret == REG_ERPAREN)
485     ret = REG_EPAREN;
486 
487   /* We have already checked preg->fastmap != NULL.  */
488   if (BE (ret == REG_NOERROR, 1))
489     /* Compute the fastmap now, since regexec cannot modify the pattern
490        buffer.  This function never fails in this implementation.  */
491     (void) re_compile_fastmap (preg);
492   else
493     {
494       /* Some error occurred while compiling the expression.  */
495       re_free (preg->fastmap);
496       preg->fastmap = NULL;
497     }
498 
499   return (int) ret;
500 }
501 #ifdef _LIBC
weak_alias(__regcomp,regcomp)502 weak_alias (__regcomp, regcomp)
503 #endif
504 
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506    from either regcomp or regexec.   We don't use PREG here.  */
507 
508 size_t regerror ( int errcode, const regex_t *__restrict preg,
509 				  char *__restrict errbuf, size_t errbuf_size )
510 {
511   const char *msg;
512   size_t msg_size;
513 
514   if (BE (errcode < 0
515 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
516 			       / sizeof (__re_error_msgid_idx[0])), 0))
517     /* Only error codes returned by the rest of the code should be passed
518        to this routine.  If we are given anything else, or if other regex
519        code generates an invalid error code, then the program has a bug.
520        Dump core so we can fix it.  */
521     abort ();
522 
523   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
524 
525   msg_size = strlen (msg) + 1; /* Includes the null.  */
526 
527   if (BE (errbuf_size != 0, 1))
528     {
529       if (BE (msg_size > errbuf_size, 0))
530 	{
531 #if defined HAVE_MEMPCPY || defined _LIBC
532 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
533 #else
534 	  memmove (errbuf, msg, errbuf_size - 1);
535 	  errbuf[errbuf_size - 1] = 0;
536 #endif
537 	}
538       else
539 	memmove (errbuf, msg, msg_size);
540     }
541 
542   return msg_size;
543 }
544 #ifdef _LIBC
545 weak_alias (__regerror, regerror)
546 #endif
547 
548 
549 #ifdef RE_ENABLE_I18N
550 /* This static array is used for the map to single-byte characters when
551    UTF-8 is used.  Otherwise we would allocate memory just to initialize
552    it the same all the time.  UTF-8 is the preferred encoding so this is
553    a worthwhile optimization.  */
554 static const bitset_t utf8_sb_map =
555 {
556   /* Set the first 128 bits.  */
557   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
558 };
559 #endif
560 
561 
562 static void
free_dfa_content(re_dfa_t * dfa)563 free_dfa_content (re_dfa_t *dfa)
564 {
565   int i, j;
566 
567   if (dfa->nodes)
568     for (i = 0; i < dfa->nodes_len; ++i)
569       free_token (dfa->nodes + i);
570   re_free (dfa->nexts);
571   for (i = 0; i < dfa->nodes_len; ++i)
572     {
573       if (dfa->eclosures != NULL)
574 	re_node_set_free (dfa->eclosures + i);
575       if (dfa->inveclosures != NULL)
576 	re_node_set_free (dfa->inveclosures + i);
577       if (dfa->edests != NULL)
578 	re_node_set_free (dfa->edests + i);
579     }
580   re_free (dfa->edests);
581   re_free (dfa->eclosures);
582   re_free (dfa->inveclosures);
583   re_free (dfa->nodes);
584 
585   if (dfa->state_table)
586     for (i = 0; i <= dfa->state_hash_mask; ++i)
587       {
588 	struct re_state_table_entry *entry = dfa->state_table + i;
589 	for (j = 0; j < entry->num; ++j)
590 	  {
591 	    re_dfastate_t *state = entry->array[j];
592 	    free_state (state);
593 	  }
594         re_free (entry->array);
595       }
596   re_free (dfa->state_table);
597 #ifdef RE_ENABLE_I18N
598   if (dfa->sb_char != utf8_sb_map)
599     re_free (dfa->sb_char);
600 #endif
601   re_free (dfa->subexp_map);
602 #ifdef DEBUG
603   re_free (dfa->re_str);
604 #endif
605 
606   re_free (dfa);
607 }
608 
609 
610 /* Free dynamically allocated space used by PREG.  */
611 
612 void
regfree(preg)613 regfree (preg)
614     regex_t *preg;
615 {
616   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
617   if (BE (dfa != NULL, 1))
618     free_dfa_content (dfa);
619   preg->buffer = NULL;
620   preg->allocated = 0;
621 
622   re_free (preg->fastmap);
623   preg->fastmap = NULL;
624 
625   re_free (preg->translate);
626   preg->translate = NULL;
627 }
628 #ifdef _LIBC
629 weak_alias (__regfree, regfree)
630 #endif
631 
632 /* Entry points compatible with 4.2 BSD regex library.  We don't define
633    them unless specifically requested.  */
634 
635 #if defined _REGEX_RE_COMP || defined _LIBC
636 
637 /* BSD has one and only one pattern buffer.  */
638 static struct re_pattern_buffer re_comp_buf;
639 
640 char *
641 # ifdef _LIBC
642 /* Make these definitions weak in libc, so POSIX programs can redefine
643    these names if they don't use our functions, and still use
644    regcomp/regexec above without link errors.  */
645 weak_function
646 # endif
re_comp(s)647 re_comp (s)
648      const char *s;
649 {
650   reg_errcode_t ret;
651   char *fastmap;
652 
653   if (!s)
654     {
655       if (!re_comp_buf.buffer)
656 	return gettext ("No previous regular expression");
657       return 0;
658     }
659 
660   if (re_comp_buf.buffer)
661     {
662       fastmap = re_comp_buf.fastmap;
663       re_comp_buf.fastmap = NULL;
664       __regfree (&re_comp_buf);
665       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
666       re_comp_buf.fastmap = fastmap;
667     }
668 
669   if (re_comp_buf.fastmap == NULL)
670     {
671       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
672       if (re_comp_buf.fastmap == NULL)
673 	return (char *) gettext (__re_error_msgid
674 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
675     }
676 
677   /* Since `re_exec' always passes NULL for the `regs' argument, we
678      don't need to initialize the pattern buffer fields which affect it.  */
679 
680   /* Match anchors at newlines.  */
681   re_comp_buf.newline_anchor = 1;
682 
683   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
684 
685   if (!ret)
686     return NULL;
687 
688   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
689   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
690 }
691 
692 #ifdef _LIBC
libc_freeres_fn(free_mem)693 libc_freeres_fn (free_mem)
694 {
695   __regfree (&re_comp_buf);
696 }
697 #endif
698 
699 #endif /* _REGEX_RE_COMP */
700 
701 /* Internal entry point.
702    Compile the regular expression PATTERN, whose length is LENGTH.
703    SYNTAX indicate regular expression's syntax.  */
704 
705 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)706 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
707 		     reg_syntax_t syntax)
708 {
709   reg_errcode_t err = REG_NOERROR;
710   re_dfa_t *dfa;
711   re_string_t regexp;
712 
713   /* Initialize the pattern buffer.  */
714   preg->fastmap_accurate = 0;
715   preg->syntax = syntax;
716   preg->not_bol = preg->not_eol = 0;
717   preg->used = 0;
718   preg->re_nsub = 0;
719   preg->can_be_null = 0;
720   preg->regs_allocated = REGS_UNALLOCATED;
721 
722   /* Initialize the dfa.  */
723   dfa = (re_dfa_t *) preg->buffer;
724   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
725     {
726       /* If zero allocated, but buffer is non-null, try to realloc
727 	 enough space.  This loses if buffer's address is bogus, but
728 	 that is the user's responsibility.  If ->buffer is NULL this
729 	 is a simple allocation.  */
730       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
731       if (dfa == NULL)
732 	return REG_ESPACE;
733       preg->allocated = sizeof (re_dfa_t);
734       preg->buffer = (unsigned char *) dfa;
735     }
736   preg->used = sizeof (re_dfa_t);
737 
738   err = init_dfa (dfa, length);
739   if (BE (err != REG_NOERROR, 0))
740     {
741       free_dfa_content (dfa);
742       preg->buffer = NULL;
743       preg->allocated = 0;
744       return err;
745     }
746 #ifdef DEBUG
747   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
748   dfa->re_str = re_malloc (char, length + 1);
749   strncpy (dfa->re_str, pattern, length + 1);
750 #endif
751 
752   __libc_lock_init (dfa->lock);
753 
754   err = re_string_construct (&regexp, pattern, length, preg->translate,
755 			     syntax & RE_ICASE, dfa);
756   if (BE (err != REG_NOERROR, 0))
757     {
758     re_compile_internal_free_return:
759       free_workarea_compile (preg);
760       re_string_destruct (&regexp);
761       free_dfa_content (dfa);
762       preg->buffer = NULL;
763       preg->allocated = 0;
764       return err;
765     }
766 
767   /* Parse the regular expression, and build a structure tree.  */
768   preg->re_nsub = 0;
769   dfa->str_tree = parse (&regexp, preg, syntax, &err);
770   if (BE (dfa->str_tree == NULL, 0))
771     goto re_compile_internal_free_return;
772 
773   /* Analyze the tree and create the nfa.  */
774   err = analyze (preg);
775   if (BE (err != REG_NOERROR, 0))
776     goto re_compile_internal_free_return;
777 
778 #ifdef RE_ENABLE_I18N
779   /* If possible, do searching in single byte encoding to speed things up.  */
780   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
781     optimize_utf8 (dfa);
782 #endif
783 
784   /* Then create the initial state of the dfa.  */
785   err = create_initial_state (dfa);
786 
787   /* Release work areas.  */
788   free_workarea_compile (preg);
789   re_string_destruct (&regexp);
790 
791   if (BE (err != REG_NOERROR, 0))
792     {
793       free_dfa_content (dfa);
794       preg->buffer = NULL;
795       preg->allocated = 0;
796     }
797 
798   return err;
799 }
800 
801 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
802    as the initial length of some arrays.  */
803 
804 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)805 init_dfa (re_dfa_t *dfa, size_t pat_len)
806 {
807   unsigned int table_size;
808 #ifndef _LIBC
809   char *codeset_name;
810 #endif
811 
812   memset (dfa, '\0', sizeof (re_dfa_t));
813 
814   /* Force allocation of str_tree_storage the first time.  */
815   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
816 
817   /* Avoid overflows.  */
818   if (pat_len == SIZE_MAX)
819     return REG_ESPACE;
820 
821   dfa->nodes_alloc = pat_len + 1;
822   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
823 
824   /*  table_size = 2 ^ ceil(log pat_len) */
825   for (table_size = 1; ; table_size <<= 1)
826     if (table_size > pat_len)
827       break;
828 
829   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
830   dfa->state_hash_mask = table_size - 1;
831 
832   dfa->mb_cur_max = MB_CUR_MAX;
833 #ifdef _LIBC
834   if (dfa->mb_cur_max == 6
835       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
836     dfa->is_utf8 = 1;
837   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
838 		       != 0);
839 #else
840 # ifdef HAVE_LANGINFO_CODESET
841   codeset_name = nl_langinfo (CODESET);
842 # else
843   codeset_name = getenv ("LC_ALL");
844   if (codeset_name == NULL || codeset_name[0] == '\0')
845     codeset_name = getenv ("LC_CTYPE");
846   if (codeset_name == NULL || codeset_name[0] == '\0')
847     codeset_name = getenv ("LANG");
848   if (codeset_name == NULL)
849     codeset_name = "";
850   else if (strchr (codeset_name, '.') !=  NULL)
851     codeset_name = strchr (codeset_name, '.') + 1;
852 # endif
853 
854   if (strcasecmp (codeset_name, "UTF-8") == 0
855       || strcasecmp (codeset_name, "UTF8") == 0)
856     dfa->is_utf8 = 1;
857 
858   /* We check exhaustively in the loop below if this charset is a
859      superset of ASCII.  */
860   dfa->map_notascii = 0;
861 #endif
862 
863 #ifdef RE_ENABLE_I18N
864   if (dfa->mb_cur_max > 1)
865     {
866       if (dfa->is_utf8)
867 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
868       else
869 	{
870 	  int i, j, ch;
871 
872 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
873 	  if (BE (dfa->sb_char == NULL, 0))
874 	    return REG_ESPACE;
875 
876 	  /* Set the bits corresponding to single byte chars.  */
877 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
878 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
879 	      {
880 		wint_t wch = __btowc (ch);
881 		if (wch != WEOF)
882 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
883 # ifndef _LIBC
884 		if (isascii (ch) && wch != ch)
885 		  dfa->map_notascii = 1;
886 # endif
887 	      }
888 	}
889     }
890 #endif
891 
892   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
893     return REG_ESPACE;
894   return REG_NOERROR;
895 }
896 
897 /* Initialize WORD_CHAR table, which indicate which character is
898    "word".  In this case "word" means that it is the word construction
899    character used by some operators like "\<", "\>", etc.  */
900 
901 static void
902 internal_function
init_word_char(re_dfa_t * dfa)903 init_word_char (re_dfa_t *dfa)
904 {
905   int i, j, ch;
906   dfa->word_ops_used = 1;
907   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
908     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
909       if (isalnum (ch) || ch == '_')
910 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
911 }
912 
913 /* Free the work area which are only used while compiling.  */
914 
915 static void
free_workarea_compile(regex_t * preg)916 free_workarea_compile (regex_t *preg)
917 {
918   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
919   bin_tree_storage_t *storage, *next;
920   for (storage = dfa->str_tree_storage; storage; storage = next)
921     {
922       next = storage->next;
923       re_free (storage);
924     }
925   dfa->str_tree_storage = NULL;
926   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
927   dfa->str_tree = NULL;
928   re_free (dfa->org_indices);
929   dfa->org_indices = NULL;
930 }
931 
932 /* Create initial states for all contexts.  */
933 
934 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)935 create_initial_state (re_dfa_t *dfa)
936 {
937   int first, i;
938   reg_errcode_t err;
939   re_node_set init_nodes;
940 
941   /* Initial states have the epsilon closure of the node which is
942      the first node of the regular expression.  */
943   first = dfa->str_tree->first->node_idx;
944   dfa->init_node = first;
945   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
946   if (BE (err != REG_NOERROR, 0))
947     return err;
948 
949   /* The back-references which are in initial states can epsilon transit,
950      since in this case all of the subexpressions can be null.
951      Then we add epsilon closures of the nodes which are the next nodes of
952      the back-references.  */
953   if (dfa->nbackref > 0)
954     for (i = 0; i < init_nodes.nelem; ++i)
955       {
956 	int node_idx = init_nodes.elems[i];
957 	re_token_type_t type = dfa->nodes[node_idx].type;
958 
959 	int clexp_idx;
960 	if (type != OP_BACK_REF)
961 	  continue;
962 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
963 	  {
964 	    re_token_t *clexp_node;
965 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
966 	    if (clexp_node->type == OP_CLOSE_SUBEXP
967 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
968 	      break;
969 	  }
970 	if (clexp_idx == init_nodes.nelem)
971 	  continue;
972 
973 	if (type == OP_BACK_REF)
974 	  {
975 	    int dest_idx = dfa->edests[node_idx].elems[0];
976 	    if (!re_node_set_contains (&init_nodes, dest_idx))
977 	      {
978 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
979 		i = 0;
980 	      }
981 	  }
982       }
983 
984   /* It must be the first time to invoke acquire_state.  */
985   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
986   /* We don't check ERR here, since the initial state must not be NULL.  */
987   if (BE (dfa->init_state == NULL, 0))
988     return err;
989   if (dfa->init_state->has_constraint)
990     {
991       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
992 						       CONTEXT_WORD);
993       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
994 						     CONTEXT_NEWLINE);
995       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
996 							 &init_nodes,
997 							 CONTEXT_NEWLINE
998 							 | CONTEXT_BEGBUF);
999       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1000 	      || dfa->init_state_begbuf == NULL, 0))
1001 	return err;
1002     }
1003   else
1004     dfa->init_state_word = dfa->init_state_nl
1005       = dfa->init_state_begbuf = dfa->init_state;
1006 
1007   re_node_set_free (&init_nodes);
1008   return REG_NOERROR;
1009 }
1010 
1011 #ifdef RE_ENABLE_I18N
1012 /* If it is possible to do searching in single byte encoding instead of UTF-8
1013    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1014    DFA nodes where needed.  */
1015 
1016 static void
optimize_utf8(re_dfa_t * dfa)1017 optimize_utf8 (re_dfa_t *dfa)
1018 {
1019   int node, i, mb_chars = 0, has_period = 0;
1020 
1021   for (node = 0; node < dfa->nodes_len; ++node)
1022     switch (dfa->nodes[node].type)
1023       {
1024       case CHARACTER:
1025 	if (dfa->nodes[node].opr.c >= 0x80)
1026 	  mb_chars = 1;
1027 	break;
1028       case ANCHOR:
1029 	switch (dfa->nodes[node].opr.idx)
1030 	  {
1031 	  case LINE_FIRST:
1032 	  case LINE_LAST:
1033 	  case BUF_FIRST:
1034 	  case BUF_LAST:
1035 	    break;
1036 	  default:
1037 	    /* Word anchors etc. cannot be handled.  */
1038 	    return;
1039 	  }
1040 	break;
1041       case OP_PERIOD:
1042         has_period = 1;
1043         break;
1044       case OP_BACK_REF:
1045       case OP_ALT:
1046       case END_OF_RE:
1047       case OP_DUP_ASTERISK:
1048       case OP_OPEN_SUBEXP:
1049       case OP_CLOSE_SUBEXP:
1050 	break;
1051       case COMPLEX_BRACKET:
1052 	return;
1053       case SIMPLE_BRACKET:
1054 	/* Just double check.  The non-ASCII range starts at 0x80.  */
1055 	assert (0x80 % BITSET_WORD_BITS == 0);
1056         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1057 	  if (dfa->nodes[node].opr.sbcset[i])
1058 	    return;
1059 	break;
1060       default:
1061 	abort ();
1062       }
1063 
1064   if (mb_chars || has_period)
1065     for (node = 0; node < dfa->nodes_len; ++node)
1066       {
1067 	if (dfa->nodes[node].type == CHARACTER
1068 	    && dfa->nodes[node].opr.c >= 0x80)
1069 	  dfa->nodes[node].mb_partial = 0;
1070 	else if (dfa->nodes[node].type == OP_PERIOD)
1071 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1072       }
1073 
1074   /* The search can be in single byte locale.  */
1075   dfa->mb_cur_max = 1;
1076   dfa->is_utf8 = 0;
1077   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1078 }
1079 #endif
1080 
1081 /* Analyze the structure tree, and calculate "first", "next", "edest",
1082    "eclosure", and "inveclosure".  */
1083 
1084 static reg_errcode_t
analyze(regex_t * preg)1085 analyze (regex_t *preg)
1086 {
1087   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1088   reg_errcode_t ret;
1089 
1090   /* Allocate arrays.  */
1091   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1092   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1093   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1094   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1095   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1096 	  || dfa->eclosures == NULL, 0))
1097     return REG_ESPACE;
1098 
1099   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1100   if (dfa->subexp_map != NULL)
1101     {
1102       int i;
1103       for (i = 0; i < preg->re_nsub; i++)
1104 	dfa->subexp_map[i] = i;
1105       preorder (dfa->str_tree, optimize_subexps, dfa);
1106       for (i = 0; i < preg->re_nsub; i++)
1107 	if (dfa->subexp_map[i] != i)
1108 	  break;
1109       if (i == preg->re_nsub)
1110 	{
1111 	  free (dfa->subexp_map);
1112 	  dfa->subexp_map = NULL;
1113 	}
1114     }
1115 
1116   ret = postorder (dfa->str_tree, lower_subexps, preg);
1117   if (BE (ret != REG_NOERROR, 0))
1118     return ret;
1119   ret = postorder (dfa->str_tree, calc_first, dfa);
1120   if (BE (ret != REG_NOERROR, 0))
1121     return ret;
1122   preorder (dfa->str_tree, calc_next, dfa);
1123   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1124   if (BE (ret != REG_NOERROR, 0))
1125     return ret;
1126   ret = calc_eclosure (dfa);
1127   if (BE (ret != REG_NOERROR, 0))
1128     return ret;
1129 
1130   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1131      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1132   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1133       || dfa->nbackref)
1134     {
1135       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1136       if (BE (dfa->inveclosures == NULL, 0))
1137         return REG_ESPACE;
1138       ret = calc_inveclosure (dfa);
1139     }
1140 
1141   return ret;
1142 }
1143 
1144 /* Our parse trees are very unbalanced, so we cannot use a stack to
1145    implement parse tree visits.  Instead, we use parent pointers and
1146    some hairy code in these two functions.  */
1147 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1148 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1149 	   void *extra)
1150 {
1151   bin_tree_t *node, *prev;
1152 
1153   for (node = root; ; )
1154     {
1155       /* Descend down the tree, preferably to the left (or to the right
1156 	 if that's the only child).  */
1157       while (node->left || node->right)
1158 	if (node->left)
1159           node = node->left;
1160         else
1161           node = node->right;
1162 
1163       do
1164 	{
1165 	  reg_errcode_t err = fn (extra, node);
1166 	  if (BE (err != REG_NOERROR, 0))
1167 	    return err;
1168           if (node->parent == NULL)
1169 	    return REG_NOERROR;
1170 	  prev = node;
1171 	  node = node->parent;
1172 	}
1173       /* Go up while we have a node that is reached from the right.  */
1174       while (node->right == prev || node->right == NULL);
1175       node = node->right;
1176     }
1177 }
1178 
1179 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1180 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1181 	  void *extra)
1182 {
1183   bin_tree_t *node;
1184 
1185   for (node = root; ; )
1186     {
1187       reg_errcode_t err = fn (extra, node);
1188       if (BE (err != REG_NOERROR, 0))
1189 	return err;
1190 
1191       /* Go to the left node, or up and to the right.  */
1192       if (node->left)
1193 	node = node->left;
1194       else
1195 	{
1196 	  bin_tree_t *prev = NULL;
1197 	  while (node->right == prev || node->right == NULL)
1198 	    {
1199 	      prev = node;
1200 	      node = node->parent;
1201 	      if (!node)
1202 	        return REG_NOERROR;
1203 	    }
1204 	  node = node->right;
1205 	}
1206     }
1207 }
1208 
1209 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1210    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1211    backreferences as well.  Requires a preorder visit.  */
1212 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1213 optimize_subexps (void *extra, bin_tree_t *node)
1214 {
1215   re_dfa_t *dfa = (re_dfa_t *) extra;
1216 
1217   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1218     {
1219       int idx = node->token.opr.idx;
1220       node->token.opr.idx = dfa->subexp_map[idx];
1221       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1222     }
1223 
1224   else if (node->token.type == SUBEXP
1225            && node->left && node->left->token.type == SUBEXP)
1226     {
1227       int other_idx = node->left->token.opr.idx;
1228 
1229       node->left = node->left->left;
1230       if (node->left)
1231         node->left->parent = node;
1232 
1233       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1234       if (other_idx < BITSET_WORD_BITS)
1235 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1236     }
1237 
1238   return REG_NOERROR;
1239 }
1240 
1241 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1242    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1243 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1244 lower_subexps (void *extra, bin_tree_t *node)
1245 {
1246   regex_t *preg = (regex_t *) extra;
1247   reg_errcode_t err = REG_NOERROR;
1248 
1249   if (node->left && node->left->token.type == SUBEXP)
1250     {
1251       node->left = lower_subexp (&err, preg, node->left);
1252       if (node->left)
1253 	node->left->parent = node;
1254     }
1255   if (node->right && node->right->token.type == SUBEXP)
1256     {
1257       node->right = lower_subexp (&err, preg, node->right);
1258       if (node->right)
1259 	node->right->parent = node;
1260     }
1261 
1262   return err;
1263 }
1264 
1265 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1266 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1267 {
1268   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1269   bin_tree_t *body = node->left;
1270   bin_tree_t *op, *cls, *tree1, *tree;
1271 
1272   if (preg->no_sub
1273       /* We do not optimize empty subexpressions, because otherwise we may
1274 	 have bad CONCAT nodes with NULL children.  This is obviously not
1275 	 very common, so we do not lose much.  An example that triggers
1276 	 this case is the sed "script" /\(\)/x.  */
1277       && node->left != NULL
1278       && (node->token.opr.idx >= BITSET_WORD_BITS
1279 	  || !(dfa->used_bkref_map
1280 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1281     return node->left;
1282 
1283   /* Convert the SUBEXP node to the concatenation of an
1284      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1285   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1286   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1287   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1288   tree = create_tree (dfa, op, tree1, CONCAT);
1289   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1290     {
1291       *err = REG_ESPACE;
1292       return NULL;
1293     }
1294 
1295   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1296   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1297   return tree;
1298 }
1299 
1300 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1301    nodes.  Requires a postorder visit.  */
1302 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1303 calc_first (void *extra, bin_tree_t *node)
1304 {
1305   re_dfa_t *dfa = (re_dfa_t *) extra;
1306   if (node->token.type == CONCAT)
1307     {
1308       node->first = node->left->first;
1309       node->node_idx = node->left->node_idx;
1310     }
1311   else
1312     {
1313       node->first = node;
1314       node->node_idx = re_dfa_add_node (dfa, node->token);
1315       if (BE (node->node_idx == -1, 0))
1316         return REG_ESPACE;
1317     }
1318   return REG_NOERROR;
1319 }
1320 
1321 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1322 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1323 calc_next (void *extra, bin_tree_t *node)
1324 {
1325   switch (node->token.type)
1326     {
1327     case OP_DUP_ASTERISK:
1328       node->left->next = node;
1329       break;
1330     case CONCAT:
1331       node->left->next = node->right->first;
1332       node->right->next = node->next;
1333       break;
1334     default:
1335       if (node->left)
1336 	node->left->next = node->next;
1337       if (node->right)
1338         node->right->next = node->next;
1339       break;
1340     }
1341   return REG_NOERROR;
1342 }
1343 
1344 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1345 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1346 link_nfa_nodes (void *extra, bin_tree_t *node)
1347 {
1348   re_dfa_t *dfa = (re_dfa_t *) extra;
1349   int idx = node->node_idx;
1350   reg_errcode_t err = REG_NOERROR;
1351 
1352   switch (node->token.type)
1353     {
1354     case CONCAT:
1355       break;
1356 
1357     case END_OF_RE:
1358       assert (node->next == NULL);
1359       break;
1360 
1361     case OP_DUP_ASTERISK:
1362     case OP_ALT:
1363       {
1364 	int left, right;
1365 	dfa->has_plural_match = 1;
1366 	if (node->left != NULL)
1367 	  left = node->left->first->node_idx;
1368 	else
1369 	  left = node->next->node_idx;
1370 	if (node->right != NULL)
1371 	  right = node->right->first->node_idx;
1372 	else
1373 	  right = node->next->node_idx;
1374 	assert (left > -1);
1375 	assert (right > -1);
1376 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1377       }
1378       break;
1379 
1380     case ANCHOR:
1381     case OP_OPEN_SUBEXP:
1382     case OP_CLOSE_SUBEXP:
1383       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1384       break;
1385 
1386     case OP_BACK_REF:
1387       dfa->nexts[idx] = node->next->node_idx;
1388       if (node->token.type == OP_BACK_REF)
1389 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1390       break;
1391 
1392     default:
1393       assert (!IS_EPSILON_NODE (node->token.type));
1394       dfa->nexts[idx] = node->next->node_idx;
1395       break;
1396     }
1397 
1398   return err;
1399 }
1400 
1401 /* Duplicate the epsilon closure of the node ROOT_NODE.
1402    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1403    to their own constraint.  */
1404 
1405 static reg_errcode_t
1406 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1407 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1408 			int root_node, unsigned int init_constraint)
1409 {
1410   int org_node, clone_node, ret;
1411   unsigned int constraint = init_constraint;
1412   for (org_node = top_org_node, clone_node = top_clone_node;;)
1413     {
1414       int org_dest, clone_dest;
1415       if (dfa->nodes[org_node].type == OP_BACK_REF)
1416 	{
1417 	  /* If the back reference epsilon-transit, its destination must
1418 	     also have the constraint.  Then duplicate the epsilon closure
1419 	     of the destination of the back reference, and store it in
1420 	     edests of the back reference.  */
1421 	  org_dest = dfa->nexts[org_node];
1422 	  re_node_set_empty (dfa->edests + clone_node);
1423 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1424 	  if (BE (clone_dest == -1, 0))
1425 	    return REG_ESPACE;
1426 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1427 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1428 	  if (BE (ret < 0, 0))
1429 	    return REG_ESPACE;
1430 	}
1431       else if (dfa->edests[org_node].nelem == 0)
1432 	{
1433 	  /* In case of the node can't epsilon-transit, don't duplicate the
1434 	     destination and store the original destination as the
1435 	     destination of the node.  */
1436 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1437 	  break;
1438 	}
1439       else if (dfa->edests[org_node].nelem == 1)
1440 	{
1441 	  /* In case of the node can epsilon-transit, and it has only one
1442 	     destination.  */
1443 	  org_dest = dfa->edests[org_node].elems[0];
1444 	  re_node_set_empty (dfa->edests + clone_node);
1445 	  if (dfa->nodes[org_node].type == ANCHOR)
1446 	    {
1447 	      /* In case of the node has another constraint, append it.  */
1448 	      if (org_node == root_node && clone_node != org_node)
1449 		{
1450 		  /* ...but if the node is root_node itself, it means the
1451 		     epsilon closure have a loop, then tie it to the
1452 		     destination of the root_node.  */
1453 		  ret = re_node_set_insert (dfa->edests + clone_node,
1454 					    org_dest);
1455 		  if (BE (ret < 0, 0))
1456 		    return REG_ESPACE;
1457 		  break;
1458 		}
1459 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1460 	    }
1461 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1462 	  if (BE (clone_dest == -1, 0))
1463 	    return REG_ESPACE;
1464 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1465 	  if (BE (ret < 0, 0))
1466 	    return REG_ESPACE;
1467 	}
1468       else /* dfa->edests[org_node].nelem == 2 */
1469 	{
1470 	  /* In case of the node can epsilon-transit, and it has two
1471 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1472 	  org_dest = dfa->edests[org_node].elems[0];
1473 	  re_node_set_empty (dfa->edests + clone_node);
1474 	  /* Search for a duplicated node which satisfies the constraint.  */
1475 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1476 	  if (clone_dest == -1)
1477 	    {
1478 	      /* There are no such a duplicated node, create a new one.  */
1479 	      reg_errcode_t err;
1480 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1481 	      if (BE (clone_dest == -1, 0))
1482 		return REG_ESPACE;
1483 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1484 	      if (BE (ret < 0, 0))
1485 		return REG_ESPACE;
1486 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1487 					    root_node, constraint);
1488 	      if (BE (err != REG_NOERROR, 0))
1489 		return err;
1490 	    }
1491 	  else
1492 	    {
1493 	      /* There are a duplicated node which satisfy the constraint,
1494 		 use it to avoid infinite loop.  */
1495 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 	      if (BE (ret < 0, 0))
1497 		return REG_ESPACE;
1498 	    }
1499 
1500 	  org_dest = dfa->edests[org_node].elems[1];
1501 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1502 	  if (BE (clone_dest == -1, 0))
1503 	    return REG_ESPACE;
1504 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1505 	  if (BE (ret < 0, 0))
1506 	    return REG_ESPACE;
1507 	}
1508       org_node = org_dest;
1509       clone_node = clone_dest;
1510     }
1511   return REG_NOERROR;
1512 }
1513 
1514 /* Search for a node which is duplicated from the node ORG_NODE, and
1515    satisfies the constraint CONSTRAINT.  */
1516 
1517 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1518 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1519 			unsigned int constraint)
1520 {
1521   int idx;
1522   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1523     {
1524       if (org_node == dfa->org_indices[idx]
1525 	  && constraint == dfa->nodes[idx].constraint)
1526 	return idx; /* Found.  */
1527     }
1528   return -1; /* Not found.  */
1529 }
1530 
1531 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1532    Return the index of the new node, or -1 if insufficient storage is
1533    available.  */
1534 
1535 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1536 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1537 {
1538   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1539   if (BE (dup_idx != -1, 1))
1540     {
1541       dfa->nodes[dup_idx].constraint = constraint;
1542       if (dfa->nodes[org_idx].type == ANCHOR)
1543 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1544       dfa->nodes[dup_idx].duplicated = 1;
1545 
1546       /* Store the index of the original node.  */
1547       dfa->org_indices[dup_idx] = org_idx;
1548     }
1549   return dup_idx;
1550 }
1551 
1552 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1553 calc_inveclosure (re_dfa_t *dfa)
1554 {
1555   int src, idx, ret;
1556   for (idx = 0; idx < dfa->nodes_len; ++idx)
1557     re_node_set_init_empty (dfa->inveclosures + idx);
1558 
1559   for (src = 0; src < dfa->nodes_len; ++src)
1560     {
1561       int *elems = dfa->eclosures[src].elems;
1562       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1563 	{
1564 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1565 	  if (BE (ret == -1, 0))
1566 	    return REG_ESPACE;
1567 	}
1568     }
1569 
1570   return REG_NOERROR;
1571 }
1572 
1573 /* Calculate "eclosure" for all the node in DFA.  */
1574 
1575 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1576 calc_eclosure (re_dfa_t *dfa)
1577 {
1578   int node_idx, incomplete;
1579 #ifdef DEBUG
1580   assert (dfa->nodes_len > 0);
1581 #endif
1582   incomplete = 0;
1583   /* For each nodes, calculate epsilon closure.  */
1584   for (node_idx = 0; ; ++node_idx)
1585     {
1586       reg_errcode_t err;
1587       re_node_set eclosure_elem;
1588       if (node_idx == dfa->nodes_len)
1589 	{
1590 	  if (!incomplete)
1591 	    break;
1592 	  incomplete = 0;
1593 	  node_idx = 0;
1594 	}
1595 
1596 #ifdef DEBUG
1597       assert (dfa->eclosures[node_idx].nelem != -1);
1598 #endif
1599 
1600       /* If we have already calculated, skip it.  */
1601       if (dfa->eclosures[node_idx].nelem != 0)
1602 	continue;
1603       /* Calculate epsilon closure of `node_idx'.  */
1604       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1605       if (BE (err != REG_NOERROR, 0))
1606 	return err;
1607 
1608       if (dfa->eclosures[node_idx].nelem == 0)
1609 	{
1610 	  incomplete = 1;
1611 	  re_node_set_free (&eclosure_elem);
1612 	}
1613     }
1614   return REG_NOERROR;
1615 }
1616 
1617 /* Calculate epsilon closure of NODE.  */
1618 
1619 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1620 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1621 {
1622   reg_errcode_t err;
1623   unsigned int constraint;
1624   int i, incomplete;
1625   re_node_set eclosure;
1626   incomplete = 0;
1627   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1628   if (BE (err != REG_NOERROR, 0))
1629     return err;
1630 
1631   /* This indicates that we are calculating this node now.
1632      We reference this value to avoid infinite loop.  */
1633   dfa->eclosures[node].nelem = -1;
1634 
1635   constraint = ((dfa->nodes[node].type == ANCHOR)
1636 		? dfa->nodes[node].opr.ctx_type : 0);
1637   /* If the current node has constraints, duplicate all nodes.
1638      Since they must inherit the constraints.  */
1639   if (constraint
1640       && dfa->edests[node].nelem
1641       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1642     {
1643       err = duplicate_node_closure (dfa, node, node, node, constraint);
1644       if (BE (err != REG_NOERROR, 0))
1645 	return err;
1646     }
1647 
1648   /* Expand each epsilon destination nodes.  */
1649   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1650     for (i = 0; i < dfa->edests[node].nelem; ++i)
1651       {
1652 	re_node_set eclosure_elem;
1653 	int edest = dfa->edests[node].elems[i];
1654 	/* If calculating the epsilon closure of `edest' is in progress,
1655 	   return intermediate result.  */
1656 	if (dfa->eclosures[edest].nelem == -1)
1657 	  {
1658 	    incomplete = 1;
1659 	    continue;
1660 	  }
1661 	/* If we haven't calculated the epsilon closure of `edest' yet,
1662 	   calculate now. Otherwise use calculated epsilon closure.  */
1663 	if (dfa->eclosures[edest].nelem == 0)
1664 	  {
1665 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1666 	    if (BE (err != REG_NOERROR, 0))
1667 	      return err;
1668 	  }
1669 	else
1670 	  eclosure_elem = dfa->eclosures[edest];
1671 	/* Merge the epsilon closure of `edest'.  */
1672 	re_node_set_merge (&eclosure, &eclosure_elem);
1673 	/* If the epsilon closure of `edest' is incomplete,
1674 	   the epsilon closure of this node is also incomplete.  */
1675 	if (dfa->eclosures[edest].nelem == 0)
1676 	  {
1677 	    incomplete = 1;
1678 	    re_node_set_free (&eclosure_elem);
1679 	  }
1680       }
1681 
1682   /* Epsilon closures include itself.  */
1683   re_node_set_insert (&eclosure, node);
1684   if (incomplete && !root)
1685     dfa->eclosures[node].nelem = 0;
1686   else
1687     dfa->eclosures[node] = eclosure;
1688   *new_set = eclosure;
1689   return REG_NOERROR;
1690 }
1691 
1692 /* Functions for token which are used in the parser.  */
1693 
1694 /* Fetch a token from INPUT.
1695    We must not use this function inside bracket expressions.  */
1696 
1697 static void
1698 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1700 {
1701   re_string_skip_bytes (input, peek_token (result, input, syntax));
1702 }
1703 
1704 /* Peek a token from INPUT, and return the length of the token.
1705    We must not use this function inside bracket expressions.  */
1706 
1707 static int
1708 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1709 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1710 {
1711   unsigned char c;
1712 
1713   if (re_string_eoi (input))
1714     {
1715       token->type = END_OF_RE;
1716       return 0;
1717     }
1718 
1719   c = re_string_peek_byte (input, 0);
1720   token->opr.c = c;
1721 
1722   token->word_char = 0;
1723 #ifdef RE_ENABLE_I18N
1724   token->mb_partial = 0;
1725   if (input->mb_cur_max > 1 &&
1726       !re_string_first_byte (input, re_string_cur_idx (input)))
1727     {
1728       token->type = CHARACTER;
1729       token->mb_partial = 1;
1730       return 1;
1731     }
1732 #endif
1733   if (c == '\\')
1734     {
1735       unsigned char c2;
1736       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1737 	{
1738 	  token->type = BACK_SLASH;
1739 	  return 1;
1740 	}
1741 
1742       c2 = re_string_peek_byte_case (input, 1);
1743       token->opr.c = c2;
1744       token->type = CHARACTER;
1745 #ifdef RE_ENABLE_I18N
1746       if (input->mb_cur_max > 1)
1747 	{
1748 	  wint_t wc = re_string_wchar_at (input,
1749 					  re_string_cur_idx (input) + 1);
1750 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1751 	}
1752       else
1753 #endif
1754 	token->word_char = IS_WORD_CHAR (c2) != 0;
1755 
1756       switch (c2)
1757 	{
1758 	case '|':
1759 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1760 	    token->type = OP_ALT;
1761 	  break;
1762 	case '1': case '2': case '3': case '4': case '5':
1763 	case '6': case '7': case '8': case '9':
1764 	  if (!(syntax & RE_NO_BK_REFS))
1765 	    {
1766 	      token->type = OP_BACK_REF;
1767 	      token->opr.idx = c2 - '1';
1768 	    }
1769 	  break;
1770 	case '<':
1771 	  if (!(syntax & RE_NO_GNU_OPS))
1772 	    {
1773 	      token->type = ANCHOR;
1774 	      token->opr.ctx_type = WORD_FIRST;
1775 	    }
1776 	  break;
1777 	case '>':
1778 	  if (!(syntax & RE_NO_GNU_OPS))
1779 	    {
1780 	      token->type = ANCHOR;
1781 	      token->opr.ctx_type = WORD_LAST;
1782 	    }
1783 	  break;
1784 	case 'b':
1785 	  if (!(syntax & RE_NO_GNU_OPS))
1786 	    {
1787 	      token->type = ANCHOR;
1788 	      token->opr.ctx_type = WORD_DELIM;
1789 	    }
1790 	  break;
1791 	case 'B':
1792 	  if (!(syntax & RE_NO_GNU_OPS))
1793 	    {
1794 	      token->type = ANCHOR;
1795 	      token->opr.ctx_type = NOT_WORD_DELIM;
1796 	    }
1797 	  break;
1798 	case 'w':
1799 	  if (!(syntax & RE_NO_GNU_OPS))
1800 	    token->type = OP_WORD;
1801 	  break;
1802 	case 'W':
1803 	  if (!(syntax & RE_NO_GNU_OPS))
1804 	    token->type = OP_NOTWORD;
1805 	  break;
1806 	case 's':
1807 	  if (!(syntax & RE_NO_GNU_OPS))
1808 	    token->type = OP_SPACE;
1809 	  break;
1810 	case 'S':
1811 	  if (!(syntax & RE_NO_GNU_OPS))
1812 	    token->type = OP_NOTSPACE;
1813 	  break;
1814 	case '`':
1815 	  if (!(syntax & RE_NO_GNU_OPS))
1816 	    {
1817 	      token->type = ANCHOR;
1818 	      token->opr.ctx_type = BUF_FIRST;
1819 	    }
1820 	  break;
1821 	case '\'':
1822 	  if (!(syntax & RE_NO_GNU_OPS))
1823 	    {
1824 	      token->type = ANCHOR;
1825 	      token->opr.ctx_type = BUF_LAST;
1826 	    }
1827 	  break;
1828 	case '(':
1829 	  if (!(syntax & RE_NO_BK_PARENS))
1830 	    token->type = OP_OPEN_SUBEXP;
1831 	  break;
1832 	case ')':
1833 	  if (!(syntax & RE_NO_BK_PARENS))
1834 	    token->type = OP_CLOSE_SUBEXP;
1835 	  break;
1836 	case '+':
1837 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1838 	    token->type = OP_DUP_PLUS;
1839 	  break;
1840 	case '?':
1841 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1842 	    token->type = OP_DUP_QUESTION;
1843 	  break;
1844 	case '{':
1845 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1846 	    token->type = OP_OPEN_DUP_NUM;
1847 	  break;
1848 	case '}':
1849 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1850 	    token->type = OP_CLOSE_DUP_NUM;
1851 	  break;
1852 	default:
1853 	  break;
1854 	}
1855       return 2;
1856     }
1857 
1858   token->type = CHARACTER;
1859 #ifdef RE_ENABLE_I18N
1860   if (input->mb_cur_max > 1)
1861     {
1862       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1863       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1864     }
1865   else
1866 #endif
1867     token->word_char = IS_WORD_CHAR (token->opr.c);
1868 
1869   switch (c)
1870     {
1871     case '\n':
1872       if (syntax & RE_NEWLINE_ALT)
1873 	token->type = OP_ALT;
1874       break;
1875     case '|':
1876       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1877 	token->type = OP_ALT;
1878       break;
1879     case '*':
1880       token->type = OP_DUP_ASTERISK;
1881       break;
1882     case '+':
1883       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1884 	token->type = OP_DUP_PLUS;
1885       break;
1886     case '?':
1887       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1888 	token->type = OP_DUP_QUESTION;
1889       break;
1890     case '{':
1891       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1892 	token->type = OP_OPEN_DUP_NUM;
1893       break;
1894     case '}':
1895       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1896 	token->type = OP_CLOSE_DUP_NUM;
1897       break;
1898     case '(':
1899       if (syntax & RE_NO_BK_PARENS)
1900 	token->type = OP_OPEN_SUBEXP;
1901       break;
1902     case ')':
1903       if (syntax & RE_NO_BK_PARENS)
1904 	token->type = OP_CLOSE_SUBEXP;
1905       break;
1906     case '[':
1907       token->type = OP_OPEN_BRACKET;
1908       break;
1909     case '.':
1910       token->type = OP_PERIOD;
1911       break;
1912     case '^':
1913       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1914 	  re_string_cur_idx (input) != 0)
1915 	{
1916 	  char prev = re_string_peek_byte (input, -1);
1917 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1918 	    break;
1919 	}
1920       token->type = ANCHOR;
1921       token->opr.ctx_type = LINE_FIRST;
1922       break;
1923     case '$':
1924       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1925 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1926 	{
1927 	  re_token_t next;
1928 	  re_string_skip_bytes (input, 1);
1929 	  peek_token (&next, input, syntax);
1930 	  re_string_skip_bytes (input, -1);
1931 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1932 	    break;
1933 	}
1934       token->type = ANCHOR;
1935       token->opr.ctx_type = LINE_LAST;
1936       break;
1937     default:
1938       break;
1939     }
1940   return 1;
1941 }
1942 
1943 /* Peek a token from INPUT, and return the length of the token.
1944    We must not use this function out of bracket expressions.  */
1945 
1946 static int
1947 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1948 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1949 {
1950   unsigned char c;
1951   if (re_string_eoi (input))
1952     {
1953       token->type = END_OF_RE;
1954       return 0;
1955     }
1956   c = re_string_peek_byte (input, 0);
1957   token->opr.c = c;
1958 
1959 #ifdef RE_ENABLE_I18N
1960   if (input->mb_cur_max > 1 &&
1961       !re_string_first_byte (input, re_string_cur_idx (input)))
1962     {
1963       token->type = CHARACTER;
1964       return 1;
1965     }
1966 #endif /* RE_ENABLE_I18N */
1967 
1968   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1969       && re_string_cur_idx (input) + 1 < re_string_length (input))
1970     {
1971       /* In this case, '\' escape a character.  */
1972       unsigned char c2;
1973       re_string_skip_bytes (input, 1);
1974       c2 = re_string_peek_byte (input, 0);
1975       token->opr.c = c2;
1976       token->type = CHARACTER;
1977       return 1;
1978     }
1979   if (c == '[') /* '[' is a special char in a bracket exps.  */
1980     {
1981       unsigned char c2;
1982       int token_len;
1983       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1984 	c2 = re_string_peek_byte (input, 1);
1985       else
1986 	c2 = 0;
1987       token->opr.c = c2;
1988       token_len = 2;
1989       switch (c2)
1990 	{
1991 	case '.':
1992 	  token->type = OP_OPEN_COLL_ELEM;
1993 	  break;
1994 	case '=':
1995 	  token->type = OP_OPEN_EQUIV_CLASS;
1996 	  break;
1997 	case ':':
1998 	  if (syntax & RE_CHAR_CLASSES)
1999 	    {
2000 	      token->type = OP_OPEN_CHAR_CLASS;
2001 	      break;
2002 	    }
2003 	  /* else fall through.  */
2004 	default:
2005 	  token->type = CHARACTER;
2006 	  token->opr.c = c;
2007 	  token_len = 1;
2008 	  break;
2009 	}
2010       return token_len;
2011     }
2012   switch (c)
2013     {
2014     case '-':
2015       token->type = OP_CHARSET_RANGE;
2016       break;
2017     case ']':
2018       token->type = OP_CLOSE_BRACKET;
2019       break;
2020     case '^':
2021       token->type = OP_NON_MATCH_LIST;
2022       break;
2023     default:
2024       token->type = CHARACTER;
2025     }
2026   return 1;
2027 }
2028 
2029 /* Functions for parser.  */
2030 
2031 /* Entry point of the parser.
2032    Parse the regular expression REGEXP and return the structure tree.
2033    If an error is occured, ERR is set by error code, and return NULL.
2034    This function build the following tree, from regular expression <reg_exp>:
2035 	   CAT
2036 	   / \
2037 	  /   \
2038    <reg_exp>  EOR
2039 
2040    CAT means concatenation.
2041    EOR means end of regular expression.  */
2042 
2043 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2044 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2045        reg_errcode_t *err)
2046 {
2047   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2048   bin_tree_t *tree, *eor, *root;
2049   re_token_t current_token;
2050   dfa->syntax = syntax;
2051   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2052   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2053   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2054     return NULL;
2055   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2056   if (tree != NULL)
2057     root = create_tree (dfa, tree, eor, CONCAT);
2058   else
2059     root = eor;
2060   if (BE (eor == NULL || root == NULL, 0))
2061     {
2062       *err = REG_ESPACE;
2063       return NULL;
2064     }
2065   return root;
2066 }
2067 
2068 /* This function build the following tree, from regular expression
2069    <branch1>|<branch2>:
2070 	   ALT
2071 	   / \
2072 	  /   \
2073    <branch1> <branch2>
2074 
2075    ALT means alternative, which represents the operator `|'.  */
2076 
2077 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2078 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2079 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2080 {
2081   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2082   bin_tree_t *tree, *branch = NULL;
2083   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2084   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2085     return NULL;
2086 
2087   while (token->type == OP_ALT)
2088     {
2089       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2090       if (token->type != OP_ALT && token->type != END_OF_RE
2091 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2092 	{
2093 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2094 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2095 	    return NULL;
2096 	}
2097       else
2098 	branch = NULL;
2099       tree = create_tree (dfa, tree, branch, OP_ALT);
2100       if (BE (tree == NULL, 0))
2101 	{
2102 	  *err = REG_ESPACE;
2103 	  return NULL;
2104 	}
2105     }
2106   return tree;
2107 }
2108 
2109 /* This function build the following tree, from regular expression
2110    <exp1><exp2>:
2111 	CAT
2112 	/ \
2113        /   \
2114    <exp1> <exp2>
2115 
2116    CAT means concatenation.  */
2117 
2118 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2119 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2120 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2121 {
2122   bin_tree_t *tree, *exp;
2123   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2124   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2125   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126     return NULL;
2127 
2128   while (token->type != OP_ALT && token->type != END_OF_RE
2129 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130     {
2131       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2132       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2133 	{
2134 	  return NULL;
2135 	}
2136       if (tree != NULL && exp != NULL)
2137 	{
2138 	  tree = create_tree (dfa, tree, exp, CONCAT);
2139 	  if (tree == NULL)
2140 	    {
2141 	      *err = REG_ESPACE;
2142 	      return NULL;
2143 	    }
2144 	}
2145       else if (tree == NULL)
2146 	tree = exp;
2147       /* Otherwise exp == NULL, we don't need to create new tree.  */
2148     }
2149   return tree;
2150 }
2151 
2152 /* This function build the following tree, from regular expression a*:
2153 	 *
2154 	 |
2155 	 a
2156 */
2157 
2158 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2159 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2160 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2161 {
2162   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2163   bin_tree_t *tree;
2164   switch (token->type)
2165     {
2166     case CHARACTER:
2167       tree = create_token_tree (dfa, NULL, NULL, token);
2168       if (BE (tree == NULL, 0))
2169 	{
2170 	  *err = REG_ESPACE;
2171 	  return NULL;
2172 	}
2173 #ifdef RE_ENABLE_I18N
2174       if (dfa->mb_cur_max > 1)
2175 	{
2176 	  while (!re_string_eoi (regexp)
2177 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2178 	    {
2179 	      bin_tree_t *mbc_remain;
2180 	      fetch_token (token, regexp, syntax);
2181 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2182 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2183 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2184 		{
2185 		  *err = REG_ESPACE;
2186 		  return NULL;
2187 		}
2188 	    }
2189 	}
2190 #endif
2191       break;
2192     case OP_OPEN_SUBEXP:
2193       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2194       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195 	return NULL;
2196       break;
2197     case OP_OPEN_BRACKET:
2198       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2199       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2200 	return NULL;
2201       break;
2202     case OP_BACK_REF:
2203       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2204 	{
2205 	  *err = REG_ESUBREG;
2206 	  return NULL;
2207 	}
2208       dfa->used_bkref_map |= 1 << token->opr.idx;
2209       tree = create_token_tree (dfa, NULL, NULL, token);
2210       if (BE (tree == NULL, 0))
2211 	{
2212 	  *err = REG_ESPACE;
2213 	  return NULL;
2214 	}
2215       ++dfa->nbackref;
2216       dfa->has_mb_node = 1;
2217       break;
2218     case OP_OPEN_DUP_NUM:
2219       if (syntax & RE_CONTEXT_INVALID_DUP)
2220 	{
2221 	  *err = REG_BADRPT;
2222 	  return NULL;
2223 	}
2224       /* FALLTHROUGH */
2225     case OP_DUP_ASTERISK:
2226     case OP_DUP_PLUS:
2227     case OP_DUP_QUESTION:
2228       if (syntax & RE_CONTEXT_INVALID_OPS)
2229 	{
2230 	  *err = REG_BADRPT;
2231 	  return NULL;
2232 	}
2233       else if (syntax & RE_CONTEXT_INDEP_OPS)
2234 	{
2235 	  fetch_token (token, regexp, syntax);
2236 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2237 	}
2238       /* else fall through  */
2239     case OP_CLOSE_SUBEXP:
2240       if ((token->type == OP_CLOSE_SUBEXP) &&
2241 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2242 	{
2243 	  *err = REG_ERPAREN;
2244 	  return NULL;
2245 	}
2246       /* else fall through  */
2247     case OP_CLOSE_DUP_NUM:
2248       /* We treat it as a normal character.  */
2249 
2250       /* Then we can these characters as normal characters.  */
2251       token->type = CHARACTER;
2252       /* mb_partial and word_char bits should be initialized already
2253 	 by peek_token.  */
2254       tree = create_token_tree (dfa, NULL, NULL, token);
2255       if (BE (tree == NULL, 0))
2256 	{
2257 	  *err = REG_ESPACE;
2258 	  return NULL;
2259 	}
2260       break;
2261     case ANCHOR:
2262       if ((token->opr.ctx_type
2263 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2264 	  && dfa->word_ops_used == 0)
2265 	init_word_char (dfa);
2266       if (token->opr.ctx_type == WORD_DELIM
2267           || token->opr.ctx_type == NOT_WORD_DELIM)
2268 	{
2269 	  bin_tree_t *tree_first, *tree_last;
2270 	  if (token->opr.ctx_type == WORD_DELIM)
2271 	    {
2272 	      token->opr.ctx_type = WORD_FIRST;
2273 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2274 	      token->opr.ctx_type = WORD_LAST;
2275             }
2276           else
2277             {
2278 	      token->opr.ctx_type = INSIDE_WORD;
2279 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2280 	      token->opr.ctx_type = INSIDE_NOTWORD;
2281             }
2282 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2283 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2284 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2285 	    {
2286 	      *err = REG_ESPACE;
2287 	      return NULL;
2288 	    }
2289 	}
2290       else
2291 	{
2292 	  tree = create_token_tree (dfa, NULL, NULL, token);
2293 	  if (BE (tree == NULL, 0))
2294 	    {
2295 	      *err = REG_ESPACE;
2296 	      return NULL;
2297 	    }
2298 	}
2299       /* We must return here, since ANCHORs can't be followed
2300 	 by repetition operators.
2301 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2302 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2303       fetch_token (token, regexp, syntax);
2304       return tree;
2305     case OP_PERIOD:
2306       tree = create_token_tree (dfa, NULL, NULL, token);
2307       if (BE (tree == NULL, 0))
2308 	{
2309 	  *err = REG_ESPACE;
2310 	  return NULL;
2311 	}
2312       if (dfa->mb_cur_max > 1)
2313 	dfa->has_mb_node = 1;
2314       break;
2315     case OP_WORD:
2316     case OP_NOTWORD:
2317       tree = build_charclass_op (dfa, regexp->trans,
2318 				 (const unsigned char *) "alnum",
2319 				 (const unsigned char *) "_",
2320 				 token->type == OP_NOTWORD, err);
2321       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2322 	return NULL;
2323       break;
2324     case OP_SPACE:
2325     case OP_NOTSPACE:
2326       tree = build_charclass_op (dfa, regexp->trans,
2327 				 (const unsigned char *) "space",
2328 				 (const unsigned char *) "",
2329 				 token->type == OP_NOTSPACE, err);
2330       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2331 	return NULL;
2332       break;
2333     case OP_ALT:
2334     case END_OF_RE:
2335       return NULL;
2336     case BACK_SLASH:
2337       *err = REG_EESCAPE;
2338       return NULL;
2339     default:
2340       /* Must not happen?  */
2341 #ifdef DEBUG
2342       assert (0);
2343 #endif
2344       return NULL;
2345     }
2346   fetch_token (token, regexp, syntax);
2347 
2348   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2349 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2350     {
2351       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2352       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2353 	return NULL;
2354       /* In BRE consecutive duplications are not allowed.  */
2355       if ((syntax & RE_CONTEXT_INVALID_DUP)
2356 	  && (token->type == OP_DUP_ASTERISK
2357 	      || token->type == OP_OPEN_DUP_NUM))
2358 	{
2359 	  *err = REG_BADRPT;
2360 	  return NULL;
2361 	}
2362     }
2363 
2364   return tree;
2365 }
2366 
2367 /* This function build the following tree, from regular expression
2368    (<reg_exp>):
2369 	 SUBEXP
2370 	    |
2371 	<reg_exp>
2372 */
2373 
2374 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2375 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2376 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2377 {
2378   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2379   bin_tree_t *tree;
2380   size_t cur_nsub;
2381   cur_nsub = preg->re_nsub++;
2382 
2383   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2384 
2385   /* The subexpression may be a null string.  */
2386   if (token->type == OP_CLOSE_SUBEXP)
2387     tree = NULL;
2388   else
2389     {
2390       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2391       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2392         *err = REG_EPAREN;
2393       if (BE (*err != REG_NOERROR, 0))
2394 	return NULL;
2395     }
2396 
2397   if (cur_nsub <= '9' - '1')
2398     dfa->completed_bkref_map |= 1 << cur_nsub;
2399 
2400   tree = create_tree (dfa, tree, NULL, SUBEXP);
2401   if (BE (tree == NULL, 0))
2402     {
2403       *err = REG_ESPACE;
2404       return NULL;
2405     }
2406   tree->token.opr.idx = cur_nsub;
2407   return tree;
2408 }
2409 
2410 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2411 
2412 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2413 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2414 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2415 {
2416   bin_tree_t *tree = NULL, *old_tree = NULL;
2417   int i, start, end, start_idx = re_string_cur_idx (regexp);
2418   re_token_t start_token = *token;
2419 
2420   if (token->type == OP_OPEN_DUP_NUM)
2421     {
2422       end = 0;
2423       start = fetch_number (regexp, token, syntax);
2424       if (start == -1)
2425 	{
2426 	  if (token->type == CHARACTER && token->opr.c == ',')
2427 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2428 	  else
2429 	    {
2430 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2431 	      return NULL;
2432 	    }
2433 	}
2434       if (BE (start != -2, 1))
2435 	{
2436 	  /* We treat "{n}" as "{n,n}".  */
2437 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2438 		 : ((token->type == CHARACTER && token->opr.c == ',')
2439 		    ? fetch_number (regexp, token, syntax) : -2));
2440 	}
2441       if (BE (start == -2 || end == -2, 0))
2442 	{
2443 	  /* Invalid sequence.  */
2444 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2445 	    {
2446 	      if (token->type == END_OF_RE)
2447 		*err = REG_EBRACE;
2448 	      else
2449 		*err = REG_BADBR;
2450 
2451 	      return NULL;
2452 	    }
2453 
2454 	  /* If the syntax bit is set, rollback.  */
2455 	  re_string_set_index (regexp, start_idx);
2456 	  *token = start_token;
2457 	  token->type = CHARACTER;
2458 	  /* mb_partial and word_char bits should be already initialized by
2459 	     peek_token.  */
2460 	  return elem;
2461 	}
2462 
2463       if (BE (end != -1 && start > end, 0))
2464 	{
2465 	  /* First number greater than second.  */
2466 	  *err = REG_BADBR;
2467 	  return NULL;
2468 	}
2469     }
2470   else
2471     {
2472       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2473       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2474     }
2475 
2476   fetch_token (token, regexp, syntax);
2477 
2478   if (BE (elem == NULL, 0))
2479     return NULL;
2480   if (BE (start == 0 && end == 0, 0))
2481     {
2482       postorder (elem, free_tree, NULL);
2483       return NULL;
2484     }
2485 
2486   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2487   if (BE (start > 0, 0))
2488     {
2489       tree = elem;
2490       for (i = 2; i <= start; ++i)
2491 	{
2492 	  elem = duplicate_tree (elem, dfa);
2493 	  tree = create_tree (dfa, tree, elem, CONCAT);
2494 	  if (BE (elem == NULL || tree == NULL, 0))
2495 	    goto parse_dup_op_espace;
2496 	}
2497 
2498       if (start == end)
2499 	return tree;
2500 
2501       /* Duplicate ELEM before it is marked optional.  */
2502       elem = duplicate_tree (elem, dfa);
2503       old_tree = tree;
2504     }
2505   else
2506     old_tree = NULL;
2507 
2508   if (elem->token.type == SUBEXP)
2509     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2510 
2511   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2512   if (BE (tree == NULL, 0))
2513     goto parse_dup_op_espace;
2514 
2515   /* This loop is actually executed only when end != -1,
2516      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2517      already created the start+1-th copy.  */
2518   for (i = start + 2; i <= end; ++i)
2519     {
2520       elem = duplicate_tree (elem, dfa);
2521       tree = create_tree (dfa, tree, elem, CONCAT);
2522       if (BE (elem == NULL || tree == NULL, 0))
2523         goto parse_dup_op_espace;
2524 
2525       tree = create_tree (dfa, tree, NULL, OP_ALT);
2526       if (BE (tree == NULL, 0))
2527         goto parse_dup_op_espace;
2528     }
2529 
2530   if (old_tree)
2531     tree = create_tree (dfa, old_tree, tree, CONCAT);
2532 
2533   return tree;
2534 
2535  parse_dup_op_espace:
2536   *err = REG_ESPACE;
2537   return NULL;
2538 }
2539 
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541    I'm not sure, but maybe enough.  */
2542 #define BRACKET_NAME_BUF_SIZE 32
2543 
2544 #ifndef _LIBC
2545   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546      Build the range expression which starts from START_ELEM, and ends
2547      at END_ELEM.  The result are written to MBCSET and SBCSET.
2548      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549      mbcset->range_ends, is a pointer argument sinse we may
2550      update it.  */
2551 
2552 static reg_errcode_t
2553 internal_function
2554 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2555 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2556 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2557 # else /* not RE_ENABLE_I18N */
2558 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2559 		 bracket_elem_t *end_elem)
2560 # endif /* not RE_ENABLE_I18N */
2561 {
2562   unsigned int start_ch, end_ch;
2563   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2564   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2565 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2566 	  0))
2567     return REG_ERANGE;
2568 
2569   /* We can handle no multi character collating elements without libc
2570      support.  */
2571   if (BE ((start_elem->type == COLL_SYM
2572 	   && strlen ((char *) start_elem->opr.name) > 1)
2573 	  || (end_elem->type == COLL_SYM
2574 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2575     return REG_ECOLLATE;
2576 
2577 # ifdef RE_ENABLE_I18N
2578   {
2579     wchar_t wc;
2580     wint_t start_wc;
2581     wint_t end_wc;
2582     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2583 
2584     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2585 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2586 		   : 0));
2587     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2588 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2589 		 : 0));
2590     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2591 		? __btowc (start_ch) : start_elem->opr.wch);
2592     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2593 	      ? __btowc (end_ch) : end_elem->opr.wch);
2594     if (start_wc == WEOF || end_wc == WEOF)
2595       return REG_ECOLLATE;
2596     cmp_buf[0] = start_wc;
2597     cmp_buf[4] = end_wc;
2598     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2599       return REG_ERANGE;
2600 
2601     /* Got valid collation sequence values, add them as a new entry.
2602        However, for !_LIBC we have no collation elements: if the
2603        character set is single byte, the single byte character set
2604        that we build below suffices.  parse_bracket_exp passes
2605        no MBCSET if dfa->mb_cur_max == 1.  */
2606     if (mbcset)
2607       {
2608         /* Check the space of the arrays.  */
2609         if (BE (*range_alloc == mbcset->nranges, 0))
2610           {
2611 	    /* There is not enough space, need realloc.  */
2612 	    wchar_t *new_array_start, *new_array_end;
2613 	    int new_nranges;
2614 
2615 	    /* +1 in case of mbcset->nranges is 0.  */
2616 	    new_nranges = 2 * mbcset->nranges + 1;
2617 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2618 	       are NULL if *range_alloc == 0.  */
2619 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2620 				          new_nranges);
2621 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2622 				        new_nranges);
2623 
2624 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2625 	      return REG_ESPACE;
2626 
2627 	    mbcset->range_starts = new_array_start;
2628 	    mbcset->range_ends = new_array_end;
2629 	    *range_alloc = new_nranges;
2630           }
2631 
2632         mbcset->range_starts[mbcset->nranges] = start_wc;
2633         mbcset->range_ends[mbcset->nranges++] = end_wc;
2634       }
2635 
2636     /* Build the table for single byte characters.  */
2637     for (wc = 0; wc < SBC_MAX; ++wc)
2638       {
2639 	cmp_buf[2] = wc;
2640 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2641 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2642 	  bitset_set (sbcset, wc);
2643       }
2644   }
2645 # else /* not RE_ENABLE_I18N */
2646   {
2647     unsigned int ch;
2648     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2649 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2650 		   : 0));
2651     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2652 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2653 		 : 0));
2654     if (start_ch > end_ch)
2655       return REG_ERANGE;
2656     /* Build the table for single byte characters.  */
2657     for (ch = 0; ch < SBC_MAX; ++ch)
2658       if (start_ch <= ch  && ch <= end_ch)
2659 	bitset_set (sbcset, ch);
2660   }
2661 # endif /* not RE_ENABLE_I18N */
2662   return REG_NOERROR;
2663 }
2664 #endif /* not _LIBC */
2665 
2666 #ifndef _LIBC
2667 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2668    Build the collating element which is represented by NAME.
2669    The result are written to MBCSET and SBCSET.
2670    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2671    pointer argument since we may update it.  */
2672 
2673 static reg_errcode_t
2674 internal_function
2675 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2676 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2677 			int *coll_sym_alloc, const unsigned char *name)
2678 # else /* not RE_ENABLE_I18N */
2679 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2680 # endif /* not RE_ENABLE_I18N */
2681 {
2682   size_t name_len = strlen ((const char *) name);
2683   if (BE (name_len != 1, 0))
2684     return REG_ECOLLATE;
2685   else
2686     {
2687       bitset_set (sbcset, name[0]);
2688       return REG_NOERROR;
2689     }
2690 }
2691 #endif /* not _LIBC */
2692 
2693 /* This function parse bracket expression like "[abc]", "[a-c]",
2694    "[[.a-a.]]" etc.  */
2695 
2696 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2697 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2698 		   reg_syntax_t syntax, reg_errcode_t *err)
2699 {
2700 #ifdef _LIBC
2701   const unsigned char *collseqmb;
2702   const char *collseqwc;
2703   uint32_t nrules;
2704   int32_t table_size;
2705   const int32_t *symb_table;
2706   const unsigned char *extra;
2707 
2708   /* Local function for parse_bracket_exp used in _LIBC environement.
2709      Seek the collating symbol entry correspondings to NAME.
2710      Return the index of the symbol in the SYMB_TABLE.  */
2711 
2712   auto inline int32_t
2713   __attribute ((always_inline))
2714   seek_collating_symbol_entry (name, name_len)
2715 	 const unsigned char *name;
2716 	 size_t name_len;
2717     {
2718       int32_t hash = elem_hash ((const char *) name, name_len);
2719       int32_t elem = hash % table_size;
2720       if (symb_table[2 * elem] != 0)
2721 	{
2722 	  int32_t second = hash % (table_size - 2) + 1;
2723 
2724 	  do
2725 	    {
2726 	      /* First compare the hashing value.  */
2727 	      if (symb_table[2 * elem] == hash
2728 		  /* Compare the length of the name.  */
2729 		  && name_len == extra[symb_table[2 * elem + 1]]
2730 		  /* Compare the name.  */
2731 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2732 			     name_len) == 0)
2733 		{
2734 		  /* Yep, this is the entry.  */
2735 		  break;
2736 		}
2737 
2738 	      /* Next entry.  */
2739 	      elem += second;
2740 	    }
2741 	  while (symb_table[2 * elem] != 0);
2742 	}
2743       return elem;
2744     }
2745 
2746   /* Local function for parse_bracket_exp used in _LIBC environement.
2747      Look up the collation sequence value of BR_ELEM.
2748      Return the value if succeeded, UINT_MAX otherwise.  */
2749 
2750   auto inline unsigned int
2751   __attribute ((always_inline))
2752   lookup_collation_sequence_value (br_elem)
2753 	 bracket_elem_t *br_elem;
2754     {
2755       if (br_elem->type == SB_CHAR)
2756 	{
2757 	  /*
2758 	  if (MB_CUR_MAX == 1)
2759 	  */
2760 	  if (nrules == 0)
2761 	    return collseqmb[br_elem->opr.ch];
2762 	  else
2763 	    {
2764 	      wint_t wc = __btowc (br_elem->opr.ch);
2765 	      return __collseq_table_lookup (collseqwc, wc);
2766 	    }
2767 	}
2768       else if (br_elem->type == MB_CHAR)
2769 	{
2770 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2771 	}
2772       else if (br_elem->type == COLL_SYM)
2773 	{
2774 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2775 	  if (nrules != 0)
2776 	    {
2777 	      int32_t elem, idx;
2778 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2779 						  sym_name_len);
2780 	      if (symb_table[2 * elem] != 0)
2781 		{
2782 		  /* We found the entry.  */
2783 		  idx = symb_table[2 * elem + 1];
2784 		  /* Skip the name of collating element name.  */
2785 		  idx += 1 + extra[idx];
2786 		  /* Skip the byte sequence of the collating element.  */
2787 		  idx += 1 + extra[idx];
2788 		  /* Adjust for the alignment.  */
2789 		  idx = (idx + 3) & ~3;
2790 		  /* Skip the multibyte collation sequence value.  */
2791 		  idx += sizeof (unsigned int);
2792 		  /* Skip the wide char sequence of the collating element.  */
2793 		  idx += sizeof (unsigned int) *
2794 		    (1 + *(unsigned int *) (extra + idx));
2795 		  /* Return the collation sequence value.  */
2796 		  return *(unsigned int *) (extra + idx);
2797 		}
2798 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2799 		{
2800 		  /* No valid character.  Match it as a single byte
2801 		     character.  */
2802 		  return collseqmb[br_elem->opr.name[0]];
2803 		}
2804 	    }
2805 	  else if (sym_name_len == 1)
2806 	    return collseqmb[br_elem->opr.name[0]];
2807 	}
2808       return UINT_MAX;
2809     }
2810 
2811   /* Local function for parse_bracket_exp used in _LIBC environement.
2812      Build the range expression which starts from START_ELEM, and ends
2813      at END_ELEM.  The result are written to MBCSET and SBCSET.
2814      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2815      mbcset->range_ends, is a pointer argument sinse we may
2816      update it.  */
2817 
2818   auto inline reg_errcode_t
2819   __attribute ((always_inline))
2820   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2821 	 re_charset_t *mbcset;
2822 	 int *range_alloc;
2823 	 bitset_t sbcset;
2824 	 bracket_elem_t *start_elem, *end_elem;
2825     {
2826       unsigned int ch;
2827       uint32_t start_collseq;
2828       uint32_t end_collseq;
2829 
2830       /* Equivalence Classes and Character Classes can't be a range
2831 	 start/end.  */
2832       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2833 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2834 	      0))
2835 	return REG_ERANGE;
2836 
2837       start_collseq = lookup_collation_sequence_value (start_elem);
2838       end_collseq = lookup_collation_sequence_value (end_elem);
2839       /* Check start/end collation sequence values.  */
2840       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2841 	return REG_ECOLLATE;
2842       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2843 	return REG_ERANGE;
2844 
2845       /* Got valid collation sequence values, add them as a new entry.
2846 	 However, if we have no collation elements, and the character set
2847 	 is single byte, the single byte character set that we
2848 	 build below suffices. */
2849       if (nrules > 0 || dfa->mb_cur_max > 1)
2850 	{
2851           /* Check the space of the arrays.  */
2852           if (BE (*range_alloc == mbcset->nranges, 0))
2853 	    {
2854 	      /* There is not enough space, need realloc.  */
2855 	      uint32_t *new_array_start;
2856 	      uint32_t *new_array_end;
2857 	      int new_nranges;
2858 
2859 	      /* +1 in case of mbcset->nranges is 0.  */
2860 	      new_nranges = 2 * mbcset->nranges + 1;
2861 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2862 					    new_nranges);
2863 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2864 				          new_nranges);
2865 
2866 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2867 	        return REG_ESPACE;
2868 
2869 	      mbcset->range_starts = new_array_start;
2870 	      mbcset->range_ends = new_array_end;
2871 	      *range_alloc = new_nranges;
2872 	    }
2873 
2874           mbcset->range_starts[mbcset->nranges] = start_collseq;
2875           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2876 	}
2877 
2878       /* Build the table for single byte characters.  */
2879       for (ch = 0; ch < SBC_MAX; ch++)
2880 	{
2881 	  uint32_t ch_collseq;
2882 	  /*
2883 	  if (MB_CUR_MAX == 1)
2884 	  */
2885 	  if (nrules == 0)
2886 	    ch_collseq = collseqmb[ch];
2887 	  else
2888 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2889 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2890 	    bitset_set (sbcset, ch);
2891 	}
2892       return REG_NOERROR;
2893     }
2894 
2895   /* Local function for parse_bracket_exp used in _LIBC environement.
2896      Build the collating element which is represented by NAME.
2897      The result are written to MBCSET and SBCSET.
2898      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2899      pointer argument sinse we may update it.  */
2900 
2901   auto inline reg_errcode_t
2902   __attribute ((always_inline))
2903   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2904 	 re_charset_t *mbcset;
2905 	 int *coll_sym_alloc;
2906 	 bitset_t sbcset;
2907 	 const unsigned char *name;
2908     {
2909       int32_t elem, idx;
2910       size_t name_len = strlen ((const char *) name);
2911       if (nrules != 0)
2912 	{
2913 	  elem = seek_collating_symbol_entry (name, name_len);
2914 	  if (symb_table[2 * elem] != 0)
2915 	    {
2916 	      /* We found the entry.  */
2917 	      idx = symb_table[2 * elem + 1];
2918 	      /* Skip the name of collating element name.  */
2919 	      idx += 1 + extra[idx];
2920 	    }
2921 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2922 	    {
2923 	      /* No valid character, treat it as a normal
2924 		 character.  */
2925 	      bitset_set (sbcset, name[0]);
2926 	      return REG_NOERROR;
2927 	    }
2928 	  else
2929 	    return REG_ECOLLATE;
2930 
2931 	  /* Got valid collation sequence, add it as a new entry.  */
2932 	  /* Check the space of the arrays.  */
2933 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2934 	    {
2935 	      /* Not enough, realloc it.  */
2936 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
2937 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2938 	      /* Use realloc since mbcset->coll_syms is NULL
2939 		 if *alloc == 0.  */
2940 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2941 						   new_coll_sym_alloc);
2942 	      if (BE (new_coll_syms == NULL, 0))
2943 		return REG_ESPACE;
2944 	      mbcset->coll_syms = new_coll_syms;
2945 	      *coll_sym_alloc = new_coll_sym_alloc;
2946 	    }
2947 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2948 	  return REG_NOERROR;
2949 	}
2950       else
2951 	{
2952 	  if (BE (name_len != 1, 0))
2953 	    return REG_ECOLLATE;
2954 	  else
2955 	    {
2956 	      bitset_set (sbcset, name[0]);
2957 	      return REG_NOERROR;
2958 	    }
2959 	}
2960     }
2961 #endif
2962 
2963   re_token_t br_token;
2964   re_bitset_ptr_t sbcset;
2965 #ifdef RE_ENABLE_I18N
2966   re_charset_t *mbcset;
2967   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2968   int equiv_class_alloc = 0, char_class_alloc = 0;
2969 #endif /* not RE_ENABLE_I18N */
2970   int non_match = 0;
2971   bin_tree_t *work_tree;
2972   int token_len;
2973   int first_round = 1;
2974 #ifdef _LIBC
2975   collseqmb = (const unsigned char *)
2976     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2977   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2978   if (nrules)
2979     {
2980       /*
2981       if (MB_CUR_MAX > 1)
2982       */
2983       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2984       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2985       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2986 						  _NL_COLLATE_SYMB_TABLEMB);
2987       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2988 						   _NL_COLLATE_SYMB_EXTRAMB);
2989     }
2990 #endif
2991   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2992 #ifdef RE_ENABLE_I18N
2993   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2994 #endif /* RE_ENABLE_I18N */
2995 #ifdef RE_ENABLE_I18N
2996   if (BE (sbcset == NULL || mbcset == NULL, 0))
2997 #else
2998   if (BE (sbcset == NULL, 0))
2999 #endif /* RE_ENABLE_I18N */
3000     {
3001       *err = REG_ESPACE;
3002       return NULL;
3003     }
3004 
3005   token_len = peek_token_bracket (token, regexp, syntax);
3006   if (BE (token->type == END_OF_RE, 0))
3007     {
3008       *err = REG_BADPAT;
3009       goto parse_bracket_exp_free_return;
3010     }
3011   if (token->type == OP_NON_MATCH_LIST)
3012     {
3013 #ifdef RE_ENABLE_I18N
3014       mbcset->non_match = 1;
3015 #endif /* not RE_ENABLE_I18N */
3016       non_match = 1;
3017       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3018 	bitset_set (sbcset, '\0');
3019       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3020       token_len = peek_token_bracket (token, regexp, syntax);
3021       if (BE (token->type == END_OF_RE, 0))
3022 	{
3023 	  *err = REG_BADPAT;
3024 	  goto parse_bracket_exp_free_return;
3025 	}
3026     }
3027 
3028   /* We treat the first ']' as a normal character.  */
3029   if (token->type == OP_CLOSE_BRACKET)
3030     token->type = CHARACTER;
3031 
3032   while (1)
3033     {
3034       bracket_elem_t start_elem, end_elem;
3035       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3036       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3037       reg_errcode_t ret;
3038       int token_len2 = 0, is_range_exp = 0;
3039       re_token_t token2;
3040 
3041       start_elem.opr.name = start_name_buf;
3042       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3043 				   syntax, first_round);
3044       if (BE (ret != REG_NOERROR, 0))
3045 	{
3046 	  *err = ret;
3047 	  goto parse_bracket_exp_free_return;
3048 	}
3049       first_round = 0;
3050 
3051       /* Get information about the next token.  We need it in any case.  */
3052       token_len = peek_token_bracket (token, regexp, syntax);
3053 
3054       /* Do not check for ranges if we know they are not allowed.  */
3055       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3056 	{
3057 	  if (BE (token->type == END_OF_RE, 0))
3058 	    {
3059 	      *err = REG_EBRACK;
3060 	      goto parse_bracket_exp_free_return;
3061 	    }
3062 	  if (token->type == OP_CHARSET_RANGE)
3063 	    {
3064 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3065 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3066 	      if (BE (token2.type == END_OF_RE, 0))
3067 		{
3068 		  *err = REG_EBRACK;
3069 		  goto parse_bracket_exp_free_return;
3070 		}
3071 	      if (token2.type == OP_CLOSE_BRACKET)
3072 		{
3073 		  /* We treat the last '-' as a normal character.  */
3074 		  re_string_skip_bytes (regexp, -token_len);
3075 		  token->type = CHARACTER;
3076 		}
3077 	      else
3078 		is_range_exp = 1;
3079 	    }
3080 	}
3081 
3082       if (is_range_exp == 1)
3083 	{
3084 	  end_elem.opr.name = end_name_buf;
3085 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3086 				       dfa, syntax, 1);
3087 	  if (BE (ret != REG_NOERROR, 0))
3088 	    {
3089 	      *err = ret;
3090 	      goto parse_bracket_exp_free_return;
3091 	    }
3092 
3093 	  token_len = peek_token_bracket (token, regexp, syntax);
3094 
3095 #ifdef _LIBC
3096 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3097 				  &start_elem, &end_elem);
3098 #else
3099 # ifdef RE_ENABLE_I18N
3100 	  *err = build_range_exp (sbcset,
3101 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3102 				  &range_alloc, &start_elem, &end_elem);
3103 # else
3104 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3105 # endif
3106 #endif /* RE_ENABLE_I18N */
3107 	  if (BE (*err != REG_NOERROR, 0))
3108 	    goto parse_bracket_exp_free_return;
3109 	}
3110       else
3111 	{
3112 	  switch (start_elem.type)
3113 	    {
3114 	    case SB_CHAR:
3115 	      bitset_set (sbcset, start_elem.opr.ch);
3116 	      break;
3117 #ifdef RE_ENABLE_I18N
3118 	    case MB_CHAR:
3119 	      /* Check whether the array has enough space.  */
3120 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3121 		{
3122 		  wchar_t *new_mbchars;
3123 		  /* Not enough, realloc it.  */
3124 		  /* +1 in case of mbcset->nmbchars is 0.  */
3125 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3126 		  /* Use realloc since array is NULL if *alloc == 0.  */
3127 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3128 					    mbchar_alloc);
3129 		  if (BE (new_mbchars == NULL, 0))
3130 		    goto parse_bracket_exp_espace;
3131 		  mbcset->mbchars = new_mbchars;
3132 		}
3133 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3134 	      break;
3135 #endif /* RE_ENABLE_I18N */
3136 	    case EQUIV_CLASS:
3137 	      *err = build_equiv_class (sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 					mbcset, &equiv_class_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141 					start_elem.opr.name);
3142 	      if (BE (*err != REG_NOERROR, 0))
3143 		goto parse_bracket_exp_free_return;
3144 	      break;
3145 	    case COLL_SYM:
3146 	      *err = build_collating_symbol (sbcset,
3147 #ifdef RE_ENABLE_I18N
3148 					     mbcset, &coll_sym_alloc,
3149 #endif /* RE_ENABLE_I18N */
3150 					     start_elem.opr.name);
3151 	      if (BE (*err != REG_NOERROR, 0))
3152 		goto parse_bracket_exp_free_return;
3153 	      break;
3154 	    case CHAR_CLASS:
3155 	      *err = build_charclass (regexp->trans, sbcset,
3156 #ifdef RE_ENABLE_I18N
3157 				      mbcset, &char_class_alloc,
3158 #endif /* RE_ENABLE_I18N */
3159 				      start_elem.opr.name, syntax);
3160 	      if (BE (*err != REG_NOERROR, 0))
3161 	       goto parse_bracket_exp_free_return;
3162 	      break;
3163 	    default:
3164 	      assert (0);
3165 	      break;
3166 	    }
3167 	}
3168       if (BE (token->type == END_OF_RE, 0))
3169 	{
3170 	  *err = REG_EBRACK;
3171 	  goto parse_bracket_exp_free_return;
3172 	}
3173       if (token->type == OP_CLOSE_BRACKET)
3174 	break;
3175     }
3176 
3177   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3178 
3179   /* If it is non-matching list.  */
3180   if (non_match)
3181     bitset_not (sbcset);
3182 
3183 #ifdef RE_ENABLE_I18N
3184   /* Ensure only single byte characters are set.  */
3185   if (dfa->mb_cur_max > 1)
3186     bitset_mask (sbcset, dfa->sb_char);
3187 
3188   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3189       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3190 						     || mbcset->non_match)))
3191     {
3192       bin_tree_t *mbc_tree;
3193       int sbc_idx;
3194       /* Build a tree for complex bracket.  */
3195       dfa->has_mb_node = 1;
3196       br_token.type = COMPLEX_BRACKET;
3197       br_token.opr.mbcset = mbcset;
3198       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3199       if (BE (mbc_tree == NULL, 0))
3200 	goto parse_bracket_exp_espace;
3201       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3202 	if (sbcset[sbc_idx])
3203 	  break;
3204       /* If there are no bits set in sbcset, there is no point
3205 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3206       if (sbc_idx < BITSET_WORDS)
3207 	{
3208           /* Build a tree for simple bracket.  */
3209           br_token.type = SIMPLE_BRACKET;
3210           br_token.opr.sbcset = sbcset;
3211           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212           if (BE (work_tree == NULL, 0))
3213             goto parse_bracket_exp_espace;
3214 
3215           /* Then join them by ALT node.  */
3216           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3217           if (BE (work_tree == NULL, 0))
3218             goto parse_bracket_exp_espace;
3219 	}
3220       else
3221 	{
3222 	  re_free (sbcset);
3223 	  work_tree = mbc_tree;
3224 	}
3225     }
3226   else
3227 #endif /* not RE_ENABLE_I18N */
3228     {
3229 #ifdef RE_ENABLE_I18N
3230       free_charset (mbcset);
3231 #endif
3232       /* Build a tree for simple bracket.  */
3233       br_token.type = SIMPLE_BRACKET;
3234       br_token.opr.sbcset = sbcset;
3235       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236       if (BE (work_tree == NULL, 0))
3237         goto parse_bracket_exp_espace;
3238     }
3239   return work_tree;
3240 
3241  parse_bracket_exp_espace:
3242   *err = REG_ESPACE;
3243  parse_bracket_exp_free_return:
3244   re_free (sbcset);
3245 #ifdef RE_ENABLE_I18N
3246   free_charset (mbcset);
3247 #endif /* RE_ENABLE_I18N */
3248   return NULL;
3249 }
3250 
3251 /* Parse an element in the bracket expression.  */
3252 
3253 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3254 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3255 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3256 		       reg_syntax_t syntax, int accept_hyphen)
3257 {
3258 #ifdef RE_ENABLE_I18N
3259   int cur_char_size;
3260   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3261   if (cur_char_size > 1)
3262     {
3263       elem->type = MB_CHAR;
3264       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3265       re_string_skip_bytes (regexp, cur_char_size);
3266       return REG_NOERROR;
3267     }
3268 #endif /* RE_ENABLE_I18N */
3269   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3270   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3271       || token->type == OP_OPEN_EQUIV_CLASS)
3272     return parse_bracket_symbol (elem, regexp, token);
3273   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3274     {
3275       /* A '-' must only appear as anything but a range indicator before
3276 	 the closing bracket.  Everything else is an error.  */
3277       re_token_t token2;
3278       (void) peek_token_bracket (&token2, regexp, syntax);
3279       if (token2.type != OP_CLOSE_BRACKET)
3280 	/* The actual error value is not standardized since this whole
3281 	   case is undefined.  But ERANGE makes good sense.  */
3282 	return REG_ERANGE;
3283     }
3284   elem->type = SB_CHAR;
3285   elem->opr.ch = token->opr.c;
3286   return REG_NOERROR;
3287 }
3288 
3289 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3290    such as [:<character_class>:], [.<collating_element>.], and
3291    [=<equivalent_class>=].  */
3292 
3293 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3294 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3295 		      re_token_t *token)
3296 {
3297   unsigned char ch, delim = token->opr.c;
3298   int i = 0;
3299   if (re_string_eoi(regexp))
3300     return REG_EBRACK;
3301   for (;; ++i)
3302     {
3303       if (i >= BRACKET_NAME_BUF_SIZE)
3304 	return REG_EBRACK;
3305       if (token->type == OP_OPEN_CHAR_CLASS)
3306 	ch = re_string_fetch_byte_case (regexp);
3307       else
3308 	ch = re_string_fetch_byte (regexp);
3309       if (re_string_eoi(regexp))
3310 	return REG_EBRACK;
3311       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3312 	break;
3313       elem->opr.name[i] = ch;
3314     }
3315   re_string_skip_bytes (regexp, 1);
3316   elem->opr.name[i] = '\0';
3317   switch (token->type)
3318     {
3319     case OP_OPEN_COLL_ELEM:
3320       elem->type = COLL_SYM;
3321       break;
3322     case OP_OPEN_EQUIV_CLASS:
3323       elem->type = EQUIV_CLASS;
3324       break;
3325     case OP_OPEN_CHAR_CLASS:
3326       elem->type = CHAR_CLASS;
3327       break;
3328     default:
3329       break;
3330     }
3331   return REG_NOERROR;
3332 }
3333 
3334   /* Helper function for parse_bracket_exp.
3335      Build the equivalence class which is represented by NAME.
3336      The result are written to MBCSET and SBCSET.
3337      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3338      is a pointer argument sinse we may update it.  */
3339 
3340 static reg_errcode_t
3341 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3342 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3343 		   int *equiv_class_alloc, const unsigned char *name)
3344 #else /* not RE_ENABLE_I18N */
3345 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3346 #endif /* not RE_ENABLE_I18N */
3347 {
3348 #ifdef _LIBC
3349   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3350   if (nrules != 0)
3351     {
3352       const int32_t *table, *indirect;
3353       const unsigned char *weights, *extra, *cp;
3354       unsigned char char_buf[2];
3355       int32_t idx1, idx2;
3356       unsigned int ch;
3357       size_t len;
3358       /* This #include defines a local function!  */
3359 # include <locale/weight.h>
3360       /* Calculate the index for equivalence class.  */
3361       cp = name;
3362       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3363       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3364 					       _NL_COLLATE_WEIGHTMB);
3365       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3366 						   _NL_COLLATE_EXTRAMB);
3367       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3368 						_NL_COLLATE_INDIRECTMB);
3369       idx1 = findidx (&cp);
3370       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3371 	/* This isn't a valid character.  */
3372 	return REG_ECOLLATE;
3373 
3374       /* Build single byte matcing table for this equivalence class.  */
3375       char_buf[1] = (unsigned char) '\0';
3376       len = weights[idx1];
3377       for (ch = 0; ch < SBC_MAX; ++ch)
3378 	{
3379 	  char_buf[0] = ch;
3380 	  cp = char_buf;
3381 	  idx2 = findidx (&cp);
3382 /*
3383 	  idx2 = table[ch];
3384 */
3385 	  if (idx2 == 0)
3386 	    /* This isn't a valid character.  */
3387 	    continue;
3388 	  if (len == weights[idx2])
3389 	    {
3390 	      int cnt = 0;
3391 	      while (cnt <= len &&
3392 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3393 		++cnt;
3394 
3395 	      if (cnt > len)
3396 		bitset_set (sbcset, ch);
3397 	    }
3398 	}
3399       /* Check whether the array has enough space.  */
3400       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3401 	{
3402 	  /* Not enough, realloc it.  */
3403 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3404 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3405 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3406 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3407 						   int32_t,
3408 						   new_equiv_class_alloc);
3409 	  if (BE (new_equiv_classes == NULL, 0))
3410 	    return REG_ESPACE;
3411 	  mbcset->equiv_classes = new_equiv_classes;
3412 	  *equiv_class_alloc = new_equiv_class_alloc;
3413 	}
3414       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3415     }
3416   else
3417 #endif /* _LIBC */
3418     {
3419       if (BE (strlen ((const char *) name) != 1, 0))
3420 	return REG_ECOLLATE;
3421       bitset_set (sbcset, *name);
3422     }
3423   return REG_NOERROR;
3424 }
3425 
3426   /* Helper function for parse_bracket_exp.
3427      Build the character class which is represented by NAME.
3428      The result are written to MBCSET and SBCSET.
3429      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3430      is a pointer argument sinse we may update it.  */
3431 
3432 static reg_errcode_t
3433 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3434 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3435 		 re_charset_t *mbcset, int *char_class_alloc,
3436 		 const unsigned char *class_name, reg_syntax_t syntax)
3437 #else /* not RE_ENABLE_I18N */
3438 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3439 		 const unsigned char *class_name, reg_syntax_t syntax)
3440 #endif /* not RE_ENABLE_I18N */
3441 {
3442   int i;
3443   const char *name = (const char *) class_name;
3444 
3445   /* In case of REG_ICASE "upper" and "lower" match the both of
3446      upper and lower cases.  */
3447   if ((syntax & RE_ICASE)
3448       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3449     name = "alpha";
3450 
3451 #ifdef RE_ENABLE_I18N
3452   /* Check the space of the arrays.  */
3453   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3454     {
3455       /* Not enough, realloc it.  */
3456       /* +1 in case of mbcset->nchar_classes is 0.  */
3457       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3458       /* Use realloc since array is NULL if *alloc == 0.  */
3459       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3460 					       new_char_class_alloc);
3461       if (BE (new_char_classes == NULL, 0))
3462 	return REG_ESPACE;
3463       mbcset->char_classes = new_char_classes;
3464       *char_class_alloc = new_char_class_alloc;
3465     }
3466   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3467 #endif /* RE_ENABLE_I18N */
3468 
3469 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3470   do {						\
3471     if (BE (trans != NULL, 0))			\
3472       {						\
3473 	for (i = 0; i < SBC_MAX; ++i)		\
3474   	  if (ctype_func (i))			\
3475 	    bitset_set (sbcset, trans[i]);	\
3476       }						\
3477     else					\
3478       {						\
3479 	for (i = 0; i < SBC_MAX; ++i)		\
3480   	  if (ctype_func (i))			\
3481 	    bitset_set (sbcset, i);		\
3482       }						\
3483   } while (0)
3484 
3485   if (strcmp (name, "alnum") == 0)
3486     BUILD_CHARCLASS_LOOP (isalnum);
3487   else if (strcmp (name, "cntrl") == 0)
3488     BUILD_CHARCLASS_LOOP (iscntrl);
3489   else if (strcmp (name, "lower") == 0)
3490     BUILD_CHARCLASS_LOOP (islower);
3491   else if (strcmp (name, "space") == 0)
3492     BUILD_CHARCLASS_LOOP (isspace);
3493   else if (strcmp (name, "alpha") == 0)
3494     BUILD_CHARCLASS_LOOP (isalpha);
3495   else if (strcmp (name, "digit") == 0)
3496     BUILD_CHARCLASS_LOOP (isdigit);
3497   else if (strcmp (name, "print") == 0)
3498     BUILD_CHARCLASS_LOOP (isprint);
3499   else if (strcmp (name, "upper") == 0)
3500     BUILD_CHARCLASS_LOOP (isupper);
3501   else if (strcmp (name, "blank") == 0)
3502     BUILD_CHARCLASS_LOOP (isblank);
3503   else if (strcmp (name, "graph") == 0)
3504     BUILD_CHARCLASS_LOOP (isgraph);
3505   else if (strcmp (name, "punct") == 0)
3506     BUILD_CHARCLASS_LOOP (ispunct);
3507   else if (strcmp (name, "xdigit") == 0)
3508     BUILD_CHARCLASS_LOOP (isxdigit);
3509   else
3510     return REG_ECTYPE;
3511 
3512   return REG_NOERROR;
3513 }
3514 
3515 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)3516 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3517 		    const unsigned char *class_name,
3518 		    const unsigned char *extra, int non_match,
3519 		    reg_errcode_t *err)
3520 {
3521   re_bitset_ptr_t sbcset;
3522 #ifdef RE_ENABLE_I18N
3523   re_charset_t *mbcset;
3524   int alloc = 0;
3525 #endif /* not RE_ENABLE_I18N */
3526   reg_errcode_t ret;
3527   re_token_t br_token;
3528   bin_tree_t *tree;
3529 
3530   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3531 #ifdef RE_ENABLE_I18N
3532   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3533 #endif /* RE_ENABLE_I18N */
3534 
3535 #ifdef RE_ENABLE_I18N
3536   if (BE (sbcset == NULL || mbcset == NULL, 0))
3537 #else /* not RE_ENABLE_I18N */
3538   if (BE (sbcset == NULL, 0))
3539 #endif /* not RE_ENABLE_I18N */
3540     {
3541       *err = REG_ESPACE;
3542       return NULL;
3543     }
3544 
3545   if (non_match)
3546     {
3547 #ifdef RE_ENABLE_I18N
3548       /*
3549       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3550 	bitset_set(cset->sbcset, '\0');
3551       */
3552       mbcset->non_match = 1;
3553 #endif /* not RE_ENABLE_I18N */
3554     }
3555 
3556   /* We don't care the syntax in this case.  */
3557   ret = build_charclass (trans, sbcset,
3558 #ifdef RE_ENABLE_I18N
3559 			 mbcset, &alloc,
3560 #endif /* RE_ENABLE_I18N */
3561 			 class_name, 0);
3562 
3563   if (BE (ret != REG_NOERROR, 0))
3564     {
3565       re_free (sbcset);
3566 #ifdef RE_ENABLE_I18N
3567       free_charset (mbcset);
3568 #endif /* RE_ENABLE_I18N */
3569       *err = ret;
3570       return NULL;
3571     }
3572   /* \w match '_' also.  */
3573   for (; *extra; extra++)
3574     bitset_set (sbcset, *extra);
3575 
3576   /* If it is non-matching list.  */
3577   if (non_match)
3578     bitset_not (sbcset);
3579 
3580 #ifdef RE_ENABLE_I18N
3581   /* Ensure only single byte characters are set.  */
3582   if (dfa->mb_cur_max > 1)
3583     bitset_mask (sbcset, dfa->sb_char);
3584 #endif
3585 
3586   /* Build a tree for simple bracket.  */
3587   br_token.type = SIMPLE_BRACKET;
3588   br_token.opr.sbcset = sbcset;
3589   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3590   if (BE (tree == NULL, 0))
3591     goto build_word_op_espace;
3592 
3593 #ifdef RE_ENABLE_I18N
3594   if (dfa->mb_cur_max > 1)
3595     {
3596       bin_tree_t *mbc_tree;
3597       /* Build a tree for complex bracket.  */
3598       br_token.type = COMPLEX_BRACKET;
3599       br_token.opr.mbcset = mbcset;
3600       dfa->has_mb_node = 1;
3601       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602       if (BE (mbc_tree == NULL, 0))
3603 	goto build_word_op_espace;
3604       /* Then join them by ALT node.  */
3605       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606       if (BE (mbc_tree != NULL, 1))
3607 	return tree;
3608     }
3609   else
3610     {
3611       free_charset (mbcset);
3612       return tree;
3613     }
3614 #else /* not RE_ENABLE_I18N */
3615   return tree;
3616 #endif /* not RE_ENABLE_I18N */
3617 
3618  build_word_op_espace:
3619   re_free (sbcset);
3620 #ifdef RE_ENABLE_I18N
3621   free_charset (mbcset);
3622 #endif /* RE_ENABLE_I18N */
3623   *err = REG_ESPACE;
3624   return NULL;
3625 }
3626 
3627 /* This is intended for the expressions like "a{1,3}".
3628    Fetch a number from `input', and return the number.
3629    Return -1, if the number field is empty like "{,1}".
3630    Return -2, If an error is occured.  */
3631 
3632 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3633 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3634 {
3635   int num = -1;
3636   unsigned char c;
3637   while (1)
3638     {
3639       fetch_token (token, input, syntax);
3640       c = token->opr.c;
3641       if (BE (token->type == END_OF_RE, 0))
3642 	return -2;
3643       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3644 	break;
3645       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3646 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3647       num = (num > RE_DUP_MAX) ? -2 : num;
3648     }
3649   return num;
3650 }
3651 
3652 #ifdef RE_ENABLE_I18N
3653 static void
free_charset(re_charset_t * cset)3654 free_charset (re_charset_t *cset)
3655 {
3656   re_free (cset->mbchars);
3657 # ifdef _LIBC
3658   re_free (cset->coll_syms);
3659   re_free (cset->equiv_classes);
3660   re_free (cset->range_starts);
3661   re_free (cset->range_ends);
3662 # endif
3663   re_free (cset->char_classes);
3664   re_free (cset);
3665 }
3666 #endif /* RE_ENABLE_I18N */
3667 
3668 /* Functions for binary tree operation.  */
3669 
3670 /* Create a tree node.  */
3671 
3672 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3673 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3674 	     re_token_type_t type)
3675 {
3676   re_token_t t;
3677   t.type = type;
3678   return create_token_tree (dfa, left, right, &t);
3679 }
3680 
3681 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3682 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3683 		   const re_token_t *token)
3684 {
3685   bin_tree_t *tree;
3686   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3687     {
3688       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3689 
3690       if (storage == NULL)
3691 	return NULL;
3692       storage->next = dfa->str_tree_storage;
3693       dfa->str_tree_storage = storage;
3694       dfa->str_tree_storage_idx = 0;
3695     }
3696   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3697 
3698   tree->parent = NULL;
3699   tree->left = left;
3700   tree->right = right;
3701   tree->token = *token;
3702   tree->token.duplicated = 0;
3703   tree->token.opt_subexp = 0;
3704   tree->first = NULL;
3705   tree->next = NULL;
3706   tree->node_idx = -1;
3707 
3708   if (left != NULL)
3709     left->parent = tree;
3710   if (right != NULL)
3711     right->parent = tree;
3712   return tree;
3713 }
3714 
3715 /* Mark the tree SRC as an optional subexpression.
3716    To be called from preorder or postorder.  */
3717 
3718 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3719 mark_opt_subexp (void *extra, bin_tree_t *node)
3720 {
3721   int idx = (int) (long) extra;
3722   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3723     node->token.opt_subexp = 1;
3724 
3725   return REG_NOERROR;
3726 }
3727 
3728 /* Free the allocated memory inside NODE. */
3729 
3730 static void
free_token(re_token_t * node)3731 free_token (re_token_t *node)
3732 {
3733 #ifdef RE_ENABLE_I18N
3734   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3735     free_charset (node->opr.mbcset);
3736   else
3737 #endif /* RE_ENABLE_I18N */
3738     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3739       re_free (node->opr.sbcset);
3740 }
3741 
3742 /* Worker function for tree walking.  Free the allocated memory inside NODE
3743    and its children. */
3744 
3745 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3746 free_tree (void *extra, bin_tree_t *node)
3747 {
3748   free_token (&node->token);
3749   return REG_NOERROR;
3750 }
3751 
3752 
3753 /* Duplicate the node SRC, and return new node.  This is a preorder
3754    visit similar to the one implemented by the generic visitor, but
3755    we need more infrastructure to maintain two parallel trees --- so,
3756    it's easier to duplicate.  */
3757 
3758 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3759 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3760 {
3761   const bin_tree_t *node;
3762   bin_tree_t *dup_root;
3763   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3764 
3765   for (node = root; ; )
3766     {
3767       /* Create a new tree and link it back to the current parent.  */
3768       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3769       if (*p_new == NULL)
3770 	return NULL;
3771       (*p_new)->parent = dup_node;
3772       (*p_new)->token.duplicated = 1;
3773       dup_node = *p_new;
3774 
3775       /* Go to the left node, or up and to the right.  */
3776       if (node->left)
3777 	{
3778 	  node = node->left;
3779 	  p_new = &dup_node->left;
3780 	}
3781       else
3782 	{
3783 	  const bin_tree_t *prev = NULL;
3784 	  while (node->right == prev || node->right == NULL)
3785 	    {
3786 	      prev = node;
3787 	      node = node->parent;
3788 	      dup_node = dup_node->parent;
3789 	      if (!node)
3790 	        return dup_root;
3791 	    }
3792 	  node = node->right;
3793 	  p_new = &dup_node->right;
3794 	}
3795     }
3796 }
3797