1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
18 
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 					  size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 				     const re_dfastate_t *init_state,
23 				     char *fastmap);
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
25 #ifdef RE_ENABLE_I18N
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
30 #ifdef RE_ENABLE_I18N
31 static void optimize_utf8 (re_dfa_t *dfa);
32 #endif
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 			       reg_errcode_t (fn (void *, bin_tree_t *)),
36 			       void *extra);
37 static reg_errcode_t postorder (bin_tree_t *root,
38 				reg_errcode_t (fn (void *, bin_tree_t *)),
39 				void *extra);
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
43 				 bin_tree_t *node);
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 				   unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
52 					 Idx node, bool root);
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
55 			 reg_syntax_t syntax);
56 static int peek_token (re_token_t *token, re_string_t *input,
57 			reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 			  reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 				  re_token_t *token, reg_syntax_t syntax,
62 				  Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 				 re_token_t *token, reg_syntax_t syntax,
65 				 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 				     re_token_t *token, reg_syntax_t syntax,
68 				     Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 				  re_token_t *token, reg_syntax_t syntax,
71 				  Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 				 re_dfa_t *dfa, re_token_t *token,
74 				 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 				      re_token_t *token, reg_syntax_t syntax,
77 				      reg_errcode_t *err);
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
79 					    re_string_t *regexp,
80 					    re_token_t *token, int token_len,
81 					    re_dfa_t *dfa,
82 					    reg_syntax_t syntax,
83 					    bool accept_hyphen);
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85 					  re_string_t *regexp,
86 					  re_token_t *token);
87 #ifdef RE_ENABLE_I18N
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 					re_charset_t *mbcset,
90 					Idx *equiv_class_alloc,
91 					const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 				      bitset_t sbcset,
94 				      re_charset_t *mbcset,
95 				      Idx *char_class_alloc,
96 				      const unsigned char *class_name,
97 				      reg_syntax_t syntax);
98 #else  /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 					const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 				      bitset_t sbcset,
103 				      const unsigned char *class_name,
104 				      reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 				       RE_TRANSLATE_TYPE trans,
108 				       const unsigned char *class_name,
109 				       const unsigned char *extra,
110 				       bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 				bin_tree_t *left, bin_tree_t *right,
113 				re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 				      bin_tree_t *left, bin_tree_t *right,
116 				      const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
121 
122 /* This table gives an error message for each of the error codes listed
123    in regex.h.  Obviously the order here has to be same as there.
124    POSIX doesn't require that we do anything for REG_NOERROR,
125    but why not be nice?  */
126 
127 static const char __re_error_msgid[] =
128   {
129 #define REG_NOERROR_IDX	0
130     gettext_noop ("Success")	/* REG_NOERROR */
131     "\0"
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133     gettext_noop ("No match")	/* REG_NOMATCH */
134     "\0"
135 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
136     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
137     "\0"
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
140     "\0"
141 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
143     "\0"
144 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
145     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
146     "\0"
147 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
148     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
149     "\0"
150 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
151     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
152     "\0"
153 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
155     "\0"
156 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
158     "\0"
159 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
160     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
161     "\0"
162 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163     gettext_noop ("Invalid range end")	/* REG_ERANGE */
164     "\0"
165 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
166     gettext_noop ("Memory exhausted") /* REG_ESPACE */
167     "\0"
168 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
169     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
170     "\0"
171 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172     gettext_noop ("Premature end of regular expression") /* REG_EEND */
173     "\0"
174 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
175     gettext_noop ("Regular expression too big") /* REG_ESIZE */
176     "\0"
177 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
178     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
179   };
180 
181 static const size_t __re_error_msgid_idx[] =
182   {
183     REG_NOERROR_IDX,
184     REG_NOMATCH_IDX,
185     REG_BADPAT_IDX,
186     REG_ECOLLATE_IDX,
187     REG_ECTYPE_IDX,
188     REG_EESCAPE_IDX,
189     REG_ESUBREG_IDX,
190     REG_EBRACK_IDX,
191     REG_EPAREN_IDX,
192     REG_EBRACE_IDX,
193     REG_BADBR_IDX,
194     REG_ERANGE_IDX,
195     REG_ESPACE_IDX,
196     REG_BADRPT_IDX,
197     REG_EEND_IDX,
198     REG_ESIZE_IDX,
199     REG_ERPAREN_IDX
200   };
201 
202 /* Entry points for GNU code.  */
203 
204 /* re_compile_pattern is the GNU regular expression compiler: it
205    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206    Returns 0 if the pattern was valid, otherwise an error string.
207 
208    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209    are set in BUFP on entry.  */
210 
211 #ifdef _LIBC
212 const char *
re_compile_pattern(pattern,length,bufp)213 re_compile_pattern (pattern, length, bufp)
214     const char *pattern;
215     size_t length;
216     struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 		    struct re_pattern_buffer *bufp)
221 #endif
222 {
223   reg_errcode_t ret;
224 
225   /* And GNU code determines whether or not to get register information
226      by passing null for the REGS argument to re_match, etc., not by
227      setting no_sub, unless RE_NO_SUB is set.  */
228   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229 
230   /* Match anchors at newline.  */
231   bufp->newline_anchor = 1;
232 
233   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 
235   if (!ret)
236     return NULL;
237   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 }
239 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)240 weak_alias (__re_compile_pattern, re_compile_pattern)
241 #endif
242 
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
244    also be assigned to arbitrarily: each pattern buffer stores its own
245    syntax, so it can be changed between regex compilations.  */
246 /* This has no initializer because initialized variables in Emacs
247    become read-only after dumping.  */
248 reg_syntax_t re_syntax_options;
249 
250 
251 /* Specify the precise syntax of regexps for compilation.  This provides
252    for compatibility for various utilities which historically have
253    different, incompatible syntaxes.
254 
255    The argument SYNTAX is a bit mask comprised of the various bits
256    defined in regex.h.  We return the old syntax.  */
257 
258 reg_syntax_t
259 re_set_syntax (syntax)
260     reg_syntax_t syntax;
261 {
262   reg_syntax_t ret = re_syntax_options;
263 
264   re_syntax_options = syntax;
265   return ret;
266 }
267 #ifdef _LIBC
268 weak_alias (__re_set_syntax, re_set_syntax)
269 #endif
270 
271 int
272 re_compile_fastmap (bufp)
273     struct re_pattern_buffer *bufp;
274 {
275   re_dfa_t *dfa = bufp->buffer;
276   char *fastmap = bufp->fastmap;
277 
278   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280   if (dfa->init_state != dfa->init_state_word)
281     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282   if (dfa->init_state != dfa->init_state_nl)
283     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284   if (dfa->init_state != dfa->init_state_begbuf)
285     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286   bufp->fastmap_accurate = 1;
287   return 0;
288 }
289 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
291 #endif
292 
293 static inline void
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
296 {
297   fastmap[ch] = 1;
298   if (icase)
299     fastmap[tolower (ch)] = 1;
300 }
301 
302 /* Helper function for re_compile_fastmap.
303    Compile fastmap for the initial_state INIT_STATE.  */
304 
305 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
307 			 char *fastmap)
308 {
309   re_dfa_t *dfa = bufp->buffer;
310   Idx node_cnt;
311   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
313     {
314       Idx node = init_state->nodes.elems[node_cnt];
315       re_token_type_t type = dfa->nodes[node].type;
316 
317       if (type == CHARACTER)
318 	{
319 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
322 	    {
323 	      unsigned char buf[MB_LEN_MAX];
324 	      unsigned char *p;
325 	      wchar_t wc;
326 	      mbstate_t state;
327 
328 	      p = buf;
329 	      *p++ = dfa->nodes[node].opr.c;
330 	      while (++node < dfa->nodes_len
331 		     &&	dfa->nodes[node].type == CHARACTER
332 		     && dfa->nodes[node].mb_partial)
333 		*p++ = dfa->nodes[node].opr.c;
334 	      memset (&state, '\0', sizeof (state));
335 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
336 			     &state) == p - buf
337 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
338 		      != (size_t) -1))
339 		re_set_fastmap (fastmap, false, buf[0]);
340 	    }
341 #endif
342 	}
343       else if (type == SIMPLE_BRACKET)
344 	{
345 	  int i, ch;
346 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
347 	    {
348 	      int j;
349 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 		if (w & ((bitset_word_t) 1 << j))
352 		  re_set_fastmap (fastmap, icase, ch);
353 	    }
354 	}
355 #ifdef RE_ENABLE_I18N
356       else if (type == COMPLEX_BRACKET)
357 	{
358 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
359 	  Idx i;
360 
361 # ifdef _LIBC
362 	  /* See if we have to try all bytes which start multiple collation
363 	     elements.
364 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 		  collation element, and don't catch 'b' since 'b' is
366 		  the only collation element which starts from 'b' (and
367 		  it is caught by SIMPLE_BRACKET).  */
368 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 		  && (cset->ncoll_syms || cset->nranges))
370 		{
371 		  const int32_t *table = (const int32_t *)
372 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 		  for (i = 0; i < SBC_MAX; ++i)
374 		    if (table[i] < 0)
375 		      re_set_fastmap (fastmap, icase, i);
376 		}
377 # endif /* _LIBC */
378 
379 	  /* See if we have to start the match at all multibyte characters,
380 	     i.e. where we would not find an invalid sequence.  This only
381 	     applies to multibyte character sets; for single byte character
382 	     sets, the SIMPLE_BRACKET again suffices.  */
383 	  if (dfa->mb_cur_max > 1
384 	      && (cset->nchar_classes || cset->non_match || cset->nranges
385 # ifdef _LIBC
386 		  || cset->nequiv_classes
387 # endif /* _LIBC */
388 		 ))
389 	    {
390 	      unsigned char c = 0;
391 	      do
392 		{
393 		  mbstate_t mbs;
394 		  memset (&mbs, 0, sizeof (mbs));
395 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 		    re_set_fastmap (fastmap, false, (int) c);
397 		}
398 	      while (++c != 0);
399 	    }
400 
401 	  else
402 	    {
403 	      /* ... Else catch all bytes which can start the mbchars.  */
404 	      for (i = 0; i < cset->nmbchars; ++i)
405 		{
406 		  char buf[256];
407 		  mbstate_t state;
408 		  memset (&state, '\0', sizeof (state));
409 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
412 		    {
413 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
414 			  != (size_t) -1)
415 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 		    }
417 		}
418 	    }
419 	}
420 #endif /* RE_ENABLE_I18N */
421       else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 	       || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 	       || type == END_OF_RE)
426 	{
427 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 	  if (type == END_OF_RE)
429 	    bufp->can_be_null = 1;
430 	  return;
431 	}
432     }
433 }
434 
435 /* Entry point for POSIX code.  */
436 /* regcomp takes a regular expression as a string and compiles it.
437 
438    PREG is a regex_t *.  We do not expect any fields to be initialized,
439    since POSIX says we shouldn't.  Thus, we set
440 
441      'buffer' to the compiled pattern;
442      'used' to the length of the compiled pattern;
443      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444        REG_EXTENDED bit in CFLAGS is set; otherwise, to
445        RE_SYNTAX_POSIX_BASIC;
446      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447      'fastmap' to an allocated space for the fastmap;
448      'fastmap_accurate' to zero;
449      're_nsub' to the number of subexpressions in PATTERN.
450 
451    PATTERN is the address of the pattern string.
452 
453    CFLAGS is a series of bits which affect compilation.
454 
455      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456      use POSIX basic syntax.
457 
458      If REG_NEWLINE is set, then . and [^...] don't match newline.
459      Also, regexec will try a match beginning after every newline.
460 
461      If REG_ICASE is set, then we considers upper- and lowercase
462      versions of letters to be equivalent when matching.
463 
464      If REG_NOSUB is set, then when PREG is passed to regexec, that
465      routine will report only success or failure, and nothing about the
466      registers.
467 
468    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
469    the return codes and their meanings.)  */
470 
471 int
regcomp(preg,pattern,cflags)472 regcomp (preg, pattern, cflags)
473     regex_t *_Restrict_ preg;
474     const char *_Restrict_ pattern;
475     int cflags;
476 {
477   reg_errcode_t ret;
478   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 			 : RE_SYNTAX_POSIX_BASIC);
480 
481   preg->buffer = NULL;
482   preg->allocated = 0;
483   preg->used = 0;
484 
485   /* Try to allocate space for the fastmap.  */
486   preg->fastmap = re_malloc (char, SBC_MAX);
487   if (BE (preg->fastmap == NULL, 0))
488     return REG_ESPACE;
489 
490   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491 
492   /* If REG_NEWLINE is set, newlines are treated differently.  */
493   if (cflags & REG_NEWLINE)
494     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
495       syntax &= ~RE_DOT_NEWLINE;
496       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497       /* It also changes the matching behavior.  */
498       preg->newline_anchor = 1;
499     }
500   else
501     preg->newline_anchor = 0;
502   preg->no_sub = !!(cflags & REG_NOSUB);
503   preg->translate = NULL;
504 
505   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506 
507   /* POSIX doesn't distinguish between an unmatched open-group and an
508      unmatched close-group: both are REG_EPAREN.  */
509   if (ret == REG_ERPAREN)
510     ret = REG_EPAREN;
511 
512   /* We have already checked preg->fastmap != NULL.  */
513   if (BE (ret == REG_NOERROR, 1))
514     /* Compute the fastmap now, since regexec cannot modify the pattern
515        buffer.  This function never fails in this implementation.  */
516     (void) re_compile_fastmap (preg);
517   else
518     {
519       /* Some error occurred while compiling the expression.  */
520       re_free (preg->fastmap);
521       preg->fastmap = NULL;
522     }
523 
524   return (int) ret;
525 }
526 #ifdef _LIBC
527 weak_alias (__regcomp, regcomp)
528 #endif
529 
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531    from either regcomp or regexec.   We don't use PREG here.  */
532 
533 #ifdef _LIBC
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536     int errcode;
537     const regex_t *_Restrict_ preg;
538     char *_Restrict_ errbuf;
539     size_t errbuf_size;
540 #else /* size_t might promote */
541 size_t
542 regerror (int errcode, const regex_t *_Restrict_ preg,
543 	  char *_Restrict_ errbuf, size_t errbuf_size)
544 #endif
545 {
546   const char *msg;
547   size_t msg_size;
548 
549   if (BE (errcode < 0
550 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 			       / sizeof (__re_error_msgid_idx[0])), 0))
552     /* Only error codes returned by the rest of the code should be passed
553        to this routine.  If we are given anything else, or if other regex
554        code generates an invalid error code, then the program has a bug.
555        Dump core so we can fix it.  */
556     abort ();
557 
558   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
559 
560   msg_size = strlen (msg) + 1; /* Includes the null.  */
561 
562   if (BE (errbuf_size != 0, 1))
563     {
564       size_t cpy_size = msg_size;
565       if (BE (msg_size > errbuf_size, 0))
566 	{
567 	  cpy_size = errbuf_size - 1;
568 	  errbuf[cpy_size] = '\0';
569 	}
570       memcpy (errbuf, msg, cpy_size);
571     }
572 
573   return msg_size;
574 }
575 #ifdef _LIBC
576 weak_alias (__regerror, regerror)
577 #endif
578 
579 
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582    UTF-8 is used.  Otherwise we would allocate memory just to initialize
583    it the same all the time.  UTF-8 is the preferred encoding so this is
584    a worthwhile optimization.  */
585 static const bitset_t utf8_sb_map =
586 {
587   /* Set the first 128 bits.  */
588 # ifdef __GNUC__
589   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
590 # else
591 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 #   error "bitset_word_t is narrower than 32 bits"
593 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596   BITSET_WORD_MAX, BITSET_WORD_MAX,
597 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
598   BITSET_WORD_MAX,
599 #  endif
600   (BITSET_WORD_MAX
601    >> (SBC_MAX % BITSET_WORD_BITS == 0
602        ? 0
603        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
604 # endif
605 };
606 #endif
607 
608 
609 static void
free_dfa_content(re_dfa_t * dfa)610 free_dfa_content (re_dfa_t *dfa)
611 {
612   Idx i, j;
613 
614   if (dfa->nodes)
615     for (i = 0; i < dfa->nodes_len; ++i)
616       free_token (dfa->nodes + i);
617   re_free (dfa->nexts);
618   for (i = 0; i < dfa->nodes_len; ++i)
619     {
620       if (dfa->eclosures != NULL)
621 	re_node_set_free (dfa->eclosures + i);
622       if (dfa->inveclosures != NULL)
623 	re_node_set_free (dfa->inveclosures + i);
624       if (dfa->edests != NULL)
625 	re_node_set_free (dfa->edests + i);
626     }
627   re_free (dfa->edests);
628   re_free (dfa->eclosures);
629   re_free (dfa->inveclosures);
630   re_free (dfa->nodes);
631 
632   if (dfa->state_table)
633     for (i = 0; i <= dfa->state_hash_mask; ++i)
634       {
635 	struct re_state_table_entry *entry = dfa->state_table + i;
636 	for (j = 0; j < entry->num; ++j)
637 	  {
638 	    re_dfastate_t *state = entry->array[j];
639 	    free_state (state);
640 	  }
641 	re_free (entry->array);
642       }
643   re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645   if (dfa->sb_char != utf8_sb_map)
646     re_free (dfa->sb_char);
647 #endif
648   re_free (dfa->subexp_map);
649 #ifdef DEBUG
650   re_free (dfa->re_str);
651 #endif
652 
653   re_free (dfa);
654 }
655 
656 
657 /* Free dynamically allocated space used by PREG.  */
658 
659 void
regfree(preg)660 regfree (preg)
661     regex_t *preg;
662 {
663   re_dfa_t *dfa = preg->buffer;
664   if (BE (dfa != NULL, 1))
665     free_dfa_content (dfa);
666   preg->buffer = NULL;
667   preg->allocated = 0;
668 
669   re_free (preg->fastmap);
670   preg->fastmap = NULL;
671 
672   re_free (preg->translate);
673   preg->translate = NULL;
674 }
675 #ifdef _LIBC
676 weak_alias (__regfree, regfree)
677 #endif
678 
679 /* Entry points compatible with 4.2 BSD regex library.  We don't define
680    them unless specifically requested.  */
681 
682 #if defined _REGEX_RE_COMP || defined _LIBC
683 
684 /* BSD has one and only one pattern buffer.  */
685 static struct re_pattern_buffer re_comp_buf;
686 
687 char *
688 # ifdef _LIBC
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690    these names if they don't use our functions, and still use
691    regcomp/regexec above without link errors.  */
692 weak_function
693 # endif
re_comp(s)694 re_comp (s)
695      const char *s;
696 {
697   reg_errcode_t ret;
698   char *fastmap;
699 
700   if (!s)
701     {
702       if (!re_comp_buf.buffer)
703 	return gettext ("No previous regular expression");
704       return 0;
705     }
706 
707   if (re_comp_buf.buffer)
708     {
709       fastmap = re_comp_buf.fastmap;
710       re_comp_buf.fastmap = NULL;
711       __regfree (&re_comp_buf);
712       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713       re_comp_buf.fastmap = fastmap;
714     }
715 
716   if (re_comp_buf.fastmap == NULL)
717     {
718       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719       if (re_comp_buf.fastmap == NULL)
720 	return (char *) gettext (__re_error_msgid
721 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
722     }
723 
724   /* Since 're_exec' always passes NULL for the 'regs' argument, we
725      don't need to initialize the pattern buffer fields which affect it.  */
726 
727   /* Match anchors at newlines.  */
728   re_comp_buf.newline_anchor = 1;
729 
730   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731 
732   if (!ret)
733     return NULL;
734 
735   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
736   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 }
738 
739 #ifdef _LIBC
libc_freeres_fn(free_mem)740 libc_freeres_fn (free_mem)
741 {
742   __regfree (&re_comp_buf);
743 }
744 #endif
745 
746 #endif /* _REGEX_RE_COMP */
747 
748 /* Internal entry point.
749    Compile the regular expression PATTERN, whose length is LENGTH.
750    SYNTAX indicate regular expression's syntax.  */
751 
752 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 		     reg_syntax_t syntax)
755 {
756   reg_errcode_t err = REG_NOERROR;
757   re_dfa_t *dfa;
758   re_string_t regexp;
759 
760   /* Initialize the pattern buffer.  */
761   preg->fastmap_accurate = 0;
762   preg->syntax = syntax;
763   preg->not_bol = preg->not_eol = 0;
764   preg->used = 0;
765   preg->re_nsub = 0;
766   preg->can_be_null = 0;
767   preg->regs_allocated = REGS_UNALLOCATED;
768 
769   /* Initialize the dfa.  */
770   dfa = preg->buffer;
771   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772     {
773       /* If zero allocated, but buffer is non-null, try to realloc
774 	 enough space.  This loses if buffer's address is bogus, but
775 	 that is the user's responsibility.  If ->buffer is NULL this
776 	 is a simple allocation.  */
777       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778       if (dfa == NULL)
779 	return REG_ESPACE;
780       preg->allocated = sizeof (re_dfa_t);
781       preg->buffer = dfa;
782     }
783   preg->used = sizeof (re_dfa_t);
784 
785   err = init_dfa (dfa, length);
786   if (BE (err != REG_NOERROR, 0))
787     {
788       free_dfa_content (dfa);
789       preg->buffer = NULL;
790       preg->allocated = 0;
791       return err;
792     }
793 #ifdef DEBUG
794   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
795   dfa->re_str = re_malloc (char, length + 1);
796   strncpy (dfa->re_str, pattern, length + 1);
797 #endif
798 
799   __libc_lock_init (dfa->lock);
800 
801   err = re_string_construct (&regexp, pattern, length, preg->translate,
802 			     (syntax & RE_ICASE) != 0, dfa);
803   if (BE (err != REG_NOERROR, 0))
804     {
805     re_compile_internal_free_return:
806       free_workarea_compile (preg);
807       re_string_destruct (&regexp);
808       free_dfa_content (dfa);
809       preg->buffer = NULL;
810       preg->allocated = 0;
811       return err;
812     }
813 
814   /* Parse the regular expression, and build a structure tree.  */
815   preg->re_nsub = 0;
816   dfa->str_tree = parse (&regexp, preg, syntax, &err);
817   if (BE (dfa->str_tree == NULL, 0))
818     goto re_compile_internal_free_return;
819 
820   /* Analyze the tree and create the nfa.  */
821   err = analyze (preg);
822   if (BE (err != REG_NOERROR, 0))
823     goto re_compile_internal_free_return;
824 
825 #ifdef RE_ENABLE_I18N
826   /* If possible, do searching in single byte encoding to speed things up.  */
827   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828     optimize_utf8 (dfa);
829 #endif
830 
831   /* Then create the initial state of the dfa.  */
832   err = create_initial_state (dfa);
833 
834   /* Release work areas.  */
835   free_workarea_compile (preg);
836   re_string_destruct (&regexp);
837 
838   if (BE (err != REG_NOERROR, 0))
839     {
840       free_dfa_content (dfa);
841       preg->buffer = NULL;
842       preg->allocated = 0;
843     }
844 
845   return err;
846 }
847 
848 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
849    as the initial length of some arrays.  */
850 
851 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)852 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 {
854   __re_size_t table_size;
855 #ifndef _LIBC
856   const char *codeset_name;
857 #endif
858 #ifdef RE_ENABLE_I18N
859   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
860 #else
861   size_t max_i18n_object_size = 0;
862 #endif
863   size_t max_object_size =
864     MAX (sizeof (struct re_state_table_entry),
865 	 MAX (sizeof (re_token_t),
866 	      MAX (sizeof (re_node_set),
867 		   MAX (sizeof (regmatch_t),
868 			max_i18n_object_size))));
869 
870   memset (dfa, '\0', sizeof (re_dfa_t));
871 
872   /* Force allocation of str_tree_storage the first time.  */
873   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874 
875   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
876      calculation below, and for similar doubling calculations
877      elsewhere.  And it's <= rather than <, because some of the
878      doubling calculations add 1 afterwards.  */
879   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
880     return REG_ESPACE;
881 
882   dfa->nodes_alloc = pat_len + 1;
883   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
884 
885   /*  table_size = 2 ^ ceil(log pat_len) */
886   for (table_size = 1; ; table_size <<= 1)
887     if (table_size > pat_len)
888       break;
889 
890   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891   dfa->state_hash_mask = table_size - 1;
892 
893   dfa->mb_cur_max = MB_CUR_MAX;
894 #ifdef _LIBC
895   if (dfa->mb_cur_max == 6
896       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
897     dfa->is_utf8 = 1;
898   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 		       != 0);
900 #else
901   codeset_name = nl_langinfo (CODESET);
902   if (strcasecmp (codeset_name, "UTF-8") == 0
903       || strcasecmp (codeset_name, "UTF8") == 0)
904     dfa->is_utf8 = 1;
905 
906   /* We check exhaustively in the loop below if this charset is a
907      superset of ASCII.  */
908   dfa->map_notascii = 0;
909 #endif
910 
911 #ifdef RE_ENABLE_I18N
912   if (dfa->mb_cur_max > 1)
913     {
914       if (dfa->is_utf8)
915 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
916       else
917 	{
918 	  int i, j, ch;
919 
920 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 	  if (BE (dfa->sb_char == NULL, 0))
922 	    return REG_ESPACE;
923 
924 	  /* Set the bits corresponding to single byte chars.  */
925 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
927 	      {
928 		wint_t wch = __btowc (ch);
929 		if (wch != WEOF)
930 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
931 # ifndef _LIBC
932 		if (isascii (ch) && wch != ch)
933 		  dfa->map_notascii = 1;
934 # endif
935 	      }
936 	}
937     }
938 #endif
939 
940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
941     return REG_ESPACE;
942   return REG_NOERROR;
943 }
944 
945 /* Initialize WORD_CHAR table, which indicate which character is
946    "word".  In this case "word" means that it is the word construction
947    character used by some operators like "\<", "\>", etc.  */
948 
949 static void
950 internal_function
init_word_char(re_dfa_t * dfa)951 init_word_char (re_dfa_t *dfa)
952 {
953   dfa->word_ops_used = 1;
954   int i = 0;
955   int j;
956   int ch = 0;
957   if (BE (dfa->map_notascii == 0, 1))
958     {
959       if (BITSET_WORD_BITS == 64)
960 	{
961 	  dfa->word_char[0] = UINT64_C (0x03ff000000000000);
962 	  dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
963 	  i = 2;
964 	}
965       else if (BITSET_WORD_BITS == 32)
966 	{
967 	  dfa->word_char[0] = UINT32_C (0x00000000);
968 	  dfa->word_char[1] = UINT32_C (0x03ff0000);
969 	  dfa->word_char[2] = UINT32_C (0x87fffffe);
970 	  dfa->word_char[3] = UINT32_C (0x07fffffe);
971 	  i = 4;
972 	}
973       else
974         goto general_case;
975       ch = 128;
976 
977       if (BE (dfa->is_utf8, 1))
978 	{
979 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
980 	  return;
981 	}
982     }
983 
984  general_case:
985   for (; i < BITSET_WORDS; ++i)
986     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
987       if (isalnum (ch) || ch == '_')
988 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
989 }
990 
991 /* Free the work area which are only used while compiling.  */
992 
993 static void
free_workarea_compile(regex_t * preg)994 free_workarea_compile (regex_t *preg)
995 {
996   re_dfa_t *dfa = preg->buffer;
997   bin_tree_storage_t *storage, *next;
998   for (storage = dfa->str_tree_storage; storage; storage = next)
999     {
1000       next = storage->next;
1001       re_free (storage);
1002     }
1003   dfa->str_tree_storage = NULL;
1004   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1005   dfa->str_tree = NULL;
1006   re_free (dfa->org_indices);
1007   dfa->org_indices = NULL;
1008 }
1009 
1010 /* Create initial states for all contexts.  */
1011 
1012 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)1013 create_initial_state (re_dfa_t *dfa)
1014 {
1015   Idx first, i;
1016   reg_errcode_t err;
1017   re_node_set init_nodes;
1018 
1019   /* Initial states have the epsilon closure of the node which is
1020      the first node of the regular expression.  */
1021   first = dfa->str_tree->first->node_idx;
1022   dfa->init_node = first;
1023   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1024   if (BE (err != REG_NOERROR, 0))
1025     return err;
1026 
1027   /* The back-references which are in initial states can epsilon transit,
1028      since in this case all of the subexpressions can be null.
1029      Then we add epsilon closures of the nodes which are the next nodes of
1030      the back-references.  */
1031   if (dfa->nbackref > 0)
1032     for (i = 0; i < init_nodes.nelem; ++i)
1033       {
1034 	Idx node_idx = init_nodes.elems[i];
1035 	re_token_type_t type = dfa->nodes[node_idx].type;
1036 
1037 	Idx clexp_idx;
1038 	if (type != OP_BACK_REF)
1039 	  continue;
1040 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1041 	  {
1042 	    re_token_t *clexp_node;
1043 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1044 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1045 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1046 	      break;
1047 	  }
1048 	if (clexp_idx == init_nodes.nelem)
1049 	  continue;
1050 
1051 	if (type == OP_BACK_REF)
1052 	  {
1053 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1054 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1055 	      {
1056 		reg_errcode_t merge_err
1057                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1058 		if (merge_err != REG_NOERROR)
1059 		  return merge_err;
1060 		i = 0;
1061 	      }
1062 	  }
1063       }
1064 
1065   /* It must be the first time to invoke acquire_state.  */
1066   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1067   /* We don't check ERR here, since the initial state must not be NULL.  */
1068   if (BE (dfa->init_state == NULL, 0))
1069     return err;
1070   if (dfa->init_state->has_constraint)
1071     {
1072       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1073 						       CONTEXT_WORD);
1074       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1075 						     CONTEXT_NEWLINE);
1076       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1077 							 &init_nodes,
1078 							 CONTEXT_NEWLINE
1079 							 | CONTEXT_BEGBUF);
1080       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1081 	      || dfa->init_state_begbuf == NULL, 0))
1082 	return err;
1083     }
1084   else
1085     dfa->init_state_word = dfa->init_state_nl
1086       = dfa->init_state_begbuf = dfa->init_state;
1087 
1088   re_node_set_free (&init_nodes);
1089   return REG_NOERROR;
1090 }
1091 
1092 #ifdef RE_ENABLE_I18N
1093 /* If it is possible to do searching in single byte encoding instead of UTF-8
1094    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1095    DFA nodes where needed.  */
1096 
1097 static void
optimize_utf8(re_dfa_t * dfa)1098 optimize_utf8 (re_dfa_t *dfa)
1099 {
1100   Idx node;
1101   int i;
1102   bool mb_chars = false;
1103   bool has_period = false;
1104 
1105   for (node = 0; node < dfa->nodes_len; ++node)
1106     switch (dfa->nodes[node].type)
1107       {
1108       case CHARACTER:
1109 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1110 	  mb_chars = true;
1111 	break;
1112       case ANCHOR:
1113 	switch (dfa->nodes[node].opr.ctx_type)
1114 	  {
1115 	  case LINE_FIRST:
1116 	  case LINE_LAST:
1117 	  case BUF_FIRST:
1118 	  case BUF_LAST:
1119 	    break;
1120 	  default:
1121 	    /* Word anchors etc. cannot be handled.  It's okay to test
1122 	       opr.ctx_type since constraints (for all DFA nodes) are
1123 	       created by ORing one or more opr.ctx_type values.  */
1124 	    return;
1125 	  }
1126 	break;
1127       case OP_PERIOD:
1128 	has_period = true;
1129 	break;
1130       case OP_BACK_REF:
1131       case OP_ALT:
1132       case END_OF_RE:
1133       case OP_DUP_ASTERISK:
1134       case OP_OPEN_SUBEXP:
1135       case OP_CLOSE_SUBEXP:
1136 	break;
1137       case COMPLEX_BRACKET:
1138 	return;
1139       case SIMPLE_BRACKET:
1140 	/* Just double check.  */
1141 	{
1142 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1143 			? 0
1144 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1145 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1146 	    {
1147 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1148 		return;
1149 	      rshift = 0;
1150 	    }
1151 	}
1152 	break;
1153       default:
1154 	abort ();
1155       }
1156 
1157   if (mb_chars || has_period)
1158     for (node = 0; node < dfa->nodes_len; ++node)
1159       {
1160 	if (dfa->nodes[node].type == CHARACTER
1161 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1162 	  dfa->nodes[node].mb_partial = 0;
1163 	else if (dfa->nodes[node].type == OP_PERIOD)
1164 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1165       }
1166 
1167   /* The search can be in single byte locale.  */
1168   dfa->mb_cur_max = 1;
1169   dfa->is_utf8 = 0;
1170   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1171 }
1172 #endif
1173 
1174 /* Analyze the structure tree, and calculate "first", "next", "edest",
1175    "eclosure", and "inveclosure".  */
1176 
1177 static reg_errcode_t
analyze(regex_t * preg)1178 analyze (regex_t *preg)
1179 {
1180   re_dfa_t *dfa = preg->buffer;
1181   reg_errcode_t ret;
1182 
1183   /* Allocate arrays.  */
1184   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1185   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1186   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1187   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1188   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1189 	  || dfa->eclosures == NULL, 0))
1190     return REG_ESPACE;
1191 
1192   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1193   if (dfa->subexp_map != NULL)
1194     {
1195       Idx i;
1196       for (i = 0; i < preg->re_nsub; i++)
1197 	dfa->subexp_map[i] = i;
1198       preorder (dfa->str_tree, optimize_subexps, dfa);
1199       for (i = 0; i < preg->re_nsub; i++)
1200 	if (dfa->subexp_map[i] != i)
1201 	  break;
1202       if (i == preg->re_nsub)
1203 	{
1204 	  free (dfa->subexp_map);
1205 	  dfa->subexp_map = NULL;
1206 	}
1207     }
1208 
1209   ret = postorder (dfa->str_tree, lower_subexps, preg);
1210   if (BE (ret != REG_NOERROR, 0))
1211     return ret;
1212   ret = postorder (dfa->str_tree, calc_first, dfa);
1213   if (BE (ret != REG_NOERROR, 0))
1214     return ret;
1215   preorder (dfa->str_tree, calc_next, dfa);
1216   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1217   if (BE (ret != REG_NOERROR, 0))
1218     return ret;
1219   ret = calc_eclosure (dfa);
1220   if (BE (ret != REG_NOERROR, 0))
1221     return ret;
1222 
1223   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1224      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1225   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1226       || dfa->nbackref)
1227     {
1228       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1229       if (BE (dfa->inveclosures == NULL, 0))
1230 	return REG_ESPACE;
1231       ret = calc_inveclosure (dfa);
1232     }
1233 
1234   return ret;
1235 }
1236 
1237 /* Our parse trees are very unbalanced, so we cannot use a stack to
1238    implement parse tree visits.  Instead, we use parent pointers and
1239    some hairy code in these two functions.  */
1240 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1241 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242 	   void *extra)
1243 {
1244   bin_tree_t *node, *prev;
1245 
1246   for (node = root; ; )
1247     {
1248       /* Descend down the tree, preferably to the left (or to the right
1249 	 if that's the only child).  */
1250       while (node->left || node->right)
1251 	if (node->left)
1252 	  node = node->left;
1253 	else
1254 	  node = node->right;
1255 
1256       do
1257 	{
1258 	  reg_errcode_t err = fn (extra, node);
1259 	  if (BE (err != REG_NOERROR, 0))
1260 	    return err;
1261 	  if (node->parent == NULL)
1262 	    return REG_NOERROR;
1263 	  prev = node;
1264 	  node = node->parent;
1265 	}
1266       /* Go up while we have a node that is reached from the right.  */
1267       while (node->right == prev || node->right == NULL);
1268       node = node->right;
1269     }
1270 }
1271 
1272 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1273 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1274 	  void *extra)
1275 {
1276   bin_tree_t *node;
1277 
1278   for (node = root; ; )
1279     {
1280       reg_errcode_t err = fn (extra, node);
1281       if (BE (err != REG_NOERROR, 0))
1282 	return err;
1283 
1284       /* Go to the left node, or up and to the right.  */
1285       if (node->left)
1286 	node = node->left;
1287       else
1288 	{
1289 	  bin_tree_t *prev = NULL;
1290 	  while (node->right == prev || node->right == NULL)
1291 	    {
1292 	      prev = node;
1293 	      node = node->parent;
1294 	      if (!node)
1295 		return REG_NOERROR;
1296 	    }
1297 	  node = node->right;
1298 	}
1299     }
1300 }
1301 
1302 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1303    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1304    backreferences as well.  Requires a preorder visit.  */
1305 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1306 optimize_subexps (void *extra, bin_tree_t *node)
1307 {
1308   re_dfa_t *dfa = (re_dfa_t *) extra;
1309 
1310   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1311     {
1312       int idx = node->token.opr.idx;
1313       node->token.opr.idx = dfa->subexp_map[idx];
1314       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1315     }
1316 
1317   else if (node->token.type == SUBEXP
1318 	   && node->left && node->left->token.type == SUBEXP)
1319     {
1320       Idx other_idx = node->left->token.opr.idx;
1321 
1322       node->left = node->left->left;
1323       if (node->left)
1324 	node->left->parent = node;
1325 
1326       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1327       if (other_idx < BITSET_WORD_BITS)
1328 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1329     }
1330 
1331   return REG_NOERROR;
1332 }
1333 
1334 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1335    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1336 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1337 lower_subexps (void *extra, bin_tree_t *node)
1338 {
1339   regex_t *preg = (regex_t *) extra;
1340   reg_errcode_t err = REG_NOERROR;
1341 
1342   if (node->left && node->left->token.type == SUBEXP)
1343     {
1344       node->left = lower_subexp (&err, preg, node->left);
1345       if (node->left)
1346 	node->left->parent = node;
1347     }
1348   if (node->right && node->right->token.type == SUBEXP)
1349     {
1350       node->right = lower_subexp (&err, preg, node->right);
1351       if (node->right)
1352 	node->right->parent = node;
1353     }
1354 
1355   return err;
1356 }
1357 
1358 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1359 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1360 {
1361   re_dfa_t *dfa = preg->buffer;
1362   bin_tree_t *body = node->left;
1363   bin_tree_t *op, *cls, *tree1, *tree;
1364 
1365   if (preg->no_sub
1366       /* We do not optimize empty subexpressions, because otherwise we may
1367 	 have bad CONCAT nodes with NULL children.  This is obviously not
1368 	 very common, so we do not lose much.  An example that triggers
1369 	 this case is the sed "script" /\(\)/x.  */
1370       && node->left != NULL
1371       && (node->token.opr.idx >= BITSET_WORD_BITS
1372 	  || !(dfa->used_bkref_map
1373 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1374     return node->left;
1375 
1376   /* Convert the SUBEXP node to the concatenation of an
1377      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1378   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1379   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1380   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1381   tree = create_tree (dfa, op, tree1, CONCAT);
1382   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1383     {
1384       *err = REG_ESPACE;
1385       return NULL;
1386     }
1387 
1388   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1389   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1390   return tree;
1391 }
1392 
1393 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1394    nodes.  Requires a postorder visit.  */
1395 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1396 calc_first (void *extra, bin_tree_t *node)
1397 {
1398   re_dfa_t *dfa = (re_dfa_t *) extra;
1399   if (node->token.type == CONCAT)
1400     {
1401       node->first = node->left->first;
1402       node->node_idx = node->left->node_idx;
1403     }
1404   else
1405     {
1406       node->first = node;
1407       node->node_idx = re_dfa_add_node (dfa, node->token);
1408       if (BE (node->node_idx == REG_MISSING, 0))
1409 	return REG_ESPACE;
1410       if (node->token.type == ANCHOR)
1411 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1412     }
1413   return REG_NOERROR;
1414 }
1415 
1416 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1417 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1418 calc_next (void *extra, bin_tree_t *node)
1419 {
1420   switch (node->token.type)
1421     {
1422     case OP_DUP_ASTERISK:
1423       node->left->next = node;
1424       break;
1425     case CONCAT:
1426       node->left->next = node->right->first;
1427       node->right->next = node->next;
1428       break;
1429     default:
1430       if (node->left)
1431 	node->left->next = node->next;
1432       if (node->right)
1433 	node->right->next = node->next;
1434       break;
1435     }
1436   return REG_NOERROR;
1437 }
1438 
1439 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1440 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1441 link_nfa_nodes (void *extra, bin_tree_t *node)
1442 {
1443   re_dfa_t *dfa = (re_dfa_t *) extra;
1444   Idx idx = node->node_idx;
1445   reg_errcode_t err = REG_NOERROR;
1446 
1447   switch (node->token.type)
1448     {
1449     case CONCAT:
1450       break;
1451 
1452     case END_OF_RE:
1453       assert (node->next == NULL);
1454       break;
1455 
1456     case OP_DUP_ASTERISK:
1457     case OP_ALT:
1458       {
1459 	Idx left, right;
1460 	dfa->has_plural_match = 1;
1461 	if (node->left != NULL)
1462 	  left = node->left->first->node_idx;
1463 	else
1464 	  left = node->next->node_idx;
1465 	if (node->right != NULL)
1466 	  right = node->right->first->node_idx;
1467 	else
1468 	  right = node->next->node_idx;
1469 	assert (REG_VALID_INDEX (left));
1470 	assert (REG_VALID_INDEX (right));
1471 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1472       }
1473       break;
1474 
1475     case ANCHOR:
1476     case OP_OPEN_SUBEXP:
1477     case OP_CLOSE_SUBEXP:
1478       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1479       break;
1480 
1481     case OP_BACK_REF:
1482       dfa->nexts[idx] = node->next->node_idx;
1483       if (node->token.type == OP_BACK_REF)
1484 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1485       break;
1486 
1487     default:
1488       assert (!IS_EPSILON_NODE (node->token.type));
1489       dfa->nexts[idx] = node->next->node_idx;
1490       break;
1491     }
1492 
1493   return err;
1494 }
1495 
1496 /* Duplicate the epsilon closure of the node ROOT_NODE.
1497    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1498    to their own constraint.  */
1499 
1500 static reg_errcode_t
1501 internal_function
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1502 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1503 			Idx root_node, unsigned int init_constraint)
1504 {
1505   Idx org_node, clone_node;
1506   bool ok;
1507   unsigned int constraint = init_constraint;
1508   for (org_node = top_org_node, clone_node = top_clone_node;;)
1509     {
1510       Idx org_dest, clone_dest;
1511       if (dfa->nodes[org_node].type == OP_BACK_REF)
1512 	{
1513 	  /* If the back reference epsilon-transit, its destination must
1514 	     also have the constraint.  Then duplicate the epsilon closure
1515 	     of the destination of the back reference, and store it in
1516 	     edests of the back reference.  */
1517 	  org_dest = dfa->nexts[org_node];
1518 	  re_node_set_empty (dfa->edests + clone_node);
1519 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 	  if (BE (clone_dest == REG_MISSING, 0))
1521 	    return REG_ESPACE;
1522 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1523 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524 	  if (BE (! ok, 0))
1525 	    return REG_ESPACE;
1526 	}
1527       else if (dfa->edests[org_node].nelem == 0)
1528 	{
1529 	  /* In case of the node can't epsilon-transit, don't duplicate the
1530 	     destination and store the original destination as the
1531 	     destination of the node.  */
1532 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1533 	  break;
1534 	}
1535       else if (dfa->edests[org_node].nelem == 1)
1536 	{
1537 	  /* In case of the node can epsilon-transit, and it has only one
1538 	     destination.  */
1539 	  org_dest = dfa->edests[org_node].elems[0];
1540 	  re_node_set_empty (dfa->edests + clone_node);
1541 	  /* If the node is root_node itself, it means the epsilon closure
1542 	     has a loop.  Then tie it to the destination of the root_node.  */
1543 	  if (org_node == root_node && clone_node != org_node)
1544 	    {
1545 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1546 	      if (BE (! ok, 0))
1547 	        return REG_ESPACE;
1548 	      break;
1549 	    }
1550 	  /* In case the node has another constraint, append it.  */
1551 	  constraint |= dfa->nodes[org_node].constraint;
1552 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1553 	  if (BE (clone_dest == REG_MISSING, 0))
1554 	    return REG_ESPACE;
1555 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 	  if (BE (! ok, 0))
1557 	    return REG_ESPACE;
1558 	}
1559       else /* dfa->edests[org_node].nelem == 2 */
1560 	{
1561 	  /* In case of the node can epsilon-transit, and it has two
1562 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1563 	  org_dest = dfa->edests[org_node].elems[0];
1564 	  re_node_set_empty (dfa->edests + clone_node);
1565 	  /* Search for a duplicated node which satisfies the constraint.  */
1566 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1567 	  if (clone_dest == REG_MISSING)
1568 	    {
1569 	      /* There is no such duplicated node, create a new one.  */
1570 	      reg_errcode_t err;
1571 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 	      if (BE (clone_dest == REG_MISSING, 0))
1573 		return REG_ESPACE;
1574 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575 	      if (BE (! ok, 0))
1576 		return REG_ESPACE;
1577 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1578 					    root_node, constraint);
1579 	      if (BE (err != REG_NOERROR, 0))
1580 		return err;
1581 	    }
1582 	  else
1583 	    {
1584 	      /* There is a duplicated node which satisfies the constraint,
1585 		 use it to avoid infinite loop.  */
1586 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587 	      if (BE (! ok, 0))
1588 		return REG_ESPACE;
1589 	    }
1590 
1591 	  org_dest = dfa->edests[org_node].elems[1];
1592 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1593 	  if (BE (clone_dest == REG_MISSING, 0))
1594 	    return REG_ESPACE;
1595 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1596 	  if (BE (! ok, 0))
1597 	    return REG_ESPACE;
1598 	}
1599       org_node = org_dest;
1600       clone_node = clone_dest;
1601     }
1602   return REG_NOERROR;
1603 }
1604 
1605 /* Search for a node which is duplicated from the node ORG_NODE, and
1606    satisfies the constraint CONSTRAINT.  */
1607 
1608 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1609 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1610 			unsigned int constraint)
1611 {
1612   Idx idx;
1613   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1614     {
1615       if (org_node == dfa->org_indices[idx]
1616 	  && constraint == dfa->nodes[idx].constraint)
1617 	return idx; /* Found.  */
1618     }
1619   return REG_MISSING; /* Not found.  */
1620 }
1621 
1622 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1623    Return the index of the new node, or REG_MISSING if insufficient storage is
1624    available.  */
1625 
1626 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1627 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1628 {
1629   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1630   if (BE (dup_idx != REG_MISSING, 1))
1631     {
1632       dfa->nodes[dup_idx].constraint = constraint;
1633       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1634       dfa->nodes[dup_idx].duplicated = 1;
1635 
1636       /* Store the index of the original node.  */
1637       dfa->org_indices[dup_idx] = org_idx;
1638     }
1639   return dup_idx;
1640 }
1641 
1642 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1643 calc_inveclosure (re_dfa_t *dfa)
1644 {
1645   Idx src, idx;
1646   bool ok;
1647   for (idx = 0; idx < dfa->nodes_len; ++idx)
1648     re_node_set_init_empty (dfa->inveclosures + idx);
1649 
1650   for (src = 0; src < dfa->nodes_len; ++src)
1651     {
1652       Idx *elems = dfa->eclosures[src].elems;
1653       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1654 	{
1655 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1656 	  if (BE (! ok, 0))
1657 	    return REG_ESPACE;
1658 	}
1659     }
1660 
1661   return REG_NOERROR;
1662 }
1663 
1664 /* Calculate "eclosure" for all the node in DFA.  */
1665 
1666 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1667 calc_eclosure (re_dfa_t *dfa)
1668 {
1669   Idx node_idx;
1670   bool incomplete;
1671 #ifdef DEBUG
1672   assert (dfa->nodes_len > 0);
1673 #endif
1674   incomplete = false;
1675   /* For each nodes, calculate epsilon closure.  */
1676   for (node_idx = 0; ; ++node_idx)
1677     {
1678       reg_errcode_t err;
1679       re_node_set eclosure_elem;
1680       if (node_idx == dfa->nodes_len)
1681 	{
1682 	  if (!incomplete)
1683 	    break;
1684 	  incomplete = false;
1685 	  node_idx = 0;
1686 	}
1687 
1688 #ifdef DEBUG
1689       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1690 #endif
1691 
1692       /* If we have already calculated, skip it.  */
1693       if (dfa->eclosures[node_idx].nelem != 0)
1694 	continue;
1695       /* Calculate epsilon closure of 'node_idx'.  */
1696       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1697       if (BE (err != REG_NOERROR, 0))
1698 	return err;
1699 
1700       if (dfa->eclosures[node_idx].nelem == 0)
1701 	{
1702 	  incomplete = true;
1703 	  re_node_set_free (&eclosure_elem);
1704 	}
1705     }
1706   return REG_NOERROR;
1707 }
1708 
1709 /* Calculate epsilon closure of NODE.  */
1710 
1711 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1712 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1713 {
1714   reg_errcode_t err;
1715   Idx i;
1716   re_node_set eclosure;
1717   bool ok;
1718   bool incomplete = false;
1719   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1720   if (BE (err != REG_NOERROR, 0))
1721     return err;
1722 
1723   /* This indicates that we are calculating this node now.
1724      We reference this value to avoid infinite loop.  */
1725   dfa->eclosures[node].nelem = REG_MISSING;
1726 
1727   /* If the current node has constraints, duplicate all nodes
1728      since they must inherit the constraints.  */
1729   if (dfa->nodes[node].constraint
1730       && dfa->edests[node].nelem
1731       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1732     {
1733       err = duplicate_node_closure (dfa, node, node, node,
1734 				    dfa->nodes[node].constraint);
1735       if (BE (err != REG_NOERROR, 0))
1736 	return err;
1737     }
1738 
1739   /* Expand each epsilon destination nodes.  */
1740   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1741     for (i = 0; i < dfa->edests[node].nelem; ++i)
1742       {
1743 	re_node_set eclosure_elem;
1744 	Idx edest = dfa->edests[node].elems[i];
1745 	/* If calculating the epsilon closure of 'edest' is in progress,
1746 	   return intermediate result.  */
1747 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1748 	  {
1749 	    incomplete = true;
1750 	    continue;
1751 	  }
1752 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1753 	   calculate now. Otherwise use calculated epsilon closure.  */
1754 	if (dfa->eclosures[edest].nelem == 0)
1755 	  {
1756 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1757 	    if (BE (err != REG_NOERROR, 0))
1758 	      return err;
1759 	  }
1760 	else
1761 	  eclosure_elem = dfa->eclosures[edest];
1762 	/* Merge the epsilon closure of 'edest'.  */
1763 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1764 	if (BE (err != REG_NOERROR, 0))
1765 	  return err;
1766 	/* If the epsilon closure of 'edest' is incomplete,
1767 	   the epsilon closure of this node is also incomplete.  */
1768 	if (dfa->eclosures[edest].nelem == 0)
1769 	  {
1770 	    incomplete = true;
1771 	    re_node_set_free (&eclosure_elem);
1772 	  }
1773       }
1774 
1775   /* An epsilon closure includes itself.  */
1776   ok = re_node_set_insert (&eclosure, node);
1777   if (BE (! ok, 0))
1778     return REG_ESPACE;
1779   if (incomplete && !root)
1780     dfa->eclosures[node].nelem = 0;
1781   else
1782     dfa->eclosures[node] = eclosure;
1783   *new_set = eclosure;
1784   return REG_NOERROR;
1785 }
1786 
1787 /* Functions for token which are used in the parser.  */
1788 
1789 /* Fetch a token from INPUT.
1790    We must not use this function inside bracket expressions.  */
1791 
1792 static void
1793 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1794 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1795 {
1796   re_string_skip_bytes (input, peek_token (result, input, syntax));
1797 }
1798 
1799 /* Peek a token from INPUT, and return the length of the token.
1800    We must not use this function inside bracket expressions.  */
1801 
1802 static int
1803 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1804 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1805 {
1806   unsigned char c;
1807 
1808   if (re_string_eoi (input))
1809     {
1810       token->type = END_OF_RE;
1811       return 0;
1812     }
1813 
1814   c = re_string_peek_byte (input, 0);
1815   token->opr.c = c;
1816 
1817   token->word_char = 0;
1818 #ifdef RE_ENABLE_I18N
1819   token->mb_partial = 0;
1820   if (input->mb_cur_max > 1 &&
1821       !re_string_first_byte (input, re_string_cur_idx (input)))
1822     {
1823       token->type = CHARACTER;
1824       token->mb_partial = 1;
1825       return 1;
1826     }
1827 #endif
1828   if (c == '\\')
1829     {
1830       unsigned char c2;
1831       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1832 	{
1833 	  token->type = BACK_SLASH;
1834 	  return 1;
1835 	}
1836 
1837       c2 = re_string_peek_byte_case (input, 1);
1838       token->opr.c = c2;
1839       token->type = CHARACTER;
1840 #ifdef RE_ENABLE_I18N
1841       if (input->mb_cur_max > 1)
1842 	{
1843 	  wint_t wc = re_string_wchar_at (input,
1844 					  re_string_cur_idx (input) + 1);
1845 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1846 	}
1847       else
1848 #endif
1849 	token->word_char = IS_WORD_CHAR (c2) != 0;
1850 
1851       switch (c2)
1852 	{
1853 	case '|':
1854 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1855 	    token->type = OP_ALT;
1856 	  break;
1857 	case '1': case '2': case '3': case '4': case '5':
1858 	case '6': case '7': case '8': case '9':
1859 	  if (!(syntax & RE_NO_BK_REFS))
1860 	    {
1861 	      token->type = OP_BACK_REF;
1862 	      token->opr.idx = c2 - '1';
1863 	    }
1864 	  break;
1865 	case '<':
1866 	  if (!(syntax & RE_NO_GNU_OPS))
1867 	    {
1868 	      token->type = ANCHOR;
1869 	      token->opr.ctx_type = WORD_FIRST;
1870 	    }
1871 	  break;
1872 	case '>':
1873 	  if (!(syntax & RE_NO_GNU_OPS))
1874 	    {
1875 	      token->type = ANCHOR;
1876 	      token->opr.ctx_type = WORD_LAST;
1877 	    }
1878 	  break;
1879 	case 'b':
1880 	  if (!(syntax & RE_NO_GNU_OPS))
1881 	    {
1882 	      token->type = ANCHOR;
1883 	      token->opr.ctx_type = WORD_DELIM;
1884 	    }
1885 	  break;
1886 	case 'B':
1887 	  if (!(syntax & RE_NO_GNU_OPS))
1888 	    {
1889 	      token->type = ANCHOR;
1890 	      token->opr.ctx_type = NOT_WORD_DELIM;
1891 	    }
1892 	  break;
1893 	case 'w':
1894 	  if (!(syntax & RE_NO_GNU_OPS))
1895 	    token->type = OP_WORD;
1896 	  break;
1897 	case 'W':
1898 	  if (!(syntax & RE_NO_GNU_OPS))
1899 	    token->type = OP_NOTWORD;
1900 	  break;
1901 	case 's':
1902 	  if (!(syntax & RE_NO_GNU_OPS))
1903 	    token->type = OP_SPACE;
1904 	  break;
1905 	case 'S':
1906 	  if (!(syntax & RE_NO_GNU_OPS))
1907 	    token->type = OP_NOTSPACE;
1908 	  break;
1909 	case '`':
1910 	  if (!(syntax & RE_NO_GNU_OPS))
1911 	    {
1912 	      token->type = ANCHOR;
1913 	      token->opr.ctx_type = BUF_FIRST;
1914 	    }
1915 	  break;
1916 	case '\'':
1917 	  if (!(syntax & RE_NO_GNU_OPS))
1918 	    {
1919 	      token->type = ANCHOR;
1920 	      token->opr.ctx_type = BUF_LAST;
1921 	    }
1922 	  break;
1923 	case '(':
1924 	  if (!(syntax & RE_NO_BK_PARENS))
1925 	    token->type = OP_OPEN_SUBEXP;
1926 	  break;
1927 	case ')':
1928 	  if (!(syntax & RE_NO_BK_PARENS))
1929 	    token->type = OP_CLOSE_SUBEXP;
1930 	  break;
1931 	case '+':
1932 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1933 	    token->type = OP_DUP_PLUS;
1934 	  break;
1935 	case '?':
1936 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937 	    token->type = OP_DUP_QUESTION;
1938 	  break;
1939 	case '{':
1940 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1941 	    token->type = OP_OPEN_DUP_NUM;
1942 	  break;
1943 	case '}':
1944 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945 	    token->type = OP_CLOSE_DUP_NUM;
1946 	  break;
1947 	default:
1948 	  break;
1949 	}
1950       return 2;
1951     }
1952 
1953   token->type = CHARACTER;
1954 #ifdef RE_ENABLE_I18N
1955   if (input->mb_cur_max > 1)
1956     {
1957       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1958       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1959     }
1960   else
1961 #endif
1962     token->word_char = IS_WORD_CHAR (token->opr.c);
1963 
1964   switch (c)
1965     {
1966     case '\n':
1967       if (syntax & RE_NEWLINE_ALT)
1968 	token->type = OP_ALT;
1969       break;
1970     case '|':
1971       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1972 	token->type = OP_ALT;
1973       break;
1974     case '*':
1975       token->type = OP_DUP_ASTERISK;
1976       break;
1977     case '+':
1978       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1979 	token->type = OP_DUP_PLUS;
1980       break;
1981     case '?':
1982       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983 	token->type = OP_DUP_QUESTION;
1984       break;
1985     case '{':
1986       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1987 	token->type = OP_OPEN_DUP_NUM;
1988       break;
1989     case '}':
1990       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991 	token->type = OP_CLOSE_DUP_NUM;
1992       break;
1993     case '(':
1994       if (syntax & RE_NO_BK_PARENS)
1995 	token->type = OP_OPEN_SUBEXP;
1996       break;
1997     case ')':
1998       if (syntax & RE_NO_BK_PARENS)
1999 	token->type = OP_CLOSE_SUBEXP;
2000       break;
2001     case '[':
2002       token->type = OP_OPEN_BRACKET;
2003       break;
2004     case '.':
2005       token->type = OP_PERIOD;
2006       break;
2007     case '^':
2008       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2009 	  re_string_cur_idx (input) != 0)
2010 	{
2011 	  char prev = re_string_peek_byte (input, -1);
2012 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2013 	    break;
2014 	}
2015       token->type = ANCHOR;
2016       token->opr.ctx_type = LINE_FIRST;
2017       break;
2018     case '$':
2019       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2020 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2021 	{
2022 	  re_token_t next;
2023 	  re_string_skip_bytes (input, 1);
2024 	  peek_token (&next, input, syntax);
2025 	  re_string_skip_bytes (input, -1);
2026 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2027 	    break;
2028 	}
2029       token->type = ANCHOR;
2030       token->opr.ctx_type = LINE_LAST;
2031       break;
2032     default:
2033       break;
2034     }
2035   return 1;
2036 }
2037 
2038 /* Peek a token from INPUT, and return the length of the token.
2039    We must not use this function out of bracket expressions.  */
2040 
2041 static int
2042 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2043 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2044 {
2045   unsigned char c;
2046   if (re_string_eoi (input))
2047     {
2048       token->type = END_OF_RE;
2049       return 0;
2050     }
2051   c = re_string_peek_byte (input, 0);
2052   token->opr.c = c;
2053 
2054 #ifdef RE_ENABLE_I18N
2055   if (input->mb_cur_max > 1 &&
2056       !re_string_first_byte (input, re_string_cur_idx (input)))
2057     {
2058       token->type = CHARACTER;
2059       return 1;
2060     }
2061 #endif /* RE_ENABLE_I18N */
2062 
2063   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2064       && re_string_cur_idx (input) + 1 < re_string_length (input))
2065     {
2066       /* In this case, '\' escape a character.  */
2067       unsigned char c2;
2068       re_string_skip_bytes (input, 1);
2069       c2 = re_string_peek_byte (input, 0);
2070       token->opr.c = c2;
2071       token->type = CHARACTER;
2072       return 1;
2073     }
2074   if (c == '[') /* '[' is a special char in a bracket exps.  */
2075     {
2076       unsigned char c2;
2077       int token_len;
2078       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2079 	c2 = re_string_peek_byte (input, 1);
2080       else
2081 	c2 = 0;
2082       token->opr.c = c2;
2083       token_len = 2;
2084       switch (c2)
2085 	{
2086 	case '.':
2087 	  token->type = OP_OPEN_COLL_ELEM;
2088 	  break;
2089 	case '=':
2090 	  token->type = OP_OPEN_EQUIV_CLASS;
2091 	  break;
2092 	case ':':
2093 	  if (syntax & RE_CHAR_CLASSES)
2094 	    {
2095 	      token->type = OP_OPEN_CHAR_CLASS;
2096 	      break;
2097 	    }
2098 	  /* else fall through.  */
2099 	default:
2100 	  token->type = CHARACTER;
2101 	  token->opr.c = c;
2102 	  token_len = 1;
2103 	  break;
2104 	}
2105       return token_len;
2106     }
2107   switch (c)
2108     {
2109     case '-':
2110       token->type = OP_CHARSET_RANGE;
2111       break;
2112     case ']':
2113       token->type = OP_CLOSE_BRACKET;
2114       break;
2115     case '^':
2116       token->type = OP_NON_MATCH_LIST;
2117       break;
2118     default:
2119       token->type = CHARACTER;
2120     }
2121   return 1;
2122 }
2123 
2124 /* Functions for parser.  */
2125 
2126 /* Entry point of the parser.
2127    Parse the regular expression REGEXP and return the structure tree.
2128    If an error occurs, ERR is set by error code, and return NULL.
2129    This function build the following tree, from regular expression <reg_exp>:
2130 	   CAT
2131 	   / \
2132 	  /   \
2133    <reg_exp>  EOR
2134 
2135    CAT means concatenation.
2136    EOR means end of regular expression.  */
2137 
2138 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2139 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2140        reg_errcode_t *err)
2141 {
2142   re_dfa_t *dfa = preg->buffer;
2143   bin_tree_t *tree, *eor, *root;
2144   re_token_t current_token;
2145   dfa->syntax = syntax;
2146   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2148   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149     return NULL;
2150   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2151   if (tree != NULL)
2152     root = create_tree (dfa, tree, eor, CONCAT);
2153   else
2154     root = eor;
2155   if (BE (eor == NULL || root == NULL, 0))
2156     {
2157       *err = REG_ESPACE;
2158       return NULL;
2159     }
2160   return root;
2161 }
2162 
2163 /* This function build the following tree, from regular expression
2164    <branch1>|<branch2>:
2165 	   ALT
2166 	   / \
2167 	  /   \
2168    <branch1> <branch2>
2169 
2170    ALT means alternative, which represents the operator '|'.  */
2171 
2172 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2173 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2175 {
2176   re_dfa_t *dfa = preg->buffer;
2177   bin_tree_t *tree, *branch = NULL;
2178   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2179   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2180     return NULL;
2181 
2182   while (token->type == OP_ALT)
2183     {
2184       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2185       if (token->type != OP_ALT && token->type != END_OF_RE
2186 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187 	{
2188 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2189 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2190 	    return NULL;
2191 	}
2192       else
2193 	branch = NULL;
2194       tree = create_tree (dfa, tree, branch, OP_ALT);
2195       if (BE (tree == NULL, 0))
2196 	{
2197 	  *err = REG_ESPACE;
2198 	  return NULL;
2199 	}
2200     }
2201   return tree;
2202 }
2203 
2204 /* This function build the following tree, from regular expression
2205    <exp1><exp2>:
2206 	CAT
2207 	/ \
2208        /   \
2209    <exp1> <exp2>
2210 
2211    CAT means concatenation.  */
2212 
2213 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 {
2217   bin_tree_t *tree, *expr;
2218   re_dfa_t *dfa = preg->buffer;
2219   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221     return NULL;
2222 
2223   while (token->type != OP_ALT && token->type != END_OF_RE
2224 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2225     {
2226       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2228 	{
2229 	  if (tree != NULL)
2230 	    postorder (tree, free_tree, NULL);
2231 	  return NULL;
2232 	}
2233       if (tree != NULL && expr != NULL)
2234 	{
2235 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 	  if (newtree == NULL)
2237 	    {
2238 	      postorder (expr, free_tree, NULL);
2239 	      postorder (tree, free_tree, NULL);
2240 	      *err = REG_ESPACE;
2241 	      return NULL;
2242 	    }
2243 	  tree = newtree;
2244 	}
2245       else if (tree == NULL)
2246 	tree = expr;
2247       /* Otherwise expr == NULL, we don't need to create new tree.  */
2248     }
2249   return tree;
2250 }
2251 
2252 /* This function build the following tree, from regular expression a*:
2253 	 *
2254 	 |
2255 	 a
2256 */
2257 
2258 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261 {
2262   re_dfa_t *dfa = preg->buffer;
2263   bin_tree_t *tree;
2264   switch (token->type)
2265     {
2266     case CHARACTER:
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269 	{
2270 	  *err = REG_ESPACE;
2271 	  return NULL;
2272 	}
2273 #ifdef RE_ENABLE_I18N
2274       if (dfa->mb_cur_max > 1)
2275 	{
2276 	  while (!re_string_eoi (regexp)
2277 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278 	    {
2279 	      bin_tree_t *mbc_remain;
2280 	      fetch_token (token, regexp, syntax);
2281 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2284 		{
2285 		  *err = REG_ESPACE;
2286 		  return NULL;
2287 		}
2288 	    }
2289 	}
2290 #endif
2291       break;
2292     case OP_OPEN_SUBEXP:
2293       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295 	return NULL;
2296       break;
2297     case OP_OPEN_BRACKET:
2298       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300 	return NULL;
2301       break;
2302     case OP_BACK_REF:
2303       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2304 	{
2305 	  *err = REG_ESUBREG;
2306 	  return NULL;
2307 	}
2308       dfa->used_bkref_map |= 1 << token->opr.idx;
2309       tree = create_token_tree (dfa, NULL, NULL, token);
2310       if (BE (tree == NULL, 0))
2311 	{
2312 	  *err = REG_ESPACE;
2313 	  return NULL;
2314 	}
2315       ++dfa->nbackref;
2316       dfa->has_mb_node = 1;
2317       break;
2318     case OP_OPEN_DUP_NUM:
2319       if (syntax & RE_CONTEXT_INVALID_DUP)
2320 	{
2321 	  *err = REG_BADRPT;
2322 	  return NULL;
2323 	}
2324       /* FALLTHROUGH */
2325     case OP_DUP_ASTERISK:
2326     case OP_DUP_PLUS:
2327     case OP_DUP_QUESTION:
2328       if (syntax & RE_CONTEXT_INVALID_OPS)
2329 	{
2330 	  *err = REG_BADRPT;
2331 	  return NULL;
2332 	}
2333       else if (syntax & RE_CONTEXT_INDEP_OPS)
2334 	{
2335 	  fetch_token (token, regexp, syntax);
2336 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2337 	}
2338       /* else fall through  */
2339     case OP_CLOSE_SUBEXP:
2340       if ((token->type == OP_CLOSE_SUBEXP) &&
2341 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2342 	{
2343 	  *err = REG_ERPAREN;
2344 	  return NULL;
2345 	}
2346       /* else fall through  */
2347     case OP_CLOSE_DUP_NUM:
2348       /* We treat it as a normal character.  */
2349 
2350       /* Then we can these characters as normal characters.  */
2351       token->type = CHARACTER;
2352       /* mb_partial and word_char bits should be initialized already
2353 	 by peek_token.  */
2354       tree = create_token_tree (dfa, NULL, NULL, token);
2355       if (BE (tree == NULL, 0))
2356 	{
2357 	  *err = REG_ESPACE;
2358 	  return NULL;
2359 	}
2360       break;
2361     case ANCHOR:
2362       if ((token->opr.ctx_type
2363 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 	  && dfa->word_ops_used == 0)
2365 	init_word_char (dfa);
2366       if (token->opr.ctx_type == WORD_DELIM
2367 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2368 	{
2369 	  bin_tree_t *tree_first, *tree_last;
2370 	  if (token->opr.ctx_type == WORD_DELIM)
2371 	    {
2372 	      token->opr.ctx_type = WORD_FIRST;
2373 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 	      token->opr.ctx_type = WORD_LAST;
2375 	    }
2376 	  else
2377 	    {
2378 	      token->opr.ctx_type = INSIDE_WORD;
2379 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 	      token->opr.ctx_type = INSIDE_NOTWORD;
2381 	    }
2382 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2385 	    {
2386 	      *err = REG_ESPACE;
2387 	      return NULL;
2388 	    }
2389 	}
2390       else
2391 	{
2392 	  tree = create_token_tree (dfa, NULL, NULL, token);
2393 	  if (BE (tree == NULL, 0))
2394 	    {
2395 	      *err = REG_ESPACE;
2396 	      return NULL;
2397 	    }
2398 	}
2399       /* We must return here, since ANCHORs can't be followed
2400 	 by repetition operators.
2401 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2403       fetch_token (token, regexp, syntax);
2404       return tree;
2405     case OP_PERIOD:
2406       tree = create_token_tree (dfa, NULL, NULL, token);
2407       if (BE (tree == NULL, 0))
2408 	{
2409 	  *err = REG_ESPACE;
2410 	  return NULL;
2411 	}
2412       if (dfa->mb_cur_max > 1)
2413 	dfa->has_mb_node = 1;
2414       break;
2415     case OP_WORD:
2416     case OP_NOTWORD:
2417       tree = build_charclass_op (dfa, regexp->trans,
2418 				 (const unsigned char *) "alnum",
2419 				 (const unsigned char *) "_",
2420 				 token->type == OP_NOTWORD, err);
2421       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2422 	return NULL;
2423       break;
2424     case OP_SPACE:
2425     case OP_NOTSPACE:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427 				 (const unsigned char *) "space",
2428 				 (const unsigned char *) "",
2429 				 token->type == OP_NOTSPACE, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 	return NULL;
2432       break;
2433     case OP_ALT:
2434     case END_OF_RE:
2435       return NULL;
2436     case BACK_SLASH:
2437       *err = REG_EESCAPE;
2438       return NULL;
2439     default:
2440       /* Must not happen?  */
2441 #ifdef DEBUG
2442       assert (0);
2443 #endif
2444       return NULL;
2445     }
2446   fetch_token (token, regexp, syntax);
2447 
2448   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2450     {
2451       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2452       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2453 	return NULL;
2454       /* In BRE consecutive duplications are not allowed.  */
2455       if ((syntax & RE_CONTEXT_INVALID_DUP)
2456 	  && (token->type == OP_DUP_ASTERISK
2457 	      || token->type == OP_OPEN_DUP_NUM))
2458 	{
2459 	  *err = REG_BADRPT;
2460 	  return NULL;
2461 	}
2462     }
2463 
2464   return tree;
2465 }
2466 
2467 /* This function build the following tree, from regular expression
2468    (<reg_exp>):
2469 	 SUBEXP
2470 	    |
2471 	<reg_exp>
2472 */
2473 
2474 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2475 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2476 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2477 {
2478   re_dfa_t *dfa = preg->buffer;
2479   bin_tree_t *tree;
2480   size_t cur_nsub;
2481   cur_nsub = preg->re_nsub++;
2482 
2483   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2484 
2485   /* The subexpression may be a null string.  */
2486   if (token->type == OP_CLOSE_SUBEXP)
2487     tree = NULL;
2488   else
2489     {
2490       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2491       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2492 	{
2493 	  if (tree != NULL)
2494 	    postorder (tree, free_tree, NULL);
2495 	  *err = REG_EPAREN;
2496 	}
2497       if (BE (*err != REG_NOERROR, 0))
2498 	return NULL;
2499     }
2500 
2501   if (cur_nsub <= '9' - '1')
2502     dfa->completed_bkref_map |= 1 << cur_nsub;
2503 
2504   tree = create_tree (dfa, tree, NULL, SUBEXP);
2505   if (BE (tree == NULL, 0))
2506     {
2507       *err = REG_ESPACE;
2508       return NULL;
2509     }
2510   tree->token.opr.idx = cur_nsub;
2511   return tree;
2512 }
2513 
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2515 
2516 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2517 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2518 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2519 {
2520   bin_tree_t *tree = NULL, *old_tree = NULL;
2521   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2522   re_token_t start_token = *token;
2523 
2524   if (token->type == OP_OPEN_DUP_NUM)
2525     {
2526       end = 0;
2527       start = fetch_number (regexp, token, syntax);
2528       if (start == REG_MISSING)
2529 	{
2530 	  if (token->type == CHARACTER && token->opr.c == ',')
2531 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2532 	  else
2533 	    {
2534 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2535 	      return NULL;
2536 	    }
2537 	}
2538       if (BE (start != REG_ERROR, 1))
2539 	{
2540 	  /* We treat "{n}" as "{n,n}".  */
2541 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542 		 : ((token->type == CHARACTER && token->opr.c == ',')
2543 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2544 	}
2545       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2546 	{
2547 	  /* Invalid sequence.  */
2548 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549 	    {
2550 	      if (token->type == END_OF_RE)
2551 		*err = REG_EBRACE;
2552 	      else
2553 		*err = REG_BADBR;
2554 
2555 	      return NULL;
2556 	    }
2557 
2558 	  /* If the syntax bit is set, rollback.  */
2559 	  re_string_set_index (regexp, start_idx);
2560 	  *token = start_token;
2561 	  token->type = CHARACTER;
2562 	  /* mb_partial and word_char bits should be already initialized by
2563 	     peek_token.  */
2564 	  return elem;
2565 	}
2566 
2567       if (BE ((end != REG_MISSING && start > end)
2568 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2569 	{
2570 	  /* First number greater than second.  */
2571 	  *err = REG_BADBR;
2572 	  return NULL;
2573 	}
2574 
2575       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2576 	{
2577 	  *err = REG_ESIZE;
2578 	  return NULL;
2579 	}
2580     }
2581   else
2582     {
2583       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2584       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2585     }
2586 
2587   fetch_token (token, regexp, syntax);
2588 
2589   if (BE (elem == NULL, 0))
2590     return NULL;
2591   if (BE (start == 0 && end == 0, 0))
2592     {
2593       postorder (elem, free_tree, NULL);
2594       return NULL;
2595     }
2596 
2597   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2598   if (BE (start > 0, 0))
2599     {
2600       tree = elem;
2601       for (i = 2; i <= start; ++i)
2602 	{
2603 	  elem = duplicate_tree (elem, dfa);
2604 	  tree = create_tree (dfa, tree, elem, CONCAT);
2605 	  if (BE (elem == NULL || tree == NULL, 0))
2606 	    goto parse_dup_op_espace;
2607 	}
2608 
2609       if (start == end)
2610 	return tree;
2611 
2612       /* Duplicate ELEM before it is marked optional.  */
2613       elem = duplicate_tree (elem, dfa);
2614       old_tree = tree;
2615     }
2616   else
2617     old_tree = NULL;
2618 
2619   if (elem->token.type == SUBEXP)
2620     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2621 
2622   tree = create_tree (dfa, elem, NULL,
2623 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2624   if (BE (tree == NULL, 0))
2625     goto parse_dup_op_espace;
2626 
2627 /* From gnulib's "intprops.h":
2628    True if the arithmetic type T is signed.  */
2629 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2630 
2631   /* This loop is actually executed only when end != REG_MISSING,
2632      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2633      already created the start+1-th copy.  */
2634   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2635     for (i = start + 2; i <= end; ++i)
2636       {
2637 	elem = duplicate_tree (elem, dfa);
2638 	tree = create_tree (dfa, tree, elem, CONCAT);
2639 	if (BE (elem == NULL || tree == NULL, 0))
2640 	  goto parse_dup_op_espace;
2641 
2642 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2643 	if (BE (tree == NULL, 0))
2644 	  goto parse_dup_op_espace;
2645       }
2646 
2647   if (old_tree)
2648     tree = create_tree (dfa, old_tree, tree, CONCAT);
2649 
2650   return tree;
2651 
2652  parse_dup_op_espace:
2653   *err = REG_ESPACE;
2654   return NULL;
2655 }
2656 
2657 /* Size of the names for collating symbol/equivalence_class/character_class.
2658    I'm not sure, but maybe enough.  */
2659 #define BRACKET_NAME_BUF_SIZE 32
2660 
2661 #ifndef _LIBC
2662   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2663      Build the range expression which starts from START_ELEM, and ends
2664      at END_ELEM.  The result are written to MBCSET and SBCSET.
2665      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2666      mbcset->range_ends, is a pointer argument since we may
2667      update it.  */
2668 
2669 static reg_errcode_t
2670 internal_function
2671 # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)2672 build_range_exp (const reg_syntax_t syntax,
2673                  bitset_t sbcset,
2674                  re_charset_t *mbcset,
2675                  Idx *range_alloc,
2676                  const bracket_elem_t *start_elem,
2677                  const bracket_elem_t *end_elem)
2678 # else /* not RE_ENABLE_I18N */
2679 build_range_exp (const reg_syntax_t syntax,
2680                  bitset_t sbcset,
2681                  const bracket_elem_t *start_elem,
2682                  const bracket_elem_t *end_elem)
2683 # endif /* not RE_ENABLE_I18N */
2684 {
2685   unsigned int start_ch, end_ch;
2686   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2687   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2688 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2689 	  0))
2690     return REG_ERANGE;
2691 
2692   /* We can handle no multi character collating elements without libc
2693      support.  */
2694   if (BE ((start_elem->type == COLL_SYM
2695 	   && strlen ((char *) start_elem->opr.name) > 1)
2696 	  || (end_elem->type == COLL_SYM
2697 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2698     return REG_ECOLLATE;
2699 
2700 # ifdef RE_ENABLE_I18N
2701   {
2702     wchar_t wc;
2703     wint_t start_wc;
2704     wint_t end_wc;
2705     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2706 
2707     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2708 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2709 		   : 0));
2710     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2711 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2712 		 : 0));
2713     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2714 		? __btowc (start_ch) : start_elem->opr.wch);
2715     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2716 	      ? __btowc (end_ch) : end_elem->opr.wch);
2717     if (start_wc == WEOF || end_wc == WEOF)
2718       return REG_ECOLLATE;
2719     cmp_buf[0] = start_wc;
2720     cmp_buf[4] = end_wc;
2721 
2722     if (BE ((syntax & RE_NO_EMPTY_RANGES)
2723             && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2724       return REG_ERANGE;
2725 
2726     /* Got valid collation sequence values, add them as a new entry.
2727        However, for !_LIBC we have no collation elements: if the
2728        character set is single byte, the single byte character set
2729        that we build below suffices.  parse_bracket_exp passes
2730        no MBCSET if dfa->mb_cur_max == 1.  */
2731     if (mbcset)
2732       {
2733 	/* Check the space of the arrays.  */
2734 	if (BE (*range_alloc == mbcset->nranges, 0))
2735 	  {
2736 	    /* There is not enough space, need realloc.  */
2737 	    wchar_t *new_array_start, *new_array_end;
2738 	    Idx new_nranges;
2739 
2740 	    /* +1 in case of mbcset->nranges is 0.  */
2741 	    new_nranges = 2 * mbcset->nranges + 1;
2742 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2743 	       are NULL if *range_alloc == 0.  */
2744 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2745 					  new_nranges);
2746 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2747 					new_nranges);
2748 
2749 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2750 	      return REG_ESPACE;
2751 
2752 	    mbcset->range_starts = new_array_start;
2753 	    mbcset->range_ends = new_array_end;
2754 	    *range_alloc = new_nranges;
2755 	  }
2756 
2757 	mbcset->range_starts[mbcset->nranges] = start_wc;
2758 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2759       }
2760 
2761     /* Build the table for single byte characters.  */
2762     for (wc = 0; wc < SBC_MAX; ++wc)
2763       {
2764 	cmp_buf[2] = wc;
2765 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2766 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2767 	  bitset_set (sbcset, wc);
2768       }
2769   }
2770 # else /* not RE_ENABLE_I18N */
2771   {
2772     unsigned int ch;
2773     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2774 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2775 		   : 0));
2776     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2777 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2778 		 : 0));
2779     if (start_ch > end_ch)
2780       return REG_ERANGE;
2781     /* Build the table for single byte characters.  */
2782     for (ch = 0; ch < SBC_MAX; ++ch)
2783       if (start_ch <= ch  && ch <= end_ch)
2784 	bitset_set (sbcset, ch);
2785   }
2786 # endif /* not RE_ENABLE_I18N */
2787   return REG_NOERROR;
2788 }
2789 #endif /* not _LIBC */
2790 
2791 #ifndef _LIBC
2792 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2793    Build the collating element which is represented by NAME.
2794    The result are written to MBCSET and SBCSET.
2795    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2796    pointer argument since we may update it.  */
2797 
2798 static reg_errcode_t
2799 internal_function
2800 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2801 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2802 			Idx *coll_sym_alloc, const unsigned char *name)
2803 # else /* not RE_ENABLE_I18N */
2804 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2805 # endif /* not RE_ENABLE_I18N */
2806 {
2807   size_t name_len = strlen ((const char *) name);
2808   if (BE (name_len != 1, 0))
2809     return REG_ECOLLATE;
2810   else
2811     {
2812       bitset_set (sbcset, name[0]);
2813       return REG_NOERROR;
2814     }
2815 }
2816 #endif /* not _LIBC */
2817 
2818 /* This function parse bracket expression like "[abc]", "[a-c]",
2819    "[[.a-a.]]" etc.  */
2820 
2821 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2822 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2823 		   reg_syntax_t syntax, reg_errcode_t *err)
2824 {
2825 #ifdef _LIBC
2826   const unsigned char *collseqmb;
2827   const char *collseqwc;
2828   uint32_t nrules;
2829   int32_t table_size;
2830   const int32_t *symb_table;
2831   const unsigned char *extra;
2832 
2833   /* Local function for parse_bracket_exp used in _LIBC environment.
2834      Seek the collating symbol entry corresponding to NAME.
2835      Return the index of the symbol in the SYMB_TABLE.  */
2836 
2837   auto inline int32_t
2838   __attribute ((always_inline))
2839   seek_collating_symbol_entry (name, name_len)
2840 	 const unsigned char *name;
2841 	 size_t name_len;
2842     {
2843       int32_t hash = elem_hash ((const char *) name, name_len);
2844       int32_t elem = hash % table_size;
2845       if (symb_table[2 * elem] != 0)
2846 	{
2847 	  int32_t second = hash % (table_size - 2) + 1;
2848 
2849 	  do
2850 	    {
2851 	      /* First compare the hashing value.  */
2852 	      if (symb_table[2 * elem] == hash
2853 		  /* Compare the length of the name.  */
2854 		  && name_len == extra[symb_table[2 * elem + 1]]
2855 		  /* Compare the name.  */
2856 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2857 			     name_len) == 0)
2858 		{
2859 		  /* Yep, this is the entry.  */
2860 		  break;
2861 		}
2862 
2863 	      /* Next entry.  */
2864 	      elem += second;
2865 	    }
2866 	  while (symb_table[2 * elem] != 0);
2867 	}
2868       return elem;
2869     }
2870 
2871   /* Local function for parse_bracket_exp used in _LIBC environment.
2872      Look up the collation sequence value of BR_ELEM.
2873      Return the value if succeeded, UINT_MAX otherwise.  */
2874 
2875   auto inline unsigned int
2876   __attribute ((always_inline))
2877   lookup_collation_sequence_value (br_elem)
2878 	 bracket_elem_t *br_elem;
2879     {
2880       if (br_elem->type == SB_CHAR)
2881 	{
2882 	  /*
2883 	  if (MB_CUR_MAX == 1)
2884 	  */
2885 	  if (nrules == 0)
2886 	    return collseqmb[br_elem->opr.ch];
2887 	  else
2888 	    {
2889 	      wint_t wc = __btowc (br_elem->opr.ch);
2890 	      return __collseq_table_lookup (collseqwc, wc);
2891 	    }
2892 	}
2893       else if (br_elem->type == MB_CHAR)
2894 	{
2895 	  if (nrules != 0)
2896 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2897 	}
2898       else if (br_elem->type == COLL_SYM)
2899 	{
2900 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2901 	  if (nrules != 0)
2902 	    {
2903 	      int32_t elem, idx;
2904 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2905 						  sym_name_len);
2906 	      if (symb_table[2 * elem] != 0)
2907 		{
2908 		  /* We found the entry.  */
2909 		  idx = symb_table[2 * elem + 1];
2910 		  /* Skip the name of collating element name.  */
2911 		  idx += 1 + extra[idx];
2912 		  /* Skip the byte sequence of the collating element.  */
2913 		  idx += 1 + extra[idx];
2914 		  /* Adjust for the alignment.  */
2915 		  idx = (idx + 3) & ~3;
2916 		  /* Skip the multibyte collation sequence value.  */
2917 		  idx += sizeof (unsigned int);
2918 		  /* Skip the wide char sequence of the collating element.  */
2919 		  idx += sizeof (unsigned int) *
2920 		    (1 + *(unsigned int *) (extra + idx));
2921 		  /* Return the collation sequence value.  */
2922 		  return *(unsigned int *) (extra + idx);
2923 		}
2924 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2925 		{
2926 		  /* No valid character.  Match it as a single byte
2927 		     character.  */
2928 		  return collseqmb[br_elem->opr.name[0]];
2929 		}
2930 	    }
2931 	  else if (sym_name_len == 1)
2932 	    return collseqmb[br_elem->opr.name[0]];
2933 	}
2934       return UINT_MAX;
2935     }
2936 
2937   /* Local function for parse_bracket_exp used in _LIBC environment.
2938      Build the range expression which starts from START_ELEM, and ends
2939      at END_ELEM.  The result are written to MBCSET and SBCSET.
2940      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2941      mbcset->range_ends, is a pointer argument since we may
2942      update it.  */
2943 
2944   auto inline reg_errcode_t
2945   __attribute ((always_inline))
2946   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2947 	 re_charset_t *mbcset;
2948 	 Idx *range_alloc;
2949 	 bitset_t sbcset;
2950 	 bracket_elem_t *start_elem, *end_elem;
2951     {
2952       unsigned int ch;
2953       uint32_t start_collseq;
2954       uint32_t end_collseq;
2955 
2956       /* Equivalence Classes and Character Classes can't be a range
2957 	 start/end.  */
2958       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2959 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2960 	      0))
2961 	return REG_ERANGE;
2962 
2963       start_collseq = lookup_collation_sequence_value (start_elem);
2964       end_collseq = lookup_collation_sequence_value (end_elem);
2965       /* Check start/end collation sequence values.  */
2966       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2967 	return REG_ECOLLATE;
2968       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2969 	return REG_ERANGE;
2970 
2971       /* Got valid collation sequence values, add them as a new entry.
2972 	 However, if we have no collation elements, and the character set
2973 	 is single byte, the single byte character set that we
2974 	 build below suffices. */
2975       if (nrules > 0 || dfa->mb_cur_max > 1)
2976 	{
2977 	  /* Check the space of the arrays.  */
2978 	  if (BE (*range_alloc == mbcset->nranges, 0))
2979 	    {
2980 	      /* There is not enough space, need realloc.  */
2981 	      uint32_t *new_array_start;
2982 	      uint32_t *new_array_end;
2983 	      Idx new_nranges;
2984 
2985 	      /* +1 in case of mbcset->nranges is 0.  */
2986 	      new_nranges = 2 * mbcset->nranges + 1;
2987 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2988 					    new_nranges);
2989 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2990 					  new_nranges);
2991 
2992 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2993 		return REG_ESPACE;
2994 
2995 	      mbcset->range_starts = new_array_start;
2996 	      mbcset->range_ends = new_array_end;
2997 	      *range_alloc = new_nranges;
2998 	    }
2999 
3000 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
3001 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
3002 	}
3003 
3004       /* Build the table for single byte characters.  */
3005       for (ch = 0; ch < SBC_MAX; ch++)
3006 	{
3007 	  uint32_t ch_collseq;
3008 	  /*
3009 	  if (MB_CUR_MAX == 1)
3010 	  */
3011 	  if (nrules == 0)
3012 	    ch_collseq = collseqmb[ch];
3013 	  else
3014 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3015 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3016 	    bitset_set (sbcset, ch);
3017 	}
3018       return REG_NOERROR;
3019     }
3020 
3021   /* Local function for parse_bracket_exp used in _LIBC environment.
3022      Build the collating element which is represented by NAME.
3023      The result are written to MBCSET and SBCSET.
3024      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3025      pointer argument since we may update it.  */
3026 
3027   auto inline reg_errcode_t
3028   __attribute ((always_inline))
3029   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3030 	 re_charset_t *mbcset;
3031 	 Idx *coll_sym_alloc;
3032 	 bitset_t sbcset;
3033 	 const unsigned char *name;
3034     {
3035       int32_t elem, idx;
3036       size_t name_len = strlen ((const char *) name);
3037       if (nrules != 0)
3038 	{
3039 	  elem = seek_collating_symbol_entry (name, name_len);
3040 	  if (symb_table[2 * elem] != 0)
3041 	    {
3042 	      /* We found the entry.  */
3043 	      idx = symb_table[2 * elem + 1];
3044 	      /* Skip the name of collating element name.  */
3045 	      idx += 1 + extra[idx];
3046 	    }
3047 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3048 	    {
3049 	      /* No valid character, treat it as a normal
3050 		 character.  */
3051 	      bitset_set (sbcset, name[0]);
3052 	      return REG_NOERROR;
3053 	    }
3054 	  else
3055 	    return REG_ECOLLATE;
3056 
3057 	  /* Got valid collation sequence, add it as a new entry.  */
3058 	  /* Check the space of the arrays.  */
3059 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3060 	    {
3061 	      /* Not enough, realloc it.  */
3062 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3063 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3064 	      /* Use realloc since mbcset->coll_syms is NULL
3065 		 if *alloc == 0.  */
3066 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3067 						   new_coll_sym_alloc);
3068 	      if (BE (new_coll_syms == NULL, 0))
3069 		return REG_ESPACE;
3070 	      mbcset->coll_syms = new_coll_syms;
3071 	      *coll_sym_alloc = new_coll_sym_alloc;
3072 	    }
3073 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3074 	  return REG_NOERROR;
3075 	}
3076       else
3077 	{
3078 	  if (BE (name_len != 1, 0))
3079 	    return REG_ECOLLATE;
3080 	  else
3081 	    {
3082 	      bitset_set (sbcset, name[0]);
3083 	      return REG_NOERROR;
3084 	    }
3085 	}
3086     }
3087 #endif
3088 
3089   re_token_t br_token;
3090   re_bitset_ptr_t sbcset;
3091 #ifdef RE_ENABLE_I18N
3092   re_charset_t *mbcset;
3093   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3094   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3095 #endif /* not RE_ENABLE_I18N */
3096   bool non_match = false;
3097   bin_tree_t *work_tree;
3098   int token_len;
3099   bool first_round = true;
3100 #ifdef _LIBC
3101   collseqmb = (const unsigned char *)
3102     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3103   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3104   if (nrules)
3105     {
3106       /*
3107       if (MB_CUR_MAX > 1)
3108       */
3109       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3110       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3111       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3112 						  _NL_COLLATE_SYMB_TABLEMB);
3113       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3114 						   _NL_COLLATE_SYMB_EXTRAMB);
3115     }
3116 #endif
3117   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3118 #ifdef RE_ENABLE_I18N
3119   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3120 #endif /* RE_ENABLE_I18N */
3121 #ifdef RE_ENABLE_I18N
3122   if (BE (sbcset == NULL || mbcset == NULL, 0))
3123 #else
3124   if (BE (sbcset == NULL, 0))
3125 #endif /* RE_ENABLE_I18N */
3126     {
3127       re_free (sbcset);
3128 #ifdef RE_ENABLE_I18N
3129       re_free (mbcset);
3130 #endif
3131       *err = REG_ESPACE;
3132       return NULL;
3133     }
3134 
3135   token_len = peek_token_bracket (token, regexp, syntax);
3136   if (BE (token->type == END_OF_RE, 0))
3137     {
3138       *err = REG_BADPAT;
3139       goto parse_bracket_exp_free_return;
3140     }
3141   if (token->type == OP_NON_MATCH_LIST)
3142     {
3143 #ifdef RE_ENABLE_I18N
3144       mbcset->non_match = 1;
3145 #endif /* not RE_ENABLE_I18N */
3146       non_match = true;
3147       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3148 	bitset_set (sbcset, '\n');
3149       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3150       token_len = peek_token_bracket (token, regexp, syntax);
3151       if (BE (token->type == END_OF_RE, 0))
3152 	{
3153 	  *err = REG_BADPAT;
3154 	  goto parse_bracket_exp_free_return;
3155 	}
3156     }
3157 
3158   /* We treat the first ']' as a normal character.  */
3159   if (token->type == OP_CLOSE_BRACKET)
3160     token->type = CHARACTER;
3161 
3162   while (1)
3163     {
3164       bracket_elem_t start_elem, end_elem;
3165       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3166       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3167       reg_errcode_t ret;
3168       int token_len2 = 0;
3169       bool is_range_exp = false;
3170       re_token_t token2;
3171 
3172       start_elem.opr.name = start_name_buf;
3173       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3174 				   syntax, first_round);
3175       if (BE (ret != REG_NOERROR, 0))
3176 	{
3177 	  *err = ret;
3178 	  goto parse_bracket_exp_free_return;
3179 	}
3180       first_round = false;
3181 
3182       /* Get information about the next token.  We need it in any case.  */
3183       token_len = peek_token_bracket (token, regexp, syntax);
3184 
3185       /* Do not check for ranges if we know they are not allowed.  */
3186       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3187 	{
3188 	  if (BE (token->type == END_OF_RE, 0))
3189 	    {
3190 	      *err = REG_EBRACK;
3191 	      goto parse_bracket_exp_free_return;
3192 	    }
3193 	  if (token->type == OP_CHARSET_RANGE)
3194 	    {
3195 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3196 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3197 	      if (BE (token2.type == END_OF_RE, 0))
3198 		{
3199 		  *err = REG_EBRACK;
3200 		  goto parse_bracket_exp_free_return;
3201 		}
3202 	      if (token2.type == OP_CLOSE_BRACKET)
3203 		{
3204 		  /* We treat the last '-' as a normal character.  */
3205 		  re_string_skip_bytes (regexp, -token_len);
3206 		  token->type = CHARACTER;
3207 		}
3208 	      else
3209 		is_range_exp = true;
3210 	    }
3211 	}
3212 
3213       if (is_range_exp == true)
3214 	{
3215 	  end_elem.opr.name = end_name_buf;
3216 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3217 				       dfa, syntax, true);
3218 	  if (BE (ret != REG_NOERROR, 0))
3219 	    {
3220 	      *err = ret;
3221 	      goto parse_bracket_exp_free_return;
3222 	    }
3223 
3224 	  token_len = peek_token_bracket (token, regexp, syntax);
3225 
3226 #ifdef _LIBC
3227 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3228 				  &start_elem, &end_elem);
3229 #else
3230 # ifdef RE_ENABLE_I18N
3231 	  *err = build_range_exp (syntax, sbcset,
3232 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3233 				  &range_alloc, &start_elem, &end_elem);
3234 # else
3235 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3236 # endif
3237 #endif /* RE_ENABLE_I18N */
3238 	  if (BE (*err != REG_NOERROR, 0))
3239 	    goto parse_bracket_exp_free_return;
3240 	}
3241       else
3242 	{
3243 	  switch (start_elem.type)
3244 	    {
3245 	    case SB_CHAR:
3246 	      bitset_set (sbcset, start_elem.opr.ch);
3247 	      break;
3248 #ifdef RE_ENABLE_I18N
3249 	    case MB_CHAR:
3250 	      /* Check whether the array has enough space.  */
3251 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3252 		{
3253 		  wchar_t *new_mbchars;
3254 		  /* Not enough, realloc it.  */
3255 		  /* +1 in case of mbcset->nmbchars is 0.  */
3256 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3257 		  /* Use realloc since array is NULL if *alloc == 0.  */
3258 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3259 					    mbchar_alloc);
3260 		  if (BE (new_mbchars == NULL, 0))
3261 		    goto parse_bracket_exp_espace;
3262 		  mbcset->mbchars = new_mbchars;
3263 		}
3264 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3265 	      break;
3266 #endif /* RE_ENABLE_I18N */
3267 	    case EQUIV_CLASS:
3268 	      *err = build_equiv_class (sbcset,
3269 #ifdef RE_ENABLE_I18N
3270 					mbcset, &equiv_class_alloc,
3271 #endif /* RE_ENABLE_I18N */
3272 					start_elem.opr.name);
3273 	      if (BE (*err != REG_NOERROR, 0))
3274 		goto parse_bracket_exp_free_return;
3275 	      break;
3276 	    case COLL_SYM:
3277 	      *err = build_collating_symbol (sbcset,
3278 #ifdef RE_ENABLE_I18N
3279 					     mbcset, &coll_sym_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281 					     start_elem.opr.name);
3282 	      if (BE (*err != REG_NOERROR, 0))
3283 		goto parse_bracket_exp_free_return;
3284 	      break;
3285 	    case CHAR_CLASS:
3286 	      *err = build_charclass (regexp->trans, sbcset,
3287 #ifdef RE_ENABLE_I18N
3288 				      mbcset, &char_class_alloc,
3289 #endif /* RE_ENABLE_I18N */
3290 				      start_elem.opr.name, syntax);
3291 	      if (BE (*err != REG_NOERROR, 0))
3292 	       goto parse_bracket_exp_free_return;
3293 	      break;
3294 	    default:
3295 	      assert (0);
3296 	      break;
3297 	    }
3298 	}
3299       if (BE (token->type == END_OF_RE, 0))
3300 	{
3301 	  *err = REG_EBRACK;
3302 	  goto parse_bracket_exp_free_return;
3303 	}
3304       if (token->type == OP_CLOSE_BRACKET)
3305 	break;
3306     }
3307 
3308   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3309 
3310   /* If it is non-matching list.  */
3311   if (non_match)
3312     bitset_not (sbcset);
3313 
3314 #ifdef RE_ENABLE_I18N
3315   /* Ensure only single byte characters are set.  */
3316   if (dfa->mb_cur_max > 1)
3317     bitset_mask (sbcset, dfa->sb_char);
3318 
3319   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3320       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3321 						     || mbcset->non_match)))
3322     {
3323       bin_tree_t *mbc_tree;
3324       int sbc_idx;
3325       /* Build a tree for complex bracket.  */
3326       dfa->has_mb_node = 1;
3327       br_token.type = COMPLEX_BRACKET;
3328       br_token.opr.mbcset = mbcset;
3329       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3330       if (BE (mbc_tree == NULL, 0))
3331 	goto parse_bracket_exp_espace;
3332       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3333 	if (sbcset[sbc_idx])
3334 	  break;
3335       /* If there are no bits set in sbcset, there is no point
3336 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3337       if (sbc_idx < BITSET_WORDS)
3338 	{
3339 	  /* Build a tree for simple bracket.  */
3340 	  br_token.type = SIMPLE_BRACKET;
3341 	  br_token.opr.sbcset = sbcset;
3342 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3343 	  if (BE (work_tree == NULL, 0))
3344 	    goto parse_bracket_exp_espace;
3345 
3346 	  /* Then join them by ALT node.  */
3347 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3348 	  if (BE (work_tree == NULL, 0))
3349 	    goto parse_bracket_exp_espace;
3350 	}
3351       else
3352 	{
3353 	  re_free (sbcset);
3354 	  work_tree = mbc_tree;
3355 	}
3356     }
3357   else
3358 #endif /* not RE_ENABLE_I18N */
3359     {
3360 #ifdef RE_ENABLE_I18N
3361       free_charset (mbcset);
3362 #endif
3363       /* Build a tree for simple bracket.  */
3364       br_token.type = SIMPLE_BRACKET;
3365       br_token.opr.sbcset = sbcset;
3366       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3367       if (BE (work_tree == NULL, 0))
3368 	goto parse_bracket_exp_espace;
3369     }
3370   return work_tree;
3371 
3372  parse_bracket_exp_espace:
3373   *err = REG_ESPACE;
3374  parse_bracket_exp_free_return:
3375   re_free (sbcset);
3376 #ifdef RE_ENABLE_I18N
3377   free_charset (mbcset);
3378 #endif /* RE_ENABLE_I18N */
3379   return NULL;
3380 }
3381 
3382 /* Parse an element in the bracket expression.  */
3383 
3384 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3385 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3386 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3387 		       reg_syntax_t syntax, bool accept_hyphen)
3388 {
3389 #ifdef RE_ENABLE_I18N
3390   int cur_char_size;
3391   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3392   if (cur_char_size > 1)
3393     {
3394       elem->type = MB_CHAR;
3395       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3396       re_string_skip_bytes (regexp, cur_char_size);
3397       return REG_NOERROR;
3398     }
3399 #endif /* RE_ENABLE_I18N */
3400   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3401   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3402       || token->type == OP_OPEN_EQUIV_CLASS)
3403     return parse_bracket_symbol (elem, regexp, token);
3404   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3405     {
3406       /* A '-' must only appear as anything but a range indicator before
3407 	 the closing bracket.  Everything else is an error.  */
3408       re_token_t token2;
3409       (void) peek_token_bracket (&token2, regexp, syntax);
3410       if (token2.type != OP_CLOSE_BRACKET)
3411 	/* The actual error value is not standardized since this whole
3412 	   case is undefined.  But ERANGE makes good sense.  */
3413 	return REG_ERANGE;
3414     }
3415   elem->type = SB_CHAR;
3416   elem->opr.ch = token->opr.c;
3417   return REG_NOERROR;
3418 }
3419 
3420 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3421    such as [:<character_class>:], [.<collating_element>.], and
3422    [=<equivalent_class>=].  */
3423 
3424 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3425 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3426 		      re_token_t *token)
3427 {
3428   unsigned char ch, delim = token->opr.c;
3429   int i = 0;
3430   if (re_string_eoi(regexp))
3431     return REG_EBRACK;
3432   for (;; ++i)
3433     {
3434       if (i >= BRACKET_NAME_BUF_SIZE)
3435 	return REG_EBRACK;
3436       if (token->type == OP_OPEN_CHAR_CLASS)
3437 	ch = re_string_fetch_byte_case (regexp);
3438       else
3439 	ch = re_string_fetch_byte (regexp);
3440       if (re_string_eoi(regexp))
3441 	return REG_EBRACK;
3442       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3443 	break;
3444       elem->opr.name[i] = ch;
3445     }
3446   re_string_skip_bytes (regexp, 1);
3447   elem->opr.name[i] = '\0';
3448   switch (token->type)
3449     {
3450     case OP_OPEN_COLL_ELEM:
3451       elem->type = COLL_SYM;
3452       break;
3453     case OP_OPEN_EQUIV_CLASS:
3454       elem->type = EQUIV_CLASS;
3455       break;
3456     case OP_OPEN_CHAR_CLASS:
3457       elem->type = CHAR_CLASS;
3458       break;
3459     default:
3460       break;
3461     }
3462   return REG_NOERROR;
3463 }
3464 
3465   /* Helper function for parse_bracket_exp.
3466      Build the equivalence class which is represented by NAME.
3467      The result are written to MBCSET and SBCSET.
3468      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3469      is a pointer argument since we may update it.  */
3470 
3471 static reg_errcode_t
3472 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3473 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3474 		   Idx *equiv_class_alloc, const unsigned char *name)
3475 #else /* not RE_ENABLE_I18N */
3476 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3477 #endif /* not RE_ENABLE_I18N */
3478 {
3479 #ifdef _LIBC
3480   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3481   if (nrules != 0)
3482     {
3483       const int32_t *table, *indirect;
3484       const unsigned char *weights, *extra, *cp;
3485       unsigned char char_buf[2];
3486       int32_t idx1, idx2;
3487       unsigned int ch;
3488       size_t len;
3489       /* This #include defines a local function!  */
3490 # include <locale/weight.h>
3491       /* Calculate the index for equivalence class.  */
3492       cp = name;
3493       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3494       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3495 					       _NL_COLLATE_WEIGHTMB);
3496       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3497 						   _NL_COLLATE_EXTRAMB);
3498       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3499 						_NL_COLLATE_INDIRECTMB);
3500       idx1 = findidx (&cp, -1);
3501       if (BE (idx1 == 0 || *cp != '\0', 0))
3502 	/* This isn't a valid character.  */
3503 	return REG_ECOLLATE;
3504 
3505       /* Build single byte matching table for this equivalence class.  */
3506       len = weights[idx1 & 0xffffff];
3507       for (ch = 0; ch < SBC_MAX; ++ch)
3508 	{
3509 	  char_buf[0] = ch;
3510 	  cp = char_buf;
3511 	  idx2 = findidx (&cp, 1);
3512 /*
3513 	  idx2 = table[ch];
3514 */
3515 	  if (idx2 == 0)
3516 	    /* This isn't a valid character.  */
3517 	    continue;
3518 	  /* Compare only if the length matches and the collation rule
3519 	     index is the same.  */
3520 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3521 	    {
3522 	      int cnt = 0;
3523 
3524 	      while (cnt <= len &&
3525 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3526 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3527 		++cnt;
3528 
3529 	      if (cnt > len)
3530 		bitset_set (sbcset, ch);
3531 	    }
3532 	}
3533       /* Check whether the array has enough space.  */
3534       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3535 	{
3536 	  /* Not enough, realloc it.  */
3537 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3538 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3539 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3540 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3541 						   int32_t,
3542 						   new_equiv_class_alloc);
3543 	  if (BE (new_equiv_classes == NULL, 0))
3544 	    return REG_ESPACE;
3545 	  mbcset->equiv_classes = new_equiv_classes;
3546 	  *equiv_class_alloc = new_equiv_class_alloc;
3547 	}
3548       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3549     }
3550   else
3551 #endif /* _LIBC */
3552     {
3553       if (BE (strlen ((const char *) name) != 1, 0))
3554 	return REG_ECOLLATE;
3555       bitset_set (sbcset, *name);
3556     }
3557   return REG_NOERROR;
3558 }
3559 
3560   /* Helper function for parse_bracket_exp.
3561      Build the character class which is represented by NAME.
3562      The result are written to MBCSET and SBCSET.
3563      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3564      is a pointer argument since we may update it.  */
3565 
3566 static reg_errcode_t
3567 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3568 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3569 		 re_charset_t *mbcset, Idx *char_class_alloc,
3570 		 const unsigned char *class_name, reg_syntax_t syntax)
3571 #else /* not RE_ENABLE_I18N */
3572 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573 		 const unsigned char *class_name, reg_syntax_t syntax)
3574 #endif /* not RE_ENABLE_I18N */
3575 {
3576   int i;
3577   const char *name = (const char *) class_name;
3578 
3579   /* In case of REG_ICASE "upper" and "lower" match the both of
3580      upper and lower cases.  */
3581   if ((syntax & RE_ICASE)
3582       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3583     name = "alpha";
3584 
3585 #ifdef RE_ENABLE_I18N
3586   /* Check the space of the arrays.  */
3587   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3588     {
3589       /* Not enough, realloc it.  */
3590       /* +1 in case of mbcset->nchar_classes is 0.  */
3591       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3592       /* Use realloc since array is NULL if *alloc == 0.  */
3593       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3594 					       new_char_class_alloc);
3595       if (BE (new_char_classes == NULL, 0))
3596 	return REG_ESPACE;
3597       mbcset->char_classes = new_char_classes;
3598       *char_class_alloc = new_char_class_alloc;
3599     }
3600   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3601 #endif /* RE_ENABLE_I18N */
3602 
3603 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3604   do {						\
3605     if (BE (trans != NULL, 0))			\
3606       {						\
3607 	for (i = 0; i < SBC_MAX; ++i)		\
3608 	  if (ctype_func (i))			\
3609 	    bitset_set (sbcset, trans[i]);	\
3610       }						\
3611     else					\
3612       {						\
3613 	for (i = 0; i < SBC_MAX; ++i)		\
3614 	  if (ctype_func (i))			\
3615 	    bitset_set (sbcset, i);		\
3616       }						\
3617   } while (0)
3618 
3619   if (strcmp (name, "alnum") == 0)
3620     BUILD_CHARCLASS_LOOP (isalnum);
3621   else if (strcmp (name, "cntrl") == 0)
3622     BUILD_CHARCLASS_LOOP (iscntrl);
3623   else if (strcmp (name, "lower") == 0)
3624     BUILD_CHARCLASS_LOOP (islower);
3625   else if (strcmp (name, "space") == 0)
3626     BUILD_CHARCLASS_LOOP (isspace);
3627   else if (strcmp (name, "alpha") == 0)
3628     BUILD_CHARCLASS_LOOP (isalpha);
3629   else if (strcmp (name, "digit") == 0)
3630     BUILD_CHARCLASS_LOOP (isdigit);
3631   else if (strcmp (name, "print") == 0)
3632     BUILD_CHARCLASS_LOOP (isprint);
3633   else if (strcmp (name, "upper") == 0)
3634     BUILD_CHARCLASS_LOOP (isupper);
3635   else if (strcmp (name, "blank") == 0)
3636     BUILD_CHARCLASS_LOOP (isblank);
3637   else if (strcmp (name, "graph") == 0)
3638     BUILD_CHARCLASS_LOOP (isgraph);
3639   else if (strcmp (name, "punct") == 0)
3640     BUILD_CHARCLASS_LOOP (ispunct);
3641   else if (strcmp (name, "xdigit") == 0)
3642     BUILD_CHARCLASS_LOOP (isxdigit);
3643   else
3644     return REG_ECTYPE;
3645 
3646   return REG_NOERROR;
3647 }
3648 
3649 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3650 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3651 		    const unsigned char *class_name,
3652 		    const unsigned char *extra, bool non_match,
3653 		    reg_errcode_t *err)
3654 {
3655   re_bitset_ptr_t sbcset;
3656 #ifdef RE_ENABLE_I18N
3657   re_charset_t *mbcset;
3658   Idx alloc = 0;
3659 #endif /* not RE_ENABLE_I18N */
3660   reg_errcode_t ret;
3661   re_token_t br_token;
3662   bin_tree_t *tree;
3663 
3664   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3665 #ifdef RE_ENABLE_I18N
3666   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3667 #endif /* RE_ENABLE_I18N */
3668 
3669 #ifdef RE_ENABLE_I18N
3670   if (BE (sbcset == NULL || mbcset == NULL, 0))
3671 #else /* not RE_ENABLE_I18N */
3672   if (BE (sbcset == NULL, 0))
3673 #endif /* not RE_ENABLE_I18N */
3674     {
3675       *err = REG_ESPACE;
3676       return NULL;
3677     }
3678 
3679   if (non_match)
3680     {
3681 #ifdef RE_ENABLE_I18N
3682       mbcset->non_match = 1;
3683 #endif /* not RE_ENABLE_I18N */
3684     }
3685 
3686   /* We don't care the syntax in this case.  */
3687   ret = build_charclass (trans, sbcset,
3688 #ifdef RE_ENABLE_I18N
3689 			 mbcset, &alloc,
3690 #endif /* RE_ENABLE_I18N */
3691 			 class_name, 0);
3692 
3693   if (BE (ret != REG_NOERROR, 0))
3694     {
3695       re_free (sbcset);
3696 #ifdef RE_ENABLE_I18N
3697       free_charset (mbcset);
3698 #endif /* RE_ENABLE_I18N */
3699       *err = ret;
3700       return NULL;
3701     }
3702   /* \w match '_' also.  */
3703   for (; *extra; extra++)
3704     bitset_set (sbcset, *extra);
3705 
3706   /* If it is non-matching list.  */
3707   if (non_match)
3708     bitset_not (sbcset);
3709 
3710 #ifdef RE_ENABLE_I18N
3711   /* Ensure only single byte characters are set.  */
3712   if (dfa->mb_cur_max > 1)
3713     bitset_mask (sbcset, dfa->sb_char);
3714 #endif
3715 
3716   /* Build a tree for simple bracket.  */
3717   br_token.type = SIMPLE_BRACKET;
3718   br_token.opr.sbcset = sbcset;
3719   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3720   if (BE (tree == NULL, 0))
3721     goto build_word_op_espace;
3722 
3723 #ifdef RE_ENABLE_I18N
3724   if (dfa->mb_cur_max > 1)
3725     {
3726       bin_tree_t *mbc_tree;
3727       /* Build a tree for complex bracket.  */
3728       br_token.type = COMPLEX_BRACKET;
3729       br_token.opr.mbcset = mbcset;
3730       dfa->has_mb_node = 1;
3731       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3732       if (BE (mbc_tree == NULL, 0))
3733 	goto build_word_op_espace;
3734       /* Then join them by ALT node.  */
3735       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3736       if (BE (mbc_tree != NULL, 1))
3737 	return tree;
3738     }
3739   else
3740     {
3741       free_charset (mbcset);
3742       return tree;
3743     }
3744 #else /* not RE_ENABLE_I18N */
3745   return tree;
3746 #endif /* not RE_ENABLE_I18N */
3747 
3748  build_word_op_espace:
3749   re_free (sbcset);
3750 #ifdef RE_ENABLE_I18N
3751   free_charset (mbcset);
3752 #endif /* RE_ENABLE_I18N */
3753   *err = REG_ESPACE;
3754   return NULL;
3755 }
3756 
3757 /* This is intended for the expressions like "a{1,3}".
3758    Fetch a number from 'input', and return the number.
3759    Return REG_MISSING if the number field is empty like "{,1}".
3760    Return RE_DUP_MAX + 1 if the number field is too large.
3761    Return REG_ERROR if an error occurred.  */
3762 
3763 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3764 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3765 {
3766   Idx num = REG_MISSING;
3767   unsigned char c;
3768   while (1)
3769     {
3770       fetch_token (token, input, syntax);
3771       c = token->opr.c;
3772       if (BE (token->type == END_OF_RE, 0))
3773 	return REG_ERROR;
3774       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3775 	break;
3776       num = ((token->type != CHARACTER || c < '0' || '9' < c
3777 	      || num == REG_ERROR)
3778 	     ? REG_ERROR
3779 	     : num == REG_MISSING
3780 	     ? c - '0'
3781 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3782     }
3783   return num;
3784 }
3785 
3786 #ifdef RE_ENABLE_I18N
3787 static void
free_charset(re_charset_t * cset)3788 free_charset (re_charset_t *cset)
3789 {
3790   re_free (cset->mbchars);
3791 # ifdef _LIBC
3792   re_free (cset->coll_syms);
3793   re_free (cset->equiv_classes);
3794   re_free (cset->range_starts);
3795   re_free (cset->range_ends);
3796 # endif
3797   re_free (cset->char_classes);
3798   re_free (cset);
3799 }
3800 #endif /* RE_ENABLE_I18N */
3801 
3802 /* Functions for binary tree operation.  */
3803 
3804 /* Create a tree node.  */
3805 
3806 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3807 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3808 	     re_token_type_t type)
3809 {
3810   re_token_t t;
3811   t.type = type;
3812   return create_token_tree (dfa, left, right, &t);
3813 }
3814 
3815 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3816 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3817 		   const re_token_t *token)
3818 {
3819   bin_tree_t *tree;
3820   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3821     {
3822       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3823 
3824       if (storage == NULL)
3825 	return NULL;
3826       storage->next = dfa->str_tree_storage;
3827       dfa->str_tree_storage = storage;
3828       dfa->str_tree_storage_idx = 0;
3829     }
3830   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3831 
3832   tree->parent = NULL;
3833   tree->left = left;
3834   tree->right = right;
3835   tree->token = *token;
3836   tree->token.duplicated = 0;
3837   tree->token.opt_subexp = 0;
3838   tree->first = NULL;
3839   tree->next = NULL;
3840   tree->node_idx = REG_MISSING;
3841 
3842   if (left != NULL)
3843     left->parent = tree;
3844   if (right != NULL)
3845     right->parent = tree;
3846   return tree;
3847 }
3848 
3849 /* Mark the tree SRC as an optional subexpression.
3850    To be called from preorder or postorder.  */
3851 
3852 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3853 mark_opt_subexp (void *extra, bin_tree_t *node)
3854 {
3855   Idx idx = (Idx) (long) extra;
3856   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3857     node->token.opt_subexp = 1;
3858 
3859   return REG_NOERROR;
3860 }
3861 
3862 /* Free the allocated memory inside NODE. */
3863 
3864 static void
free_token(re_token_t * node)3865 free_token (re_token_t *node)
3866 {
3867 #ifdef RE_ENABLE_I18N
3868   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3869     free_charset (node->opr.mbcset);
3870   else
3871 #endif /* RE_ENABLE_I18N */
3872     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3873       re_free (node->opr.sbcset);
3874 }
3875 
3876 /* Worker function for tree walking.  Free the allocated memory inside NODE
3877    and its children. */
3878 
3879 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3880 free_tree (void *extra, bin_tree_t *node)
3881 {
3882   free_token (&node->token);
3883   return REG_NOERROR;
3884 }
3885 
3886 
3887 /* Duplicate the node SRC, and return new node.  This is a preorder
3888    visit similar to the one implemented by the generic visitor, but
3889    we need more infrastructure to maintain two parallel trees --- so,
3890    it's easier to duplicate.  */
3891 
3892 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3893 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3894 {
3895   const bin_tree_t *node;
3896   bin_tree_t *dup_root;
3897   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3898 
3899   for (node = root; ; )
3900     {
3901       /* Create a new tree and link it back to the current parent.  */
3902       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3903       if (*p_new == NULL)
3904 	return NULL;
3905       (*p_new)->parent = dup_node;
3906       (*p_new)->token.duplicated = 1;
3907       dup_node = *p_new;
3908 
3909       /* Go to the left node, or up and to the right.  */
3910       if (node->left)
3911 	{
3912 	  node = node->left;
3913 	  p_new = &dup_node->left;
3914 	}
3915       else
3916 	{
3917 	  const bin_tree_t *prev = NULL;
3918 	  while (node->right == prev || node->right == NULL)
3919 	    {
3920 	      prev = node;
3921 	      node = node->parent;
3922 	      dup_node = dup_node->parent;
3923 	      if (!node)
3924 		return dup_root;
3925 	    }
3926 	  node = node->right;
3927 	  p_new = &dup_node->right;
3928 	}
3929     }
3930 }
3931