1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 #if defined __TANDEM
21  /* This is currently duplicated from git-compat-utils.h */
22 # ifdef NO_INTPTR_T
23  typedef long intptr_t;
24  typedef unsigned long uintptr_t;
25 # endif
26 #endif
27 
28 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
29 					  size_t length, reg_syntax_t syntax);
30 static void re_compile_fastmap_iter (regex_t *bufp,
31 				     const re_dfastate_t *init_state,
32 				     char *fastmap);
33 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
34 #ifdef RE_ENABLE_I18N
35 static void free_charset (re_charset_t *cset);
36 #endif /* RE_ENABLE_I18N */
37 static void free_workarea_compile (regex_t *preg);
38 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
39 #ifdef RE_ENABLE_I18N
40 static void optimize_utf8 (re_dfa_t *dfa);
41 #endif
42 static reg_errcode_t analyze (regex_t *preg);
43 static reg_errcode_t preorder (bin_tree_t *root,
44 			       reg_errcode_t (fn (void *, bin_tree_t *)),
45 			       void *extra);
46 static reg_errcode_t postorder (bin_tree_t *root,
47 				reg_errcode_t (fn (void *, bin_tree_t *)),
48 				void *extra);
49 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
50 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
51 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
52 				 bin_tree_t *node);
53 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
54 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
55 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
56 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
57 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
58 				   unsigned int constraint);
59 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
60 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
61 					 int node, int root);
62 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
63 static int fetch_number (re_string_t *input, re_token_t *token,
64 			 reg_syntax_t syntax);
65 static int peek_token (re_token_t *token, re_string_t *input,
66 			reg_syntax_t syntax) internal_function;
67 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
68 			  reg_syntax_t syntax, reg_errcode_t *err);
69 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
70 				  re_token_t *token, reg_syntax_t syntax,
71 				  int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
73 				 re_token_t *token, reg_syntax_t syntax,
74 				 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
76 				     re_token_t *token, reg_syntax_t syntax,
77 				     int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
79 				  re_token_t *token, reg_syntax_t syntax,
80 				  int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
82 				 re_dfa_t *dfa, re_token_t *token,
83 				 reg_syntax_t syntax, reg_errcode_t *err);
84 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
85 				      re_token_t *token, reg_syntax_t syntax,
86 				      reg_errcode_t *err);
87 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
88 					    re_string_t *regexp,
89 					    re_token_t *token, int token_len,
90 					    re_dfa_t *dfa,
91 					    reg_syntax_t syntax,
92 					    int accept_hyphen);
93 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
94 					  re_string_t *regexp,
95 					  re_token_t *token);
96 #ifdef RE_ENABLE_I18N
97 static reg_errcode_t build_equiv_class (bitset_t sbcset,
98 					re_charset_t *mbcset,
99 					int *equiv_class_alloc,
100 					const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 				      bitset_t sbcset,
103 				      re_charset_t *mbcset,
104 				      int *char_class_alloc,
105 				      const char *class_name,
106 				      reg_syntax_t syntax);
107 #else  /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_equiv_class (bitset_t sbcset,
109 					const unsigned char *name);
110 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
111 				      bitset_t sbcset,
112 				      const char *class_name,
113 				      reg_syntax_t syntax);
114 #endif /* not RE_ENABLE_I18N */
115 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
116 				       RE_TRANSLATE_TYPE trans,
117 				       const char *class_name,
118 				       const char *extra,
119 				       int non_match, reg_errcode_t *err);
120 static bin_tree_t *create_tree (re_dfa_t *dfa,
121 				bin_tree_t *left, bin_tree_t *right,
122 				re_token_type_t type);
123 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
124 				      bin_tree_t *left, bin_tree_t *right,
125 				      const re_token_t *token);
126 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
127 static void free_token (re_token_t *node);
128 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
129 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
130 
131 /* This table gives an error message for each of the error codes listed
132    in regex.h.  Obviously the order here has to be same as there.
133    POSIX doesn't require that we do anything for REG_NOERROR,
134    but why not be nice?  */
135 
136 const char __re_error_msgid[] attribute_hidden =
137   {
138 #define REG_NOERROR_IDX	0
139     gettext_noop ("Success")	/* REG_NOERROR */
140     "\0"
141 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
142     gettext_noop ("No match")	/* REG_NOMATCH */
143     "\0"
144 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
145     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
146     "\0"
147 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
148     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
149     "\0"
150 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
151     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
152     "\0"
153 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
154     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
155     "\0"
156 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
157     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
158     "\0"
159 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
160     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
161     "\0"
162 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
163     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
164     "\0"
165 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
166     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
167     "\0"
168 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
169     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
170     "\0"
171 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
172     gettext_noop ("Invalid range end")	/* REG_ERANGE */
173     "\0"
174 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
175     gettext_noop ("Memory exhausted") /* REG_ESPACE */
176     "\0"
177 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
178     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
179     "\0"
180 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
181     gettext_noop ("Premature end of regular expression") /* REG_EEND */
182     "\0"
183 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
184     gettext_noop ("Regular expression too big") /* REG_ESIZE */
185     "\0"
186 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
187     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188   };
189 
190 const size_t __re_error_msgid_idx[] attribute_hidden =
191   {
192     REG_NOERROR_IDX,
193     REG_NOMATCH_IDX,
194     REG_BADPAT_IDX,
195     REG_ECOLLATE_IDX,
196     REG_ECTYPE_IDX,
197     REG_EESCAPE_IDX,
198     REG_ESUBREG_IDX,
199     REG_EBRACK_IDX,
200     REG_EPAREN_IDX,
201     REG_EBRACE_IDX,
202     REG_BADBR_IDX,
203     REG_ERANGE_IDX,
204     REG_ESPACE_IDX,
205     REG_BADRPT_IDX,
206     REG_EEND_IDX,
207     REG_ESIZE_IDX,
208     REG_ERPAREN_IDX
209   };
210 
211 /* Entry points for GNU code.  */
212 
213 
214 #ifdef ZOS_USS
215 
216 /* For ZOS USS we must define btowc */
217 
218 wchar_t
btowc(int c)219 btowc (int c)
220 {
221    wchar_t wtmp[2];
222    char tmp[2];
223 
224    tmp[0] = c;
225    tmp[1] = 0;
226 
227    mbtowc (wtmp, tmp, 1);
228    return wtmp[0];
229 }
230 #endif
231 
232 /* re_compile_pattern is the GNU regular expression compiler: it
233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234    Returns 0 if the pattern was valid, otherwise an error string.
235 
236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237    are set in BUFP on entry.  */
238 
239 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)240 re_compile_pattern (const char *pattern,
241 		    size_t length,
242 		    struct re_pattern_buffer *bufp)
243 {
244   reg_errcode_t ret;
245 
246   /* And GNU code determines whether or not to get register information
247      by passing null for the REGS argument to re_match, etc., not by
248      setting no_sub, unless RE_NO_SUB is set.  */
249   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
250 
251   /* Match anchors at newline.  */
252   bufp->newline_anchor = 1;
253 
254   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
255 
256   if (!ret)
257     return NULL;
258   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
259 }
260 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)261 weak_alias (__re_compile_pattern, re_compile_pattern)
262 #endif
263 
264 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
265    also be assigned to arbitrarily: each pattern buffer stores its own
266    syntax, so it can be changed between regex compilations.  */
267 /* This has no initializer because initialized variables in Emacs
268    become read-only after dumping.  */
269 reg_syntax_t re_syntax_options;
270 
271 
272 /* Specify the precise syntax of regexps for compilation.  This provides
273    for compatibility for various utilities which historically have
274    different, incompatible syntaxes.
275 
276    The argument SYNTAX is a bit mask comprised of the various bits
277    defined in regex.h.  We return the old syntax.  */
278 
279 reg_syntax_t
280 re_set_syntax (reg_syntax_t syntax)
281 {
282   reg_syntax_t ret = re_syntax_options;
283 
284   re_syntax_options = syntax;
285   return ret;
286 }
287 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)288 weak_alias (__re_set_syntax, re_set_syntax)
289 #endif
290 
291 int
292 re_compile_fastmap (struct re_pattern_buffer *bufp)
293 {
294   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
295   char *fastmap = bufp->fastmap;
296 
297   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
298   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
299   if (dfa->init_state != dfa->init_state_word)
300     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
301   if (dfa->init_state != dfa->init_state_nl)
302     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
303   if (dfa->init_state != dfa->init_state_begbuf)
304     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
305   bufp->fastmap_accurate = 1;
306   return 0;
307 }
308 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)309 weak_alias (__re_compile_fastmap, re_compile_fastmap)
310 #endif
311 
312 static inline void
313 __attribute ((always_inline))
314 re_set_fastmap (char *fastmap, int icase, int ch)
315 {
316   fastmap[ch] = 1;
317   if (icase)
318     fastmap[tolower (ch)] = 1;
319 }
320 
321 /* Helper function for re_compile_fastmap.
322    Compile fastmap for the initial_state INIT_STATE.  */
323 
324 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)325 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
326 			 char *fastmap)
327 {
328   volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
329   int node_cnt;
330   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
331   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
332     {
333       int node = init_state->nodes.elems[node_cnt];
334       re_token_type_t type = dfa->nodes[node].type;
335 
336       if (type == CHARACTER)
337 	{
338 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
339 #ifdef RE_ENABLE_I18N
340 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
341 	    {
342 	      unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
343 	      wchar_t wc;
344 	      mbstate_t state;
345 
346 	      p = buf;
347 	      *p++ = dfa->nodes[node].opr.c;
348 	      while (++node < dfa->nodes_len
349 		     && dfa->nodes[node].type == CHARACTER
350 		     && dfa->nodes[node].mb_partial)
351 		*p++ = dfa->nodes[node].opr.c;
352 	      memset (&state, '\0', sizeof (state));
353 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
354 			     &state) == p - buf
355 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
356 		      != (size_t) -1))
357 		re_set_fastmap (fastmap, 0, buf[0]);
358 	      re_free (buf);
359 	    }
360 #endif
361 	}
362       else if (type == SIMPLE_BRACKET)
363 	{
364 	  int i, ch;
365 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
366 	    {
367 	      int j;
368 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
369 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
370 		if (w & ((bitset_word_t) 1 << j))
371 		  re_set_fastmap (fastmap, icase, ch);
372 	    }
373 	}
374 #ifdef RE_ENABLE_I18N
375       else if (type == COMPLEX_BRACKET)
376 	{
377 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 	  int i;
379 
380 # ifdef _LIBC
381 	  /* See if we have to try all bytes which start multiple collation
382 	     elements.
383 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
384 		  collation element, and don't catch 'b' since 'b' is
385 		  the only collation element which starts from 'b' (and
386 		  it is caught by SIMPLE_BRACKET).  */
387 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
388 		  && (cset->ncoll_syms || cset->nranges))
389 		{
390 		  const int32_t *table = (const int32_t *)
391 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
392 		  for (i = 0; i < SBC_MAX; ++i)
393 		    if (table[i] < 0)
394 		      re_set_fastmap (fastmap, icase, i);
395 		}
396 # endif /* _LIBC */
397 
398 	  /* See if we have to start the match at all multibyte characters,
399 	     i.e. where we would not find an invalid sequence.  This only
400 	     applies to multibyte character sets; for single byte character
401 	     sets, the SIMPLE_BRACKET again suffices.  */
402 	  if (dfa->mb_cur_max > 1
403 	      && (cset->nchar_classes || cset->non_match || cset->nranges
404 # ifdef _LIBC
405 		  || cset->nequiv_classes
406 # endif /* _LIBC */
407 		 ))
408 	    {
409 	      unsigned char c = 0;
410 	      do
411 		{
412 		  mbstate_t mbs;
413 		  memset (&mbs, 0, sizeof (mbs));
414 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
415 		    re_set_fastmap (fastmap, false, (int) c);
416 		}
417 	      while (++c != 0);
418 	    }
419 
420 	  else
421 	    {
422 	      /* ... Else catch all bytes which can start the mbchars.  */
423 	      for (i = 0; i < cset->nmbchars; ++i)
424 		{
425 		  char buf[256];
426 		  mbstate_t state;
427 		  memset (&state, '\0', sizeof (state));
428 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
429 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
430 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
431 		    {
432 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
433 			  != (size_t) -1)
434 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
435 		    }
436 		}
437 	    }
438 	}
439 #endif /* RE_ENABLE_I18N */
440       else if (type == OP_PERIOD
441 #ifdef RE_ENABLE_I18N
442 	       || type == OP_UTF8_PERIOD
443 #endif /* RE_ENABLE_I18N */
444 	       || type == END_OF_RE)
445 	{
446 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
447 	  if (type == END_OF_RE)
448 	    bufp->can_be_null = 1;
449 	  return;
450 	}
451     }
452 }
453 
454 /* Entry point for POSIX code.  */
455 /* regcomp takes a regular expression as a string and compiles it.
456 
457    PREG is a regex_t *.  We do not expect any fields to be initialized,
458    since POSIX says we shouldn't.  Thus, we set
459 
460      `buffer' to the compiled pattern;
461      `used' to the length of the compiled pattern;
462      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
463        REG_EXTENDED bit in CFLAGS is set; otherwise, to
464        RE_SYNTAX_POSIX_BASIC;
465      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
466      `fastmap' to an allocated space for the fastmap;
467      `fastmap_accurate' to zero;
468      `re_nsub' to the number of subexpressions in PATTERN.
469 
470    PATTERN is the address of the pattern string.
471 
472    CFLAGS is a series of bits which affect compilation.
473 
474      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
475      use POSIX basic syntax.
476 
477      If REG_NEWLINE is set, then . and [^...] don't match newline.
478      Also, regexec will try a match beginning after every newline.
479 
480      If REG_ICASE is set, then we considers upper- and lowercase
481      versions of letters to be equivalent when matching.
482 
483      If REG_NOSUB is set, then when PREG is passed to regexec, that
484      routine will report only success or failure, and nothing about the
485      registers.
486 
487    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
488    the return codes and their meanings.)  */
489 
490 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)491 regcomp (regex_t *__restrict preg,
492 	 const char *__restrict pattern,
493 	 int cflags)
494 {
495   reg_errcode_t ret;
496   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
497 			 : RE_SYNTAX_POSIX_BASIC);
498 
499   preg->buffer = NULL;
500   preg->allocated = 0;
501   preg->used = 0;
502 
503   /* Try to allocate space for the fastmap.  */
504   preg->fastmap = re_malloc (char, SBC_MAX);
505   if (BE (preg->fastmap == NULL, 0))
506     return REG_ESPACE;
507 
508   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
509 
510   /* If REG_NEWLINE is set, newlines are treated differently.  */
511   if (cflags & REG_NEWLINE)
512     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
513       syntax &= ~RE_DOT_NEWLINE;
514       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
515       /* It also changes the matching behavior.  */
516       preg->newline_anchor = 1;
517     }
518   else
519     preg->newline_anchor = 0;
520   preg->no_sub = !!(cflags & REG_NOSUB);
521   preg->translate = NULL;
522 
523   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
524 
525   /* POSIX doesn't distinguish between an unmatched open-group and an
526      unmatched close-group: both are REG_EPAREN.  */
527   if (ret == REG_ERPAREN)
528     ret = REG_EPAREN;
529 
530   /* We have already checked preg->fastmap != NULL.  */
531   if (BE (ret == REG_NOERROR, 1))
532     /* Compute the fastmap now, since regexec cannot modify the pattern
533        buffer.  This function never fails in this implementation.  */
534     (void) re_compile_fastmap (preg);
535   else
536     {
537       /* Some error occurred while compiling the expression.  */
538       re_free (preg->fastmap);
539       preg->fastmap = NULL;
540     }
541 
542   return (int) ret;
543 }
544 #ifdef _LIBC
weak_alias(__regcomp,regcomp)545 weak_alias (__regcomp, regcomp)
546 #endif
547 
548 /* Returns a message corresponding to an error code, ERRCODE, returned
549    from either regcomp or regexec.   We don't use PREG here.  */
550 
551 size_t
552 regerror(int errcode, const regex_t *__restrict preg,
553 	 char *__restrict errbuf, size_t errbuf_size)
554 {
555   const char *msg;
556   size_t msg_size;
557 
558   if (BE (errcode < 0
559 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
560 			       / sizeof (__re_error_msgid_idx[0])), 0))
561     /* Only error codes returned by the rest of the code should be passed
562        to this routine.  If we are given anything else, or if other regex
563        code generates an invalid error code, then the program has a bug.
564        Dump core so we can fix it.  */
565     abort ();
566 
567   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
568 
569   msg_size = strlen (msg) + 1; /* Includes the null.  */
570 
571   if (BE (errbuf_size != 0, 1))
572     {
573       if (BE (msg_size > errbuf_size, 0))
574 	{
575 	  memcpy (errbuf, msg, errbuf_size - 1);
576 	  errbuf[errbuf_size - 1] = 0;
577 	}
578       else
579 	memcpy (errbuf, msg, msg_size);
580     }
581 
582   return msg_size;
583 }
584 #ifdef _LIBC
585 weak_alias (__regerror, regerror)
586 #endif
587 
588 
589 #ifdef RE_ENABLE_I18N
590 /* This static array is used for the map to single-byte characters when
591    UTF-8 is used.  Otherwise we would allocate memory just to initialize
592    it the same all the time.  UTF-8 is the preferred encoding so this is
593    a worthwhile optimization.  */
594 #if __GNUC__ >= 3
595 static const bitset_t utf8_sb_map = {
596   /* Set the first 128 bits.  */
597   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
598 };
599 #else /* ! (__GNUC__ >= 3) */
600 static bitset_t utf8_sb_map;
601 #endif /* __GNUC__ >= 3 */
602 #endif /* RE_ENABLE_I18N */
603 
604 
605 static void
free_dfa_content(re_dfa_t * dfa)606 free_dfa_content (re_dfa_t *dfa)
607 {
608   int i, j;
609 
610   if (dfa->nodes)
611     for (i = 0; i < dfa->nodes_len; ++i)
612       free_token (dfa->nodes + i);
613   re_free (dfa->nexts);
614   for (i = 0; i < dfa->nodes_len; ++i)
615     {
616       if (dfa->eclosures != NULL)
617 	re_node_set_free (dfa->eclosures + i);
618       if (dfa->inveclosures != NULL)
619 	re_node_set_free (dfa->inveclosures + i);
620       if (dfa->edests != NULL)
621 	re_node_set_free (dfa->edests + i);
622     }
623   re_free (dfa->edests);
624   re_free (dfa->eclosures);
625   re_free (dfa->inveclosures);
626   re_free (dfa->nodes);
627 
628   if (dfa->state_table)
629     for (i = 0; i <= dfa->state_hash_mask; ++i)
630       {
631 	struct re_state_table_entry *entry = dfa->state_table + i;
632 	for (j = 0; j < entry->num; ++j)
633 	  {
634 	    re_dfastate_t *state = entry->array[j];
635 	    free_state (state);
636 	  }
637 	re_free (entry->array);
638       }
639   re_free (dfa->state_table);
640 #ifdef RE_ENABLE_I18N
641   if (dfa->sb_char != utf8_sb_map)
642     re_free (dfa->sb_char);
643 #endif
644   re_free (dfa->subexp_map);
645 #ifdef DEBUG
646   re_free (dfa->re_str);
647 #endif
648 
649   re_free (dfa);
650 }
651 
652 
653 /* Free dynamically allocated space used by PREG.  */
654 
655 void
regfree(regex_t * preg)656 regfree (regex_t *preg)
657 {
658   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
659   if (BE (dfa != NULL, 1))
660     free_dfa_content (dfa);
661   preg->buffer = NULL;
662   preg->allocated = 0;
663 
664   re_free (preg->fastmap);
665   preg->fastmap = NULL;
666 
667   re_free (preg->translate);
668   preg->translate = NULL;
669 }
670 #ifdef _LIBC
671 weak_alias (__regfree, regfree)
672 #endif
673 
674 /* Entry points compatible with 4.2 BSD regex library.  We don't define
675    them unless specifically requested.  */
676 
677 #if defined _REGEX_RE_COMP || defined _LIBC
678 
679 /* BSD has one and only one pattern buffer.  */
680 static struct re_pattern_buffer re_comp_buf;
681 
682 char *
683 # ifdef _LIBC
684 /* Make these definitions weak in libc, so POSIX programs can redefine
685    these names if they don't use our functions, and still use
686    regcomp/regexec above without link errors.  */
687 weak_function
688 # endif
re_comp(s)689 re_comp (s)
690      const char *s;
691 {
692   reg_errcode_t ret;
693   char *fastmap;
694 
695   if (!s)
696     {
697       if (!re_comp_buf.buffer)
698 	return gettext ("No previous regular expression");
699       return 0;
700     }
701 
702   if (re_comp_buf.buffer)
703     {
704       fastmap = re_comp_buf.fastmap;
705       re_comp_buf.fastmap = NULL;
706       __regfree (&re_comp_buf);
707       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
708       re_comp_buf.fastmap = fastmap;
709     }
710 
711   if (re_comp_buf.fastmap == NULL)
712     {
713       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
714       if (re_comp_buf.fastmap == NULL)
715 	return (char *) gettext (__re_error_msgid
716 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
717     }
718 
719   /* Since `re_exec' always passes NULL for the `regs' argument, we
720      don't need to initialize the pattern buffer fields which affect it.  */
721 
722   /* Match anchors at newlines.  */
723   re_comp_buf.newline_anchor = 1;
724 
725   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
726 
727   if (!ret)
728     return NULL;
729 
730   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
731   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
732 }
733 
734 #ifdef _LIBC
libc_freeres_fn(free_mem)735 libc_freeres_fn (free_mem)
736 {
737   __regfree (&re_comp_buf);
738 }
739 #endif
740 
741 #endif /* _REGEX_RE_COMP */
742 
743 /* Internal entry point.
744    Compile the regular expression PATTERN, whose length is LENGTH.
745    SYNTAX indicate regular expression's syntax.  */
746 
747 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)748 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
749 		     reg_syntax_t syntax)
750 {
751   reg_errcode_t err = REG_NOERROR;
752   re_dfa_t *dfa;
753   re_string_t regexp;
754 
755   /* Initialize the pattern buffer.  */
756   preg->fastmap_accurate = 0;
757   preg->syntax = syntax;
758   preg->not_bol = preg->not_eol = 0;
759   preg->used = 0;
760   preg->re_nsub = 0;
761   preg->can_be_null = 0;
762   preg->regs_allocated = REGS_UNALLOCATED;
763 
764   /* Initialize the dfa.  */
765   dfa = (re_dfa_t *) preg->buffer;
766   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
767     {
768       /* If zero allocated, but buffer is non-null, try to realloc
769 	 enough space.  This loses if buffer's address is bogus, but
770 	 that is the user's responsibility.  If ->buffer is NULL this
771 	 is a simple allocation.  */
772       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
773       if (dfa == NULL)
774 	return REG_ESPACE;
775       preg->allocated = sizeof (re_dfa_t);
776       preg->buffer = (unsigned char *) dfa;
777     }
778   preg->used = sizeof (re_dfa_t);
779 
780   err = init_dfa (dfa, length);
781   if (BE (err != REG_NOERROR, 0))
782     {
783       free_dfa_content (dfa);
784       preg->buffer = NULL;
785       preg->allocated = 0;
786       return err;
787     }
788 #ifdef DEBUG
789   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
790   dfa->re_str = re_malloc (char, length + 1);
791   strncpy (dfa->re_str, pattern, length + 1);
792 #endif
793 
794   __libc_lock_init (dfa->lock);
795 
796   err = re_string_construct (&regexp, pattern, length, preg->translate,
797 			     syntax & RE_ICASE, dfa);
798   if (BE (err != REG_NOERROR, 0))
799     {
800     re_compile_internal_free_return:
801       free_workarea_compile (preg);
802       re_string_destruct (&regexp);
803       free_dfa_content (dfa);
804       preg->buffer = NULL;
805       preg->allocated = 0;
806       return err;
807     }
808 
809   /* Parse the regular expression, and build a structure tree.  */
810   preg->re_nsub = 0;
811   dfa->str_tree = parse (&regexp, preg, syntax, &err);
812   if (BE (dfa->str_tree == NULL, 0))
813     goto re_compile_internal_free_return;
814 
815   /* Analyze the tree and create the nfa.  */
816   err = analyze (preg);
817   if (BE (err != REG_NOERROR, 0))
818     goto re_compile_internal_free_return;
819 
820 #ifdef RE_ENABLE_I18N
821   /* If possible, do searching in single byte encoding to speed things up.  */
822   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
823     optimize_utf8 (dfa);
824 #endif
825 
826   /* Then create the initial state of the dfa.  */
827   err = create_initial_state (dfa);
828 
829   /* Release work areas.  */
830   free_workarea_compile (preg);
831   re_string_destruct (&regexp);
832 
833   if (BE (err != REG_NOERROR, 0))
834     {
835       free_dfa_content (dfa);
836       preg->buffer = NULL;
837       preg->allocated = 0;
838     }
839 
840   return err;
841 }
842 
843 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
844    as the initial length of some arrays.  */
845 
846 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)847 init_dfa (re_dfa_t *dfa, size_t pat_len)
848 {
849   unsigned int table_size;
850 #ifndef _LIBC
851   char *codeset_name;
852 #endif
853 
854   memset (dfa, '\0', sizeof (re_dfa_t));
855 
856   /* Force allocation of str_tree_storage the first time.  */
857   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
858 
859   /* Avoid overflows.  */
860   if (pat_len == SIZE_MAX)
861     return REG_ESPACE;
862 
863   dfa->nodes_alloc = pat_len + 1;
864   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
865 
866   /*  table_size = 2 ^ ceil(log pat_len) */
867   for (table_size = 1; ; table_size <<= 1)
868     if (table_size > pat_len)
869       break;
870 
871   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872   dfa->state_hash_mask = table_size - 1;
873 
874   dfa->mb_cur_max = MB_CUR_MAX;
875 #ifdef _LIBC
876   if (dfa->mb_cur_max == 6
877       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878     dfa->is_utf8 = 1;
879   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 		       != 0);
881 #else
882 # ifdef HAVE_LANGINFO_CODESET
883   codeset_name = nl_langinfo (CODESET);
884 # else
885   codeset_name = getenv ("LC_ALL");
886   if (codeset_name == NULL || codeset_name[0] == '\0')
887     codeset_name = getenv ("LC_CTYPE");
888   if (codeset_name == NULL || codeset_name[0] == '\0')
889     codeset_name = getenv ("LANG");
890   if (codeset_name == NULL)
891     codeset_name = "";
892   else if (strchr (codeset_name, '.') !=  NULL)
893     codeset_name = strchr (codeset_name, '.') + 1;
894 # endif
895 
896   /* strcasecmp isn't a standard interface. brute force check */
897 #if 0
898   if (strcasecmp (codeset_name, "UTF-8") == 0
899       || strcasecmp (codeset_name, "UTF8") == 0)
900     dfa->is_utf8 = 1;
901 #else
902   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
903       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
904       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
905       && (codeset_name[3] == '-'
906           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
907           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
908     dfa->is_utf8 = 1;
909 #endif
910 
911   /* We check exhaustively in the loop below if this charset is a
912      superset of ASCII.  */
913   dfa->map_notascii = 0;
914 #endif
915 
916 #ifdef RE_ENABLE_I18N
917   if (dfa->mb_cur_max > 1)
918     {
919       if (dfa->is_utf8)
920         {
921 #if !defined(__GNUC__) || __GNUC__ < 3
922 	  static short utf8_sb_map_inited = 0;
923 
924 	  if (! utf8_sb_map_inited)
925 	    {
926 		int i;
927 
928 	  	utf8_sb_map_inited = 0;
929 		for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
930 		  utf8_sb_map[i] = BITSET_WORD_MAX;
931 	    }
932 #endif
933 	  dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
934 	}
935       else
936 	{
937 	  int i, j, ch;
938 
939 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
940 	  if (BE (dfa->sb_char == NULL, 0))
941 	    return REG_ESPACE;
942 
943 	  /* Set the bits corresponding to single byte chars.  */
944 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
945 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
946 	      {
947 		wint_t wch = __btowc (ch);
948 		if (wch != WEOF)
949 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
950 # ifndef _LIBC
951 		if (isascii (ch) && wch != ch)
952 		  dfa->map_notascii = 1;
953 # endif
954 	      }
955 	}
956     }
957 #endif
958 
959   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
960     return REG_ESPACE;
961   return REG_NOERROR;
962 }
963 
964 /* Initialize WORD_CHAR table, which indicate which character is
965    "word".  In this case "word" means that it is the word construction
966    character used by some operators like "\<", "\>", etc.  */
967 
968 static void
969 internal_function
init_word_char(re_dfa_t * dfa)970 init_word_char (re_dfa_t *dfa)
971 {
972   int i, j, ch;
973   dfa->word_ops_used = 1;
974   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
975     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
976       if (isalnum (ch) || ch == '_')
977 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
978 }
979 
980 /* Free the work area which are only used while compiling.  */
981 
982 static void
free_workarea_compile(regex_t * preg)983 free_workarea_compile (regex_t *preg)
984 {
985   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
986   bin_tree_storage_t *storage, *next;
987   for (storage = dfa->str_tree_storage; storage; storage = next)
988     {
989       next = storage->next;
990       re_free (storage);
991     }
992   dfa->str_tree_storage = NULL;
993   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
994   dfa->str_tree = NULL;
995   re_free (dfa->org_indices);
996   dfa->org_indices = NULL;
997 }
998 
999 /* Create initial states for all contexts.  */
1000 
1001 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)1002 create_initial_state (re_dfa_t *dfa)
1003 {
1004   int first, i;
1005   reg_errcode_t err;
1006   re_node_set init_nodes;
1007 
1008   /* Initial states have the epsilon closure of the node which is
1009      the first node of the regular expression.  */
1010   first = dfa->str_tree->first->node_idx;
1011   dfa->init_node = first;
1012   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013   if (BE (err != REG_NOERROR, 0))
1014     return err;
1015 
1016   /* The back-references which are in initial states can epsilon transit,
1017      since in this case all of the subexpressions can be null.
1018      Then we add epsilon closures of the nodes which are the next nodes of
1019      the back-references.  */
1020   if (dfa->nbackref > 0)
1021     for (i = 0; i < init_nodes.nelem; ++i)
1022       {
1023 	int node_idx = init_nodes.elems[i];
1024 	re_token_type_t type = dfa->nodes[node_idx].type;
1025 
1026 	int clexp_idx;
1027 	if (type != OP_BACK_REF)
1028 	  continue;
1029 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1030 	  {
1031 	    re_token_t *clexp_node;
1032 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1034 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035 	      break;
1036 	  }
1037 	if (clexp_idx == init_nodes.nelem)
1038 	  continue;
1039 
1040 	if (type == OP_BACK_REF)
1041 	  {
1042 	    int dest_idx = dfa->edests[node_idx].elems[0];
1043 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1044 	      {
1045 		reg_errcode_t err = re_node_set_merge (&init_nodes,
1046 						       dfa->eclosures
1047 						       + dest_idx);
1048 		if (err != REG_NOERROR)
1049 		  return err;
1050 		i = 0;
1051 	      }
1052 	  }
1053       }
1054 
1055   /* It must be the first time to invoke acquire_state.  */
1056   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1057   /* We don't check ERR here, since the initial state must not be NULL.  */
1058   if (BE (dfa->init_state == NULL, 0))
1059     return err;
1060   if (dfa->init_state->has_constraint)
1061     {
1062       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1063 						       CONTEXT_WORD);
1064       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1065 						     CONTEXT_NEWLINE);
1066       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1067 							 &init_nodes,
1068 							 CONTEXT_NEWLINE
1069 							 | CONTEXT_BEGBUF);
1070       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1071 	      || dfa->init_state_begbuf == NULL, 0))
1072 	return err;
1073     }
1074   else
1075     dfa->init_state_word = dfa->init_state_nl
1076       = dfa->init_state_begbuf = dfa->init_state;
1077 
1078   re_node_set_free (&init_nodes);
1079   return REG_NOERROR;
1080 }
1081 
1082 #ifdef RE_ENABLE_I18N
1083 /* If it is possible to do searching in single byte encoding instead of UTF-8
1084    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085    DFA nodes where needed.  */
1086 
1087 static void
optimize_utf8(re_dfa_t * dfa)1088 optimize_utf8 (re_dfa_t *dfa)
1089 {
1090   int node, i, mb_chars = 0, has_period = 0;
1091 
1092   for (node = 0; node < dfa->nodes_len; ++node)
1093     switch (dfa->nodes[node].type)
1094       {
1095       case CHARACTER:
1096 	if (dfa->nodes[node].opr.c >= 0x80)
1097 	  mb_chars = 1;
1098 	break;
1099       case ANCHOR:
1100 	switch (dfa->nodes[node].opr.ctx_type)
1101 	  {
1102 	  case LINE_FIRST:
1103 	  case LINE_LAST:
1104 	  case BUF_FIRST:
1105 	  case BUF_LAST:
1106 	    break;
1107 	  default:
1108 	    /* Word anchors etc. cannot be handled.  It's okay to test
1109 	       opr.ctx_type since constraints (for all DFA nodes) are
1110 	       created by ORing one or more opr.ctx_type values.  */
1111 	    return;
1112 	  }
1113 	break;
1114       case OP_PERIOD:
1115 	has_period = 1;
1116 	break;
1117       case OP_BACK_REF:
1118       case OP_ALT:
1119       case END_OF_RE:
1120       case OP_DUP_ASTERISK:
1121       case OP_OPEN_SUBEXP:
1122       case OP_CLOSE_SUBEXP:
1123 	break;
1124       case COMPLEX_BRACKET:
1125 	return;
1126       case SIMPLE_BRACKET:
1127 	/* Just double check.  The non-ASCII range starts at 0x80.  */
1128 	assert (0x80 % BITSET_WORD_BITS == 0);
1129 	for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1130 	  if (dfa->nodes[node].opr.sbcset[i])
1131 	    return;
1132 	break;
1133       default:
1134 	abort ();
1135       }
1136 
1137   if (mb_chars || has_period)
1138     for (node = 0; node < dfa->nodes_len; ++node)
1139       {
1140 	if (dfa->nodes[node].type == CHARACTER
1141 	    && dfa->nodes[node].opr.c >= 0x80)
1142 	  dfa->nodes[node].mb_partial = 0;
1143 	else if (dfa->nodes[node].type == OP_PERIOD)
1144 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1145       }
1146 
1147   /* The search can be in single byte locale.  */
1148   dfa->mb_cur_max = 1;
1149   dfa->is_utf8 = 0;
1150   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1151 }
1152 #endif
1153 
1154 /* Analyze the structure tree, and calculate "first", "next", "edest",
1155    "eclosure", and "inveclosure".  */
1156 
1157 static reg_errcode_t
analyze(regex_t * preg)1158 analyze (regex_t *preg)
1159 {
1160   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1161   reg_errcode_t ret;
1162 
1163   /* Allocate arrays.  */
1164   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1165   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1166   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1167   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1168   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1169 	  || dfa->eclosures == NULL, 0))
1170     return REG_ESPACE;
1171 
1172   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1173   if (dfa->subexp_map != NULL)
1174     {
1175       int i;
1176       for (i = 0; i < preg->re_nsub; i++)
1177 	dfa->subexp_map[i] = i;
1178       preorder (dfa->str_tree, optimize_subexps, dfa);
1179       for (i = 0; i < preg->re_nsub; i++)
1180 	if (dfa->subexp_map[i] != i)
1181 	  break;
1182       if (i == preg->re_nsub)
1183 	{
1184 	  free (dfa->subexp_map);
1185 	  dfa->subexp_map = NULL;
1186 	}
1187     }
1188 
1189   ret = postorder (dfa->str_tree, lower_subexps, preg);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192   ret = postorder (dfa->str_tree, calc_first, dfa);
1193   if (BE (ret != REG_NOERROR, 0))
1194     return ret;
1195   preorder (dfa->str_tree, calc_next, dfa);
1196   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1197   if (BE (ret != REG_NOERROR, 0))
1198     return ret;
1199   ret = calc_eclosure (dfa);
1200   if (BE (ret != REG_NOERROR, 0))
1201     return ret;
1202 
1203   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1204      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1205   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1206       || dfa->nbackref)
1207     {
1208       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1209       if (BE (dfa->inveclosures == NULL, 0))
1210 	return REG_ESPACE;
1211       ret = calc_inveclosure (dfa);
1212     }
1213 
1214   return ret;
1215 }
1216 
1217 /* Our parse trees are very unbalanced, so we cannot use a stack to
1218    implement parse tree visits.  Instead, we use parent pointers and
1219    some hairy code in these two functions.  */
1220 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1221 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1222 	   void *extra)
1223 {
1224   bin_tree_t *node, *prev;
1225 
1226   for (node = root; ; )
1227     {
1228       /* Descend down the tree, preferably to the left (or to the right
1229 	 if that's the only child).  */
1230       while (node->left || node->right)
1231 	if (node->left)
1232 	  node = node->left;
1233 	else
1234 	  node = node->right;
1235 
1236       do
1237 	{
1238 	  reg_errcode_t err = fn (extra, node);
1239 	  if (BE (err != REG_NOERROR, 0))
1240 	    return err;
1241 	  if (node->parent == NULL)
1242 	    return REG_NOERROR;
1243 	  prev = node;
1244 	  node = node->parent;
1245 	}
1246       /* Go up while we have a node that is reached from the right.  */
1247       while (node->right == prev || node->right == NULL);
1248       node = node->right;
1249     }
1250 }
1251 
1252 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1253 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254 	  void *extra)
1255 {
1256   bin_tree_t *node;
1257 
1258   for (node = root; ; )
1259     {
1260       reg_errcode_t err = fn (extra, node);
1261       if (BE (err != REG_NOERROR, 0))
1262 	return err;
1263 
1264       /* Go to the left node, or up and to the right.  */
1265       if (node->left)
1266 	node = node->left;
1267       else
1268 	{
1269 	  bin_tree_t *prev = NULL;
1270 	  while (node->right == prev || node->right == NULL)
1271 	    {
1272 	      prev = node;
1273 	      node = node->parent;
1274 	      if (!node)
1275 		return REG_NOERROR;
1276 	    }
1277 	  node = node->right;
1278 	}
1279     }
1280 }
1281 
1282 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1283    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1284    backreferences as well.  Requires a preorder visit.  */
1285 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1286 optimize_subexps (void *extra, bin_tree_t *node)
1287 {
1288   re_dfa_t *dfa = (re_dfa_t *) extra;
1289 
1290   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1291     {
1292       int idx = node->token.opr.idx;
1293       node->token.opr.idx = dfa->subexp_map[idx];
1294       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1295     }
1296 
1297   else if (node->token.type == SUBEXP
1298 	   && node->left && node->left->token.type == SUBEXP)
1299     {
1300       int other_idx = node->left->token.opr.idx;
1301 
1302       node->left = node->left->left;
1303       if (node->left)
1304 	node->left->parent = node;
1305 
1306       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1307       if (other_idx < BITSET_WORD_BITS)
1308 	  dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1309     }
1310 
1311   return REG_NOERROR;
1312 }
1313 
1314 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1315    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1316 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1317 lower_subexps (void *extra, bin_tree_t *node)
1318 {
1319   regex_t *preg = (regex_t *) extra;
1320   reg_errcode_t err = REG_NOERROR;
1321 
1322   if (node->left && node->left->token.type == SUBEXP)
1323     {
1324       node->left = lower_subexp (&err, preg, node->left);
1325       if (node->left)
1326 	node->left->parent = node;
1327     }
1328   if (node->right && node->right->token.type == SUBEXP)
1329     {
1330       node->right = lower_subexp (&err, preg, node->right);
1331       if (node->right)
1332 	node->right->parent = node;
1333     }
1334 
1335   return err;
1336 }
1337 
1338 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1339 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1340 {
1341   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1342   bin_tree_t *body = node->left;
1343   bin_tree_t *op, *cls, *tree1, *tree;
1344 
1345   if (preg->no_sub
1346       /* We do not optimize empty subexpressions, because otherwise we may
1347 	 have bad CONCAT nodes with NULL children.  This is obviously not
1348 	 very common, so we do not lose much.  An example that triggers
1349 	 this case is the sed "script" /\(\)/x.  */
1350       && node->left != NULL
1351       && (node->token.opr.idx >= BITSET_WORD_BITS
1352 	  || !(dfa->used_bkref_map
1353 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1354     return node->left;
1355 
1356   /* Convert the SUBEXP node to the concatenation of an
1357      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1358   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1359   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1360   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1361   tree = create_tree (dfa, op, tree1, CONCAT);
1362   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1363     {
1364       *err = REG_ESPACE;
1365       return NULL;
1366     }
1367 
1368   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1369   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1370   return tree;
1371 }
1372 
1373 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1374    nodes.  Requires a postorder visit.  */
1375 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1376 calc_first (void *extra, bin_tree_t *node)
1377 {
1378   re_dfa_t *dfa = (re_dfa_t *) extra;
1379   if (node->token.type == CONCAT)
1380     {
1381       node->first = node->left->first;
1382       node->node_idx = node->left->node_idx;
1383     }
1384   else
1385     {
1386       node->first = node;
1387       node->node_idx = re_dfa_add_node (dfa, node->token);
1388       if (BE (node->node_idx == -1, 0))
1389 	return REG_ESPACE;
1390       if (node->token.type == ANCHOR)
1391 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1392     }
1393   return REG_NOERROR;
1394 }
1395 
1396 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1397 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1398 calc_next (void *extra, bin_tree_t *node)
1399 {
1400   switch (node->token.type)
1401     {
1402     case OP_DUP_ASTERISK:
1403       node->left->next = node;
1404       break;
1405     case CONCAT:
1406       node->left->next = node->right->first;
1407       node->right->next = node->next;
1408       break;
1409     default:
1410       if (node->left)
1411 	node->left->next = node->next;
1412       if (node->right)
1413 	node->right->next = node->next;
1414       break;
1415     }
1416   return REG_NOERROR;
1417 }
1418 
1419 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1420 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1421 link_nfa_nodes (void *extra, bin_tree_t *node)
1422 {
1423   re_dfa_t *dfa = (re_dfa_t *) extra;
1424   int idx = node->node_idx;
1425   reg_errcode_t err = REG_NOERROR;
1426 
1427   switch (node->token.type)
1428     {
1429     case CONCAT:
1430       break;
1431 
1432     case END_OF_RE:
1433       assert (node->next == NULL);
1434       break;
1435 
1436     case OP_DUP_ASTERISK:
1437     case OP_ALT:
1438       {
1439 	int left, right;
1440 	dfa->has_plural_match = 1;
1441 	if (node->left != NULL)
1442 	  left = node->left->first->node_idx;
1443 	else
1444 	  left = node->next->node_idx;
1445 	if (node->right != NULL)
1446 	  right = node->right->first->node_idx;
1447 	else
1448 	  right = node->next->node_idx;
1449 	assert (left > -1);
1450 	assert (right > -1);
1451 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1452       }
1453       break;
1454 
1455     case ANCHOR:
1456     case OP_OPEN_SUBEXP:
1457     case OP_CLOSE_SUBEXP:
1458       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1459       break;
1460 
1461     case OP_BACK_REF:
1462       dfa->nexts[idx] = node->next->node_idx;
1463       if (node->token.type == OP_BACK_REF)
1464 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1465       break;
1466 
1467     default:
1468       assert (!IS_EPSILON_NODE (node->token.type));
1469       dfa->nexts[idx] = node->next->node_idx;
1470       break;
1471     }
1472 
1473   return err;
1474 }
1475 
1476 /* Duplicate the epsilon closure of the node ROOT_NODE.
1477    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1478    to their own constraint.  */
1479 
1480 static reg_errcode_t
1481 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1482 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1483 			int root_node, unsigned int init_constraint)
1484 {
1485   int org_node, clone_node, ret;
1486   unsigned int constraint = init_constraint;
1487   for (org_node = top_org_node, clone_node = top_clone_node;;)
1488     {
1489       int org_dest, clone_dest;
1490       if (dfa->nodes[org_node].type == OP_BACK_REF)
1491 	{
1492 	  /* If the back reference epsilon-transit, its destination must
1493 	     also have the constraint.  Then duplicate the epsilon closure
1494 	     of the destination of the back reference, and store it in
1495 	     edests of the back reference.  */
1496 	  org_dest = dfa->nexts[org_node];
1497 	  re_node_set_empty (dfa->edests + clone_node);
1498 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1499 	  if (BE (clone_dest == -1, 0))
1500 	    return REG_ESPACE;
1501 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 	  if (BE (ret < 0, 0))
1504 	    return REG_ESPACE;
1505 	}
1506       else if (dfa->edests[org_node].nelem == 0)
1507 	{
1508 	  /* In case of the node can't epsilon-transit, don't duplicate the
1509 	     destination and store the original destination as the
1510 	     destination of the node.  */
1511 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1512 	  break;
1513 	}
1514       else if (dfa->edests[org_node].nelem == 1)
1515 	{
1516 	  /* In case of the node can epsilon-transit, and it has only one
1517 	     destination.  */
1518 	  org_dest = dfa->edests[org_node].elems[0];
1519 	  re_node_set_empty (dfa->edests + clone_node);
1520 	  /* If the node is root_node itself, it means the epsilon clsoure
1521 	     has a loop.   Then tie it to the destination of the root_node.  */
1522 	  if (org_node == root_node && clone_node != org_node)
1523 	    {
1524 	      ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1525 	      if (BE (ret < 0, 0))
1526 		return REG_ESPACE;
1527 	      break;
1528 	    }
1529 	  /* In case of the node has another constraint, add it.  */
1530 	  constraint |= dfa->nodes[org_node].constraint;
1531 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 	  if (BE (clone_dest == -1, 0))
1533 	    return REG_ESPACE;
1534 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 	  if (BE (ret < 0, 0))
1536 	    return REG_ESPACE;
1537 	}
1538       else /* dfa->edests[org_node].nelem == 2 */
1539 	{
1540 	  /* In case of the node can epsilon-transit, and it has two
1541 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1542 	  org_dest = dfa->edests[org_node].elems[0];
1543 	  re_node_set_empty (dfa->edests + clone_node);
1544 	  /* Search for a duplicated node which satisfies the constraint.  */
1545 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546 	  if (clone_dest == -1)
1547 	    {
1548 	      /* There is no such duplicated node, create a new one.  */
1549 	      reg_errcode_t err;
1550 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1551 	      if (BE (clone_dest == -1, 0))
1552 		return REG_ESPACE;
1553 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 	      if (BE (ret < 0, 0))
1555 		return REG_ESPACE;
1556 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1557 					    root_node, constraint);
1558 	      if (BE (err != REG_NOERROR, 0))
1559 		return err;
1560 	    }
1561 	  else
1562 	    {
1563 	      /* There is a duplicated node which satisfies the constraint,
1564 		 use it to avoid infinite loop.  */
1565 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 	      if (BE (ret < 0, 0))
1567 		return REG_ESPACE;
1568 	    }
1569 
1570 	  org_dest = dfa->edests[org_node].elems[1];
1571 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 	  if (BE (clone_dest == -1, 0))
1573 	    return REG_ESPACE;
1574 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575 	  if (BE (ret < 0, 0))
1576 	    return REG_ESPACE;
1577 	}
1578       org_node = org_dest;
1579       clone_node = clone_dest;
1580     }
1581   return REG_NOERROR;
1582 }
1583 
1584 /* Search for a node which is duplicated from the node ORG_NODE, and
1585    satisfies the constraint CONSTRAINT.  */
1586 
1587 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1588 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1589 			unsigned int constraint)
1590 {
1591   int idx;
1592   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1593     {
1594       if (org_node == dfa->org_indices[idx]
1595 	  && constraint == dfa->nodes[idx].constraint)
1596 	return idx; /* Found.  */
1597     }
1598   return -1; /* Not found.  */
1599 }
1600 
1601 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602    Return the index of the new node, or -1 if insufficient storage is
1603    available.  */
1604 
1605 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1606 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1607 {
1608   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1609   if (BE (dup_idx != -1, 1))
1610     {
1611       dfa->nodes[dup_idx].constraint = constraint;
1612       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1613       dfa->nodes[dup_idx].duplicated = 1;
1614 
1615       /* Store the index of the original node.  */
1616       dfa->org_indices[dup_idx] = org_idx;
1617     }
1618   return dup_idx;
1619 }
1620 
1621 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1622 calc_inveclosure (re_dfa_t *dfa)
1623 {
1624   int src, idx, ret;
1625   for (idx = 0; idx < dfa->nodes_len; ++idx)
1626     re_node_set_init_empty (dfa->inveclosures + idx);
1627 
1628   for (src = 0; src < dfa->nodes_len; ++src)
1629     {
1630       int *elems = dfa->eclosures[src].elems;
1631       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1632 	{
1633 	  ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634 	  if (BE (ret == -1, 0))
1635 	    return REG_ESPACE;
1636 	}
1637     }
1638 
1639   return REG_NOERROR;
1640 }
1641 
1642 /* Calculate "eclosure" for all the node in DFA.  */
1643 
1644 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1645 calc_eclosure (re_dfa_t *dfa)
1646 {
1647   int node_idx, incomplete;
1648 #ifdef DEBUG
1649   assert (dfa->nodes_len > 0);
1650 #endif
1651   incomplete = 0;
1652   /* For each nodes, calculate epsilon closure.  */
1653   for (node_idx = 0; ; ++node_idx)
1654     {
1655       reg_errcode_t err;
1656       re_node_set eclosure_elem;
1657       if (node_idx == dfa->nodes_len)
1658 	{
1659 	  if (!incomplete)
1660 	    break;
1661 	  incomplete = 0;
1662 	  node_idx = 0;
1663 	}
1664 
1665 #ifdef DEBUG
1666       assert (dfa->eclosures[node_idx].nelem != -1);
1667 #endif
1668 
1669       /* If we have already calculated, skip it.  */
1670       if (dfa->eclosures[node_idx].nelem != 0)
1671 	continue;
1672       /* Calculate epsilon closure of `node_idx'.  */
1673       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1674       if (BE (err != REG_NOERROR, 0))
1675 	return err;
1676 
1677       if (dfa->eclosures[node_idx].nelem == 0)
1678 	{
1679 	  incomplete = 1;
1680 	  re_node_set_free (&eclosure_elem);
1681 	}
1682     }
1683   return REG_NOERROR;
1684 }
1685 
1686 /* Calculate epsilon closure of NODE.  */
1687 
1688 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1689 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1690 {
1691   reg_errcode_t err;
1692   int i;
1693   re_node_set eclosure;
1694   int ret;
1695   int incomplete = 0;
1696   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1697   if (BE (err != REG_NOERROR, 0))
1698     return err;
1699 
1700   /* This indicates that we are calculating this node now.
1701      We reference this value to avoid infinite loop.  */
1702   dfa->eclosures[node].nelem = -1;
1703 
1704   /* If the current node has constraints, duplicate all nodes
1705      since they must inherit the constraints.  */
1706   if (dfa->nodes[node].constraint
1707       && dfa->edests[node].nelem
1708       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1709     {
1710       err = duplicate_node_closure (dfa, node, node, node,
1711 				    dfa->nodes[node].constraint);
1712       if (BE (err != REG_NOERROR, 0))
1713 	return err;
1714     }
1715 
1716   /* Expand each epsilon destination nodes.  */
1717   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1718     for (i = 0; i < dfa->edests[node].nelem; ++i)
1719       {
1720 	re_node_set eclosure_elem;
1721 	int edest = dfa->edests[node].elems[i];
1722 	/* If calculating the epsilon closure of `edest' is in progress,
1723 	   return intermediate result.  */
1724 	if (dfa->eclosures[edest].nelem == -1)
1725 	  {
1726 	    incomplete = 1;
1727 	    continue;
1728 	  }
1729 	/* If we haven't calculated the epsilon closure of `edest' yet,
1730 	   calculate now. Otherwise use calculated epsilon closure.  */
1731 	if (dfa->eclosures[edest].nelem == 0)
1732 	  {
1733 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1734 	    if (BE (err != REG_NOERROR, 0))
1735 	      return err;
1736 	  }
1737 	else
1738 	  eclosure_elem = dfa->eclosures[edest];
1739 	/* Merge the epsilon closure of `edest'.  */
1740 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1741 	if (BE (err != REG_NOERROR, 0))
1742 	  return err;
1743 	/* If the epsilon closure of `edest' is incomplete,
1744 	   the epsilon closure of this node is also incomplete.  */
1745 	if (dfa->eclosures[edest].nelem == 0)
1746 	  {
1747 	    incomplete = 1;
1748 	    re_node_set_free (&eclosure_elem);
1749 	  }
1750       }
1751 
1752   /* An epsilon closure includes itself.  */
1753   ret = re_node_set_insert (&eclosure, node);
1754   if (BE (ret < 0, 0))
1755     return REG_ESPACE;
1756   if (incomplete && !root)
1757     dfa->eclosures[node].nelem = 0;
1758   else
1759     dfa->eclosures[node] = eclosure;
1760   *new_set = eclosure;
1761   return REG_NOERROR;
1762 }
1763 
1764 /* Functions for token which are used in the parser.  */
1765 
1766 /* Fetch a token from INPUT.
1767    We must not use this function inside bracket expressions.  */
1768 
1769 static void
1770 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1771 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1772 {
1773   re_string_skip_bytes (input, peek_token (result, input, syntax));
1774 }
1775 
1776 /* Peek a token from INPUT, and return the length of the token.
1777    We must not use this function inside bracket expressions.  */
1778 
1779 static int
1780 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1782 {
1783   unsigned char c;
1784 
1785   if (re_string_eoi (input))
1786     {
1787       token->type = END_OF_RE;
1788       return 0;
1789     }
1790 
1791   c = re_string_peek_byte (input, 0);
1792   token->opr.c = c;
1793 
1794   token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796   token->mb_partial = 0;
1797   if (input->mb_cur_max > 1 &&
1798       !re_string_first_byte (input, re_string_cur_idx (input)))
1799     {
1800       token->type = CHARACTER;
1801       token->mb_partial = 1;
1802       return 1;
1803     }
1804 #endif
1805   if (c == '\\')
1806     {
1807       unsigned char c2;
1808       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1809 	{
1810 	  token->type = BACK_SLASH;
1811 	  return 1;
1812 	}
1813 
1814       c2 = re_string_peek_byte_case (input, 1);
1815       token->opr.c = c2;
1816       token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818       if (input->mb_cur_max > 1)
1819 	{
1820 	  wint_t wc = re_string_wchar_at (input,
1821 					  re_string_cur_idx (input) + 1);
1822 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1823 	}
1824       else
1825 #endif
1826 	token->word_char = IS_WORD_CHAR (c2) != 0;
1827 
1828       switch (c2)
1829 	{
1830 	case '|':
1831 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832 	    token->type = OP_ALT;
1833 	  break;
1834 	case '1': case '2': case '3': case '4': case '5':
1835 	case '6': case '7': case '8': case '9':
1836 	  if (!(syntax & RE_NO_BK_REFS))
1837 	    {
1838 	      token->type = OP_BACK_REF;
1839 	      token->opr.idx = c2 - '1';
1840 	    }
1841 	  break;
1842 	case '<':
1843 	  if (!(syntax & RE_NO_GNU_OPS))
1844 	    {
1845 	      token->type = ANCHOR;
1846 	      token->opr.ctx_type = WORD_FIRST;
1847 	    }
1848 	  break;
1849 	case '>':
1850 	  if (!(syntax & RE_NO_GNU_OPS))
1851 	    {
1852 	      token->type = ANCHOR;
1853 	      token->opr.ctx_type = WORD_LAST;
1854 	    }
1855 	  break;
1856 	case 'b':
1857 	  if (!(syntax & RE_NO_GNU_OPS))
1858 	    {
1859 	      token->type = ANCHOR;
1860 	      token->opr.ctx_type = WORD_DELIM;
1861 	    }
1862 	  break;
1863 	case 'B':
1864 	  if (!(syntax & RE_NO_GNU_OPS))
1865 	    {
1866 	      token->type = ANCHOR;
1867 	      token->opr.ctx_type = NOT_WORD_DELIM;
1868 	    }
1869 	  break;
1870 	case 'w':
1871 	  if (!(syntax & RE_NO_GNU_OPS))
1872 	    token->type = OP_WORD;
1873 	  break;
1874 	case 'W':
1875 	  if (!(syntax & RE_NO_GNU_OPS))
1876 	    token->type = OP_NOTWORD;
1877 	  break;
1878 	case 's':
1879 	  if (!(syntax & RE_NO_GNU_OPS))
1880 	    token->type = OP_SPACE;
1881 	  break;
1882 	case 'S':
1883 	  if (!(syntax & RE_NO_GNU_OPS))
1884 	    token->type = OP_NOTSPACE;
1885 	  break;
1886 	case '`':
1887 	  if (!(syntax & RE_NO_GNU_OPS))
1888 	    {
1889 	      token->type = ANCHOR;
1890 	      token->opr.ctx_type = BUF_FIRST;
1891 	    }
1892 	  break;
1893 	case '\'':
1894 	  if (!(syntax & RE_NO_GNU_OPS))
1895 	    {
1896 	      token->type = ANCHOR;
1897 	      token->opr.ctx_type = BUF_LAST;
1898 	    }
1899 	  break;
1900 	case '(':
1901 	  if (!(syntax & RE_NO_BK_PARENS))
1902 	    token->type = OP_OPEN_SUBEXP;
1903 	  break;
1904 	case ')':
1905 	  if (!(syntax & RE_NO_BK_PARENS))
1906 	    token->type = OP_CLOSE_SUBEXP;
1907 	  break;
1908 	case '+':
1909 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 	    token->type = OP_DUP_PLUS;
1911 	  break;
1912 	case '?':
1913 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 	    token->type = OP_DUP_QUESTION;
1915 	  break;
1916 	case '{':
1917 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 	    token->type = OP_OPEN_DUP_NUM;
1919 	  break;
1920 	case '}':
1921 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 	    token->type = OP_CLOSE_DUP_NUM;
1923 	  break;
1924 	default:
1925 	  break;
1926 	}
1927       return 2;
1928     }
1929 
1930   token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932   if (input->mb_cur_max > 1)
1933     {
1934       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1936     }
1937   else
1938 #endif
1939     token->word_char = IS_WORD_CHAR (token->opr.c);
1940 
1941   switch (c)
1942     {
1943     case '\n':
1944       if (syntax & RE_NEWLINE_ALT)
1945 	token->type = OP_ALT;
1946       break;
1947     case '|':
1948       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949 	token->type = OP_ALT;
1950       break;
1951     case '*':
1952       token->type = OP_DUP_ASTERISK;
1953       break;
1954     case '+':
1955       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 	token->type = OP_DUP_PLUS;
1957       break;
1958     case '?':
1959       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 	token->type = OP_DUP_QUESTION;
1961       break;
1962     case '{':
1963       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 	token->type = OP_OPEN_DUP_NUM;
1965       break;
1966     case '}':
1967       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 	token->type = OP_CLOSE_DUP_NUM;
1969       break;
1970     case '(':
1971       if (syntax & RE_NO_BK_PARENS)
1972 	token->type = OP_OPEN_SUBEXP;
1973       break;
1974     case ')':
1975       if (syntax & RE_NO_BK_PARENS)
1976 	token->type = OP_CLOSE_SUBEXP;
1977       break;
1978     case '[':
1979       token->type = OP_OPEN_BRACKET;
1980       break;
1981     case '.':
1982       token->type = OP_PERIOD;
1983       break;
1984     case '^':
1985       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1986 	  re_string_cur_idx (input) != 0)
1987 	{
1988 	  char prev = re_string_peek_byte (input, -1);
1989 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990 	    break;
1991 	}
1992       token->type = ANCHOR;
1993       token->opr.ctx_type = LINE_FIRST;
1994       break;
1995     case '$':
1996       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1997 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1998 	{
1999 	  re_token_t next;
2000 	  re_string_skip_bytes (input, 1);
2001 	  peek_token (&next, input, syntax);
2002 	  re_string_skip_bytes (input, -1);
2003 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004 	    break;
2005 	}
2006       token->type = ANCHOR;
2007       token->opr.ctx_type = LINE_LAST;
2008       break;
2009     default:
2010       break;
2011     }
2012   return 1;
2013 }
2014 
2015 /* Peek a token from INPUT, and return the length of the token.
2016    We must not use this function out of bracket expressions.  */
2017 
2018 static int
2019 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2020 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2021 {
2022   unsigned char c;
2023   if (re_string_eoi (input))
2024     {
2025       token->type = END_OF_RE;
2026       return 0;
2027     }
2028   c = re_string_peek_byte (input, 0);
2029   token->opr.c = c;
2030 
2031 #ifdef RE_ENABLE_I18N
2032   if (input->mb_cur_max > 1 &&
2033       !re_string_first_byte (input, re_string_cur_idx (input)))
2034     {
2035       token->type = CHARACTER;
2036       return 1;
2037     }
2038 #endif /* RE_ENABLE_I18N */
2039 
2040   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041       && re_string_cur_idx (input) + 1 < re_string_length (input))
2042     {
2043       /* In this case, '\' escape a character.  */
2044       unsigned char c2;
2045       re_string_skip_bytes (input, 1);
2046       c2 = re_string_peek_byte (input, 0);
2047       token->opr.c = c2;
2048       token->type = CHARACTER;
2049       return 1;
2050     }
2051   if (c == '[') /* '[' is a special char in a bracket exps.  */
2052     {
2053       unsigned char c2;
2054       int token_len;
2055       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056 	c2 = re_string_peek_byte (input, 1);
2057       else
2058 	c2 = 0;
2059       token->opr.c = c2;
2060       token_len = 2;
2061       switch (c2)
2062 	{
2063 	case '.':
2064 	  token->type = OP_OPEN_COLL_ELEM;
2065 	  break;
2066 	case '=':
2067 	  token->type = OP_OPEN_EQUIV_CLASS;
2068 	  break;
2069 	case ':':
2070 	  if (syntax & RE_CHAR_CLASSES)
2071 	    {
2072 	      token->type = OP_OPEN_CHAR_CLASS;
2073 	      break;
2074 	    }
2075 	  /* else fall through.  */
2076 	default:
2077 	  token->type = CHARACTER;
2078 	  token->opr.c = c;
2079 	  token_len = 1;
2080 	  break;
2081 	}
2082       return token_len;
2083     }
2084   switch (c)
2085     {
2086     case '-':
2087       token->type = OP_CHARSET_RANGE;
2088       break;
2089     case ']':
2090       token->type = OP_CLOSE_BRACKET;
2091       break;
2092     case '^':
2093       token->type = OP_NON_MATCH_LIST;
2094       break;
2095     default:
2096       token->type = CHARACTER;
2097     }
2098   return 1;
2099 }
2100 
2101 /* Functions for parser.  */
2102 
2103 /* Entry point of the parser.
2104    Parse the regular expression REGEXP and return the structure tree.
2105    If an error has occurred, ERR is set by error code, and return NULL.
2106    This function build the following tree, from regular expression <reg_exp>:
2107 	   CAT
2108 	   / \
2109 	  /   \
2110    <reg_exp>  EOR
2111 
2112    CAT means concatenation.
2113    EOR means end of regular expression.  */
2114 
2115 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2116 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2117        reg_errcode_t *err)
2118 {
2119   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120   bin_tree_t *tree, *eor, *root;
2121   re_token_t current_token;
2122   dfa->syntax = syntax;
2123   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2125   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126     return NULL;
2127   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2128   if (tree != NULL)
2129     root = create_tree (dfa, tree, eor, CONCAT);
2130   else
2131     root = eor;
2132   if (BE (eor == NULL || root == NULL, 0))
2133     {
2134       *err = REG_ESPACE;
2135       return NULL;
2136     }
2137   return root;
2138 }
2139 
2140 /* This function build the following tree, from regular expression
2141    <branch1>|<branch2>:
2142 	   ALT
2143 	   / \
2144 	  /   \
2145    <branch1> <branch2>
2146 
2147    ALT means alternative, which represents the operator `|'.  */
2148 
2149 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2150 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2151 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2152 {
2153   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2154   bin_tree_t *tree, *branch = NULL;
2155   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2156   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157     return NULL;
2158 
2159   while (token->type == OP_ALT)
2160     {
2161       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2162       if (token->type != OP_ALT && token->type != END_OF_RE
2163 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2164 	{
2165 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2166 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2167 	    return NULL;
2168 	}
2169       else
2170 	branch = NULL;
2171       tree = create_tree (dfa, tree, branch, OP_ALT);
2172       if (BE (tree == NULL, 0))
2173 	{
2174 	  *err = REG_ESPACE;
2175 	  return NULL;
2176 	}
2177     }
2178   return tree;
2179 }
2180 
2181 /* This function build the following tree, from regular expression
2182    <exp1><exp2>:
2183 	CAT
2184 	/ \
2185        /   \
2186    <exp1> <exp2>
2187 
2188    CAT means concatenation.  */
2189 
2190 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2191 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192 	      reg_syntax_t syntax, int nest, reg_errcode_t *err)
2193 {
2194   bin_tree_t *tree, *exp;
2195   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2196   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2197   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2198     return NULL;
2199 
2200   while (token->type != OP_ALT && token->type != END_OF_RE
2201 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2202     {
2203       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2204       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2205 	{
2206 	  return NULL;
2207 	}
2208       if (tree != NULL && exp != NULL)
2209 	{
2210 	  tree = create_tree (dfa, tree, exp, CONCAT);
2211 	  if (tree == NULL)
2212 	    {
2213 	      *err = REG_ESPACE;
2214 	      return NULL;
2215 	    }
2216 	}
2217       else if (tree == NULL)
2218 	tree = exp;
2219       /* Otherwise exp == NULL, we don't need to create new tree.  */
2220     }
2221   return tree;
2222 }
2223 
2224 /* This function build the following tree, from regular expression a*:
2225 	 *
2226 	 |
2227 	 a
2228 */
2229 
2230 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2231 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2232 		  reg_syntax_t syntax, int nest, reg_errcode_t *err)
2233 {
2234   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2235   bin_tree_t *tree;
2236   switch (token->type)
2237     {
2238     case CHARACTER:
2239       tree = create_token_tree (dfa, NULL, NULL, token);
2240       if (BE (tree == NULL, 0))
2241 	{
2242 	  *err = REG_ESPACE;
2243 	  return NULL;
2244 	}
2245 #ifdef RE_ENABLE_I18N
2246       if (dfa->mb_cur_max > 1)
2247 	{
2248 	  while (!re_string_eoi (regexp)
2249 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2250 	    {
2251 	      bin_tree_t *mbc_remain;
2252 	      fetch_token (token, regexp, syntax);
2253 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2256 		{
2257 		  *err = REG_ESPACE;
2258 		  return NULL;
2259 		}
2260 	    }
2261 	}
2262 #endif
2263       break;
2264     case OP_OPEN_SUBEXP:
2265       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 	return NULL;
2268       break;
2269     case OP_OPEN_BRACKET:
2270       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2272 	return NULL;
2273       break;
2274     case OP_BACK_REF:
2275       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2276 	{
2277 	  *err = REG_ESUBREG;
2278 	  return NULL;
2279 	}
2280       dfa->used_bkref_map |= 1 << token->opr.idx;
2281       tree = create_token_tree (dfa, NULL, NULL, token);
2282       if (BE (tree == NULL, 0))
2283 	{
2284 	  *err = REG_ESPACE;
2285 	  return NULL;
2286 	}
2287       ++dfa->nbackref;
2288       dfa->has_mb_node = 1;
2289       break;
2290     case OP_OPEN_DUP_NUM:
2291       if (syntax & RE_CONTEXT_INVALID_DUP)
2292 	{
2293 	  *err = REG_BADRPT;
2294 	  return NULL;
2295 	}
2296       /* FALLTHROUGH */
2297     case OP_DUP_ASTERISK:
2298     case OP_DUP_PLUS:
2299     case OP_DUP_QUESTION:
2300       if (syntax & RE_CONTEXT_INVALID_OPS)
2301 	{
2302 	  *err = REG_BADRPT;
2303 	  return NULL;
2304 	}
2305       else if (syntax & RE_CONTEXT_INDEP_OPS)
2306 	{
2307 	  fetch_token (token, regexp, syntax);
2308 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2309 	}
2310       /* else fall through  */
2311     case OP_CLOSE_SUBEXP:
2312       if ((token->type == OP_CLOSE_SUBEXP) &&
2313 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2314 	{
2315 	  *err = REG_ERPAREN;
2316 	  return NULL;
2317 	}
2318       /* else fall through  */
2319     case OP_CLOSE_DUP_NUM:
2320       /* We treat it as a normal character.  */
2321 
2322       /* Then we can these characters as normal characters.  */
2323       token->type = CHARACTER;
2324       /* mb_partial and word_char bits should be initialized already
2325 	 by peek_token.  */
2326       tree = create_token_tree (dfa, NULL, NULL, token);
2327       if (BE (tree == NULL, 0))
2328 	{
2329 	  *err = REG_ESPACE;
2330 	  return NULL;
2331 	}
2332       break;
2333     case ANCHOR:
2334       if ((token->opr.ctx_type
2335 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336 	  && dfa->word_ops_used == 0)
2337 	init_word_char (dfa);
2338       if (token->opr.ctx_type == WORD_DELIM
2339 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2340 	{
2341 	  bin_tree_t *tree_first, *tree_last;
2342 	  if (token->opr.ctx_type == WORD_DELIM)
2343 	    {
2344 	      token->opr.ctx_type = WORD_FIRST;
2345 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2346 	      token->opr.ctx_type = WORD_LAST;
2347 	    }
2348 	  else
2349 	    {
2350 	      token->opr.ctx_type = INSIDE_WORD;
2351 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2352 	      token->opr.ctx_type = INSIDE_NOTWORD;
2353 	    }
2354 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2355 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2357 	    {
2358 	      *err = REG_ESPACE;
2359 	      return NULL;
2360 	    }
2361 	}
2362       else
2363 	{
2364 	  tree = create_token_tree (dfa, NULL, NULL, token);
2365 	  if (BE (tree == NULL, 0))
2366 	    {
2367 	      *err = REG_ESPACE;
2368 	      return NULL;
2369 	    }
2370 	}
2371       /* We must return here, since ANCHORs can't be followed
2372 	 by repetition operators.
2373 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2375       fetch_token (token, regexp, syntax);
2376       return tree;
2377     case OP_PERIOD:
2378       tree = create_token_tree (dfa, NULL, NULL, token);
2379       if (BE (tree == NULL, 0))
2380 	{
2381 	  *err = REG_ESPACE;
2382 	  return NULL;
2383 	}
2384       if (dfa->mb_cur_max > 1)
2385 	dfa->has_mb_node = 1;
2386       break;
2387     case OP_WORD:
2388     case OP_NOTWORD:
2389       tree = build_charclass_op (dfa, regexp->trans,
2390 				 "alnum",
2391 				 "_",
2392 				 token->type == OP_NOTWORD, err);
2393       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 	return NULL;
2395       break;
2396     case OP_SPACE:
2397     case OP_NOTSPACE:
2398       tree = build_charclass_op (dfa, regexp->trans,
2399 				 "space",
2400 				 "",
2401 				 token->type == OP_NOTSPACE, err);
2402       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403 	return NULL;
2404       break;
2405     case OP_ALT:
2406     case END_OF_RE:
2407       return NULL;
2408     case BACK_SLASH:
2409       *err = REG_EESCAPE;
2410       return NULL;
2411     default:
2412       /* Must not happen?  */
2413 #ifdef DEBUG
2414       assert (0);
2415 #endif
2416       return NULL;
2417     }
2418   fetch_token (token, regexp, syntax);
2419 
2420   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2422     {
2423       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425 	return NULL;
2426       /* In BRE consecutive duplications are not allowed.  */
2427       if ((syntax & RE_CONTEXT_INVALID_DUP)
2428 	  && (token->type == OP_DUP_ASTERISK
2429 	      || token->type == OP_OPEN_DUP_NUM))
2430 	{
2431 	  *err = REG_BADRPT;
2432 	  return NULL;
2433 	}
2434     }
2435 
2436   return tree;
2437 }
2438 
2439 /* This function build the following tree, from regular expression
2440    (<reg_exp>):
2441 	 SUBEXP
2442 	    |
2443 	<reg_exp>
2444 */
2445 
2446 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2447 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2448 	       reg_syntax_t syntax, int nest, reg_errcode_t *err)
2449 {
2450   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2451   bin_tree_t *tree;
2452   size_t cur_nsub;
2453   cur_nsub = preg->re_nsub++;
2454 
2455   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2456 
2457   /* The subexpression may be a null string.  */
2458   if (token->type == OP_CLOSE_SUBEXP)
2459     tree = NULL;
2460   else
2461     {
2462       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2463       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2464 	*err = REG_EPAREN;
2465       if (BE (*err != REG_NOERROR, 0))
2466 	return NULL;
2467     }
2468 
2469   if (cur_nsub <= '9' - '1')
2470     dfa->completed_bkref_map |= 1 << cur_nsub;
2471 
2472   tree = create_tree (dfa, tree, NULL, SUBEXP);
2473   if (BE (tree == NULL, 0))
2474     {
2475       *err = REG_ESPACE;
2476       return NULL;
2477     }
2478   tree->token.opr.idx = cur_nsub;
2479   return tree;
2480 }
2481 
2482 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2483 
2484 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2485 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2486 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2487 {
2488   bin_tree_t *tree = NULL, *old_tree = NULL;
2489   int i, start, end, start_idx = re_string_cur_idx (regexp);
2490 #ifndef RE_TOKEN_INIT_BUG
2491   re_token_t start_token = *token;
2492 #else
2493   re_token_t start_token;
2494 
2495   memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2496 #endif
2497 
2498   if (token->type == OP_OPEN_DUP_NUM)
2499     {
2500       end = 0;
2501       start = fetch_number (regexp, token, syntax);
2502       if (start == -1)
2503 	{
2504 	  if (token->type == CHARACTER && token->opr.c == ',')
2505 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2506 	  else
2507 	    {
2508 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2509 	      return NULL;
2510 	    }
2511 	}
2512       if (BE (start != -2, 1))
2513 	{
2514 	  /* We treat "{n}" as "{n,n}".  */
2515 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516 		 : ((token->type == CHARACTER && token->opr.c == ',')
2517 		    ? fetch_number (regexp, token, syntax) : -2));
2518 	}
2519       if (BE (start == -2 || end == -2, 0))
2520 	{
2521 	  /* Invalid sequence.  */
2522 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2523 	    {
2524 	      if (token->type == END_OF_RE)
2525 		*err = REG_EBRACE;
2526 	      else
2527 		*err = REG_BADBR;
2528 
2529 	      return NULL;
2530 	    }
2531 
2532 	  /* If the syntax bit is set, rollback.  */
2533 	  re_string_set_index (regexp, start_idx);
2534 	  *token = start_token;
2535 	  token->type = CHARACTER;
2536 	  /* mb_partial and word_char bits should be already initialized by
2537 	     peek_token.  */
2538 	  return elem;
2539 	}
2540 
2541       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2542 	{
2543 	  /* First number greater than second.  */
2544 	  *err = REG_BADBR;
2545 	  return NULL;
2546 	}
2547     }
2548   else
2549     {
2550       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2552     }
2553 
2554   fetch_token (token, regexp, syntax);
2555 
2556   if (BE (elem == NULL, 0))
2557     return NULL;
2558   if (BE (start == 0 && end == 0, 0))
2559     {
2560       postorder (elem, free_tree, NULL);
2561       return NULL;
2562     }
2563 
2564   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2565   if (BE (start > 0, 0))
2566     {
2567       tree = elem;
2568       for (i = 2; i <= start; ++i)
2569 	{
2570 	  elem = duplicate_tree (elem, dfa);
2571 	  tree = create_tree (dfa, tree, elem, CONCAT);
2572 	  if (BE (elem == NULL || tree == NULL, 0))
2573 	    goto parse_dup_op_espace;
2574 	}
2575 
2576       if (start == end)
2577 	return tree;
2578 
2579       /* Duplicate ELEM before it is marked optional.  */
2580       elem = duplicate_tree (elem, dfa);
2581       old_tree = tree;
2582     }
2583   else
2584     old_tree = NULL;
2585 
2586   if (elem->token.type == SUBEXP)
2587     postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2588 
2589   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2590   if (BE (tree == NULL, 0))
2591     goto parse_dup_op_espace;
2592 
2593   /* This loop is actually executed only when end != -1,
2594      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2595      already created the start+1-th copy.  */
2596   for (i = start + 2; i <= end; ++i)
2597     {
2598       elem = duplicate_tree (elem, dfa);
2599       tree = create_tree (dfa, tree, elem, CONCAT);
2600       if (BE (elem == NULL || tree == NULL, 0))
2601 	goto parse_dup_op_espace;
2602 
2603       tree = create_tree (dfa, tree, NULL, OP_ALT);
2604       if (BE (tree == NULL, 0))
2605 	goto parse_dup_op_espace;
2606     }
2607 
2608   if (old_tree)
2609     tree = create_tree (dfa, old_tree, tree, CONCAT);
2610 
2611   return tree;
2612 
2613  parse_dup_op_espace:
2614   *err = REG_ESPACE;
2615   return NULL;
2616 }
2617 
2618 /* Size of the names for collating symbol/equivalence_class/character_class.
2619    I'm not sure, but maybe enough.  */
2620 #define BRACKET_NAME_BUF_SIZE 32
2621 
2622 #ifndef _LIBC
2623   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2624      Build the range expression which starts from START_ELEM, and ends
2625      at END_ELEM.  The result are written to MBCSET and SBCSET.
2626      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2627      mbcset->range_ends, is a pointer argument since we may
2628      update it.  */
2629 
2630 static reg_errcode_t
2631 internal_function
2632 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2633 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2634 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2635 # else /* not RE_ENABLE_I18N */
2636 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2637 		 bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2639 {
2640   unsigned int start_ch, end_ch;
2641   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2642   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2644 	  0))
2645     return REG_ERANGE;
2646 
2647   /* We can handle no multi character collating elements without libc
2648      support.  */
2649   if (BE ((start_elem->type == COLL_SYM
2650 	   && strlen ((char *) start_elem->opr.name) > 1)
2651 	  || (end_elem->type == COLL_SYM
2652 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2653     return REG_ECOLLATE;
2654 
2655 # ifdef RE_ENABLE_I18N
2656   {
2657     wchar_t wc;
2658     wint_t start_wc;
2659     wint_t end_wc;
2660     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2661 
2662     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664 		   : 0));
2665     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667 		 : 0));
2668 #ifdef GAWK
2669     /*
2670      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2671      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2672      * unsigned, so we don't have sign extension problems.
2673      */
2674     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675 		? start_ch : start_elem->opr.wch);
2676     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677 	      ? end_ch : end_elem->opr.wch);
2678 #else
2679     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2680 		? __btowc (start_ch) : start_elem->opr.wch);
2681     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2682 	      ? __btowc (end_ch) : end_elem->opr.wch);
2683 #endif
2684     if (start_wc == WEOF || end_wc == WEOF)
2685       return REG_ECOLLATE;
2686     cmp_buf[0] = start_wc;
2687     cmp_buf[4] = end_wc;
2688     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2689       return REG_ERANGE;
2690 
2691     /* Got valid collation sequence values, add them as a new entry.
2692        However, for !_LIBC we have no collation elements: if the
2693        character set is single byte, the single byte character set
2694        that we build below suffices.  parse_bracket_exp passes
2695        no MBCSET if dfa->mb_cur_max == 1.  */
2696     if (mbcset)
2697       {
2698 	/* Check the space of the arrays.  */
2699 	if (BE (*range_alloc == mbcset->nranges, 0))
2700 	  {
2701 	    /* There is not enough space, need realloc.  */
2702 	    wchar_t *new_array_start, *new_array_end;
2703 	    int new_nranges;
2704 
2705 	    /* +1 in case of mbcset->nranges is 0.  */
2706 	    new_nranges = 2 * mbcset->nranges + 1;
2707 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2708 	       are NULL if *range_alloc == 0.  */
2709 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2710 					  new_nranges);
2711 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2712 					new_nranges);
2713 
2714 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2715 	      return REG_ESPACE;
2716 
2717 	    mbcset->range_starts = new_array_start;
2718 	    mbcset->range_ends = new_array_end;
2719 	    *range_alloc = new_nranges;
2720 	  }
2721 
2722 	mbcset->range_starts[mbcset->nranges] = start_wc;
2723 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2724       }
2725 
2726     /* Build the table for single byte characters.  */
2727     for (wc = 0; wc < SBC_MAX; ++wc)
2728       {
2729 	cmp_buf[2] = wc;
2730 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2731 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2732 	  bitset_set (sbcset, wc);
2733       }
2734   }
2735 # else /* not RE_ENABLE_I18N */
2736   {
2737     unsigned int ch;
2738     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2739 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2740 		   : 0));
2741     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2742 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2743 		 : 0));
2744     if (start_ch > end_ch)
2745       return REG_ERANGE;
2746     /* Build the table for single byte characters.  */
2747     for (ch = 0; ch < SBC_MAX; ++ch)
2748       if (start_ch <= ch  && ch <= end_ch)
2749 	bitset_set (sbcset, ch);
2750   }
2751 # endif /* not RE_ENABLE_I18N */
2752   return REG_NOERROR;
2753 }
2754 #endif /* not _LIBC */
2755 
2756 #ifndef _LIBC
2757 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2758    Build the collating element which is represented by NAME.
2759    The result are written to MBCSET and SBCSET.
2760    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2761    pointer argument since we may update it.  */
2762 
2763 static reg_errcode_t
2764 internal_function
2765 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2766 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2767 			int *coll_sym_alloc, const unsigned char *name)
2768 # else /* not RE_ENABLE_I18N */
2769 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2770 # endif /* not RE_ENABLE_I18N */
2771 {
2772   size_t name_len = strlen ((const char *) name);
2773   if (BE (name_len != 1, 0))
2774     return REG_ECOLLATE;
2775   else
2776     {
2777       bitset_set (sbcset, name[0]);
2778       return REG_NOERROR;
2779     }
2780 }
2781 #endif /* not _LIBC */
2782 
2783 /* This function parse bracket expression like "[abc]", "[a-c]",
2784    "[[.a-a.]]" etc.  */
2785 
2786 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2787 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2788 		   reg_syntax_t syntax, reg_errcode_t *err)
2789 {
2790 #ifdef _LIBC
2791   const unsigned char *collseqmb;
2792   const char *collseqwc;
2793   uint32_t nrules;
2794   int32_t table_size;
2795   const int32_t *symb_table;
2796   const unsigned char *extra;
2797 
2798   /* Local function for parse_bracket_exp used in _LIBC environment.
2799      Seek the collating symbol entry correspondings to NAME.
2800      Return the index of the symbol in the SYMB_TABLE.  */
2801 
2802   auto inline int32_t
2803   __attribute ((always_inline))
2804   seek_collating_symbol_entry (name, name_len)
2805 	 const unsigned char *name;
2806 	 size_t name_len;
2807     {
2808       int32_t hash = elem_hash ((const char *) name, name_len);
2809       int32_t elem = hash % table_size;
2810       if (symb_table[2 * elem] != 0)
2811 	{
2812 	  int32_t second = hash % (table_size - 2) + 1;
2813 
2814 	  do
2815 	    {
2816 	      /* First compare the hashing value.  */
2817 	      if (symb_table[2 * elem] == hash
2818 		  /* Compare the length of the name.  */
2819 		  && name_len == extra[symb_table[2 * elem + 1]]
2820 		  /* Compare the name.  */
2821 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2822 			     name_len) == 0)
2823 		{
2824 		  /* Yep, this is the entry.  */
2825 		  break;
2826 		}
2827 
2828 	      /* Next entry.  */
2829 	      elem += second;
2830 	    }
2831 	  while (symb_table[2 * elem] != 0);
2832 	}
2833       return elem;
2834     }
2835 
2836   /* Local function for parse_bracket_exp used in _LIBC environment.
2837      Look up the collation sequence value of BR_ELEM.
2838      Return the value if succeeded, UINT_MAX otherwise.  */
2839 
2840   auto inline unsigned int
2841   __attribute ((always_inline))
2842   lookup_collation_sequence_value (br_elem)
2843 	 bracket_elem_t *br_elem;
2844     {
2845       if (br_elem->type == SB_CHAR)
2846 	{
2847 	  /*
2848 	  if (MB_CUR_MAX == 1)
2849 	  */
2850 	  if (nrules == 0)
2851 	    return collseqmb[br_elem->opr.ch];
2852 	  else
2853 	    {
2854 	      wint_t wc = __btowc (br_elem->opr.ch);
2855 	      return __collseq_table_lookup (collseqwc, wc);
2856 	    }
2857 	}
2858       else if (br_elem->type == MB_CHAR)
2859 	{
2860 	  if (nrules != 0)
2861 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2862 	}
2863       else if (br_elem->type == COLL_SYM)
2864 	{
2865 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2866 	  if (nrules != 0)
2867 	    {
2868 	      int32_t elem, idx;
2869 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2870 						  sym_name_len);
2871 	      if (symb_table[2 * elem] != 0)
2872 		{
2873 		  /* We found the entry.  */
2874 		  idx = symb_table[2 * elem + 1];
2875 		  /* Skip the name of collating element name.  */
2876 		  idx += 1 + extra[idx];
2877 		  /* Skip the byte sequence of the collating element.  */
2878 		  idx += 1 + extra[idx];
2879 		  /* Adjust for the alignment.  */
2880 		  idx = (idx + 3) & ~3;
2881 		  /* Skip the multibyte collation sequence value.  */
2882 		  idx += sizeof (unsigned int);
2883 		  /* Skip the wide char sequence of the collating element.  */
2884 		  idx += sizeof (unsigned int) *
2885 		    (1 + *(unsigned int *) (extra + idx));
2886 		  /* Return the collation sequence value.  */
2887 		  return *(unsigned int *) (extra + idx);
2888 		}
2889 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2890 		{
2891 		  /* No valid character.  Match it as a single byte
2892 		     character.  */
2893 		  return collseqmb[br_elem->opr.name[0]];
2894 		}
2895 	    }
2896 	  else if (sym_name_len == 1)
2897 	    return collseqmb[br_elem->opr.name[0]];
2898 	}
2899       return UINT_MAX;
2900     }
2901 
2902   /* Local function for parse_bracket_exp used in _LIBC environment.
2903      Build the range expression which starts from START_ELEM, and ends
2904      at END_ELEM.  The result are written to MBCSET and SBCSET.
2905      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2906      mbcset->range_ends, is a pointer argument since we may
2907      update it.  */
2908 
2909   auto inline reg_errcode_t
2910   __attribute ((always_inline))
2911   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2912 	 re_charset_t *mbcset;
2913 	 int *range_alloc;
2914 	 bitset_t sbcset;
2915 	 bracket_elem_t *start_elem, *end_elem;
2916     {
2917       unsigned int ch;
2918       uint32_t start_collseq;
2919       uint32_t end_collseq;
2920 
2921       /* Equivalence Classes and Character Classes can't be a range
2922 	 start/end.  */
2923       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2924 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2925 	      0))
2926 	return REG_ERANGE;
2927 
2928       start_collseq = lookup_collation_sequence_value (start_elem);
2929       end_collseq = lookup_collation_sequence_value (end_elem);
2930       /* Check start/end collation sequence values.  */
2931       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2932 	return REG_ECOLLATE;
2933       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2934 	return REG_ERANGE;
2935 
2936       /* Got valid collation sequence values, add them as a new entry.
2937 	 However, if we have no collation elements, and the character set
2938 	 is single byte, the single byte character set that we
2939 	 build below suffices. */
2940       if (nrules > 0 || dfa->mb_cur_max > 1)
2941 	{
2942 	  /* Check the space of the arrays.  */
2943 	  if (BE (*range_alloc == mbcset->nranges, 0))
2944 	    {
2945 	      /* There is not enough space, need realloc.  */
2946 	      uint32_t *new_array_start;
2947 	      uint32_t *new_array_end;
2948 	      int new_nranges;
2949 
2950 	      /* +1 in case of mbcset->nranges is 0.  */
2951 	      new_nranges = 2 * mbcset->nranges + 1;
2952 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2953 					    new_nranges);
2954 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2955 					  new_nranges);
2956 
2957 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2958 		return REG_ESPACE;
2959 
2960 	      mbcset->range_starts = new_array_start;
2961 	      mbcset->range_ends = new_array_end;
2962 	      *range_alloc = new_nranges;
2963 	    }
2964 
2965 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
2966 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
2967 	}
2968 
2969       /* Build the table for single byte characters.  */
2970       for (ch = 0; ch < SBC_MAX; ch++)
2971 	{
2972 	  uint32_t ch_collseq;
2973 	  /*
2974 	  if (MB_CUR_MAX == 1)
2975 	  */
2976 	  if (nrules == 0)
2977 	    ch_collseq = collseqmb[ch];
2978 	  else
2979 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2980 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2981 	    bitset_set (sbcset, ch);
2982 	}
2983       return REG_NOERROR;
2984     }
2985 
2986   /* Local function for parse_bracket_exp used in _LIBC environment.
2987      Build the collating element which is represented by NAME.
2988      The result are written to MBCSET and SBCSET.
2989      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2990      pointer argument since we may update it.  */
2991 
2992   auto inline reg_errcode_t
2993   __attribute ((always_inline))
2994   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2995 	 re_charset_t *mbcset;
2996 	 int *coll_sym_alloc;
2997 	 bitset_t sbcset;
2998 	 const unsigned char *name;
2999     {
3000       int32_t elem, idx;
3001       size_t name_len = strlen ((const char *) name);
3002       if (nrules != 0)
3003 	{
3004 	  elem = seek_collating_symbol_entry (name, name_len);
3005 	  if (symb_table[2 * elem] != 0)
3006 	    {
3007 	      /* We found the entry.  */
3008 	      idx = symb_table[2 * elem + 1];
3009 	      /* Skip the name of collating element name.  */
3010 	      idx += 1 + extra[idx];
3011 	    }
3012 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
3013 	    {
3014 	      /* No valid character, treat it as a normal
3015 		 character.  */
3016 	      bitset_set (sbcset, name[0]);
3017 	      return REG_NOERROR;
3018 	    }
3019 	  else
3020 	    return REG_ECOLLATE;
3021 
3022 	  /* Got valid collation sequence, add it as a new entry.  */
3023 	  /* Check the space of the arrays.  */
3024 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3025 	    {
3026 	      /* Not enough, realloc it.  */
3027 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3028 	      int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3029 	      /* Use realloc since mbcset->coll_syms is NULL
3030 		 if *alloc == 0.  */
3031 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3032 						   new_coll_sym_alloc);
3033 	      if (BE (new_coll_syms == NULL, 0))
3034 		return REG_ESPACE;
3035 	      mbcset->coll_syms = new_coll_syms;
3036 	      *coll_sym_alloc = new_coll_sym_alloc;
3037 	    }
3038 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3039 	  return REG_NOERROR;
3040 	}
3041       else
3042 	{
3043 	  if (BE (name_len != 1, 0))
3044 	    return REG_ECOLLATE;
3045 	  else
3046 	    {
3047 	      bitset_set (sbcset, name[0]);
3048 	      return REG_NOERROR;
3049 	    }
3050 	}
3051     }
3052 #endif
3053 
3054   re_token_t br_token;
3055   re_bitset_ptr_t sbcset;
3056 #ifdef RE_ENABLE_I18N
3057   re_charset_t *mbcset;
3058   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3059   int equiv_class_alloc = 0, char_class_alloc = 0;
3060 #endif /* not RE_ENABLE_I18N */
3061   int non_match = 0;
3062   bin_tree_t *work_tree;
3063   int token_len;
3064   int first_round = 1;
3065 #ifdef _LIBC
3066   collseqmb = (const unsigned char *)
3067     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3068   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3069   if (nrules)
3070     {
3071       /*
3072       if (MB_CUR_MAX > 1)
3073       */
3074       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3075       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3076       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3077 						  _NL_COLLATE_SYMB_TABLEMB);
3078       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3079 						   _NL_COLLATE_SYMB_EXTRAMB);
3080     }
3081 #endif
3082   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3083 #ifdef RE_ENABLE_I18N
3084   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3085 #endif /* RE_ENABLE_I18N */
3086 #ifdef RE_ENABLE_I18N
3087   if (BE (sbcset == NULL || mbcset == NULL, 0))
3088 #else
3089   if (BE (sbcset == NULL, 0))
3090 #endif /* RE_ENABLE_I18N */
3091     {
3092       *err = REG_ESPACE;
3093       return NULL;
3094     }
3095 
3096   token_len = peek_token_bracket (token, regexp, syntax);
3097   if (BE (token->type == END_OF_RE, 0))
3098     {
3099       *err = REG_BADPAT;
3100       goto parse_bracket_exp_free_return;
3101     }
3102   if (token->type == OP_NON_MATCH_LIST)
3103     {
3104 #ifdef RE_ENABLE_I18N
3105       mbcset->non_match = 1;
3106 #endif /* not RE_ENABLE_I18N */
3107       non_match = 1;
3108       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3109 	bitset_set (sbcset, '\n');
3110       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3111       token_len = peek_token_bracket (token, regexp, syntax);
3112       if (BE (token->type == END_OF_RE, 0))
3113 	{
3114 	  *err = REG_BADPAT;
3115 	  goto parse_bracket_exp_free_return;
3116 	}
3117     }
3118 
3119   /* We treat the first ']' as a normal character.  */
3120   if (token->type == OP_CLOSE_BRACKET)
3121     token->type = CHARACTER;
3122 
3123   while (1)
3124     {
3125       bracket_elem_t start_elem, end_elem;
3126       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3127       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3128       reg_errcode_t ret;
3129       int token_len2 = 0, is_range_exp = 0;
3130       re_token_t token2;
3131 
3132       start_elem.opr.name = start_name_buf;
3133       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3134 				   syntax, first_round);
3135       if (BE (ret != REG_NOERROR, 0))
3136 	{
3137 	  *err = ret;
3138 	  goto parse_bracket_exp_free_return;
3139 	}
3140       first_round = 0;
3141 
3142       /* Get information about the next token.  We need it in any case.  */
3143       token_len = peek_token_bracket (token, regexp, syntax);
3144 
3145       /* Do not check for ranges if we know they are not allowed.  */
3146       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3147 	{
3148 	  if (BE (token->type == END_OF_RE, 0))
3149 	    {
3150 	      *err = REG_EBRACK;
3151 	      goto parse_bracket_exp_free_return;
3152 	    }
3153 	  if (token->type == OP_CHARSET_RANGE)
3154 	    {
3155 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3156 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3157 	      if (BE (token2.type == END_OF_RE, 0))
3158 		{
3159 		  *err = REG_EBRACK;
3160 		  goto parse_bracket_exp_free_return;
3161 		}
3162 	      if (token2.type == OP_CLOSE_BRACKET)
3163 		{
3164 		  /* We treat the last '-' as a normal character.  */
3165 		  re_string_skip_bytes (regexp, -token_len);
3166 		  token->type = CHARACTER;
3167 		}
3168 	      else
3169 		is_range_exp = 1;
3170 	    }
3171 	}
3172 
3173       if (is_range_exp == 1)
3174 	{
3175 	  end_elem.opr.name = end_name_buf;
3176 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3177 				       dfa, syntax, 1);
3178 	  if (BE (ret != REG_NOERROR, 0))
3179 	    {
3180 	      *err = ret;
3181 	      goto parse_bracket_exp_free_return;
3182 	    }
3183 
3184 	  token_len = peek_token_bracket (token, regexp, syntax);
3185 
3186 #ifdef _LIBC
3187 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3188 				  &start_elem, &end_elem);
3189 #else
3190 # ifdef RE_ENABLE_I18N
3191 	  *err = build_range_exp (sbcset,
3192 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3193 				  &range_alloc, &start_elem, &end_elem);
3194 # else
3195 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3196 # endif
3197 #endif /* RE_ENABLE_I18N */
3198 	  if (BE (*err != REG_NOERROR, 0))
3199 	    goto parse_bracket_exp_free_return;
3200 	}
3201       else
3202 	{
3203 	  switch (start_elem.type)
3204 	    {
3205 	    case SB_CHAR:
3206 	      bitset_set (sbcset, start_elem.opr.ch);
3207 	      break;
3208 #ifdef RE_ENABLE_I18N
3209 	    case MB_CHAR:
3210 	      /* Check whether the array has enough space.  */
3211 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3212 		{
3213 		  wchar_t *new_mbchars;
3214 		  /* Not enough, realloc it.  */
3215 		  /* +1 in case of mbcset->nmbchars is 0.  */
3216 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3217 		  /* Use realloc since array is NULL if *alloc == 0.  */
3218 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3219 					    mbchar_alloc);
3220 		  if (BE (new_mbchars == NULL, 0))
3221 		    goto parse_bracket_exp_espace;
3222 		  mbcset->mbchars = new_mbchars;
3223 		}
3224 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3225 	      break;
3226 #endif /* RE_ENABLE_I18N */
3227 	    case EQUIV_CLASS:
3228 	      *err = build_equiv_class (sbcset,
3229 #ifdef RE_ENABLE_I18N
3230 					mbcset, &equiv_class_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232 					start_elem.opr.name);
3233 	      if (BE (*err != REG_NOERROR, 0))
3234 		goto parse_bracket_exp_free_return;
3235 	      break;
3236 	    case COLL_SYM:
3237 	      *err = build_collating_symbol (sbcset,
3238 #ifdef RE_ENABLE_I18N
3239 					     mbcset, &coll_sym_alloc,
3240 #endif /* RE_ENABLE_I18N */
3241 					     start_elem.opr.name);
3242 	      if (BE (*err != REG_NOERROR, 0))
3243 		goto parse_bracket_exp_free_return;
3244 	      break;
3245 	    case CHAR_CLASS:
3246 	      *err = build_charclass (regexp->trans, sbcset,
3247 #ifdef RE_ENABLE_I18N
3248 				      mbcset, &char_class_alloc,
3249 #endif /* RE_ENABLE_I18N */
3250 				      (const char *) start_elem.opr.name, syntax);
3251 	      if (BE (*err != REG_NOERROR, 0))
3252 	       goto parse_bracket_exp_free_return;
3253 	      break;
3254 	    default:
3255 	      assert (0);
3256 	      break;
3257 	    }
3258 	}
3259       if (BE (token->type == END_OF_RE, 0))
3260 	{
3261 	  *err = REG_EBRACK;
3262 	  goto parse_bracket_exp_free_return;
3263 	}
3264       if (token->type == OP_CLOSE_BRACKET)
3265 	break;
3266     }
3267 
3268   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3269 
3270   /* If it is non-matching list.  */
3271   if (non_match)
3272     bitset_not (sbcset);
3273 
3274 #ifdef RE_ENABLE_I18N
3275   /* Ensure only single byte characters are set.  */
3276   if (dfa->mb_cur_max > 1)
3277     bitset_mask (sbcset, dfa->sb_char);
3278 
3279   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3280       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3281 						     || mbcset->non_match)))
3282     {
3283       bin_tree_t *mbc_tree;
3284       int sbc_idx;
3285       /* Build a tree for complex bracket.  */
3286       dfa->has_mb_node = 1;
3287       br_token.type = COMPLEX_BRACKET;
3288       br_token.opr.mbcset = mbcset;
3289       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290       if (BE (mbc_tree == NULL, 0))
3291 	goto parse_bracket_exp_espace;
3292       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3293 	if (sbcset[sbc_idx])
3294 	  break;
3295       /* If there are no bits set in sbcset, there is no point
3296 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3297       if (sbc_idx < BITSET_WORDS)
3298 	{
3299 	  /* Build a tree for simple bracket.  */
3300 	  br_token.type = SIMPLE_BRACKET;
3301 	  br_token.opr.sbcset = sbcset;
3302 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303 	  if (BE (work_tree == NULL, 0))
3304 	    goto parse_bracket_exp_espace;
3305 
3306 	  /* Then join them by ALT node.  */
3307 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3308 	  if (BE (work_tree == NULL, 0))
3309 	    goto parse_bracket_exp_espace;
3310 	}
3311       else
3312 	{
3313 	  re_free (sbcset);
3314 	  work_tree = mbc_tree;
3315 	}
3316     }
3317   else
3318 #endif /* not RE_ENABLE_I18N */
3319     {
3320 #ifdef RE_ENABLE_I18N
3321       free_charset (mbcset);
3322 #endif
3323       /* Build a tree for simple bracket.  */
3324       br_token.type = SIMPLE_BRACKET;
3325       br_token.opr.sbcset = sbcset;
3326       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327       if (BE (work_tree == NULL, 0))
3328 	goto parse_bracket_exp_espace;
3329     }
3330   return work_tree;
3331 
3332  parse_bracket_exp_espace:
3333   *err = REG_ESPACE;
3334  parse_bracket_exp_free_return:
3335   re_free (sbcset);
3336 #ifdef RE_ENABLE_I18N
3337   free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3339   return NULL;
3340 }
3341 
3342 /* Parse an element in the bracket expression.  */
3343 
3344 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3345 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3346 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3347 		       reg_syntax_t syntax, int accept_hyphen)
3348 {
3349 #ifdef RE_ENABLE_I18N
3350   int cur_char_size;
3351   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3352   if (cur_char_size > 1)
3353     {
3354       elem->type = MB_CHAR;
3355       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3356       re_string_skip_bytes (regexp, cur_char_size);
3357       return REG_NOERROR;
3358     }
3359 #endif /* RE_ENABLE_I18N */
3360   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3361   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3362       || token->type == OP_OPEN_EQUIV_CLASS)
3363     return parse_bracket_symbol (elem, regexp, token);
3364   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3365     {
3366       /* A '-' must only appear as anything but a range indicator before
3367 	 the closing bracket.  Everything else is an error.  */
3368       re_token_t token2;
3369       (void) peek_token_bracket (&token2, regexp, syntax);
3370       if (token2.type != OP_CLOSE_BRACKET)
3371 	/* The actual error value is not standardized since this whole
3372 	   case is undefined.  But ERANGE makes good sense.  */
3373 	return REG_ERANGE;
3374     }
3375   elem->type = SB_CHAR;
3376   elem->opr.ch = token->opr.c;
3377   return REG_NOERROR;
3378 }
3379 
3380 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3381    such as [:<character_class>:], [.<collating_element>.], and
3382    [=<equivalent_class>=].  */
3383 
3384 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3385 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3386 		      re_token_t *token)
3387 {
3388   unsigned char ch, delim = token->opr.c;
3389   int i = 0;
3390   if (re_string_eoi(regexp))
3391     return REG_EBRACK;
3392   for (;; ++i)
3393     {
3394       if (i >= BRACKET_NAME_BUF_SIZE)
3395 	return REG_EBRACK;
3396       if (token->type == OP_OPEN_CHAR_CLASS)
3397 	ch = re_string_fetch_byte_case (regexp);
3398       else
3399 	ch = re_string_fetch_byte (regexp);
3400       if (re_string_eoi(regexp))
3401 	return REG_EBRACK;
3402       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3403 	break;
3404       elem->opr.name[i] = ch;
3405     }
3406   re_string_skip_bytes (regexp, 1);
3407   elem->opr.name[i] = '\0';
3408   switch (token->type)
3409     {
3410     case OP_OPEN_COLL_ELEM:
3411       elem->type = COLL_SYM;
3412       break;
3413     case OP_OPEN_EQUIV_CLASS:
3414       elem->type = EQUIV_CLASS;
3415       break;
3416     case OP_OPEN_CHAR_CLASS:
3417       elem->type = CHAR_CLASS;
3418       break;
3419     default:
3420       break;
3421     }
3422   return REG_NOERROR;
3423 }
3424 
3425   /* Helper function for parse_bracket_exp.
3426      Build the equivalence class which is represented by NAME.
3427      The result are written to MBCSET and SBCSET.
3428      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3429      is a pointer argument since we may update it.  */
3430 
3431 static reg_errcode_t
3432 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3433 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3434 		   int *equiv_class_alloc, const unsigned char *name)
3435 #else /* not RE_ENABLE_I18N */
3436 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3437 #endif /* not RE_ENABLE_I18N */
3438 {
3439 #ifdef _LIBC
3440   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3441   if (nrules != 0)
3442     {
3443       const int32_t *table, *indirect;
3444       const unsigned char *weights, *extra, *cp;
3445       unsigned char char_buf[2];
3446       int32_t idx1, idx2;
3447       unsigned int ch;
3448       size_t len;
3449       /* This #include defines a local function!  */
3450 # include <locale/weight.h>
3451       /* Calculate the index for equivalence class.  */
3452       cp = name;
3453       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3454       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455 					       _NL_COLLATE_WEIGHTMB);
3456       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3457 						   _NL_COLLATE_EXTRAMB);
3458       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3459 						_NL_COLLATE_INDIRECTMB);
3460       idx1 = findidx (&cp);
3461       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3462 	/* This isn't a valid character.  */
3463 	return REG_ECOLLATE;
3464 
3465       /* Build single byte matching table for this equivalence class.  */
3466       char_buf[1] = (unsigned char) '\0';
3467       len = weights[idx1 & 0xffffff];
3468       for (ch = 0; ch < SBC_MAX; ++ch)
3469 	{
3470 	  char_buf[0] = ch;
3471 	  cp = char_buf;
3472 	  idx2 = findidx (&cp);
3473 /*
3474 	  idx2 = table[ch];
3475 */
3476 	  if (idx2 == 0)
3477 	    /* This isn't a valid character.  */
3478 	    continue;
3479 	  /* Compare only if the length matches and the collation rule
3480 	     index is the same.  */
3481 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3482 	    {
3483 	      int cnt = 0;
3484 
3485 	      while (cnt <= len &&
3486 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3487 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3488 		++cnt;
3489 
3490 	      if (cnt > len)
3491 		bitset_set (sbcset, ch);
3492 	    }
3493 	}
3494       /* Check whether the array has enough space.  */
3495       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3496 	{
3497 	  /* Not enough, realloc it.  */
3498 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3499 	  int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3500 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3501 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3502 						   int32_t,
3503 						   new_equiv_class_alloc);
3504 	  if (BE (new_equiv_classes == NULL, 0))
3505 	    return REG_ESPACE;
3506 	  mbcset->equiv_classes = new_equiv_classes;
3507 	  *equiv_class_alloc = new_equiv_class_alloc;
3508 	}
3509       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3510     }
3511   else
3512 #endif /* _LIBC */
3513     {
3514       if (BE (strlen ((const char *) name) != 1, 0))
3515 	return REG_ECOLLATE;
3516       bitset_set (sbcset, *name);
3517     }
3518   return REG_NOERROR;
3519 }
3520 
3521   /* Helper function for parse_bracket_exp.
3522      Build the character class which is represented by NAME.
3523      The result are written to MBCSET and SBCSET.
3524      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3525      is a pointer argument since we may update it.  */
3526 
3527 static reg_errcode_t
3528 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const char * class_name,reg_syntax_t syntax)3529 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3530 		 re_charset_t *mbcset, int *char_class_alloc,
3531 		 const char *class_name, reg_syntax_t syntax)
3532 #else /* not RE_ENABLE_I18N */
3533 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3534 		 const char *class_name, reg_syntax_t syntax)
3535 #endif /* not RE_ENABLE_I18N */
3536 {
3537   int i;
3538 
3539   /* In case of REG_ICASE "upper" and "lower" match the both of
3540      upper and lower cases.  */
3541   if ((syntax & RE_ICASE)
3542       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3543     class_name = "alpha";
3544 
3545 #ifdef RE_ENABLE_I18N
3546   /* Check the space of the arrays.  */
3547   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3548     {
3549       /* Not enough, realloc it.  */
3550       /* +1 in case of mbcset->nchar_classes is 0.  */
3551       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3552       /* Use realloc since array is NULL if *alloc == 0.  */
3553       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3554 					       new_char_class_alloc);
3555       if (BE (new_char_classes == NULL, 0))
3556 	return REG_ESPACE;
3557       mbcset->char_classes = new_char_classes;
3558       *char_class_alloc = new_char_class_alloc;
3559     }
3560   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3561 #endif /* RE_ENABLE_I18N */
3562 
3563 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3564   do {						\
3565     if (BE (trans != NULL, 0))			\
3566       {						\
3567 	for (i = 0; i < SBC_MAX; ++i)		\
3568   	  if (ctype_func (i))			\
3569 	    bitset_set (sbcset, trans[i]);	\
3570       }						\
3571     else					\
3572       {						\
3573 	for (i = 0; i < SBC_MAX; ++i)		\
3574   	  if (ctype_func (i))			\
3575 	    bitset_set (sbcset, i);		\
3576       }						\
3577   } while (0)
3578 
3579   if (strcmp (class_name, "alnum") == 0)
3580     BUILD_CHARCLASS_LOOP (isalnum);
3581   else if (strcmp (class_name, "cntrl") == 0)
3582     BUILD_CHARCLASS_LOOP (iscntrl);
3583   else if (strcmp (class_name, "lower") == 0)
3584     BUILD_CHARCLASS_LOOP (islower);
3585   else if (strcmp (class_name, "space") == 0)
3586     BUILD_CHARCLASS_LOOP (isspace);
3587   else if (strcmp (class_name, "alpha") == 0)
3588     BUILD_CHARCLASS_LOOP (isalpha);
3589   else if (strcmp (class_name, "digit") == 0)
3590     BUILD_CHARCLASS_LOOP (isdigit);
3591   else if (strcmp (class_name, "print") == 0)
3592     BUILD_CHARCLASS_LOOP (isprint);
3593   else if (strcmp (class_name, "upper") == 0)
3594     BUILD_CHARCLASS_LOOP (isupper);
3595   else if (strcmp (class_name, "blank") == 0)
3596 #ifndef GAWK
3597     BUILD_CHARCLASS_LOOP (isblank);
3598 #else
3599     /* see comments above */
3600     BUILD_CHARCLASS_LOOP (is_blank);
3601 #endif
3602   else if (strcmp (class_name, "graph") == 0)
3603     BUILD_CHARCLASS_LOOP (isgraph);
3604   else if (strcmp (class_name, "punct") == 0)
3605     BUILD_CHARCLASS_LOOP (ispunct);
3606   else if (strcmp (class_name, "xdigit") == 0)
3607     BUILD_CHARCLASS_LOOP (isxdigit);
3608   else
3609     return REG_ECTYPE;
3610 
3611   return REG_NOERROR;
3612 }
3613 
3614 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,int non_match,reg_errcode_t * err)3615 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3616 		    const char *class_name,
3617 		    const char *extra, int non_match,
3618 		    reg_errcode_t *err)
3619 {
3620   re_bitset_ptr_t sbcset;
3621 #ifdef RE_ENABLE_I18N
3622   re_charset_t *mbcset;
3623   int alloc = 0;
3624 #endif /* not RE_ENABLE_I18N */
3625   reg_errcode_t ret;
3626   re_token_t br_token;
3627   bin_tree_t *tree;
3628 
3629   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3630 #ifdef RE_ENABLE_I18N
3631   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3632 #endif /* RE_ENABLE_I18N */
3633 
3634 #ifdef RE_ENABLE_I18N
3635   if (BE (sbcset == NULL || mbcset == NULL, 0))
3636 #else /* not RE_ENABLE_I18N */
3637   if (BE (sbcset == NULL, 0))
3638 #endif /* not RE_ENABLE_I18N */
3639     {
3640       *err = REG_ESPACE;
3641       return NULL;
3642     }
3643 
3644   if (non_match)
3645     {
3646 #ifdef RE_ENABLE_I18N
3647       mbcset->non_match = 1;
3648 #endif /* not RE_ENABLE_I18N */
3649     }
3650 
3651   /* We don't care the syntax in this case.  */
3652   ret = build_charclass (trans, sbcset,
3653 #ifdef RE_ENABLE_I18N
3654 			 mbcset, &alloc,
3655 #endif /* RE_ENABLE_I18N */
3656 			 class_name, 0);
3657 
3658   if (BE (ret != REG_NOERROR, 0))
3659     {
3660       re_free (sbcset);
3661 #ifdef RE_ENABLE_I18N
3662       free_charset (mbcset);
3663 #endif /* RE_ENABLE_I18N */
3664       *err = ret;
3665       return NULL;
3666     }
3667   /* \w match '_' also.  */
3668   for (; *extra; extra++)
3669     bitset_set (sbcset, *extra);
3670 
3671   /* If it is non-matching list.  */
3672   if (non_match)
3673     bitset_not (sbcset);
3674 
3675 #ifdef RE_ENABLE_I18N
3676   /* Ensure only single byte characters are set.  */
3677   if (dfa->mb_cur_max > 1)
3678     bitset_mask (sbcset, dfa->sb_char);
3679 #endif
3680 
3681   /* Build a tree for simple bracket.  */
3682   br_token.type = SIMPLE_BRACKET;
3683   br_token.opr.sbcset = sbcset;
3684   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685   if (BE (tree == NULL, 0))
3686     goto build_word_op_espace;
3687 
3688 #ifdef RE_ENABLE_I18N
3689   if (dfa->mb_cur_max > 1)
3690     {
3691       bin_tree_t *mbc_tree;
3692       /* Build a tree for complex bracket.  */
3693       br_token.type = COMPLEX_BRACKET;
3694       br_token.opr.mbcset = mbcset;
3695       dfa->has_mb_node = 1;
3696       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3697       if (BE (mbc_tree == NULL, 0))
3698 	goto build_word_op_espace;
3699       /* Then join them by ALT node.  */
3700       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3701       if (BE (mbc_tree != NULL, 1))
3702 	return tree;
3703     }
3704   else
3705     {
3706       free_charset (mbcset);
3707       return tree;
3708     }
3709 #else /* not RE_ENABLE_I18N */
3710   return tree;
3711 #endif /* not RE_ENABLE_I18N */
3712 
3713  build_word_op_espace:
3714   re_free (sbcset);
3715 #ifdef RE_ENABLE_I18N
3716   free_charset (mbcset);
3717 #endif /* RE_ENABLE_I18N */
3718   *err = REG_ESPACE;
3719   return NULL;
3720 }
3721 
3722 /* This is intended for the expressions like "a{1,3}".
3723    Fetch a number from `input', and return the number.
3724    Return -1, if the number field is empty like "{,1}".
3725    Return -2, if an error has occurred.  */
3726 
3727 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3728 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3729 {
3730   int num = -1;
3731   unsigned char c;
3732   while (1)
3733     {
3734       fetch_token (token, input, syntax);
3735       c = token->opr.c;
3736       if (BE (token->type == END_OF_RE, 0))
3737 	return -2;
3738       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3739 	break;
3740       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3741 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3742       num = (num > RE_DUP_MAX) ? -2 : num;
3743     }
3744   return num;
3745 }
3746 
3747 #ifdef RE_ENABLE_I18N
3748 static void
free_charset(re_charset_t * cset)3749 free_charset (re_charset_t *cset)
3750 {
3751   re_free (cset->mbchars);
3752 # ifdef _LIBC
3753   re_free (cset->coll_syms);
3754   re_free (cset->equiv_classes);
3755   re_free (cset->range_starts);
3756   re_free (cset->range_ends);
3757 # endif
3758   re_free (cset->char_classes);
3759   re_free (cset);
3760 }
3761 #endif /* RE_ENABLE_I18N */
3762 
3763 /* Functions for binary tree operation.  */
3764 
3765 /* Create a tree node.  */
3766 
3767 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3768 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769 	     re_token_type_t type)
3770 {
3771   re_token_t t;
3772   t.type = type;
3773   return create_token_tree (dfa, left, right, &t);
3774 }
3775 
3776 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3777 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3778 		   const re_token_t *token)
3779 {
3780   bin_tree_t *tree;
3781   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3782     {
3783       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3784 
3785       if (storage == NULL)
3786 	return NULL;
3787       storage->next = dfa->str_tree_storage;
3788       dfa->str_tree_storage = storage;
3789       dfa->str_tree_storage_idx = 0;
3790     }
3791   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3792 
3793   tree->parent = NULL;
3794   tree->left = left;
3795   tree->right = right;
3796   tree->token = *token;
3797   tree->token.duplicated = 0;
3798   tree->token.opt_subexp = 0;
3799   tree->first = NULL;
3800   tree->next = NULL;
3801   tree->node_idx = -1;
3802 
3803   if (left != NULL)
3804     left->parent = tree;
3805   if (right != NULL)
3806     right->parent = tree;
3807   return tree;
3808 }
3809 
3810 /* Mark the tree SRC as an optional subexpression.
3811    To be called from preorder or postorder.  */
3812 
3813 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3814 mark_opt_subexp (void *extra, bin_tree_t *node)
3815 {
3816   int idx = (int) (intptr_t) extra;
3817   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3818     node->token.opt_subexp = 1;
3819 
3820   return REG_NOERROR;
3821 }
3822 
3823 /* Free the allocated memory inside NODE. */
3824 
3825 static void
free_token(re_token_t * node)3826 free_token (re_token_t *node)
3827 {
3828 #ifdef RE_ENABLE_I18N
3829   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3830     free_charset (node->opr.mbcset);
3831   else
3832 #endif /* RE_ENABLE_I18N */
3833     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3834       re_free (node->opr.sbcset);
3835 }
3836 
3837 /* Worker function for tree walking.  Free the allocated memory inside NODE
3838    and its children. */
3839 
3840 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3841 free_tree (void *extra, bin_tree_t *node)
3842 {
3843   free_token (&node->token);
3844   return REG_NOERROR;
3845 }
3846 
3847 
3848 /* Duplicate the node SRC, and return new node.  This is a preorder
3849    visit similar to the one implemented by the generic visitor, but
3850    we need more infrastructure to maintain two parallel trees --- so,
3851    it's easier to duplicate.  */
3852 
3853 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3854 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3855 {
3856   const bin_tree_t *node;
3857   bin_tree_t *dup_root;
3858   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3859 
3860   for (node = root; ; )
3861     {
3862       /* Create a new tree and link it back to the current parent.  */
3863       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3864       if (*p_new == NULL)
3865 	return NULL;
3866       (*p_new)->parent = dup_node;
3867       (*p_new)->token.duplicated = 1;
3868       dup_node = *p_new;
3869 
3870       /* Go to the left node, or up and to the right.  */
3871       if (node->left)
3872 	{
3873 	  node = node->left;
3874 	  p_new = &dup_node->left;
3875 	}
3876       else
3877 	{
3878 	  const bin_tree_t *prev = NULL;
3879 	  while (node->right == prev || node->right == NULL)
3880 	    {
3881 	      prev = node;
3882 	      node = node->parent;
3883 	      dup_node = dup_node->parent;
3884 	      if (!node)
3885 		return dup_root;
3886 	    }
3887 	  node = node->right;
3888 	  p_new = &dup_node->right;
3889 	}
3890     }
3891 }
3892