1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
23 
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 					  size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 				     const re_dfastate_t *init_state,
28 				     char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 			       reg_errcode_t (fn (void *, bin_tree_t *)),
41 			       void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 				reg_errcode_t (fn (void *, bin_tree_t *)),
44 				void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 				 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 				   unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 					 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 			 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 			reg_syntax_t syntax) internal_function;
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 			  reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 				  re_token_t *token, reg_syntax_t syntax,
67 				  Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 				 re_token_t *token, reg_syntax_t syntax,
70 				 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 				     re_token_t *token, reg_syntax_t syntax,
73 				     Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 				  re_token_t *token, reg_syntax_t syntax,
76 				  Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 				 re_dfa_t *dfa, re_token_t *token,
79 				 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 				      re_token_t *token, reg_syntax_t syntax,
82 				      reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 					    re_string_t *regexp,
85 					    re_token_t *token, int token_len,
86 					    re_dfa_t *dfa,
87 					    reg_syntax_t syntax,
88 					    bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 					  re_string_t *regexp,
91 					  re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 					re_charset_t *mbcset,
95 					Idx *equiv_class_alloc,
96 					const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 				      bitset_t sbcset,
99 				      re_charset_t *mbcset,
100 				      Idx *char_class_alloc,
101 				      const char *class_name,
102 				      reg_syntax_t syntax);
103 #else  /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 					const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 				      bitset_t sbcset,
108 				      const char *class_name,
109 				      reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 				       RE_TRANSLATE_TYPE trans,
113 				       const char *class_name,
114 				       const char *extra,
115 				       bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 				bin_tree_t *left, bin_tree_t *right,
118 				re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 				      bin_tree_t *left, bin_tree_t *right,
121 				      const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126 
127 /* This table gives an error message for each of the error codes listed
128    in regex.h.  Obviously the order here has to be same as there.
129    POSIX doesn't require that we do anything for REG_NOERROR,
130    but why not be nice?  */
131 
132 static const char __re_error_msgid[] =
133   {
134 #define REG_NOERROR_IDX	0
135     gettext_noop ("Success")	/* REG_NOERROR */
136     "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138     gettext_noop ("No match")	/* REG_NOMATCH */
139     "\0"
140 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142     "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145     "\0"
146 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148     "\0"
149 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151     "\0"
152 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154     "\0"
155 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
156     gettext_noop ("Unmatched [, [^, [:, [., or [=")	/* REG_EBRACK */
157     "\0"
158 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160     "\0"
161 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163     "\0"
164 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166     "\0"
167 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168     gettext_noop ("Invalid range end")	/* REG_ERANGE */
169     "\0"
170 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
172     "\0"
173 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175     "\0"
176 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
178     "\0"
179 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
181     "\0"
182 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184   };
185 
186 static const size_t __re_error_msgid_idx[] =
187   {
188     REG_NOERROR_IDX,
189     REG_NOMATCH_IDX,
190     REG_BADPAT_IDX,
191     REG_ECOLLATE_IDX,
192     REG_ECTYPE_IDX,
193     REG_EESCAPE_IDX,
194     REG_ESUBREG_IDX,
195     REG_EBRACK_IDX,
196     REG_EPAREN_IDX,
197     REG_EBRACE_IDX,
198     REG_BADBR_IDX,
199     REG_ERANGE_IDX,
200     REG_ESPACE_IDX,
201     REG_BADRPT_IDX,
202     REG_EEND_IDX,
203     REG_ESIZE_IDX,
204     REG_ERPAREN_IDX
205   };
206 
207 /* Entry points for GNU code.  */
208 
209 /* re_compile_pattern is the GNU regular expression compiler: it
210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211    Returns 0 if the pattern was valid, otherwise an error string.
212 
213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214    are set in BUFP on entry.  */
215 
216 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)217 re_compile_pattern (const char *pattern, size_t length,
218 		    struct re_pattern_buffer *bufp)
219 {
220   reg_errcode_t ret;
221 
222   /* And GNU code determines whether or not to get register information
223      by passing null for the REGS argument to re_match, etc., not by
224      setting no_sub, unless RE_NO_SUB is set.  */
225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 
227   /* Match anchors at newline.  */
228   bufp->newline_anchor = 1;
229 
230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 
232   if (!ret)
233     return NULL;
234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 }
236 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
239 
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
241    also be assigned to arbitrarily: each pattern buffer stores its own
242    syntax, so it can be changed between regex compilations.  */
243 /* This has no initializer because initialized variables in Emacs
244    become read-only after dumping.  */
245 reg_syntax_t re_syntax_options;
246 
247 
248 /* Specify the precise syntax of regexps for compilation.  This provides
249    for compatibility for various utilities which historically have
250    different, incompatible syntaxes.
251 
252    The argument SYNTAX is a bit mask comprised of the various bits
253    defined in regex.h.  We return the old syntax.  */
254 
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
257 {
258   reg_syntax_t ret = re_syntax_options;
259 
260   re_syntax_options = syntax;
261   return ret;
262 }
263 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266 
267 int
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
269 {
270   re_dfa_t *dfa = bufp->buffer;
271   char *fastmap = bufp->fastmap;
272 
273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275   if (dfa->init_state != dfa->init_state_word)
276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277   if (dfa->init_state != dfa->init_state_nl)
278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279   if (dfa->init_state != dfa->init_state_begbuf)
280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281   bufp->fastmap_accurate = 1;
282   return 0;
283 }
284 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
287 
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
291 {
292   fastmap[ch] = 1;
293   if (icase)
294     fastmap[tolower (ch)] = 1;
295 }
296 
297 /* Helper function for re_compile_fastmap.
298    Compile fastmap for the initial_state INIT_STATE.  */
299 
300 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 			 char *fastmap)
303 {
304   re_dfa_t *dfa = bufp->buffer;
305   Idx node_cnt;
306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
308     {
309       Idx node = init_state->nodes.elems[node_cnt];
310       re_token_type_t type = dfa->nodes[node].type;
311 
312       if (type == CHARACTER)
313 	{
314 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
317 	    {
318 	      unsigned char buf[MB_LEN_MAX];
319 	      unsigned char *p;
320 	      wchar_t wc;
321 	      mbstate_t state;
322 
323 	      p = buf;
324 	      *p++ = dfa->nodes[node].opr.c;
325 	      while (++node < dfa->nodes_len
326 		     &&	dfa->nodes[node].type == CHARACTER
327 		     && dfa->nodes[node].mb_partial)
328 		*p++ = dfa->nodes[node].opr.c;
329 	      memset (&state, '\0', sizeof (state));
330 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 			     &state) == p - buf
332 		  && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 		      != (size_t) -1))
334 		re_set_fastmap (fastmap, false, buf[0]);
335 	    }
336 #endif
337 	}
338       else if (type == SIMPLE_BRACKET)
339 	{
340 	  int i, ch;
341 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 	    {
343 	      int j;
344 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 		if (w & ((bitset_word_t) 1 << j))
347 		  re_set_fastmap (fastmap, icase, ch);
348 	    }
349 	}
350 #ifdef RE_ENABLE_I18N
351       else if (type == COMPLEX_BRACKET)
352 	{
353 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 	  Idx i;
355 
356 # ifdef _LIBC
357 	  /* See if we have to try all bytes which start multiple collation
358 	     elements.
359 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 		  collation element, and don't catch 'b' since 'b' is
361 		  the only collation element which starts from 'b' (and
362 		  it is caught by SIMPLE_BRACKET).  */
363 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 		  && (cset->ncoll_syms || cset->nranges))
365 		{
366 		  const int32_t *table = (const int32_t *)
367 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 		  for (i = 0; i < SBC_MAX; ++i)
369 		    if (table[i] < 0)
370 		      re_set_fastmap (fastmap, icase, i);
371 		}
372 # endif /* _LIBC */
373 
374 	  /* See if we have to start the match at all multibyte characters,
375 	     i.e. where we would not find an invalid sequence.  This only
376 	     applies to multibyte character sets; for single byte character
377 	     sets, the SIMPLE_BRACKET again suffices.  */
378 	  if (dfa->mb_cur_max > 1
379 	      && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 		  || cset->nequiv_classes
382 # endif /* _LIBC */
383 		 ))
384 	    {
385 	      unsigned char c = 0;
386 	      do
387 		{
388 		  mbstate_t mbs;
389 		  memset (&mbs, 0, sizeof (mbs));
390 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 		    re_set_fastmap (fastmap, false, (int) c);
392 		}
393 	      while (++c != 0);
394 	    }
395 
396 	  else
397 	    {
398 	      /* ... Else catch all bytes which can start the mbchars.  */
399 	      for (i = 0; i < cset->nmbchars; ++i)
400 		{
401 		  char buf[256];
402 		  mbstate_t state;
403 		  memset (&state, '\0', sizeof (state));
404 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 		    {
408 		      if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 			  != (size_t) -1)
410 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
411 		    }
412 		}
413 	    }
414 	}
415 #endif /* RE_ENABLE_I18N */
416       else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 	       || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 	       || type == END_OF_RE)
421 	{
422 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 	  if (type == END_OF_RE)
424 	    bufp->can_be_null = 1;
425 	  return;
426 	}
427     }
428 }
429 
430 /* Entry point for POSIX code.  */
431 /* regcomp takes a regular expression as a string and compiles it.
432 
433    PREG is a regex_t *.  We do not expect any fields to be initialized,
434    since POSIX says we shouldn't.  Thus, we set
435 
436      'buffer' to the compiled pattern;
437      'used' to the length of the compiled pattern;
438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
440        RE_SYNTAX_POSIX_BASIC;
441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442      'fastmap' to an allocated space for the fastmap;
443      'fastmap_accurate' to zero;
444      're_nsub' to the number of subexpressions in PATTERN.
445 
446    PATTERN is the address of the pattern string.
447 
448    CFLAGS is a series of bits which affect compilation.
449 
450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451      use POSIX basic syntax.
452 
453      If REG_NEWLINE is set, then . and [^...] don't match newline.
454      Also, regexec will try a match beginning after every newline.
455 
456      If REG_ICASE is set, then we considers upper- and lowercase
457      versions of letters to be equivalent when matching.
458 
459      If REG_NOSUB is set, then when PREG is passed to regexec, that
460      routine will report only success or failure, and nothing about the
461      registers.
462 
463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
464    the return codes and their meanings.)  */
465 
466 int
regcomp(regex_t * _Restrict_ preg,const char * _Restrict_ pattern,int cflags)467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
468 {
469   reg_errcode_t ret;
470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 			 : RE_SYNTAX_POSIX_BASIC);
472 
473   preg->buffer = NULL;
474   preg->allocated = 0;
475   preg->used = 0;
476 
477   /* Try to allocate space for the fastmap.  */
478   preg->fastmap = re_malloc (char, SBC_MAX);
479   if (BE (preg->fastmap == NULL, 0))
480     return REG_ESPACE;
481 
482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 
484   /* If REG_NEWLINE is set, newlines are treated differently.  */
485   if (cflags & REG_NEWLINE)
486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
487       syntax &= ~RE_DOT_NEWLINE;
488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489       /* It also changes the matching behavior.  */
490       preg->newline_anchor = 1;
491     }
492   else
493     preg->newline_anchor = 0;
494   preg->no_sub = !!(cflags & REG_NOSUB);
495   preg->translate = NULL;
496 
497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 
499   /* POSIX doesn't distinguish between an unmatched open-group and an
500      unmatched close-group: both are REG_EPAREN.  */
501   if (ret == REG_ERPAREN)
502     ret = REG_EPAREN;
503 
504   /* We have already checked preg->fastmap != NULL.  */
505   if (BE (ret == REG_NOERROR, 1))
506     /* Compute the fastmap now, since regexec cannot modify the pattern
507        buffer.  This function never fails in this implementation.  */
508     (void) re_compile_fastmap (preg);
509   else
510     {
511       /* Some error occurred while compiling the expression.  */
512       re_free (preg->fastmap);
513       preg->fastmap = NULL;
514     }
515 
516   return (int) ret;
517 }
518 #ifdef _LIBC
weak_alias(__regcomp,regcomp)519 weak_alias (__regcomp, regcomp)
520 #endif
521 
522 /* Returns a message corresponding to an error code, ERRCODE, returned
523    from either regcomp or regexec.   We don't use PREG here.  */
524 
525 size_t
526 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
527 	  size_t errbuf_size)
528 {
529   const char *msg;
530   size_t msg_size;
531 
532   if (BE (errcode < 0
533 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
534 			       / sizeof (__re_error_msgid_idx[0])), 0))
535     /* Only error codes returned by the rest of the code should be passed
536        to this routine.  If we are given anything else, or if other regex
537        code generates an invalid error code, then the program has a bug.
538        Dump core so we can fix it.  */
539     abort ();
540 
541   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
542 
543   msg_size = strlen (msg) + 1; /* Includes the null.  */
544 
545   if (BE (errbuf_size != 0, 1))
546     {
547       size_t cpy_size = msg_size;
548       if (BE (msg_size > errbuf_size, 0))
549 	{
550 	  cpy_size = errbuf_size - 1;
551 	  errbuf[cpy_size] = '\0';
552 	}
553       memcpy (errbuf, msg, cpy_size);
554     }
555 
556   return msg_size;
557 }
558 #ifdef _LIBC
559 weak_alias (__regerror, regerror)
560 #endif
561 
562 
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565    UTF-8 is used.  Otherwise we would allocate memory just to initialize
566    it the same all the time.  UTF-8 is the preferred encoding so this is
567    a worthwhile optimization.  */
568 static const bitset_t utf8_sb_map =
569 {
570   /* Set the first 128 bits.  */
571 # if defined __GNUC__ && !defined __STRICT_ANSI__
572   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
573 # else
574 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
575 #   error "bitset_word_t is narrower than 32 bits"
576 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
577   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
578 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
579   BITSET_WORD_MAX, BITSET_WORD_MAX,
580 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
581   BITSET_WORD_MAX,
582 #  endif
583   (BITSET_WORD_MAX
584    >> (SBC_MAX % BITSET_WORD_BITS == 0
585        ? 0
586        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
587 # endif
588 };
589 #endif
590 
591 
592 static void
free_dfa_content(re_dfa_t * dfa)593 free_dfa_content (re_dfa_t *dfa)
594 {
595   Idx i, j;
596 
597   if (dfa->nodes)
598     for (i = 0; i < dfa->nodes_len; ++i)
599       free_token (dfa->nodes + i);
600   re_free (dfa->nexts);
601   for (i = 0; i < dfa->nodes_len; ++i)
602     {
603       if (dfa->eclosures != NULL)
604 	re_node_set_free (dfa->eclosures + i);
605       if (dfa->inveclosures != NULL)
606 	re_node_set_free (dfa->inveclosures + i);
607       if (dfa->edests != NULL)
608 	re_node_set_free (dfa->edests + i);
609     }
610   re_free (dfa->edests);
611   re_free (dfa->eclosures);
612   re_free (dfa->inveclosures);
613   re_free (dfa->nodes);
614 
615   if (dfa->state_table)
616     for (i = 0; i <= dfa->state_hash_mask; ++i)
617       {
618 	struct re_state_table_entry *entry = dfa->state_table + i;
619 	for (j = 0; j < entry->num; ++j)
620 	  {
621 	    re_dfastate_t *state = entry->array[j];
622 	    free_state (state);
623 	  }
624 	re_free (entry->array);
625       }
626   re_free (dfa->state_table);
627 #ifdef RE_ENABLE_I18N
628   if (dfa->sb_char != utf8_sb_map)
629     re_free (dfa->sb_char);
630 #endif
631   re_free (dfa->subexp_map);
632 #ifdef DEBUG
633   re_free (dfa->re_str);
634 #endif
635 
636   re_free (dfa);
637 }
638 
639 
640 /* Free dynamically allocated space used by PREG.  */
641 
642 void
regfree(regex_t * preg)643 regfree (regex_t *preg)
644 {
645   re_dfa_t *dfa = preg->buffer;
646   if (BE (dfa != NULL, 1))
647     {
648       lock_fini (dfa->lock);
649       free_dfa_content (dfa);
650     }
651   preg->buffer = NULL;
652   preg->allocated = 0;
653 
654   re_free (preg->fastmap);
655   preg->fastmap = NULL;
656 
657   re_free (preg->translate);
658   preg->translate = NULL;
659 }
660 #ifdef _LIBC
661 weak_alias (__regfree, regfree)
662 #endif
663 
664 /* Entry points compatible with 4.2 BSD regex library.  We don't define
665    them unless specifically requested.  */
666 
667 #if defined _REGEX_RE_COMP || defined _LIBC
668 
669 /* BSD has one and only one pattern buffer.  */
670 static struct re_pattern_buffer re_comp_buf;
671 
672 char *
673 # ifdef _LIBC
674 /* Make these definitions weak in libc, so POSIX programs can redefine
675    these names if they don't use our functions, and still use
676    regcomp/regexec above without link errors.  */
677 weak_function
678 # endif
re_comp(const char * s)679 re_comp (const char *s)
680 {
681   reg_errcode_t ret;
682   char *fastmap;
683 
684   if (!s)
685     {
686       if (!re_comp_buf.buffer)
687 	return gettext ("No previous regular expression");
688       return 0;
689     }
690 
691   if (re_comp_buf.buffer)
692     {
693       fastmap = re_comp_buf.fastmap;
694       re_comp_buf.fastmap = NULL;
695       __regfree (&re_comp_buf);
696       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
697       re_comp_buf.fastmap = fastmap;
698     }
699 
700   if (re_comp_buf.fastmap == NULL)
701     {
702       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
703       if (re_comp_buf.fastmap == NULL)
704 	return (char *) gettext (__re_error_msgid
705 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
706     }
707 
708   /* Since 're_exec' always passes NULL for the 'regs' argument, we
709      don't need to initialize the pattern buffer fields which affect it.  */
710 
711   /* Match anchors at newlines.  */
712   re_comp_buf.newline_anchor = 1;
713 
714   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
715 
716   if (!ret)
717     return NULL;
718 
719   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
720   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
721 }
722 
723 #ifdef _LIBC
libc_freeres_fn(free_mem)724 libc_freeres_fn (free_mem)
725 {
726   __regfree (&re_comp_buf);
727 }
728 #endif
729 
730 #endif /* _REGEX_RE_COMP */
731 
732 /* Internal entry point.
733    Compile the regular expression PATTERN, whose length is LENGTH.
734    SYNTAX indicate regular expression's syntax.  */
735 
736 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)737 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
738 		     reg_syntax_t syntax)
739 {
740   reg_errcode_t err = REG_NOERROR;
741   re_dfa_t *dfa;
742   re_string_t regexp;
743 
744   /* Initialize the pattern buffer.  */
745   preg->fastmap_accurate = 0;
746   preg->syntax = syntax;
747   preg->not_bol = preg->not_eol = 0;
748   preg->used = 0;
749   preg->re_nsub = 0;
750   preg->can_be_null = 0;
751   preg->regs_allocated = REGS_UNALLOCATED;
752 
753   /* Initialize the dfa.  */
754   dfa = preg->buffer;
755   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
756     {
757       /* If zero allocated, but buffer is non-null, try to realloc
758 	 enough space.  This loses if buffer's address is bogus, but
759 	 that is the user's responsibility.  If ->buffer is NULL this
760 	 is a simple allocation.  */
761       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
762       if (dfa == NULL)
763 	return REG_ESPACE;
764       preg->allocated = sizeof (re_dfa_t);
765       preg->buffer = dfa;
766     }
767   preg->used = sizeof (re_dfa_t);
768 
769   err = init_dfa (dfa, length);
770   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
771     err = REG_ESPACE;
772   if (BE (err != REG_NOERROR, 0))
773     {
774       free_dfa_content (dfa);
775       preg->buffer = NULL;
776       preg->allocated = 0;
777       return err;
778     }
779 #ifdef DEBUG
780   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
781   dfa->re_str = re_malloc (char, length + 1);
782   strncpy (dfa->re_str, pattern, length + 1);
783 #endif
784 
785   err = re_string_construct (&regexp, pattern, length, preg->translate,
786 			     (syntax & RE_ICASE) != 0, dfa);
787   if (BE (err != REG_NOERROR, 0))
788     {
789     re_compile_internal_free_return:
790       free_workarea_compile (preg);
791       re_string_destruct (&regexp);
792       lock_fini (dfa->lock);
793       free_dfa_content (dfa);
794       preg->buffer = NULL;
795       preg->allocated = 0;
796       return err;
797     }
798 
799   /* Parse the regular expression, and build a structure tree.  */
800   preg->re_nsub = 0;
801   dfa->str_tree = parse (&regexp, preg, syntax, &err);
802   if (BE (dfa->str_tree == NULL, 0))
803     goto re_compile_internal_free_return;
804 
805   /* Analyze the tree and create the nfa.  */
806   err = analyze (preg);
807   if (BE (err != REG_NOERROR, 0))
808     goto re_compile_internal_free_return;
809 
810 #ifdef RE_ENABLE_I18N
811   /* If possible, do searching in single byte encoding to speed things up.  */
812   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
813     optimize_utf8 (dfa);
814 #endif
815 
816   /* Then create the initial state of the dfa.  */
817   err = create_initial_state (dfa);
818 
819   /* Release work areas.  */
820   free_workarea_compile (preg);
821   re_string_destruct (&regexp);
822 
823   if (BE (err != REG_NOERROR, 0))
824     {
825       lock_fini (dfa->lock);
826       free_dfa_content (dfa);
827       preg->buffer = NULL;
828       preg->allocated = 0;
829     }
830 
831   return err;
832 }
833 
834 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
835    as the initial length of some arrays.  */
836 
837 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)838 init_dfa (re_dfa_t *dfa, size_t pat_len)
839 {
840   __re_size_t table_size;
841 #ifndef _LIBC
842   const char *codeset_name;
843 #endif
844 #ifdef RE_ENABLE_I18N
845   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
846 #else
847   size_t max_i18n_object_size = 0;
848 #endif
849   size_t max_object_size =
850     MAX (sizeof (struct re_state_table_entry),
851 	 MAX (sizeof (re_token_t),
852 	      MAX (sizeof (re_node_set),
853 		   MAX (sizeof (regmatch_t),
854 			max_i18n_object_size))));
855 
856   memset (dfa, '\0', sizeof (re_dfa_t));
857 
858   /* Force allocation of str_tree_storage the first time.  */
859   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
860 
861   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
862      calculation below, and for similar doubling calculations
863      elsewhere.  And it's <= rather than <, because some of the
864      doubling calculations add 1 afterwards.  */
865   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
866     return REG_ESPACE;
867 
868   dfa->nodes_alloc = pat_len + 1;
869   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
870 
871   /*  table_size = 2 ^ ceil(log pat_len) */
872   for (table_size = 1; ; table_size <<= 1)
873     if (table_size > pat_len)
874       break;
875 
876   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
877   dfa->state_hash_mask = table_size - 1;
878 
879   dfa->mb_cur_max = MB_CUR_MAX;
880 #ifdef _LIBC
881   if (dfa->mb_cur_max == 6
882       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
883     dfa->is_utf8 = 1;
884   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
885 		       != 0);
886 #else
887   codeset_name = nl_langinfo (CODESET);
888   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
889       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
890       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
891       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
892     dfa->is_utf8 = 1;
893 
894   /* We check exhaustively in the loop below if this charset is a
895      superset of ASCII.  */
896   dfa->map_notascii = 0;
897 #endif
898 
899 #ifdef RE_ENABLE_I18N
900   if (dfa->mb_cur_max > 1)
901     {
902       if (dfa->is_utf8)
903 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
904       else
905 	{
906 	  int i, j, ch;
907 
908 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
909 	  if (BE (dfa->sb_char == NULL, 0))
910 	    return REG_ESPACE;
911 
912 	  /* Set the bits corresponding to single byte chars.  */
913 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
914 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
915 	      {
916 		wint_t wch = __btowc (ch);
917 		if (wch != WEOF)
918 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
919 # ifndef _LIBC
920 		if (isascii (ch) && wch != ch)
921 		  dfa->map_notascii = 1;
922 # endif
923 	      }
924 	}
925     }
926 #endif
927 
928   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
929     return REG_ESPACE;
930   return REG_NOERROR;
931 }
932 
933 /* Initialize WORD_CHAR table, which indicate which character is
934    "word".  In this case "word" means that it is the word construction
935    character used by some operators like "\<", "\>", etc.  */
936 
937 static void
938 internal_function
init_word_char(re_dfa_t * dfa)939 init_word_char (re_dfa_t *dfa)
940 {
941   int i = 0;
942   int j;
943   int ch = 0;
944   dfa->word_ops_used = 1;
945   if (BE (dfa->map_notascii == 0, 1))
946     {
947       bitset_word_t bits0 = 0x00000000;
948       bitset_word_t bits1 = 0x03ff0000;
949       bitset_word_t bits2 = 0x87fffffe;
950       bitset_word_t bits3 = 0x07fffffe;
951       if (BITSET_WORD_BITS == 64)
952 	{
953 	  dfa->word_char[0] = bits1 << 31 << 1 | bits0;
954 	  dfa->word_char[1] = bits3 << 31 << 1 | bits2;
955 	  i = 2;
956 	}
957       else if (BITSET_WORD_BITS == 32)
958 	{
959 	  dfa->word_char[0] = bits0;
960 	  dfa->word_char[1] = bits1;
961 	  dfa->word_char[2] = bits2;
962 	  dfa->word_char[3] = bits3;
963 	  i = 4;
964 	}
965       else
966         goto general_case;
967       ch = 128;
968 
969       if (BE (dfa->is_utf8, 1))
970 	{
971 	  memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
972 	  return;
973 	}
974     }
975 
976  general_case:
977   for (; i < BITSET_WORDS; ++i)
978     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
979       if (isalnum (ch) || ch == '_')
980 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
981 }
982 
983 /* Free the work area which are only used while compiling.  */
984 
985 static void
free_workarea_compile(regex_t * preg)986 free_workarea_compile (regex_t *preg)
987 {
988   re_dfa_t *dfa = preg->buffer;
989   bin_tree_storage_t *storage, *next;
990   for (storage = dfa->str_tree_storage; storage; storage = next)
991     {
992       next = storage->next;
993       re_free (storage);
994     }
995   dfa->str_tree_storage = NULL;
996   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
997   dfa->str_tree = NULL;
998   re_free (dfa->org_indices);
999   dfa->org_indices = NULL;
1000 }
1001 
1002 /* Create initial states for all contexts.  */
1003 
1004 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)1005 create_initial_state (re_dfa_t *dfa)
1006 {
1007   Idx first, i;
1008   reg_errcode_t err;
1009   re_node_set init_nodes;
1010 
1011   /* Initial states have the epsilon closure of the node which is
1012      the first node of the regular expression.  */
1013   first = dfa->str_tree->first->node_idx;
1014   dfa->init_node = first;
1015   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1016   if (BE (err != REG_NOERROR, 0))
1017     return err;
1018 
1019   /* The back-references which are in initial states can epsilon transit,
1020      since in this case all of the subexpressions can be null.
1021      Then we add epsilon closures of the nodes which are the next nodes of
1022      the back-references.  */
1023   if (dfa->nbackref > 0)
1024     for (i = 0; i < init_nodes.nelem; ++i)
1025       {
1026 	Idx node_idx = init_nodes.elems[i];
1027 	re_token_type_t type = dfa->nodes[node_idx].type;
1028 
1029 	Idx clexp_idx;
1030 	if (type != OP_BACK_REF)
1031 	  continue;
1032 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1033 	  {
1034 	    re_token_t *clexp_node;
1035 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1036 	    if (clexp_node->type == OP_CLOSE_SUBEXP
1037 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1038 	      break;
1039 	  }
1040 	if (clexp_idx == init_nodes.nelem)
1041 	  continue;
1042 
1043 	if (type == OP_BACK_REF)
1044 	  {
1045 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
1046 	    if (!re_node_set_contains (&init_nodes, dest_idx))
1047 	      {
1048 		reg_errcode_t merge_err
1049                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1050 		if (merge_err != REG_NOERROR)
1051 		  return merge_err;
1052 		i = 0;
1053 	      }
1054 	  }
1055       }
1056 
1057   /* It must be the first time to invoke acquire_state.  */
1058   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1059   /* We don't check ERR here, since the initial state must not be NULL.  */
1060   if (BE (dfa->init_state == NULL, 0))
1061     return err;
1062   if (dfa->init_state->has_constraint)
1063     {
1064       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1065 						       CONTEXT_WORD);
1066       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1067 						     CONTEXT_NEWLINE);
1068       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1069 							 &init_nodes,
1070 							 CONTEXT_NEWLINE
1071 							 | CONTEXT_BEGBUF);
1072       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1073 	      || dfa->init_state_begbuf == NULL, 0))
1074 	return err;
1075     }
1076   else
1077     dfa->init_state_word = dfa->init_state_nl
1078       = dfa->init_state_begbuf = dfa->init_state;
1079 
1080   re_node_set_free (&init_nodes);
1081   return REG_NOERROR;
1082 }
1083 
1084 #ifdef RE_ENABLE_I18N
1085 /* If it is possible to do searching in single byte encoding instead of UTF-8
1086    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1087    DFA nodes where needed.  */
1088 
1089 static void
optimize_utf8(re_dfa_t * dfa)1090 optimize_utf8 (re_dfa_t *dfa)
1091 {
1092   Idx node;
1093   int i;
1094   bool mb_chars = false;
1095   bool has_period = false;
1096 
1097   for (node = 0; node < dfa->nodes_len; ++node)
1098     switch (dfa->nodes[node].type)
1099       {
1100       case CHARACTER:
1101 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1102 	  mb_chars = true;
1103 	break;
1104       case ANCHOR:
1105 	switch (dfa->nodes[node].opr.ctx_type)
1106 	  {
1107 	  case LINE_FIRST:
1108 	  case LINE_LAST:
1109 	  case BUF_FIRST:
1110 	  case BUF_LAST:
1111 	    break;
1112 	  default:
1113 	    /* Word anchors etc. cannot be handled.  It's okay to test
1114 	       opr.ctx_type since constraints (for all DFA nodes) are
1115 	       created by ORing one or more opr.ctx_type values.  */
1116 	    return;
1117 	  }
1118 	break;
1119       case OP_PERIOD:
1120 	has_period = true;
1121 	break;
1122       case OP_BACK_REF:
1123       case OP_ALT:
1124       case END_OF_RE:
1125       case OP_DUP_ASTERISK:
1126       case OP_OPEN_SUBEXP:
1127       case OP_CLOSE_SUBEXP:
1128 	break;
1129       case COMPLEX_BRACKET:
1130 	return;
1131       case SIMPLE_BRACKET:
1132 	/* Just double check.  */
1133 	{
1134 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1135 			? 0
1136 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1137 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1138 	    {
1139 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1140 		return;
1141 	      rshift = 0;
1142 	    }
1143 	}
1144 	break;
1145       default:
1146 	abort ();
1147       }
1148 
1149   if (mb_chars || has_period)
1150     for (node = 0; node < dfa->nodes_len; ++node)
1151       {
1152 	if (dfa->nodes[node].type == CHARACTER
1153 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
1154 	  dfa->nodes[node].mb_partial = 0;
1155 	else if (dfa->nodes[node].type == OP_PERIOD)
1156 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1157       }
1158 
1159   /* The search can be in single byte locale.  */
1160   dfa->mb_cur_max = 1;
1161   dfa->is_utf8 = 0;
1162   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1163 }
1164 #endif
1165 
1166 /* Analyze the structure tree, and calculate "first", "next", "edest",
1167    "eclosure", and "inveclosure".  */
1168 
1169 static reg_errcode_t
analyze(regex_t * preg)1170 analyze (regex_t *preg)
1171 {
1172   re_dfa_t *dfa = preg->buffer;
1173   reg_errcode_t ret;
1174 
1175   /* Allocate arrays.  */
1176   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1177   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1178   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1179   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1180   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1181 	  || dfa->eclosures == NULL, 0))
1182     return REG_ESPACE;
1183 
1184   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1185   if (dfa->subexp_map != NULL)
1186     {
1187       Idx i;
1188       for (i = 0; i < preg->re_nsub; i++)
1189 	dfa->subexp_map[i] = i;
1190       preorder (dfa->str_tree, optimize_subexps, dfa);
1191       for (i = 0; i < preg->re_nsub; i++)
1192 	if (dfa->subexp_map[i] != i)
1193 	  break;
1194       if (i == preg->re_nsub)
1195 	{
1196 	  free (dfa->subexp_map);
1197 	  dfa->subexp_map = NULL;
1198 	}
1199     }
1200 
1201   ret = postorder (dfa->str_tree, lower_subexps, preg);
1202   if (BE (ret != REG_NOERROR, 0))
1203     return ret;
1204   ret = postorder (dfa->str_tree, calc_first, dfa);
1205   if (BE (ret != REG_NOERROR, 0))
1206     return ret;
1207   preorder (dfa->str_tree, calc_next, dfa);
1208   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1209   if (BE (ret != REG_NOERROR, 0))
1210     return ret;
1211   ret = calc_eclosure (dfa);
1212   if (BE (ret != REG_NOERROR, 0))
1213     return ret;
1214 
1215   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1216      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1217   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1218       || dfa->nbackref)
1219     {
1220       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1221       if (BE (dfa->inveclosures == NULL, 0))
1222 	return REG_ESPACE;
1223       ret = calc_inveclosure (dfa);
1224     }
1225 
1226   return ret;
1227 }
1228 
1229 /* Our parse trees are very unbalanced, so we cannot use a stack to
1230    implement parse tree visits.  Instead, we use parent pointers and
1231    some hairy code in these two functions.  */
1232 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1233 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234 	   void *extra)
1235 {
1236   bin_tree_t *node, *prev;
1237 
1238   for (node = root; ; )
1239     {
1240       /* Descend down the tree, preferably to the left (or to the right
1241 	 if that's the only child).  */
1242       while (node->left || node->right)
1243 	if (node->left)
1244 	  node = node->left;
1245 	else
1246 	  node = node->right;
1247 
1248       do
1249 	{
1250 	  reg_errcode_t err = fn (extra, node);
1251 	  if (BE (err != REG_NOERROR, 0))
1252 	    return err;
1253 	  if (node->parent == NULL)
1254 	    return REG_NOERROR;
1255 	  prev = node;
1256 	  node = node->parent;
1257 	}
1258       /* Go up while we have a node that is reached from the right.  */
1259       while (node->right == prev || node->right == NULL);
1260       node = node->right;
1261     }
1262 }
1263 
1264 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1265 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1266 	  void *extra)
1267 {
1268   bin_tree_t *node;
1269 
1270   for (node = root; ; )
1271     {
1272       reg_errcode_t err = fn (extra, node);
1273       if (BE (err != REG_NOERROR, 0))
1274 	return err;
1275 
1276       /* Go to the left node, or up and to the right.  */
1277       if (node->left)
1278 	node = node->left;
1279       else
1280 	{
1281 	  bin_tree_t *prev = NULL;
1282 	  while (node->right == prev || node->right == NULL)
1283 	    {
1284 	      prev = node;
1285 	      node = node->parent;
1286 	      if (!node)
1287 		return REG_NOERROR;
1288 	    }
1289 	  node = node->right;
1290 	}
1291     }
1292 }
1293 
1294 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1296    backreferences as well.  Requires a preorder visit.  */
1297 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1298 optimize_subexps (void *extra, bin_tree_t *node)
1299 {
1300   re_dfa_t *dfa = (re_dfa_t *) extra;
1301 
1302   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1303     {
1304       int idx = node->token.opr.idx;
1305       node->token.opr.idx = dfa->subexp_map[idx];
1306       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1307     }
1308 
1309   else if (node->token.type == SUBEXP
1310 	   && node->left && node->left->token.type == SUBEXP)
1311     {
1312       Idx other_idx = node->left->token.opr.idx;
1313 
1314       node->left = node->left->left;
1315       if (node->left)
1316 	node->left->parent = node;
1317 
1318       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1319       if (other_idx < BITSET_WORD_BITS)
1320 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1321     }
1322 
1323   return REG_NOERROR;
1324 }
1325 
1326 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1328 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1329 lower_subexps (void *extra, bin_tree_t *node)
1330 {
1331   regex_t *preg = (regex_t *) extra;
1332   reg_errcode_t err = REG_NOERROR;
1333 
1334   if (node->left && node->left->token.type == SUBEXP)
1335     {
1336       node->left = lower_subexp (&err, preg, node->left);
1337       if (node->left)
1338 	node->left->parent = node;
1339     }
1340   if (node->right && node->right->token.type == SUBEXP)
1341     {
1342       node->right = lower_subexp (&err, preg, node->right);
1343       if (node->right)
1344 	node->right->parent = node;
1345     }
1346 
1347   return err;
1348 }
1349 
1350 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1351 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1352 {
1353   re_dfa_t *dfa = preg->buffer;
1354   bin_tree_t *body = node->left;
1355   bin_tree_t *op, *cls, *tree1, *tree;
1356 
1357   if (preg->no_sub
1358       /* We do not optimize empty subexpressions, because otherwise we may
1359 	 have bad CONCAT nodes with NULL children.  This is obviously not
1360 	 very common, so we do not lose much.  An example that triggers
1361 	 this case is the sed "script" /\(\)/x.  */
1362       && node->left != NULL
1363       && (node->token.opr.idx >= BITSET_WORD_BITS
1364 	  || !(dfa->used_bkref_map
1365 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1366     return node->left;
1367 
1368   /* Convert the SUBEXP node to the concatenation of an
1369      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1370   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1371   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1372   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1373   tree = create_tree (dfa, op, tree1, CONCAT);
1374   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1375     {
1376       *err = REG_ESPACE;
1377       return NULL;
1378     }
1379 
1380   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1381   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1382   return tree;
1383 }
1384 
1385 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1386    nodes.  Requires a postorder visit.  */
1387 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1388 calc_first (void *extra, bin_tree_t *node)
1389 {
1390   re_dfa_t *dfa = (re_dfa_t *) extra;
1391   if (node->token.type == CONCAT)
1392     {
1393       node->first = node->left->first;
1394       node->node_idx = node->left->node_idx;
1395     }
1396   else
1397     {
1398       node->first = node;
1399       node->node_idx = re_dfa_add_node (dfa, node->token);
1400       if (BE (node->node_idx == -1, 0))
1401 	return REG_ESPACE;
1402       if (node->token.type == ANCHOR)
1403 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1404     }
1405   return REG_NOERROR;
1406 }
1407 
1408 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1409 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1410 calc_next (void *extra, bin_tree_t *node)
1411 {
1412   switch (node->token.type)
1413     {
1414     case OP_DUP_ASTERISK:
1415       node->left->next = node;
1416       break;
1417     case CONCAT:
1418       node->left->next = node->right->first;
1419       node->right->next = node->next;
1420       break;
1421     default:
1422       if (node->left)
1423 	node->left->next = node->next;
1424       if (node->right)
1425 	node->right->next = node->next;
1426       break;
1427     }
1428   return REG_NOERROR;
1429 }
1430 
1431 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1432 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1433 link_nfa_nodes (void *extra, bin_tree_t *node)
1434 {
1435   re_dfa_t *dfa = (re_dfa_t *) extra;
1436   Idx idx = node->node_idx;
1437   reg_errcode_t err = REG_NOERROR;
1438 
1439   switch (node->token.type)
1440     {
1441     case CONCAT:
1442       break;
1443 
1444     case END_OF_RE:
1445       assert (node->next == NULL);
1446       break;
1447 
1448     case OP_DUP_ASTERISK:
1449     case OP_ALT:
1450       {
1451 	Idx left, right;
1452 	dfa->has_plural_match = 1;
1453 	if (node->left != NULL)
1454 	  left = node->left->first->node_idx;
1455 	else
1456 	  left = node->next->node_idx;
1457 	if (node->right != NULL)
1458 	  right = node->right->first->node_idx;
1459 	else
1460 	  right = node->next->node_idx;
1461 	assert (left > -1);
1462 	assert (right > -1);
1463 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1464       }
1465       break;
1466 
1467     case ANCHOR:
1468     case OP_OPEN_SUBEXP:
1469     case OP_CLOSE_SUBEXP:
1470       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1471       break;
1472 
1473     case OP_BACK_REF:
1474       dfa->nexts[idx] = node->next->node_idx;
1475       if (node->token.type == OP_BACK_REF)
1476 	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1477       break;
1478 
1479     default:
1480       assert (!IS_EPSILON_NODE (node->token.type));
1481       dfa->nexts[idx] = node->next->node_idx;
1482       break;
1483     }
1484 
1485   return err;
1486 }
1487 
1488 /* Duplicate the epsilon closure of the node ROOT_NODE.
1489    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1490    to their own constraint.  */
1491 
1492 static reg_errcode_t
1493 internal_function
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1494 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1495 			Idx root_node, unsigned int init_constraint)
1496 {
1497   Idx org_node, clone_node;
1498   bool ok;
1499   unsigned int constraint = init_constraint;
1500   for (org_node = top_org_node, clone_node = top_clone_node;;)
1501     {
1502       Idx org_dest, clone_dest;
1503       if (dfa->nodes[org_node].type == OP_BACK_REF)
1504 	{
1505 	  /* If the back reference epsilon-transit, its destination must
1506 	     also have the constraint.  Then duplicate the epsilon closure
1507 	     of the destination of the back reference, and store it in
1508 	     edests of the back reference.  */
1509 	  org_dest = dfa->nexts[org_node];
1510 	  re_node_set_empty (dfa->edests + clone_node);
1511 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 	  if (BE (clone_dest == -1, 0))
1513 	    return REG_ESPACE;
1514 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1515 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 	  if (BE (! ok, 0))
1517 	    return REG_ESPACE;
1518 	}
1519       else if (dfa->edests[org_node].nelem == 0)
1520 	{
1521 	  /* In case of the node can't epsilon-transit, don't duplicate the
1522 	     destination and store the original destination as the
1523 	     destination of the node.  */
1524 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1525 	  break;
1526 	}
1527       else if (dfa->edests[org_node].nelem == 1)
1528 	{
1529 	  /* In case of the node can epsilon-transit, and it has only one
1530 	     destination.  */
1531 	  org_dest = dfa->edests[org_node].elems[0];
1532 	  re_node_set_empty (dfa->edests + clone_node);
1533 	  /* If the node is root_node itself, it means the epsilon closure
1534 	     has a loop.  Then tie it to the destination of the root_node.  */
1535 	  if (org_node == root_node && clone_node != org_node)
1536 	    {
1537 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1538 	      if (BE (! ok, 0))
1539 	        return REG_ESPACE;
1540 	      break;
1541 	    }
1542 	  /* In case the node has another constraint, append it.  */
1543 	  constraint |= dfa->nodes[org_node].constraint;
1544 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1545 	  if (BE (clone_dest == -1, 0))
1546 	    return REG_ESPACE;
1547 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 	  if (BE (! ok, 0))
1549 	    return REG_ESPACE;
1550 	}
1551       else /* dfa->edests[org_node].nelem == 2 */
1552 	{
1553 	  /* In case of the node can epsilon-transit, and it has two
1554 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1555 	  org_dest = dfa->edests[org_node].elems[0];
1556 	  re_node_set_empty (dfa->edests + clone_node);
1557 	  /* Search for a duplicated node which satisfies the constraint.  */
1558 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1559 	  if (clone_dest == -1)
1560 	    {
1561 	      /* There is no such duplicated node, create a new one.  */
1562 	      reg_errcode_t err;
1563 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1564 	      if (BE (clone_dest == -1, 0))
1565 		return REG_ESPACE;
1566 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 	      if (BE (! ok, 0))
1568 		return REG_ESPACE;
1569 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1570 					    root_node, constraint);
1571 	      if (BE (err != REG_NOERROR, 0))
1572 		return err;
1573 	    }
1574 	  else
1575 	    {
1576 	      /* There is a duplicated node which satisfies the constraint,
1577 		 use it to avoid infinite loop.  */
1578 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579 	      if (BE (! ok, 0))
1580 		return REG_ESPACE;
1581 	    }
1582 
1583 	  org_dest = dfa->edests[org_node].elems[1];
1584 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1585 	  if (BE (clone_dest == -1, 0))
1586 	    return REG_ESPACE;
1587 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1588 	  if (BE (! ok, 0))
1589 	    return REG_ESPACE;
1590 	}
1591       org_node = org_dest;
1592       clone_node = clone_dest;
1593     }
1594   return REG_NOERROR;
1595 }
1596 
1597 /* Search for a node which is duplicated from the node ORG_NODE, and
1598    satisfies the constraint CONSTRAINT.  */
1599 
1600 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1601 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1602 			unsigned int constraint)
1603 {
1604   Idx idx;
1605   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1606     {
1607       if (org_node == dfa->org_indices[idx]
1608 	  && constraint == dfa->nodes[idx].constraint)
1609 	return idx; /* Found.  */
1610     }
1611   return -1; /* Not found.  */
1612 }
1613 
1614 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615    Return the index of the new node, or -1 if insufficient storage is
1616    available.  */
1617 
1618 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1619 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1620 {
1621   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1622   if (BE (dup_idx != -1, 1))
1623     {
1624       dfa->nodes[dup_idx].constraint = constraint;
1625       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1626       dfa->nodes[dup_idx].duplicated = 1;
1627 
1628       /* Store the index of the original node.  */
1629       dfa->org_indices[dup_idx] = org_idx;
1630     }
1631   return dup_idx;
1632 }
1633 
1634 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1635 calc_inveclosure (re_dfa_t *dfa)
1636 {
1637   Idx src, idx;
1638   bool ok;
1639   for (idx = 0; idx < dfa->nodes_len; ++idx)
1640     re_node_set_init_empty (dfa->inveclosures + idx);
1641 
1642   for (src = 0; src < dfa->nodes_len; ++src)
1643     {
1644       Idx *elems = dfa->eclosures[src].elems;
1645       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1646 	{
1647 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1648 	  if (BE (! ok, 0))
1649 	    return REG_ESPACE;
1650 	}
1651     }
1652 
1653   return REG_NOERROR;
1654 }
1655 
1656 /* Calculate "eclosure" for all the node in DFA.  */
1657 
1658 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1659 calc_eclosure (re_dfa_t *dfa)
1660 {
1661   Idx node_idx;
1662   bool incomplete;
1663 #ifdef DEBUG
1664   assert (dfa->nodes_len > 0);
1665 #endif
1666   incomplete = false;
1667   /* For each nodes, calculate epsilon closure.  */
1668   for (node_idx = 0; ; ++node_idx)
1669     {
1670       reg_errcode_t err;
1671       re_node_set eclosure_elem;
1672       if (node_idx == dfa->nodes_len)
1673 	{
1674 	  if (!incomplete)
1675 	    break;
1676 	  incomplete = false;
1677 	  node_idx = 0;
1678 	}
1679 
1680 #ifdef DEBUG
1681       assert (dfa->eclosures[node_idx].nelem != -1);
1682 #endif
1683 
1684       /* If we have already calculated, skip it.  */
1685       if (dfa->eclosures[node_idx].nelem != 0)
1686 	continue;
1687       /* Calculate epsilon closure of 'node_idx'.  */
1688       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1689       if (BE (err != REG_NOERROR, 0))
1690 	return err;
1691 
1692       if (dfa->eclosures[node_idx].nelem == 0)
1693 	{
1694 	  incomplete = true;
1695 	  re_node_set_free (&eclosure_elem);
1696 	}
1697     }
1698   return REG_NOERROR;
1699 }
1700 
1701 /* Calculate epsilon closure of NODE.  */
1702 
1703 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1704 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1705 {
1706   reg_errcode_t err;
1707   Idx i;
1708   re_node_set eclosure;
1709   bool ok;
1710   bool incomplete = false;
1711   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1712   if (BE (err != REG_NOERROR, 0))
1713     return err;
1714 
1715   /* This indicates that we are calculating this node now.
1716      We reference this value to avoid infinite loop.  */
1717   dfa->eclosures[node].nelem = -1;
1718 
1719   /* If the current node has constraints, duplicate all nodes
1720      since they must inherit the constraints.  */
1721   if (dfa->nodes[node].constraint
1722       && dfa->edests[node].nelem
1723       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1724     {
1725       err = duplicate_node_closure (dfa, node, node, node,
1726 				    dfa->nodes[node].constraint);
1727       if (BE (err != REG_NOERROR, 0))
1728 	return err;
1729     }
1730 
1731   /* Expand each epsilon destination nodes.  */
1732   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1733     for (i = 0; i < dfa->edests[node].nelem; ++i)
1734       {
1735 	re_node_set eclosure_elem;
1736 	Idx edest = dfa->edests[node].elems[i];
1737 	/* If calculating the epsilon closure of 'edest' is in progress,
1738 	   return intermediate result.  */
1739 	if (dfa->eclosures[edest].nelem == -1)
1740 	  {
1741 	    incomplete = true;
1742 	    continue;
1743 	  }
1744 	/* If we haven't calculated the epsilon closure of 'edest' yet,
1745 	   calculate now. Otherwise use calculated epsilon closure.  */
1746 	if (dfa->eclosures[edest].nelem == 0)
1747 	  {
1748 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1749 	    if (BE (err != REG_NOERROR, 0))
1750 	      return err;
1751 	  }
1752 	else
1753 	  eclosure_elem = dfa->eclosures[edest];
1754 	/* Merge the epsilon closure of 'edest'.  */
1755 	err = re_node_set_merge (&eclosure, &eclosure_elem);
1756 	if (BE (err != REG_NOERROR, 0))
1757 	  return err;
1758 	/* If the epsilon closure of 'edest' is incomplete,
1759 	   the epsilon closure of this node is also incomplete.  */
1760 	if (dfa->eclosures[edest].nelem == 0)
1761 	  {
1762 	    incomplete = true;
1763 	    re_node_set_free (&eclosure_elem);
1764 	  }
1765       }
1766 
1767   /* An epsilon closure includes itself.  */
1768   ok = re_node_set_insert (&eclosure, node);
1769   if (BE (! ok, 0))
1770     return REG_ESPACE;
1771   if (incomplete && !root)
1772     dfa->eclosures[node].nelem = 0;
1773   else
1774     dfa->eclosures[node] = eclosure;
1775   *new_set = eclosure;
1776   return REG_NOERROR;
1777 }
1778 
1779 /* Functions for token which are used in the parser.  */
1780 
1781 /* Fetch a token from INPUT.
1782    We must not use this function inside bracket expressions.  */
1783 
1784 static void
1785 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1786 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1787 {
1788   re_string_skip_bytes (input, peek_token (result, input, syntax));
1789 }
1790 
1791 /* Peek a token from INPUT, and return the length of the token.
1792    We must not use this function inside bracket expressions.  */
1793 
1794 static int
1795 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1796 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1797 {
1798   unsigned char c;
1799 
1800   if (re_string_eoi (input))
1801     {
1802       token->type = END_OF_RE;
1803       return 0;
1804     }
1805 
1806   c = re_string_peek_byte (input, 0);
1807   token->opr.c = c;
1808 
1809   token->word_char = 0;
1810 #ifdef RE_ENABLE_I18N
1811   token->mb_partial = 0;
1812   if (input->mb_cur_max > 1 &&
1813       !re_string_first_byte (input, re_string_cur_idx (input)))
1814     {
1815       token->type = CHARACTER;
1816       token->mb_partial = 1;
1817       return 1;
1818     }
1819 #endif
1820   if (c == '\\')
1821     {
1822       unsigned char c2;
1823       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1824 	{
1825 	  token->type = BACK_SLASH;
1826 	  return 1;
1827 	}
1828 
1829       c2 = re_string_peek_byte_case (input, 1);
1830       token->opr.c = c2;
1831       token->type = CHARACTER;
1832 #ifdef RE_ENABLE_I18N
1833       if (input->mb_cur_max > 1)
1834 	{
1835 	  wint_t wc = re_string_wchar_at (input,
1836 					  re_string_cur_idx (input) + 1);
1837 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1838 	}
1839       else
1840 #endif
1841 	token->word_char = IS_WORD_CHAR (c2) != 0;
1842 
1843       switch (c2)
1844 	{
1845 	case '|':
1846 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1847 	    token->type = OP_ALT;
1848 	  break;
1849 	case '1': case '2': case '3': case '4': case '5':
1850 	case '6': case '7': case '8': case '9':
1851 	  if (!(syntax & RE_NO_BK_REFS))
1852 	    {
1853 	      token->type = OP_BACK_REF;
1854 	      token->opr.idx = c2 - '1';
1855 	    }
1856 	  break;
1857 	case '<':
1858 	  if (!(syntax & RE_NO_GNU_OPS))
1859 	    {
1860 	      token->type = ANCHOR;
1861 	      token->opr.ctx_type = WORD_FIRST;
1862 	    }
1863 	  break;
1864 	case '>':
1865 	  if (!(syntax & RE_NO_GNU_OPS))
1866 	    {
1867 	      token->type = ANCHOR;
1868 	      token->opr.ctx_type = WORD_LAST;
1869 	    }
1870 	  break;
1871 	case 'b':
1872 	  if (!(syntax & RE_NO_GNU_OPS))
1873 	    {
1874 	      token->type = ANCHOR;
1875 	      token->opr.ctx_type = WORD_DELIM;
1876 	    }
1877 	  break;
1878 	case 'B':
1879 	  if (!(syntax & RE_NO_GNU_OPS))
1880 	    {
1881 	      token->type = ANCHOR;
1882 	      token->opr.ctx_type = NOT_WORD_DELIM;
1883 	    }
1884 	  break;
1885 	case 'w':
1886 	  if (!(syntax & RE_NO_GNU_OPS))
1887 	    token->type = OP_WORD;
1888 	  break;
1889 	case 'W':
1890 	  if (!(syntax & RE_NO_GNU_OPS))
1891 	    token->type = OP_NOTWORD;
1892 	  break;
1893 	case 's':
1894 	  if (!(syntax & RE_NO_GNU_OPS))
1895 	    token->type = OP_SPACE;
1896 	  break;
1897 	case 'S':
1898 	  if (!(syntax & RE_NO_GNU_OPS))
1899 	    token->type = OP_NOTSPACE;
1900 	  break;
1901 	case '`':
1902 	  if (!(syntax & RE_NO_GNU_OPS))
1903 	    {
1904 	      token->type = ANCHOR;
1905 	      token->opr.ctx_type = BUF_FIRST;
1906 	    }
1907 	  break;
1908 	case '\'':
1909 	  if (!(syntax & RE_NO_GNU_OPS))
1910 	    {
1911 	      token->type = ANCHOR;
1912 	      token->opr.ctx_type = BUF_LAST;
1913 	    }
1914 	  break;
1915 	case '(':
1916 	  if (!(syntax & RE_NO_BK_PARENS))
1917 	    token->type = OP_OPEN_SUBEXP;
1918 	  break;
1919 	case ')':
1920 	  if (!(syntax & RE_NO_BK_PARENS))
1921 	    token->type = OP_CLOSE_SUBEXP;
1922 	  break;
1923 	case '+':
1924 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1925 	    token->type = OP_DUP_PLUS;
1926 	  break;
1927 	case '?':
1928 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1929 	    token->type = OP_DUP_QUESTION;
1930 	  break;
1931 	case '{':
1932 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1933 	    token->type = OP_OPEN_DUP_NUM;
1934 	  break;
1935 	case '}':
1936 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1937 	    token->type = OP_CLOSE_DUP_NUM;
1938 	  break;
1939 	default:
1940 	  break;
1941 	}
1942       return 2;
1943     }
1944 
1945   token->type = CHARACTER;
1946 #ifdef RE_ENABLE_I18N
1947   if (input->mb_cur_max > 1)
1948     {
1949       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1950       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1951     }
1952   else
1953 #endif
1954     token->word_char = IS_WORD_CHAR (token->opr.c);
1955 
1956   switch (c)
1957     {
1958     case '\n':
1959       if (syntax & RE_NEWLINE_ALT)
1960 	token->type = OP_ALT;
1961       break;
1962     case '|':
1963       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1964 	token->type = OP_ALT;
1965       break;
1966     case '*':
1967       token->type = OP_DUP_ASTERISK;
1968       break;
1969     case '+':
1970       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1971 	token->type = OP_DUP_PLUS;
1972       break;
1973     case '?':
1974       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1975 	token->type = OP_DUP_QUESTION;
1976       break;
1977     case '{':
1978       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1979 	token->type = OP_OPEN_DUP_NUM;
1980       break;
1981     case '}':
1982       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1983 	token->type = OP_CLOSE_DUP_NUM;
1984       break;
1985     case '(':
1986       if (syntax & RE_NO_BK_PARENS)
1987 	token->type = OP_OPEN_SUBEXP;
1988       break;
1989     case ')':
1990       if (syntax & RE_NO_BK_PARENS)
1991 	token->type = OP_CLOSE_SUBEXP;
1992       break;
1993     case '[':
1994       token->type = OP_OPEN_BRACKET;
1995       break;
1996     case '.':
1997       token->type = OP_PERIOD;
1998       break;
1999     case '^':
2000       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2001 	  re_string_cur_idx (input) != 0)
2002 	{
2003 	  char prev = re_string_peek_byte (input, -1);
2004 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2005 	    break;
2006 	}
2007       token->type = ANCHOR;
2008       token->opr.ctx_type = LINE_FIRST;
2009       break;
2010     case '$':
2011       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2012 	  re_string_cur_idx (input) + 1 != re_string_length (input))
2013 	{
2014 	  re_token_t next;
2015 	  re_string_skip_bytes (input, 1);
2016 	  peek_token (&next, input, syntax);
2017 	  re_string_skip_bytes (input, -1);
2018 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2019 	    break;
2020 	}
2021       token->type = ANCHOR;
2022       token->opr.ctx_type = LINE_LAST;
2023       break;
2024     default:
2025       break;
2026     }
2027   return 1;
2028 }
2029 
2030 /* Peek a token from INPUT, and return the length of the token.
2031    We must not use this function out of bracket expressions.  */
2032 
2033 static int
2034 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036 {
2037   unsigned char c;
2038   if (re_string_eoi (input))
2039     {
2040       token->type = END_OF_RE;
2041       return 0;
2042     }
2043   c = re_string_peek_byte (input, 0);
2044   token->opr.c = c;
2045 
2046 #ifdef RE_ENABLE_I18N
2047   if (input->mb_cur_max > 1 &&
2048       !re_string_first_byte (input, re_string_cur_idx (input)))
2049     {
2050       token->type = CHARACTER;
2051       return 1;
2052     }
2053 #endif /* RE_ENABLE_I18N */
2054 
2055   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056       && re_string_cur_idx (input) + 1 < re_string_length (input))
2057     {
2058       /* In this case, '\' escape a character.  */
2059       unsigned char c2;
2060       re_string_skip_bytes (input, 1);
2061       c2 = re_string_peek_byte (input, 0);
2062       token->opr.c = c2;
2063       token->type = CHARACTER;
2064       return 1;
2065     }
2066   if (c == '[') /* '[' is a special char in a bracket exps.  */
2067     {
2068       unsigned char c2;
2069       int token_len;
2070       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 	c2 = re_string_peek_byte (input, 1);
2072       else
2073 	c2 = 0;
2074       token->opr.c = c2;
2075       token_len = 2;
2076       switch (c2)
2077 	{
2078 	case '.':
2079 	  token->type = OP_OPEN_COLL_ELEM;
2080 	  break;
2081 	case '=':
2082 	  token->type = OP_OPEN_EQUIV_CLASS;
2083 	  break;
2084 	case ':':
2085 	  if (syntax & RE_CHAR_CLASSES)
2086 	    {
2087 	      token->type = OP_OPEN_CHAR_CLASS;
2088 	      break;
2089 	    }
2090 	  /* else fall through.  */
2091 	default:
2092 	  token->type = CHARACTER;
2093 	  token->opr.c = c;
2094 	  token_len = 1;
2095 	  break;
2096 	}
2097       return token_len;
2098     }
2099   switch (c)
2100     {
2101     case '-':
2102       token->type = OP_CHARSET_RANGE;
2103       break;
2104     case ']':
2105       token->type = OP_CLOSE_BRACKET;
2106       break;
2107     case '^':
2108       token->type = OP_NON_MATCH_LIST;
2109       break;
2110     default:
2111       token->type = CHARACTER;
2112     }
2113   return 1;
2114 }
2115 
2116 /* Functions for parser.  */
2117 
2118 /* Entry point of the parser.
2119    Parse the regular expression REGEXP and return the structure tree.
2120    If an error occurs, ERR is set by error code, and return NULL.
2121    This function build the following tree, from regular expression <reg_exp>:
2122 	   CAT
2123 	   / \
2124 	  /   \
2125    <reg_exp>  EOR
2126 
2127    CAT means concatenation.
2128    EOR means end of regular expression.  */
2129 
2130 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2131 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2132        reg_errcode_t *err)
2133 {
2134   re_dfa_t *dfa = preg->buffer;
2135   bin_tree_t *tree, *eor, *root;
2136   re_token_t current_token;
2137   dfa->syntax = syntax;
2138   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2139   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2140   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141     return NULL;
2142   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2143   if (tree != NULL)
2144     root = create_tree (dfa, tree, eor, CONCAT);
2145   else
2146     root = eor;
2147   if (BE (eor == NULL || root == NULL, 0))
2148     {
2149       *err = REG_ESPACE;
2150       return NULL;
2151     }
2152   return root;
2153 }
2154 
2155 /* This function build the following tree, from regular expression
2156    <branch1>|<branch2>:
2157 	   ALT
2158 	   / \
2159 	  /   \
2160    <branch1> <branch2>
2161 
2162    ALT means alternative, which represents the operator '|'.  */
2163 
2164 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2165 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2166 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2167 {
2168   re_dfa_t *dfa = preg->buffer;
2169   bin_tree_t *tree, *branch = NULL;
2170   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2171   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2172   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2173     return NULL;
2174 
2175   while (token->type == OP_ALT)
2176     {
2177       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2178       if (token->type != OP_ALT && token->type != END_OF_RE
2179 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2180 	{
2181 	  bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2182 	  dfa->completed_bkref_map = initial_bkref_map;
2183 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2184 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2185 	    {
2186 	      if (tree != NULL)
2187 		postorder (tree, free_tree, NULL);
2188 	      return NULL;
2189 	    }
2190 	  dfa->completed_bkref_map |= accumulated_bkref_map;
2191 	}
2192       else
2193 	branch = NULL;
2194       tree = create_tree (dfa, tree, branch, OP_ALT);
2195       if (BE (tree == NULL, 0))
2196 	{
2197 	  *err = REG_ESPACE;
2198 	  return NULL;
2199 	}
2200     }
2201   return tree;
2202 }
2203 
2204 /* This function build the following tree, from regular expression
2205    <exp1><exp2>:
2206 	CAT
2207 	/ \
2208        /   \
2209    <exp1> <exp2>
2210 
2211    CAT means concatenation.  */
2212 
2213 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 {
2217   bin_tree_t *tree, *expr;
2218   re_dfa_t *dfa = preg->buffer;
2219   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221     return NULL;
2222 
2223   while (token->type != OP_ALT && token->type != END_OF_RE
2224 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2225     {
2226       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2228 	{
2229 	  if (tree != NULL)
2230 	    postorder (tree, free_tree, NULL);
2231 	  return NULL;
2232 	}
2233       if (tree != NULL && expr != NULL)
2234 	{
2235 	  bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 	  if (newtree == NULL)
2237 	    {
2238 	      postorder (expr, free_tree, NULL);
2239 	      postorder (tree, free_tree, NULL);
2240 	      *err = REG_ESPACE;
2241 	      return NULL;
2242 	    }
2243 	  tree = newtree;
2244 	}
2245       else if (tree == NULL)
2246 	tree = expr;
2247       /* Otherwise expr == NULL, we don't need to create new tree.  */
2248     }
2249   return tree;
2250 }
2251 
2252 /* This function build the following tree, from regular expression a*:
2253 	 *
2254 	 |
2255 	 a
2256 */
2257 
2258 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261 {
2262   re_dfa_t *dfa = preg->buffer;
2263   bin_tree_t *tree;
2264   switch (token->type)
2265     {
2266     case CHARACTER:
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269 	{
2270 	  *err = REG_ESPACE;
2271 	  return NULL;
2272 	}
2273 #ifdef RE_ENABLE_I18N
2274       if (dfa->mb_cur_max > 1)
2275 	{
2276 	  while (!re_string_eoi (regexp)
2277 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278 	    {
2279 	      bin_tree_t *mbc_remain;
2280 	      fetch_token (token, regexp, syntax);
2281 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2284 		{
2285 		  *err = REG_ESPACE;
2286 		  return NULL;
2287 		}
2288 	    }
2289 	}
2290 #endif
2291       break;
2292     case OP_OPEN_SUBEXP:
2293       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295 	return NULL;
2296       break;
2297     case OP_OPEN_BRACKET:
2298       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300 	return NULL;
2301       break;
2302     case OP_BACK_REF:
2303       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2304 	{
2305 	  *err = REG_ESUBREG;
2306 	  return NULL;
2307 	}
2308       dfa->used_bkref_map |= 1 << token->opr.idx;
2309       tree = create_token_tree (dfa, NULL, NULL, token);
2310       if (BE (tree == NULL, 0))
2311 	{
2312 	  *err = REG_ESPACE;
2313 	  return NULL;
2314 	}
2315       ++dfa->nbackref;
2316       dfa->has_mb_node = 1;
2317       break;
2318     case OP_OPEN_DUP_NUM:
2319       if (syntax & RE_CONTEXT_INVALID_DUP)
2320 	{
2321 	  *err = REG_BADRPT;
2322 	  return NULL;
2323 	}
2324       /* FALLTHROUGH */
2325     case OP_DUP_ASTERISK:
2326     case OP_DUP_PLUS:
2327     case OP_DUP_QUESTION:
2328       if (syntax & RE_CONTEXT_INVALID_OPS)
2329 	{
2330 	  *err = REG_BADRPT;
2331 	  return NULL;
2332 	}
2333       else if (syntax & RE_CONTEXT_INDEP_OPS)
2334 	{
2335 	  fetch_token (token, regexp, syntax);
2336 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2337 	}
2338       /* else fall through  */
2339     case OP_CLOSE_SUBEXP:
2340       if ((token->type == OP_CLOSE_SUBEXP) &&
2341 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2342 	{
2343 	  *err = REG_ERPAREN;
2344 	  return NULL;
2345 	}
2346       /* else fall through  */
2347     case OP_CLOSE_DUP_NUM:
2348       /* We treat it as a normal character.  */
2349 
2350       /* Then we can these characters as normal characters.  */
2351       token->type = CHARACTER;
2352       /* mb_partial and word_char bits should be initialized already
2353 	 by peek_token.  */
2354       tree = create_token_tree (dfa, NULL, NULL, token);
2355       if (BE (tree == NULL, 0))
2356 	{
2357 	  *err = REG_ESPACE;
2358 	  return NULL;
2359 	}
2360       break;
2361     case ANCHOR:
2362       if ((token->opr.ctx_type
2363 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 	  && dfa->word_ops_used == 0)
2365 	init_word_char (dfa);
2366       if (token->opr.ctx_type == WORD_DELIM
2367 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2368 	{
2369 	  bin_tree_t *tree_first, *tree_last;
2370 	  if (token->opr.ctx_type == WORD_DELIM)
2371 	    {
2372 	      token->opr.ctx_type = WORD_FIRST;
2373 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 	      token->opr.ctx_type = WORD_LAST;
2375 	    }
2376 	  else
2377 	    {
2378 	      token->opr.ctx_type = INSIDE_WORD;
2379 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 	      token->opr.ctx_type = INSIDE_NOTWORD;
2381 	    }
2382 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2385 	    {
2386 	      *err = REG_ESPACE;
2387 	      return NULL;
2388 	    }
2389 	}
2390       else
2391 	{
2392 	  tree = create_token_tree (dfa, NULL, NULL, token);
2393 	  if (BE (tree == NULL, 0))
2394 	    {
2395 	      *err = REG_ESPACE;
2396 	      return NULL;
2397 	    }
2398 	}
2399       /* We must return here, since ANCHORs can't be followed
2400 	 by repetition operators.
2401 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2403       fetch_token (token, regexp, syntax);
2404       return tree;
2405     case OP_PERIOD:
2406       tree = create_token_tree (dfa, NULL, NULL, token);
2407       if (BE (tree == NULL, 0))
2408 	{
2409 	  *err = REG_ESPACE;
2410 	  return NULL;
2411 	}
2412       if (dfa->mb_cur_max > 1)
2413 	dfa->has_mb_node = 1;
2414       break;
2415     case OP_WORD:
2416     case OP_NOTWORD:
2417       tree = build_charclass_op (dfa, regexp->trans,
2418 				 "alnum",
2419 				 "_",
2420 				 token->type == OP_NOTWORD, err);
2421       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2422 	return NULL;
2423       break;
2424     case OP_SPACE:
2425     case OP_NOTSPACE:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427 				 "space",
2428 				 "",
2429 				 token->type == OP_NOTSPACE, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 	return NULL;
2432       break;
2433     case OP_ALT:
2434     case END_OF_RE:
2435       return NULL;
2436     case BACK_SLASH:
2437       *err = REG_EESCAPE;
2438       return NULL;
2439     default:
2440       /* Must not happen?  */
2441 #ifdef DEBUG
2442       assert (0);
2443 #endif
2444       return NULL;
2445     }
2446   fetch_token (token, regexp, syntax);
2447 
2448   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2450     {
2451       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2452 					   syntax, err);
2453       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2454 	{
2455 	  if (tree != NULL)
2456 	    postorder (tree, free_tree, NULL);
2457 	  return NULL;
2458 	}
2459       tree = dup_tree;
2460       /* In BRE consecutive duplications are not allowed.  */
2461       if ((syntax & RE_CONTEXT_INVALID_DUP)
2462 	  && (token->type == OP_DUP_ASTERISK
2463 	      || token->type == OP_OPEN_DUP_NUM))
2464 	{
2465 	  if (tree != NULL)
2466 	    postorder (tree, free_tree, NULL);
2467 	  *err = REG_BADRPT;
2468 	  return NULL;
2469 	}
2470     }
2471 
2472   return tree;
2473 }
2474 
2475 /* This function build the following tree, from regular expression
2476    (<reg_exp>):
2477 	 SUBEXP
2478 	    |
2479 	<reg_exp>
2480 */
2481 
2482 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2483 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2484 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2485 {
2486   re_dfa_t *dfa = preg->buffer;
2487   bin_tree_t *tree;
2488   size_t cur_nsub;
2489   cur_nsub = preg->re_nsub++;
2490 
2491   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2492 
2493   /* The subexpression may be a null string.  */
2494   if (token->type == OP_CLOSE_SUBEXP)
2495     tree = NULL;
2496   else
2497     {
2498       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2499       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2500 	{
2501 	  if (tree != NULL)
2502 	    postorder (tree, free_tree, NULL);
2503 	  *err = REG_EPAREN;
2504 	}
2505       if (BE (*err != REG_NOERROR, 0))
2506 	return NULL;
2507     }
2508 
2509   if (cur_nsub <= '9' - '1')
2510     dfa->completed_bkref_map |= 1 << cur_nsub;
2511 
2512   tree = create_tree (dfa, tree, NULL, SUBEXP);
2513   if (BE (tree == NULL, 0))
2514     {
2515       *err = REG_ESPACE;
2516       return NULL;
2517     }
2518   tree->token.opr.idx = cur_nsub;
2519   return tree;
2520 }
2521 
2522 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2523 
2524 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2525 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2526 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2527 {
2528   bin_tree_t *tree = NULL, *old_tree = NULL;
2529   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2530   re_token_t start_token = *token;
2531 
2532   if (token->type == OP_OPEN_DUP_NUM)
2533     {
2534       end = 0;
2535       start = fetch_number (regexp, token, syntax);
2536       if (start == -1)
2537 	{
2538 	  if (token->type == CHARACTER && token->opr.c == ',')
2539 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2540 	  else
2541 	    {
2542 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2543 	      return NULL;
2544 	    }
2545 	}
2546       if (BE (start != -2, 1))
2547 	{
2548 	  /* We treat "{n}" as "{n,n}".  */
2549 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2550 		 : ((token->type == CHARACTER && token->opr.c == ',')
2551 		    ? fetch_number (regexp, token, syntax) : -2));
2552 	}
2553       if (BE (start == -2 || end == -2, 0))
2554 	{
2555 	  /* Invalid sequence.  */
2556 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2557 	    {
2558 	      if (token->type == END_OF_RE)
2559 		*err = REG_EBRACE;
2560 	      else
2561 		*err = REG_BADBR;
2562 
2563 	      return NULL;
2564 	    }
2565 
2566 	  /* If the syntax bit is set, rollback.  */
2567 	  re_string_set_index (regexp, start_idx);
2568 	  *token = start_token;
2569 	  token->type = CHARACTER;
2570 	  /* mb_partial and word_char bits should be already initialized by
2571 	     peek_token.  */
2572 	  return elem;
2573 	}
2574 
2575       if (BE ((end != -1 && start > end)
2576 	      || token->type != OP_CLOSE_DUP_NUM, 0))
2577 	{
2578 	  /* First number greater than second.  */
2579 	  *err = REG_BADBR;
2580 	  return NULL;
2581 	}
2582 
2583       if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2584 	{
2585 	  *err = REG_ESIZE;
2586 	  return NULL;
2587 	}
2588     }
2589   else
2590     {
2591       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2592       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2593     }
2594 
2595   fetch_token (token, regexp, syntax);
2596 
2597   if (BE (elem == NULL, 0))
2598     return NULL;
2599   if (BE (start == 0 && end == 0, 0))
2600     {
2601       postorder (elem, free_tree, NULL);
2602       return NULL;
2603     }
2604 
2605   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2606   if (BE (start > 0, 0))
2607     {
2608       tree = elem;
2609       for (i = 2; i <= start; ++i)
2610 	{
2611 	  elem = duplicate_tree (elem, dfa);
2612 	  tree = create_tree (dfa, tree, elem, CONCAT);
2613 	  if (BE (elem == NULL || tree == NULL, 0))
2614 	    goto parse_dup_op_espace;
2615 	}
2616 
2617       if (start == end)
2618 	return tree;
2619 
2620       /* Duplicate ELEM before it is marked optional.  */
2621       elem = duplicate_tree (elem, dfa);
2622       if (BE (elem == NULL, 0))
2623         goto parse_dup_op_espace;
2624       old_tree = tree;
2625     }
2626   else
2627     old_tree = NULL;
2628 
2629   if (elem->token.type == SUBEXP)
2630     {
2631       uintptr_t subidx = elem->token.opr.idx;
2632       postorder (elem, mark_opt_subexp, (void *) subidx);
2633     }
2634 
2635   tree = create_tree (dfa, elem, NULL,
2636 		      (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2637   if (BE (tree == NULL, 0))
2638     goto parse_dup_op_espace;
2639 
2640 /* From gnulib's "intprops.h":
2641    True if the arithmetic type T is signed.  */
2642 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2643 
2644   /* This loop is actually executed only when end != -1,
2645      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2646      already created the start+1-th copy.  */
2647   if (TYPE_SIGNED (Idx) || end != -1)
2648     for (i = start + 2; i <= end; ++i)
2649       {
2650 	elem = duplicate_tree (elem, dfa);
2651 	tree = create_tree (dfa, tree, elem, CONCAT);
2652 	if (BE (elem == NULL || tree == NULL, 0))
2653 	  goto parse_dup_op_espace;
2654 
2655 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2656 	if (BE (tree == NULL, 0))
2657 	  goto parse_dup_op_espace;
2658       }
2659 
2660   if (old_tree)
2661     tree = create_tree (dfa, old_tree, tree, CONCAT);
2662 
2663   return tree;
2664 
2665  parse_dup_op_espace:
2666   *err = REG_ESPACE;
2667   return NULL;
2668 }
2669 
2670 /* Size of the names for collating symbol/equivalence_class/character_class.
2671    I'm not sure, but maybe enough.  */
2672 #define BRACKET_NAME_BUF_SIZE 32
2673 
2674 #ifndef _LIBC
2675 
2676 # ifdef RE_ENABLE_I18N
2677 /* Convert the byte B to the corresponding wide character.  In a
2678    unibyte locale, treat B as itself if it is an encoding error.
2679    In a multibyte locale, return WEOF if B is an encoding error.  */
2680 static wint_t
parse_byte(unsigned char b,re_charset_t * mbcset)2681 parse_byte (unsigned char b, re_charset_t *mbcset)
2682 {
2683   wint_t wc = __btowc (b);
2684   return wc == WEOF && !mbcset ? b : wc;
2685 }
2686 #endif
2687 
2688   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2689      Build the range expression which starts from START_ELEM, and ends
2690      at END_ELEM.  The result are written to MBCSET and SBCSET.
2691      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2692      mbcset->range_ends, is a pointer argument since we may
2693      update it.  */
2694 
2695 static reg_errcode_t
2696 internal_function
2697 # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)2698 build_range_exp (const reg_syntax_t syntax,
2699                  bitset_t sbcset,
2700                  re_charset_t *mbcset,
2701                  Idx *range_alloc,
2702                  const bracket_elem_t *start_elem,
2703                  const bracket_elem_t *end_elem)
2704 # else /* not RE_ENABLE_I18N */
2705 build_range_exp (const reg_syntax_t syntax,
2706                  bitset_t sbcset,
2707                  const bracket_elem_t *start_elem,
2708                  const bracket_elem_t *end_elem)
2709 # endif /* not RE_ENABLE_I18N */
2710 {
2711   unsigned int start_ch, end_ch;
2712   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2713   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2714 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2715 	  0))
2716     return REG_ERANGE;
2717 
2718   /* We can handle no multi character collating elements without libc
2719      support.  */
2720   if (BE ((start_elem->type == COLL_SYM
2721 	   && strlen ((char *) start_elem->opr.name) > 1)
2722 	  || (end_elem->type == COLL_SYM
2723 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2724     return REG_ECOLLATE;
2725 
2726 # ifdef RE_ENABLE_I18N
2727   {
2728     wchar_t wc;
2729     wint_t start_wc;
2730     wint_t end_wc;
2731 
2732     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2733 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734 		   : 0));
2735     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2736 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737 		 : 0));
2738     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2739 		? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2740     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2741 	      ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2742     if (start_wc == WEOF || end_wc == WEOF)
2743       return REG_ECOLLATE;
2744     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2745       return REG_ERANGE;
2746 
2747     /* Got valid collation sequence values, add them as a new entry.
2748        However, for !_LIBC we have no collation elements: if the
2749        character set is single byte, the single byte character set
2750        that we build below suffices.  parse_bracket_exp passes
2751        no MBCSET if dfa->mb_cur_max == 1.  */
2752     if (mbcset)
2753       {
2754 	/* Check the space of the arrays.  */
2755 	if (BE (*range_alloc == mbcset->nranges, 0))
2756 	  {
2757 	    /* There is not enough space, need realloc.  */
2758 	    wchar_t *new_array_start, *new_array_end;
2759 	    Idx new_nranges;
2760 
2761 	    /* +1 in case of mbcset->nranges is 0.  */
2762 	    new_nranges = 2 * mbcset->nranges + 1;
2763 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2764 	       are NULL if *range_alloc == 0.  */
2765 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2766 					  new_nranges);
2767 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2768 					new_nranges);
2769 
2770 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2771 	      {
2772 		re_free (new_array_start);
2773 		re_free (new_array_end);
2774 		return REG_ESPACE;
2775 	      }
2776 
2777 	    mbcset->range_starts = new_array_start;
2778 	    mbcset->range_ends = new_array_end;
2779 	    *range_alloc = new_nranges;
2780 	  }
2781 
2782 	mbcset->range_starts[mbcset->nranges] = start_wc;
2783 	mbcset->range_ends[mbcset->nranges++] = end_wc;
2784       }
2785 
2786     /* Build the table for single byte characters.  */
2787     for (wc = 0; wc < SBC_MAX; ++wc)
2788       {
2789 	if (start_wc <= wc && wc <= end_wc)
2790 	  bitset_set (sbcset, wc);
2791       }
2792   }
2793 # else /* not RE_ENABLE_I18N */
2794   {
2795     unsigned int ch;
2796     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2797 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2798 		   : 0));
2799     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2800 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2801 		 : 0));
2802     if (start_ch > end_ch)
2803       return REG_ERANGE;
2804     /* Build the table for single byte characters.  */
2805     for (ch = 0; ch < SBC_MAX; ++ch)
2806       if (start_ch <= ch  && ch <= end_ch)
2807 	bitset_set (sbcset, ch);
2808   }
2809 # endif /* not RE_ENABLE_I18N */
2810   return REG_NOERROR;
2811 }
2812 #endif /* not _LIBC */
2813 
2814 #ifndef _LIBC
2815 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2816    Build the collating element which is represented by NAME.
2817    The result are written to MBCSET and SBCSET.
2818    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2819    pointer argument since we may update it.  */
2820 
2821 static reg_errcode_t
2822 internal_function
2823 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2824 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2825 			Idx *coll_sym_alloc, const unsigned char *name)
2826 # else /* not RE_ENABLE_I18N */
2827 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2828 # endif /* not RE_ENABLE_I18N */
2829 {
2830   size_t name_len = strlen ((const char *) name);
2831   if (BE (name_len != 1, 0))
2832     return REG_ECOLLATE;
2833   else
2834     {
2835       bitset_set (sbcset, name[0]);
2836       return REG_NOERROR;
2837     }
2838 }
2839 #endif /* not _LIBC */
2840 
2841 /* This function parse bracket expression like "[abc]", "[a-c]",
2842    "[[.a-a.]]" etc.  */
2843 
2844 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2845 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2846 		   reg_syntax_t syntax, reg_errcode_t *err)
2847 {
2848 #ifdef _LIBC
2849   const unsigned char *collseqmb;
2850   const char *collseqwc;
2851   uint32_t nrules;
2852   int32_t table_size;
2853   const int32_t *symb_table;
2854   const unsigned char *extra;
2855 
2856   /* Local function for parse_bracket_exp used in _LIBC environment.
2857      Seek the collating symbol entry corresponding to NAME.
2858      Return the index of the symbol in the SYMB_TABLE,
2859      or -1 if not found.  */
2860 
2861   auto inline int32_t
2862   __attribute__ ((always_inline))
2863   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2864     {
2865       int32_t elem;
2866 
2867       for (elem = 0; elem < table_size; elem++)
2868 	if (symb_table[2 * elem] != 0)
2869 	  {
2870 	    int32_t idx = symb_table[2 * elem + 1];
2871 	    /* Skip the name of collating element name.  */
2872 	    idx += 1 + extra[idx];
2873 	    if (/* Compare the length of the name.  */
2874 		name_len == extra[idx]
2875 		/* Compare the name.  */
2876 		&& memcmp (name, &extra[idx + 1], name_len) == 0)
2877 	      /* Yep, this is the entry.  */
2878 	      return elem;
2879 	  }
2880       return -1;
2881     }
2882 
2883   /* Local function for parse_bracket_exp used in _LIBC environment.
2884      Look up the collation sequence value of BR_ELEM.
2885      Return the value if succeeded, UINT_MAX otherwise.  */
2886 
2887   auto inline unsigned int
2888   __attribute__ ((always_inline))
2889   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2890     {
2891       if (br_elem->type == SB_CHAR)
2892 	{
2893 	  /*
2894 	  if (MB_CUR_MAX == 1)
2895 	  */
2896 	  if (nrules == 0)
2897 	    return collseqmb[br_elem->opr.ch];
2898 	  else
2899 	    {
2900 	      wint_t wc = __btowc (br_elem->opr.ch);
2901 	      return __collseq_table_lookup (collseqwc, wc);
2902 	    }
2903 	}
2904       else if (br_elem->type == MB_CHAR)
2905 	{
2906 	  if (nrules != 0)
2907 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908 	}
2909       else if (br_elem->type == COLL_SYM)
2910 	{
2911 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2912 	  if (nrules != 0)
2913 	    {
2914 	      int32_t elem, idx;
2915 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2916 						  sym_name_len);
2917 	      if (elem != -1)
2918 		{
2919 		  /* We found the entry.  */
2920 		  idx = symb_table[2 * elem + 1];
2921 		  /* Skip the name of collating element name.  */
2922 		  idx += 1 + extra[idx];
2923 		  /* Skip the byte sequence of the collating element.  */
2924 		  idx += 1 + extra[idx];
2925 		  /* Adjust for the alignment.  */
2926 		  idx = (idx + 3) & ~3;
2927 		  /* Skip the multibyte collation sequence value.  */
2928 		  idx += sizeof (unsigned int);
2929 		  /* Skip the wide char sequence of the collating element.  */
2930 		  idx += sizeof (unsigned int) *
2931 		    (1 + *(unsigned int *) (extra + idx));
2932 		  /* Return the collation sequence value.  */
2933 		  return *(unsigned int *) (extra + idx);
2934 		}
2935 	      else if (sym_name_len == 1)
2936 		{
2937 		  /* No valid character.  Match it as a single byte
2938 		     character.  */
2939 		  return collseqmb[br_elem->opr.name[0]];
2940 		}
2941 	    }
2942 	  else if (sym_name_len == 1)
2943 	    return collseqmb[br_elem->opr.name[0]];
2944 	}
2945       return UINT_MAX;
2946     }
2947 
2948   /* Local function for parse_bracket_exp used in _LIBC environment.
2949      Build the range expression which starts from START_ELEM, and ends
2950      at END_ELEM.  The result are written to MBCSET and SBCSET.
2951      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2952      mbcset->range_ends, is a pointer argument since we may
2953      update it.  */
2954 
2955   auto inline reg_errcode_t
2956   __attribute__ ((always_inline))
2957   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2958 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2959     {
2960       unsigned int ch;
2961       uint32_t start_collseq;
2962       uint32_t end_collseq;
2963 
2964       /* Equivalence Classes and Character Classes can't be a range
2965 	 start/end.  */
2966       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2967 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2968 	      0))
2969 	return REG_ERANGE;
2970 
2971       /* FIXME: Implement rational ranges here, too.  */
2972       start_collseq = lookup_collation_sequence_value (start_elem);
2973       end_collseq = lookup_collation_sequence_value (end_elem);
2974       /* Check start/end collation sequence values.  */
2975       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2976 	return REG_ECOLLATE;
2977       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978 	return REG_ERANGE;
2979 
2980       /* Got valid collation sequence values, add them as a new entry.
2981 	 However, if we have no collation elements, and the character set
2982 	 is single byte, the single byte character set that we
2983 	 build below suffices. */
2984       if (nrules > 0 || dfa->mb_cur_max > 1)
2985 	{
2986 	  /* Check the space of the arrays.  */
2987 	  if (BE (*range_alloc == mbcset->nranges, 0))
2988 	    {
2989 	      /* There is not enough space, need realloc.  */
2990 	      uint32_t *new_array_start;
2991 	      uint32_t *new_array_end;
2992 	      Idx new_nranges;
2993 
2994 	      /* +1 in case of mbcset->nranges is 0.  */
2995 	      new_nranges = 2 * mbcset->nranges + 1;
2996 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2997 					    new_nranges);
2998 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999 					  new_nranges);
3000 
3001 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002 		return REG_ESPACE;
3003 
3004 	      mbcset->range_starts = new_array_start;
3005 	      mbcset->range_ends = new_array_end;
3006 	      *range_alloc = new_nranges;
3007 	    }
3008 
3009 	  mbcset->range_starts[mbcset->nranges] = start_collseq;
3010 	  mbcset->range_ends[mbcset->nranges++] = end_collseq;
3011 	}
3012 
3013       /* Build the table for single byte characters.  */
3014       for (ch = 0; ch < SBC_MAX; ch++)
3015 	{
3016 	  uint32_t ch_collseq;
3017 	  /*
3018 	  if (MB_CUR_MAX == 1)
3019 	  */
3020 	  if (nrules == 0)
3021 	    ch_collseq = collseqmb[ch];
3022 	  else
3023 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3024 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3025 	    bitset_set (sbcset, ch);
3026 	}
3027       return REG_NOERROR;
3028     }
3029 
3030   /* Local function for parse_bracket_exp used in _LIBC environment.
3031      Build the collating element which is represented by NAME.
3032      The result are written to MBCSET and SBCSET.
3033      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3034      pointer argument since we may update it.  */
3035 
3036   auto inline reg_errcode_t
3037   __attribute__ ((always_inline))
3038   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3039 			  Idx *coll_sym_alloc, const unsigned char *name)
3040     {
3041       int32_t elem, idx;
3042       size_t name_len = strlen ((const char *) name);
3043       if (nrules != 0)
3044 	{
3045 	  elem = seek_collating_symbol_entry (name, name_len);
3046 	  if (elem != -1)
3047 	    {
3048 	      /* We found the entry.  */
3049 	      idx = symb_table[2 * elem + 1];
3050 	      /* Skip the name of collating element name.  */
3051 	      idx += 1 + extra[idx];
3052 	    }
3053 	  else if (name_len == 1)
3054 	    {
3055 	      /* No valid character, treat it as a normal
3056 		 character.  */
3057 	      bitset_set (sbcset, name[0]);
3058 	      return REG_NOERROR;
3059 	    }
3060 	  else
3061 	    return REG_ECOLLATE;
3062 
3063 	  /* Got valid collation sequence, add it as a new entry.  */
3064 	  /* Check the space of the arrays.  */
3065 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3066 	    {
3067 	      /* Not enough, realloc it.  */
3068 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
3069 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3070 	      /* Use realloc since mbcset->coll_syms is NULL
3071 		 if *alloc == 0.  */
3072 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3073 						   new_coll_sym_alloc);
3074 	      if (BE (new_coll_syms == NULL, 0))
3075 		return REG_ESPACE;
3076 	      mbcset->coll_syms = new_coll_syms;
3077 	      *coll_sym_alloc = new_coll_sym_alloc;
3078 	    }
3079 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3080 	  return REG_NOERROR;
3081 	}
3082       else
3083 	{
3084 	  if (BE (name_len != 1, 0))
3085 	    return REG_ECOLLATE;
3086 	  else
3087 	    {
3088 	      bitset_set (sbcset, name[0]);
3089 	      return REG_NOERROR;
3090 	    }
3091 	}
3092     }
3093 #endif
3094 
3095   re_token_t br_token;
3096   re_bitset_ptr_t sbcset;
3097 #ifdef RE_ENABLE_I18N
3098   re_charset_t *mbcset;
3099   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3100   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3101 #endif /* not RE_ENABLE_I18N */
3102   bool non_match = false;
3103   bin_tree_t *work_tree;
3104   int token_len;
3105   bool first_round = true;
3106 #ifdef _LIBC
3107   collseqmb = (const unsigned char *)
3108     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3109   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3110   if (nrules)
3111     {
3112       /*
3113       if (MB_CUR_MAX > 1)
3114       */
3115       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3116       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3117       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3118 						  _NL_COLLATE_SYMB_TABLEMB);
3119       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3120 						   _NL_COLLATE_SYMB_EXTRAMB);
3121     }
3122 #endif
3123   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3124 #ifdef RE_ENABLE_I18N
3125   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3126 #endif /* RE_ENABLE_I18N */
3127 #ifdef RE_ENABLE_I18N
3128   if (BE (sbcset == NULL || mbcset == NULL, 0))
3129 #else
3130   if (BE (sbcset == NULL, 0))
3131 #endif /* RE_ENABLE_I18N */
3132     {
3133       re_free (sbcset);
3134 #ifdef RE_ENABLE_I18N
3135       re_free (mbcset);
3136 #endif
3137       *err = REG_ESPACE;
3138       return NULL;
3139     }
3140 
3141   token_len = peek_token_bracket (token, regexp, syntax);
3142   if (BE (token->type == END_OF_RE, 0))
3143     {
3144       *err = REG_BADPAT;
3145       goto parse_bracket_exp_free_return;
3146     }
3147   if (token->type == OP_NON_MATCH_LIST)
3148     {
3149 #ifdef RE_ENABLE_I18N
3150       mbcset->non_match = 1;
3151 #endif /* not RE_ENABLE_I18N */
3152       non_match = true;
3153       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3154 	bitset_set (sbcset, '\n');
3155       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3156       token_len = peek_token_bracket (token, regexp, syntax);
3157       if (BE (token->type == END_OF_RE, 0))
3158 	{
3159 	  *err = REG_BADPAT;
3160 	  goto parse_bracket_exp_free_return;
3161 	}
3162     }
3163 
3164   /* We treat the first ']' as a normal character.  */
3165   if (token->type == OP_CLOSE_BRACKET)
3166     token->type = CHARACTER;
3167 
3168   while (1)
3169     {
3170       bracket_elem_t start_elem, end_elem;
3171       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3172       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3173       reg_errcode_t ret;
3174       int token_len2 = 0;
3175       bool is_range_exp = false;
3176       re_token_t token2;
3177 
3178       start_elem.opr.name = start_name_buf;
3179       start_elem.type = COLL_SYM;
3180       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3181 				   syntax, first_round);
3182       if (BE (ret != REG_NOERROR, 0))
3183 	{
3184 	  *err = ret;
3185 	  goto parse_bracket_exp_free_return;
3186 	}
3187       first_round = false;
3188 
3189       /* Get information about the next token.  We need it in any case.  */
3190       token_len = peek_token_bracket (token, regexp, syntax);
3191 
3192       /* Do not check for ranges if we know they are not allowed.  */
3193       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3194 	{
3195 	  if (BE (token->type == END_OF_RE, 0))
3196 	    {
3197 	      *err = REG_EBRACK;
3198 	      goto parse_bracket_exp_free_return;
3199 	    }
3200 	  if (token->type == OP_CHARSET_RANGE)
3201 	    {
3202 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3203 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3204 	      if (BE (token2.type == END_OF_RE, 0))
3205 		{
3206 		  *err = REG_EBRACK;
3207 		  goto parse_bracket_exp_free_return;
3208 		}
3209 	      if (token2.type == OP_CLOSE_BRACKET)
3210 		{
3211 		  /* We treat the last '-' as a normal character.  */
3212 		  re_string_skip_bytes (regexp, -token_len);
3213 		  token->type = CHARACTER;
3214 		}
3215 	      else
3216 		is_range_exp = true;
3217 	    }
3218 	}
3219 
3220       if (is_range_exp == true)
3221 	{
3222 	  end_elem.opr.name = end_name_buf;
3223 	  end_elem.type = COLL_SYM;
3224 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3225 				       dfa, syntax, true);
3226 	  if (BE (ret != REG_NOERROR, 0))
3227 	    {
3228 	      *err = ret;
3229 	      goto parse_bracket_exp_free_return;
3230 	    }
3231 
3232 	  token_len = peek_token_bracket (token, regexp, syntax);
3233 
3234 #ifdef _LIBC
3235 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3236 				  &start_elem, &end_elem);
3237 #else
3238 # ifdef RE_ENABLE_I18N
3239 	  *err = build_range_exp (syntax, sbcset,
3240 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3241 				  &range_alloc, &start_elem, &end_elem);
3242 # else
3243 	  *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3244 # endif
3245 #endif /* RE_ENABLE_I18N */
3246 	  if (BE (*err != REG_NOERROR, 0))
3247 	    goto parse_bracket_exp_free_return;
3248 	}
3249       else
3250 	{
3251 	  switch (start_elem.type)
3252 	    {
3253 	    case SB_CHAR:
3254 	      bitset_set (sbcset, start_elem.opr.ch);
3255 	      break;
3256 #ifdef RE_ENABLE_I18N
3257 	    case MB_CHAR:
3258 	      /* Check whether the array has enough space.  */
3259 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3260 		{
3261 		  wchar_t *new_mbchars;
3262 		  /* Not enough, realloc it.  */
3263 		  /* +1 in case of mbcset->nmbchars is 0.  */
3264 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
3265 		  /* Use realloc since array is NULL if *alloc == 0.  */
3266 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3267 					    mbchar_alloc);
3268 		  if (BE (new_mbchars == NULL, 0))
3269 		    goto parse_bracket_exp_espace;
3270 		  mbcset->mbchars = new_mbchars;
3271 		}
3272 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3273 	      break;
3274 #endif /* RE_ENABLE_I18N */
3275 	    case EQUIV_CLASS:
3276 	      *err = build_equiv_class (sbcset,
3277 #ifdef RE_ENABLE_I18N
3278 					mbcset, &equiv_class_alloc,
3279 #endif /* RE_ENABLE_I18N */
3280 					start_elem.opr.name);
3281 	      if (BE (*err != REG_NOERROR, 0))
3282 		goto parse_bracket_exp_free_return;
3283 	      break;
3284 	    case COLL_SYM:
3285 	      *err = build_collating_symbol (sbcset,
3286 #ifdef RE_ENABLE_I18N
3287 					     mbcset, &coll_sym_alloc,
3288 #endif /* RE_ENABLE_I18N */
3289 					     start_elem.opr.name);
3290 	      if (BE (*err != REG_NOERROR, 0))
3291 		goto parse_bracket_exp_free_return;
3292 	      break;
3293 	    case CHAR_CLASS:
3294 	      *err = build_charclass (regexp->trans, sbcset,
3295 #ifdef RE_ENABLE_I18N
3296 				      mbcset, &char_class_alloc,
3297 #endif /* RE_ENABLE_I18N */
3298 				      (const char *) start_elem.opr.name,
3299 				      syntax);
3300 	      if (BE (*err != REG_NOERROR, 0))
3301 	       goto parse_bracket_exp_free_return;
3302 	      break;
3303 	    default:
3304 	      assert (0);
3305 	      break;
3306 	    }
3307 	}
3308       if (BE (token->type == END_OF_RE, 0))
3309 	{
3310 	  *err = REG_EBRACK;
3311 	  goto parse_bracket_exp_free_return;
3312 	}
3313       if (token->type == OP_CLOSE_BRACKET)
3314 	break;
3315     }
3316 
3317   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3318 
3319   /* If it is non-matching list.  */
3320   if (non_match)
3321     bitset_not (sbcset);
3322 
3323 #ifdef RE_ENABLE_I18N
3324   /* Ensure only single byte characters are set.  */
3325   if (dfa->mb_cur_max > 1)
3326     bitset_mask (sbcset, dfa->sb_char);
3327 
3328   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3329       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3330 						     || mbcset->non_match)))
3331     {
3332       bin_tree_t *mbc_tree;
3333       int sbc_idx;
3334       /* Build a tree for complex bracket.  */
3335       dfa->has_mb_node = 1;
3336       br_token.type = COMPLEX_BRACKET;
3337       br_token.opr.mbcset = mbcset;
3338       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3339       if (BE (mbc_tree == NULL, 0))
3340 	goto parse_bracket_exp_espace;
3341       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3342 	if (sbcset[sbc_idx])
3343 	  break;
3344       /* If there are no bits set in sbcset, there is no point
3345 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3346       if (sbc_idx < BITSET_WORDS)
3347 	{
3348 	  /* Build a tree for simple bracket.  */
3349 	  br_token.type = SIMPLE_BRACKET;
3350 	  br_token.opr.sbcset = sbcset;
3351 	  work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352 	  if (BE (work_tree == NULL, 0))
3353 	    goto parse_bracket_exp_espace;
3354 
3355 	  /* Then join them by ALT node.  */
3356 	  work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3357 	  if (BE (work_tree == NULL, 0))
3358 	    goto parse_bracket_exp_espace;
3359 	}
3360       else
3361 	{
3362 	  re_free (sbcset);
3363 	  work_tree = mbc_tree;
3364 	}
3365     }
3366   else
3367 #endif /* not RE_ENABLE_I18N */
3368     {
3369 #ifdef RE_ENABLE_I18N
3370       free_charset (mbcset);
3371 #endif
3372       /* Build a tree for simple bracket.  */
3373       br_token.type = SIMPLE_BRACKET;
3374       br_token.opr.sbcset = sbcset;
3375       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3376       if (BE (work_tree == NULL, 0))
3377 	goto parse_bracket_exp_espace;
3378     }
3379   return work_tree;
3380 
3381  parse_bracket_exp_espace:
3382   *err = REG_ESPACE;
3383  parse_bracket_exp_free_return:
3384   re_free (sbcset);
3385 #ifdef RE_ENABLE_I18N
3386   free_charset (mbcset);
3387 #endif /* RE_ENABLE_I18N */
3388   return NULL;
3389 }
3390 
3391 /* Parse an element in the bracket expression.  */
3392 
3393 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3394 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3395 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3396 		       reg_syntax_t syntax, bool accept_hyphen)
3397 {
3398 #ifdef RE_ENABLE_I18N
3399   int cur_char_size;
3400   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3401   if (cur_char_size > 1)
3402     {
3403       elem->type = MB_CHAR;
3404       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3405       re_string_skip_bytes (regexp, cur_char_size);
3406       return REG_NOERROR;
3407     }
3408 #endif /* RE_ENABLE_I18N */
3409   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3410   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3411       || token->type == OP_OPEN_EQUIV_CLASS)
3412     return parse_bracket_symbol (elem, regexp, token);
3413   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3414     {
3415       /* A '-' must only appear as anything but a range indicator before
3416 	 the closing bracket.  Everything else is an error.  */
3417       re_token_t token2;
3418       (void) peek_token_bracket (&token2, regexp, syntax);
3419       if (token2.type != OP_CLOSE_BRACKET)
3420 	/* The actual error value is not standardized since this whole
3421 	   case is undefined.  But ERANGE makes good sense.  */
3422 	return REG_ERANGE;
3423     }
3424   elem->type = SB_CHAR;
3425   elem->opr.ch = token->opr.c;
3426   return REG_NOERROR;
3427 }
3428 
3429 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3430    such as [:<character_class>:], [.<collating_element>.], and
3431    [=<equivalent_class>=].  */
3432 
3433 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3434 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3435 		      re_token_t *token)
3436 {
3437   unsigned char ch, delim = token->opr.c;
3438   int i = 0;
3439   if (re_string_eoi(regexp))
3440     return REG_EBRACK;
3441   for (;; ++i)
3442     {
3443       if (i >= BRACKET_NAME_BUF_SIZE)
3444 	return REG_EBRACK;
3445       if (token->type == OP_OPEN_CHAR_CLASS)
3446 	ch = re_string_fetch_byte_case (regexp);
3447       else
3448 	ch = re_string_fetch_byte (regexp);
3449       if (re_string_eoi(regexp))
3450 	return REG_EBRACK;
3451       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3452 	break;
3453       elem->opr.name[i] = ch;
3454     }
3455   re_string_skip_bytes (regexp, 1);
3456   elem->opr.name[i] = '\0';
3457   switch (token->type)
3458     {
3459     case OP_OPEN_COLL_ELEM:
3460       elem->type = COLL_SYM;
3461       break;
3462     case OP_OPEN_EQUIV_CLASS:
3463       elem->type = EQUIV_CLASS;
3464       break;
3465     case OP_OPEN_CHAR_CLASS:
3466       elem->type = CHAR_CLASS;
3467       break;
3468     default:
3469       break;
3470     }
3471   return REG_NOERROR;
3472 }
3473 
3474   /* Helper function for parse_bracket_exp.
3475      Build the equivalence class which is represented by NAME.
3476      The result are written to MBCSET and SBCSET.
3477      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3478      is a pointer argument since we may update it.  */
3479 
3480 static reg_errcode_t
3481 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3482 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3483 		   Idx *equiv_class_alloc, const unsigned char *name)
3484 #else /* not RE_ENABLE_I18N */
3485 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3486 #endif /* not RE_ENABLE_I18N */
3487 {
3488 #ifdef _LIBC
3489   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3490   if (nrules != 0)
3491     {
3492       const int32_t *table, *indirect;
3493       const unsigned char *weights, *extra, *cp;
3494       unsigned char char_buf[2];
3495       int32_t idx1, idx2;
3496       unsigned int ch;
3497       size_t len;
3498       /* Calculate the index for equivalence class.  */
3499       cp = name;
3500       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3501       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502 					       _NL_COLLATE_WEIGHTMB);
3503       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504 						   _NL_COLLATE_EXTRAMB);
3505       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3506 						_NL_COLLATE_INDIRECTMB);
3507       idx1 = findidx (table, indirect, extra, &cp, -1);
3508       if (BE (idx1 == 0 || *cp != '\0', 0))
3509 	/* This isn't a valid character.  */
3510 	return REG_ECOLLATE;
3511 
3512       /* Build single byte matching table for this equivalence class.  */
3513       len = weights[idx1 & 0xffffff];
3514       for (ch = 0; ch < SBC_MAX; ++ch)
3515 	{
3516 	  char_buf[0] = ch;
3517 	  cp = char_buf;
3518 	  idx2 = findidx (table, indirect, extra, &cp, 1);
3519 /*
3520 	  idx2 = table[ch];
3521 */
3522 	  if (idx2 == 0)
3523 	    /* This isn't a valid character.  */
3524 	    continue;
3525 	  /* Compare only if the length matches and the collation rule
3526 	     index is the same.  */
3527 	  if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3528 	    {
3529 	      int cnt = 0;
3530 
3531 	      while (cnt <= len &&
3532 		     weights[(idx1 & 0xffffff) + 1 + cnt]
3533 		     == weights[(idx2 & 0xffffff) + 1 + cnt])
3534 		++cnt;
3535 
3536 	      if (cnt > len)
3537 		bitset_set (sbcset, ch);
3538 	    }
3539 	}
3540       /* Check whether the array has enough space.  */
3541       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3542 	{
3543 	  /* Not enough, realloc it.  */
3544 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
3545 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3546 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3547 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3548 						   int32_t,
3549 						   new_equiv_class_alloc);
3550 	  if (BE (new_equiv_classes == NULL, 0))
3551 	    return REG_ESPACE;
3552 	  mbcset->equiv_classes = new_equiv_classes;
3553 	  *equiv_class_alloc = new_equiv_class_alloc;
3554 	}
3555       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3556     }
3557   else
3558 #endif /* _LIBC */
3559     {
3560       if (BE (strlen ((const char *) name) != 1, 0))
3561 	return REG_ECOLLATE;
3562       bitset_set (sbcset, *name);
3563     }
3564   return REG_NOERROR;
3565 }
3566 
3567   /* Helper function for parse_bracket_exp.
3568      Build the character class which is represented by NAME.
3569      The result are written to MBCSET and SBCSET.
3570      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3571      is a pointer argument since we may update it.  */
3572 
3573 static reg_errcode_t
3574 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const char * class_name,reg_syntax_t syntax)3575 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3576 		 re_charset_t *mbcset, Idx *char_class_alloc,
3577 		 const char *class_name, reg_syntax_t syntax)
3578 #else /* not RE_ENABLE_I18N */
3579 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3580 		 const char *class_name, reg_syntax_t syntax)
3581 #endif /* not RE_ENABLE_I18N */
3582 {
3583   int i;
3584   const char *name = class_name;
3585 
3586   /* In case of REG_ICASE "upper" and "lower" match the both of
3587      upper and lower cases.  */
3588   if ((syntax & RE_ICASE)
3589       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3590     name = "alpha";
3591 
3592 #ifdef RE_ENABLE_I18N
3593   /* Check the space of the arrays.  */
3594   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3595     {
3596       /* Not enough, realloc it.  */
3597       /* +1 in case of mbcset->nchar_classes is 0.  */
3598       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3599       /* Use realloc since array is NULL if *alloc == 0.  */
3600       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3601 					       new_char_class_alloc);
3602       if (BE (new_char_classes == NULL, 0))
3603 	return REG_ESPACE;
3604       mbcset->char_classes = new_char_classes;
3605       *char_class_alloc = new_char_class_alloc;
3606     }
3607   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3608 #endif /* RE_ENABLE_I18N */
3609 
3610 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3611   do {						\
3612     if (BE (trans != NULL, 0))			\
3613       {						\
3614 	for (i = 0; i < SBC_MAX; ++i)		\
3615 	  if (ctype_func (i))			\
3616 	    bitset_set (sbcset, trans[i]);	\
3617       }						\
3618     else					\
3619       {						\
3620 	for (i = 0; i < SBC_MAX; ++i)		\
3621 	  if (ctype_func (i))			\
3622 	    bitset_set (sbcset, i);		\
3623       }						\
3624   } while (0)
3625 
3626   if (strcmp (name, "alnum") == 0)
3627     BUILD_CHARCLASS_LOOP (isalnum);
3628   else if (strcmp (name, "cntrl") == 0)
3629     BUILD_CHARCLASS_LOOP (iscntrl);
3630   else if (strcmp (name, "lower") == 0)
3631     BUILD_CHARCLASS_LOOP (islower);
3632   else if (strcmp (name, "space") == 0)
3633     BUILD_CHARCLASS_LOOP (isspace);
3634   else if (strcmp (name, "alpha") == 0)
3635     BUILD_CHARCLASS_LOOP (isalpha);
3636   else if (strcmp (name, "digit") == 0)
3637     BUILD_CHARCLASS_LOOP (isdigit);
3638   else if (strcmp (name, "print") == 0)
3639     BUILD_CHARCLASS_LOOP (isprint);
3640   else if (strcmp (name, "upper") == 0)
3641     BUILD_CHARCLASS_LOOP (isupper);
3642   else if (strcmp (name, "blank") == 0)
3643     BUILD_CHARCLASS_LOOP (isblank);
3644   else if (strcmp (name, "graph") == 0)
3645     BUILD_CHARCLASS_LOOP (isgraph);
3646   else if (strcmp (name, "punct") == 0)
3647     BUILD_CHARCLASS_LOOP (ispunct);
3648   else if (strcmp (name, "xdigit") == 0)
3649     BUILD_CHARCLASS_LOOP (isxdigit);
3650   else
3651     return REG_ECTYPE;
3652 
3653   return REG_NOERROR;
3654 }
3655 
3656 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const char * class_name,const char * extra,bool non_match,reg_errcode_t * err)3657 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3658 		    const char *class_name,
3659 		    const char *extra, bool non_match,
3660 		    reg_errcode_t *err)
3661 {
3662   re_bitset_ptr_t sbcset;
3663 #ifdef RE_ENABLE_I18N
3664   re_charset_t *mbcset;
3665   Idx alloc = 0;
3666 #endif /* not RE_ENABLE_I18N */
3667   reg_errcode_t ret;
3668   re_token_t br_token;
3669   bin_tree_t *tree;
3670 
3671   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3672   if (BE (sbcset == NULL, 0))
3673     {
3674       *err = REG_ESPACE;
3675       return NULL;
3676     }
3677 #ifdef RE_ENABLE_I18N
3678   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3679   if (BE (mbcset == NULL, 0))
3680     {
3681       re_free (sbcset);
3682       *err = REG_ESPACE;
3683       return NULL;
3684     }
3685   mbcset->non_match = non_match;
3686 #endif /* RE_ENABLE_I18N */
3687 
3688   /* We don't care the syntax in this case.  */
3689   ret = build_charclass (trans, sbcset,
3690 #ifdef RE_ENABLE_I18N
3691 			 mbcset, &alloc,
3692 #endif /* RE_ENABLE_I18N */
3693 			 class_name, 0);
3694 
3695   if (BE (ret != REG_NOERROR, 0))
3696     {
3697       re_free (sbcset);
3698 #ifdef RE_ENABLE_I18N
3699       free_charset (mbcset);
3700 #endif /* RE_ENABLE_I18N */
3701       *err = ret;
3702       return NULL;
3703     }
3704   /* \w match '_' also.  */
3705   for (; *extra; extra++)
3706     bitset_set (sbcset, *extra);
3707 
3708   /* If it is non-matching list.  */
3709   if (non_match)
3710     bitset_not (sbcset);
3711 
3712 #ifdef RE_ENABLE_I18N
3713   /* Ensure only single byte characters are set.  */
3714   if (dfa->mb_cur_max > 1)
3715     bitset_mask (sbcset, dfa->sb_char);
3716 #endif
3717 
3718   /* Build a tree for simple bracket.  */
3719 #if defined GCC_LINT || defined lint
3720   memset (&br_token, 0, sizeof br_token);
3721 #endif
3722   br_token.type = SIMPLE_BRACKET;
3723   br_token.opr.sbcset = sbcset;
3724   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3725   if (BE (tree == NULL, 0))
3726     goto build_word_op_espace;
3727 
3728 #ifdef RE_ENABLE_I18N
3729   if (dfa->mb_cur_max > 1)
3730     {
3731       bin_tree_t *mbc_tree;
3732       /* Build a tree for complex bracket.  */
3733       br_token.type = COMPLEX_BRACKET;
3734       br_token.opr.mbcset = mbcset;
3735       dfa->has_mb_node = 1;
3736       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3737       if (BE (mbc_tree == NULL, 0))
3738 	goto build_word_op_espace;
3739       /* Then join them by ALT node.  */
3740       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3741       if (BE (mbc_tree != NULL, 1))
3742 	return tree;
3743     }
3744   else
3745     {
3746       free_charset (mbcset);
3747       return tree;
3748     }
3749 #else /* not RE_ENABLE_I18N */
3750   return tree;
3751 #endif /* not RE_ENABLE_I18N */
3752 
3753  build_word_op_espace:
3754   re_free (sbcset);
3755 #ifdef RE_ENABLE_I18N
3756   free_charset (mbcset);
3757 #endif /* RE_ENABLE_I18N */
3758   *err = REG_ESPACE;
3759   return NULL;
3760 }
3761 
3762 /* This is intended for the expressions like "a{1,3}".
3763    Fetch a number from 'input', and return the number.
3764    Return -1 if the number field is empty like "{,1}".
3765    Return RE_DUP_MAX + 1 if the number field is too large.
3766    Return -2 if an error occurred.  */
3767 
3768 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3770 {
3771   Idx num = -1;
3772   unsigned char c;
3773   while (1)
3774     {
3775       fetch_token (token, input, syntax);
3776       c = token->opr.c;
3777       if (BE (token->type == END_OF_RE, 0))
3778 	return -2;
3779       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3780 	break;
3781       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3782 	     ? -2
3783 	     : num == -1
3784 	     ? c - '0'
3785 	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3786     }
3787   return num;
3788 }
3789 
3790 #ifdef RE_ENABLE_I18N
3791 static void
free_charset(re_charset_t * cset)3792 free_charset (re_charset_t *cset)
3793 {
3794   re_free (cset->mbchars);
3795 # ifdef _LIBC
3796   re_free (cset->coll_syms);
3797   re_free (cset->equiv_classes);
3798   re_free (cset->range_starts);
3799   re_free (cset->range_ends);
3800 # endif
3801   re_free (cset->char_classes);
3802   re_free (cset);
3803 }
3804 #endif /* RE_ENABLE_I18N */
3805 
3806 /* Functions for binary tree operation.  */
3807 
3808 /* Create a tree node.  */
3809 
3810 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3811 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3812 	     re_token_type_t type)
3813 {
3814   re_token_t t;
3815 #if defined GCC_LINT || defined lint
3816   memset (&t, 0, sizeof t);
3817 #endif
3818   t.type = type;
3819   return create_token_tree (dfa, left, right, &t);
3820 }
3821 
3822 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3823 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3824 		   const re_token_t *token)
3825 {
3826   bin_tree_t *tree;
3827   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3828     {
3829       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3830 
3831       if (storage == NULL)
3832 	return NULL;
3833       storage->next = dfa->str_tree_storage;
3834       dfa->str_tree_storage = storage;
3835       dfa->str_tree_storage_idx = 0;
3836     }
3837   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3838 
3839   tree->parent = NULL;
3840   tree->left = left;
3841   tree->right = right;
3842   tree->token = *token;
3843   tree->token.duplicated = 0;
3844   tree->token.opt_subexp = 0;
3845   tree->first = NULL;
3846   tree->next = NULL;
3847   tree->node_idx = -1;
3848 
3849   if (left != NULL)
3850     left->parent = tree;
3851   if (right != NULL)
3852     right->parent = tree;
3853   return tree;
3854 }
3855 
3856 /* Mark the tree SRC as an optional subexpression.
3857    To be called from preorder or postorder.  */
3858 
3859 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3860 mark_opt_subexp (void *extra, bin_tree_t *node)
3861 {
3862   Idx idx = (uintptr_t) extra;
3863   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3864     node->token.opt_subexp = 1;
3865 
3866   return REG_NOERROR;
3867 }
3868 
3869 /* Free the allocated memory inside NODE. */
3870 
3871 static void
free_token(re_token_t * node)3872 free_token (re_token_t *node)
3873 {
3874 #ifdef RE_ENABLE_I18N
3875   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3876     free_charset (node->opr.mbcset);
3877   else
3878 #endif /* RE_ENABLE_I18N */
3879     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3880       re_free (node->opr.sbcset);
3881 }
3882 
3883 /* Worker function for tree walking.  Free the allocated memory inside NODE
3884    and its children. */
3885 
3886 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3887 free_tree (void *extra, bin_tree_t *node)
3888 {
3889   free_token (&node->token);
3890   return REG_NOERROR;
3891 }
3892 
3893 
3894 /* Duplicate the node SRC, and return new node.  This is a preorder
3895    visit similar to the one implemented by the generic visitor, but
3896    we need more infrastructure to maintain two parallel trees --- so,
3897    it's easier to duplicate.  */
3898 
3899 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3900 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3901 {
3902   const bin_tree_t *node;
3903   bin_tree_t *dup_root;
3904   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3905 
3906   for (node = root; ; )
3907     {
3908       /* Create a new tree and link it back to the current parent.  */
3909       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910       if (*p_new == NULL)
3911 	return NULL;
3912       (*p_new)->parent = dup_node;
3913       (*p_new)->token.duplicated = 1;
3914       dup_node = *p_new;
3915 
3916       /* Go to the left node, or up and to the right.  */
3917       if (node->left)
3918 	{
3919 	  node = node->left;
3920 	  p_new = &dup_node->left;
3921 	}
3922       else
3923 	{
3924 	  const bin_tree_t *prev = NULL;
3925 	  while (node->right == prev || node->right == NULL)
3926 	    {
3927 	      prev = node;
3928 	      node = node->parent;
3929 	      dup_node = dup_node->parent;
3930 	      if (!node)
3931 		return dup_root;
3932 	    }
3933 	  node = node->right;
3934 	  p_new = &dup_node->right;
3935 	}
3936     }
3937 }
3938