1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public License
17    along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include <assert.h>
21 #include <ctype.h>
22 #include <limits.h>
23 #include <locale.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 
28 #if defined(_WIN32) && !defined (MB_CUR_MAX)
29 #define MB_CUR_MAX 2
30 #endif
31 
32 #if defined HAVE_WCHAR_H || defined _LIBC
33 # include <wchar.h>
34 #endif /* HAVE_WCHAR_H || _LIBC */
35 #if defined HAVE_WCTYPE_H || defined _LIBC
36 # include <wctype.h>
37 #endif /* HAVE_WCTYPE_H || _LIBC */
38 
39 /* In case that the system doesn't have isblank().  */
40 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
41 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
42 #endif
43 
44 #ifdef _LIBC
45 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
46 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
47 #   include <locale/localeinfo.h>
48 #   include <locale/elem-hash.h>
49 #   include <locale/coll-lookup.h>
50 # endif
51 #endif
52 
53 /* This is for other GNU distributions with internationalized messages.  */
54 #if HAVE_LIBINTL_H || defined _LIBC
55 # include <libintl.h>
56 # ifdef _LIBC
57 #  undef gettext
58 #  define gettext(msgid) \
59   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
60 # endif
61 #else
62 # define gettext(msgid) (msgid)
63 #endif
64 
65 #ifndef gettext_noop
66 /* This define is so xgettext can find the internationalizable
67    strings.  */
68 # define gettext_noop(String) String
69 #endif
70 
71 #include "_regex.h" /* gnupg */
72 #include "regex_internal.h"
73 
74 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
75                                           int length, reg_syntax_t syntax);
76 static void re_compile_fastmap_iter (regex_t *bufp,
77                                      const re_dfastate_t *init_state,
78                                      char *fastmap);
79 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
80 static reg_errcode_t init_word_char (re_dfa_t *dfa);
81 #ifdef RE_ENABLE_I18N
82 static void free_charset (re_charset_t *cset);
83 #endif /* RE_ENABLE_I18N */
84 static void free_workarea_compile (regex_t *preg);
85 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
86 static reg_errcode_t analyze (re_dfa_t *dfa);
87 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
88 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
89 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
90 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92                                      unsigned int constraint);
93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
95                                          int node, int root);
96 static void calc_inveclosure (re_dfa_t *dfa);
97 static int fetch_number (re_string_t *input, re_token_t *token,
98                          reg_syntax_t syntax);
99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
100 static int peek_token (re_token_t *token, re_string_t *input,
101                         reg_syntax_t syntax);
102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
103                                reg_syntax_t syntax);
104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
105                           reg_syntax_t syntax, reg_errcode_t *err);
106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
107                                   re_token_t *token, reg_syntax_t syntax,
108                                   int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
110                                  re_token_t *token, reg_syntax_t syntax,
111                                  int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
113                                      re_token_t *token, reg_syntax_t syntax,
114                                      int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
116                                   re_token_t *token, reg_syntax_t syntax,
117                                   int nest, reg_errcode_t *err);
118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
119                                  re_dfa_t *dfa, re_token_t *token,
120                                  reg_syntax_t syntax, reg_errcode_t *err);
121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
122                                       re_token_t *token, reg_syntax_t syntax,
123                                       reg_errcode_t *err);
124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
125                                             re_string_t *regexp,
126                                             re_token_t *token, int token_len,
127                                             re_dfa_t *dfa,
128                                             reg_syntax_t syntax);
129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
130                                           re_string_t *regexp,
131                                           re_token_t *token);
132 #ifndef _LIBC
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
135                                       re_charset_t *mbcset, int *range_alloc,
136                                       bracket_elem_t *start_elem,
137                                       bracket_elem_t *end_elem);
138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139                                              re_charset_t *mbcset,
140                                              int *coll_syxmalloc,
141                                              const unsigned char *name);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
144                                       bracket_elem_t *start_elem,
145                                       bracket_elem_t *end_elem);
146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
147                                              const unsigned char *name);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
152                                         re_charset_t *mbcset,
153                                         int *equiv_class_alloc,
154                                         const unsigned char *name);
155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156                                       re_charset_t *mbcset,
157                                       int *char_class_alloc,
158                                       const unsigned char *class_name,
159                                       reg_syntax_t syntax);
160 #else  /* not RE_ENABLE_I18N */
161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
162                                         const unsigned char *name);
163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
164                                       const unsigned char *class_name,
165                                       reg_syntax_t syntax);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
168 static void free_bin_tree (bin_tree_t *tree);
169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
170                                 re_token_type_t type, int index);
171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
172 
173 /* This table gives an error message for each of the error codes listed
174    in regex.h.  Obviously the order here has to be same as there.
175    POSIX doesn't require that we do anything for REG_NOERROR,
176    but why not be nice?  */
177 
178 const char __re_error_msgid[] attribute_hidden =
179   {
180 #define REG_NOERROR_IDX	0
181     gettext_noop ("Success")	/* REG_NOERROR */
182     "\0"
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184     gettext_noop ("No match")	/* REG_NOMATCH */
185     "\0"
186 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
187     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
188     "\0"
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
191     "\0"
192 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
194     "\0"
195 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
196     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
197     "\0"
198 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
199     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
200     "\0"
201 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
202     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
203     "\0"
204 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
206     "\0"
207 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
209     "\0"
210 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
211     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
212     "\0"
213 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214     gettext_noop ("Invalid range end")	/* REG_ERANGE */
215     "\0"
216 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
217     gettext_noop ("Memory exhausted") /* REG_ESPACE */
218     "\0"
219 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
220     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
221     "\0"
222 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223     gettext_noop ("Premature end of regular expression") /* REG_EEND */
224     "\0"
225 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
226     gettext_noop ("Regular expression too big") /* REG_ESIZE */
227     "\0"
228 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
229     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
230   };
231 
232 const size_t __re_error_msgid_idx[] attribute_hidden =
233   {
234     REG_NOERROR_IDX,
235     REG_NOMATCH_IDX,
236     REG_BADPAT_IDX,
237     REG_ECOLLATE_IDX,
238     REG_ECTYPE_IDX,
239     REG_EESCAPE_IDX,
240     REG_ESUBREG_IDX,
241     REG_EBRACK_IDX,
242     REG_EPAREN_IDX,
243     REG_EBRACE_IDX,
244     REG_BADBR_IDX,
245     REG_ERANGE_IDX,
246     REG_ESPACE_IDX,
247     REG_BADRPT_IDX,
248     REG_EEND_IDX,
249     REG_ESIZE_IDX,
250     REG_ERPAREN_IDX
251   };
252 
253 /* Entry points for GNU code.  */
254 
255 /* re_compile_pattern is the GNU regular expression compiler: it
256    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257    Returns 0 if the pattern was valid, otherwise an error string.
258 
259    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260    are set in BUFP on entry.  */
261 
262 const char *
re_compile_pattern(pattern,length,bufp)263 re_compile_pattern (pattern, length, bufp)
264     const char *pattern;
265     size_t length;
266     struct re_pattern_buffer *bufp;
267 {
268   reg_errcode_t ret;
269 
270   /* GNU code is written to assume at least RE_NREGS registers will be set
271      (and at least one extra will be -1).  */
272   bufp->regs_allocated = REGS_UNALLOCATED;
273 
274   /* And GNU code determines whether or not to get register information
275      by passing null for the REGS argument to re_match, etc., not by
276      setting no_sub.  */
277   bufp->no_sub = 0;
278 
279   /* Match anchors at newline.  */
280   bufp->newline_anchor = 1;
281 
282   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
283 
284   if (!ret)
285     return NULL;
286   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
287 }
288 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)289 weak_alias (__re_compile_pattern, re_compile_pattern)
290 #endif
291 
292 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
293    also be assigned to arbitrarily: each pattern buffer stores its own
294    syntax, so it can be changed between regex compilations.  */
295 /* This has no initializer because initialized variables in Emacs
296    become read-only after dumping.  */
297 reg_syntax_t re_syntax_options;
298 
299 
300 /* Specify the precise syntax of regexps for compilation.  This provides
301    for compatibility for various utilities which historically have
302    different, incompatible syntaxes.
303 
304    The argument SYNTAX is a bit mask comprised of the various bits
305    defined in regex.h.  We return the old syntax.  */
306 
307 reg_syntax_t
308 re_set_syntax (syntax)
309     reg_syntax_t syntax;
310 {
311   reg_syntax_t ret = re_syntax_options;
312 
313   re_syntax_options = syntax;
314   return ret;
315 }
316 #ifdef _LIBC
317 weak_alias (__re_set_syntax, re_set_syntax)
318 #endif
319 
320 int
321 re_compile_fastmap (bufp)
322     struct re_pattern_buffer *bufp;
323 {
324   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325   char *fastmap = bufp->fastmap;
326 
327   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
328   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
329   if (dfa->init_state != dfa->init_state_word)
330     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
331   if (dfa->init_state != dfa->init_state_nl)
332     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
333   if (dfa->init_state != dfa->init_state_begbuf)
334     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
335   bufp->fastmap_accurate = 1;
336   return 0;
337 }
338 #ifdef _LIBC
339 weak_alias (__re_compile_fastmap, re_compile_fastmap)
340 #endif
341 
342 /* Helper function for re_compile_fastmap.
343    Compile fastmap for the initial_state INIT_STATE.  */
344 
345 static void
346 re_compile_fastmap_iter (bufp, init_state, fastmap)
347      regex_t *bufp;
348      const re_dfastate_t *init_state;
349      char *fastmap;
350 {
351   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
352   int node_cnt;
353   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
354     {
355       int node = init_state->nodes.elems[node_cnt];
356       re_token_type_t type = dfa->nodes[node].type;
357       if (type == OP_CONTEXT_NODE)
358         {
359           node = dfa->nodes[node].opr.ctx_info->entity;
360           type = dfa->nodes[node].type;
361         }
362 
363       if (type == CHARACTER)
364         fastmap[dfa->nodes[node].opr.c] = 1;
365       else if (type == SIMPLE_BRACKET)
366         {
367           int i, j, ch;
368           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
369             for (j = 0; j < UINT_BITS; ++j, ++ch)
370               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
371                 fastmap[ch] = 1;
372         }
373 #ifdef RE_ENABLE_I18N
374       else if (type == COMPLEX_BRACKET)
375         {
376           int i;
377           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
379               || cset->nranges || cset->nchar_classes)
380             {
381 # ifdef _LIBC
382               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
383                 {
384                   /* In this case we want to catch the bytes which are
385                      the first byte of any collation elements.
386                      e.g. In da_DK, we want to catch 'a' since "aa"
387                           is a valid collation element, and don't catch
388                           'b' since 'b' is the only collation element
389                           which starts from 'b'.  */
390                   int j, ch;
391                   const int32_t *table = (const int32_t *)
392                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
393                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
394                     for (j = 0; j < UINT_BITS; ++j, ++ch)
395                       if (table[ch] < 0)
396                         fastmap[ch] = 1;
397                 }
398 # else
399               if (MB_CUR_MAX > 1)
400                 for (i = 0; i < SBC_MAX; ++i)
401                   if (__btowc (i) == WEOF)
402                     fastmap[i] = 1;
403 # endif /* not _LIBC */
404             }
405           for (i = 0; i < cset->nmbchars; ++i)
406             {
407               char buf[256];
408               wctomb (buf, cset->mbchars[i]);
409               fastmap[*(unsigned char *) buf] = 1;
410             }
411         }
412 #endif /* RE_ENABLE_I18N */
413       else if (type == END_OF_RE || type == OP_PERIOD
414 #ifdef RE_ENABLE_I18N
415                || type == COMPLEX_BRACKET
416 #endif /* RE_ENABLE_I18N */
417               )
418         {
419           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
420           if (type == END_OF_RE)
421             bufp->can_be_null = 1;
422           return;
423         }
424     }
425 }
426 
427 /* Entry point for POSIX code.  */
428 /* regcomp takes a regular expression as a string and compiles it.
429 
430    PREG is a regex_t *.  We do not expect any fields to be initialized,
431    since POSIX says we shouldn't.  Thus, we set
432 
433      `buffer' to the compiled pattern;
434      `used' to the length of the compiled pattern;
435      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436        REG_EXTENDED bit in CFLAGS is set; otherwise, to
437        RE_SYNTAX_POSIX_BASIC;
438      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
439      `fastmap' to an allocated space for the fastmap;
440      `fastmap_accurate' to zero;
441      `re_nsub' to the number of subexpressions in PATTERN.
442 
443    PATTERN is the address of the pattern string.
444 
445    CFLAGS is a series of bits which affect compilation.
446 
447      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448      use POSIX basic syntax.
449 
450      If REG_NEWLINE is set, then . and [^...] don't match newline.
451      Also, regexec will try a match beginning after every newline.
452 
453      If REG_ICASE is set, then we considers upper- and lowercase
454      versions of letters to be equivalent when matching.
455 
456      If REG_NOSUB is set, then when PREG is passed to regexec, that
457      routine will report only success or failure, and nothing about the
458      registers.
459 
460    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
461    the return codes and their meanings.)  */
462 
463 int
regcomp(preg,pattern,cflags)464 regcomp (preg, pattern, cflags)
465     regex_t *__restrict preg;
466     const char *__restrict pattern;
467     int cflags;
468 {
469   reg_errcode_t ret;
470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471                          : RE_SYNTAX_POSIX_BASIC);
472 
473   preg->buffer = NULL;
474   preg->allocated = 0;
475   preg->used = 0;
476 
477   /* Try to allocate space for the fastmap.  */
478   preg->fastmap = re_malloc (char, SBC_MAX);
479   if (BE (preg->fastmap == NULL, 0))
480     return REG_ESPACE;
481 
482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 
484   /* If REG_NEWLINE is set, newlines are treated differently.  */
485   if (cflags & REG_NEWLINE)
486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
487       syntax &= ~RE_DOT_NEWLINE;
488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489       /* It also changes the matching behavior.  */
490       preg->newline_anchor = 1;
491     }
492   else
493     preg->newline_anchor = 0;
494   preg->no_sub = !!(cflags & REG_NOSUB);
495   preg->translate = NULL;
496 
497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 
499   /* POSIX doesn't distinguish between an unmatched open-group and an
500      unmatched close-group: both are REG_EPAREN.  */
501   if (ret == REG_ERPAREN)
502     ret = REG_EPAREN;
503 
504   /* We have already checked preg->fastmap != NULL.  */
505   if (BE (ret == REG_NOERROR, 1))
506     {
507       /* Compute the fastmap now, since regexec cannot modify the pattern
508 	 buffer.  */
509       if (BE (re_compile_fastmap (preg) == -2, 0))
510 	{
511 	  /* Some error occurred while computing the fastmap, just forget
512 	     about it.  */
513 	  re_free (preg->fastmap);
514 	  preg->fastmap = NULL;
515 	}
516     }
517 
518   return (int) ret;
519 }
520 #ifdef _LIBC
521 weak_alias (__regcomp, regcomp)
522 #endif
523 
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525    from either regcomp or regexec.   We don't use PREG here.  */
526 
527 size_t
528 regerror (errcode, preg, errbuf, errbuf_size)
529     int errcode;
530     const regex_t *preg;
531     char *errbuf;
532     size_t errbuf_size;
533 {
534   const char *msg;
535   size_t msg_size;
536 
537   if (BE (errcode < 0
538           || errcode >= (int) (sizeof (__re_error_msgid_idx)
539                                / sizeof (__re_error_msgid_idx[0])), 0))
540     /* Only error codes returned by the rest of the code should be passed
541        to this routine.  If we are given anything else, or if other regex
542        code generates an invalid error code, then the program has a bug.
543        Dump core so we can fix it.  */
544     abort ();
545 
546   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
547 
548   msg_size = strlen (msg) + 1; /* Includes the null.  */
549 
550   if (BE (errbuf_size != 0, 1))
551     {
552       if (BE (msg_size > errbuf_size, 0))
553         {
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556 #else
557           memcpy (errbuf, msg, errbuf_size - 1);
558           errbuf[errbuf_size - 1] = 0;
559 #endif
560         }
561       else
562         memcpy (errbuf, msg, msg_size);
563     }
564 
565   return msg_size;
566 }
567 #ifdef _LIBC
568 weak_alias (__regerror, regerror)
569 #endif
570 
571 /* Free dynamically allocated space used by PREG.  */
572 
573 void
574 regfree (preg)
575     regex_t *preg;
576 {
577   int i, j;
578   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
579   if (BE (dfa != NULL, 1))
580     {
581       re_free (dfa->subexps);
582 
583       for (i = 0; i < dfa->nodes_len; ++i)
584         {
585           re_token_t *node = dfa->nodes + i;
586 #ifdef RE_ENABLE_I18N
587           if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
588             free_charset (node->opr.mbcset);
589           else
590 #endif /* RE_ENABLE_I18N */
591           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
592             re_free (node->opr.sbcset);
593           else if (node->type == OP_CONTEXT_NODE)
594             {
595               if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
596                 {
597                   if (node->opr.ctx_info->bkref_eclosure != NULL)
598                     re_node_set_free (node->opr.ctx_info->bkref_eclosure);
599                   re_free (node->opr.ctx_info->bkref_eclosure);
600                 }
601               re_free (node->opr.ctx_info);
602             }
603         }
604       re_free (dfa->firsts);
605       re_free (dfa->nexts);
606       for (i = 0; i < dfa->nodes_len; ++i)
607         {
608           if (dfa->eclosures != NULL)
609             re_node_set_free (dfa->eclosures + i);
610           if (dfa->inveclosures != NULL)
611             re_node_set_free (dfa->inveclosures + i);
612           if (dfa->edests != NULL)
613             re_node_set_free (dfa->edests + i);
614         }
615       re_free (dfa->edests);
616       re_free (dfa->eclosures);
617       re_free (dfa->inveclosures);
618       re_free (dfa->nodes);
619 
620       for (i = 0; i <= dfa->state_hash_mask; ++i)
621         {
622           struct re_state_table_entry *entry = dfa->state_table + i;
623           for (j = 0; j < entry->num; ++j)
624             {
625               re_dfastate_t *state = entry->array[j];
626               if (state->entrance_nodes != &state->nodes)
627                 {
628                   re_node_set_free (state->entrance_nodes);
629                   re_free (state->entrance_nodes);
630                 }
631               re_node_set_free (&state->nodes);
632               re_free (state->trtable);
633               re_free (state->trtable_search);
634               re_free (state);
635             }
636           re_free (entry->array);
637         }
638       re_free (dfa->state_table);
639 
640       if (dfa->word_char != NULL)
641         re_free (dfa->word_char);
642 #ifdef DEBUG
643       re_free (dfa->re_str);
644 #endif
645       re_free (dfa);
646     }
647   re_free (preg->fastmap);
648 }
649 #ifdef _LIBC
650 weak_alias (__regfree, regfree)
651 #endif
652 
653 /* Entry points compatible with 4.2 BSD regex library.  We don't define
654    them unless specifically requested.  */
655 
656 #if defined _REGEX_RE_COMP || defined _LIBC
657 
658 /* BSD has one and only one pattern buffer.  */
659 static struct re_pattern_buffer re_comp_buf;
660 
661 char *
662 # ifdef _LIBC
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664    these names if they don't use our functions, and still use
665    regcomp/regexec above without link errors.  */
666 weak_function
667 # endif
re_comp(s)668 re_comp (s)
669      const char *s;
670 {
671   reg_errcode_t ret;
672 
673   if (!s)
674     {
675       if (!re_comp_buf.buffer)
676 	return gettext ("No previous regular expression");
677       return 0;
678     }
679 
680   if (!re_comp_buf.buffer)
681     {
682       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
683       if (re_comp_buf.fastmap == NULL)
684 	return (char *) gettext (__re_error_msgid
685 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
686     }
687 
688   /* Since `re_exec' always passes NULL for the `regs' argument, we
689      don't need to initialize the pattern buffer fields which affect it.  */
690 
691   /* Match anchors at newlines.  */
692   re_comp_buf.newline_anchor = 1;
693 
694   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
695 
696   if (!ret)
697     return NULL;
698 
699   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
700   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
701 }
702 #endif /* _REGEX_RE_COMP */
703 
704 /* Internal entry point.
705    Compile the regular expression PATTERN, whose length is LENGTH.
706    SYNTAX indicate regular expression's syntax.  */
707 
708 static reg_errcode_t
re_compile_internal(preg,pattern,length,syntax)709 re_compile_internal (preg, pattern, length, syntax)
710      regex_t *preg;
711      const char * pattern;
712      int length;
713      reg_syntax_t syntax;
714 {
715   reg_errcode_t err = REG_NOERROR;
716   re_dfa_t *dfa;
717   re_string_t regexp;
718 
719   /* Initialize the pattern buffer.  */
720   preg->fastmap_accurate = 0;
721   preg->syntax = syntax;
722   preg->not_bol = preg->not_eol = 0;
723   preg->used = 0;
724   preg->re_nsub = 0;
725 
726   /* Initialize the dfa.  */
727   dfa = (re_dfa_t *) preg->buffer;
728   if (preg->allocated < sizeof (re_dfa_t))
729     {
730       /* If zero allocated, but buffer is non-null, try to realloc
731 	 enough space.  This loses if buffer's address is bogus, but
732 	 that is the user's responsibility.  If ->buffer is NULL this
733 	 is a simple allocation.  */
734       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
735       if (dfa == NULL)
736 	return REG_ESPACE;
737       preg->allocated = sizeof (re_dfa_t);
738     }
739   preg->buffer = (unsigned char *) dfa;
740   preg->used = sizeof (re_dfa_t);
741 
742   err = init_dfa (dfa, length);
743   if (BE (err != REG_NOERROR, 0))
744     {
745       re_free (dfa);
746       preg->buffer = NULL;
747       return err;
748     }
749 #ifdef DEBUG
750   dfa->re_str = re_malloc (char, length + 1);
751   strncpy (dfa->re_str, pattern, length + 1);
752 #endif
753 
754   err = re_string_construct (&regexp, pattern, length, preg->translate,
755                              syntax & RE_ICASE);
756   if (BE (err != REG_NOERROR, 0))
757     {
758       re_free (dfa);
759       preg->buffer = NULL;
760       return err;
761     }
762 
763   /* Parse the regular expression, and build a structure tree.  */
764   preg->re_nsub = 0;
765   dfa->str_tree = parse (&regexp, preg, syntax, &err);
766   if (BE (dfa->str_tree == NULL, 0))
767     goto re_compile_internal_free_return;
768 
769   /* Analyze the tree and collect information which is necessary to
770      create the dfa.  */
771   err = analyze (dfa);
772   if (BE (err != REG_NOERROR, 0))
773     goto re_compile_internal_free_return;
774 
775   /* Then create the initial state of the dfa.  */
776   err = create_initial_state (dfa);
777   if (BE (err != REG_NOERROR, 0))
778     goto re_compile_internal_free_return;
779 
780  re_compile_internal_free_return:
781   /* Release work areas.  */
782   free_workarea_compile (preg);
783   re_string_destruct (&regexp);
784 
785   return err;
786 }
787 
788 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
789    as the initial length of some arrays.  */
790 
791 static reg_errcode_t
init_dfa(dfa,pat_len)792 init_dfa (dfa, pat_len)
793      re_dfa_t *dfa;
794      int pat_len;
795 {
796   int table_size;
797 
798   memset (dfa, '\0', sizeof (re_dfa_t));
799 
800   dfa->nodes_alloc = pat_len + 1;
801   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
802 
803   dfa->states_alloc = pat_len + 1;
804 
805   /*  table_size = 2 ^ ceil(log pat_len) */
806   for (table_size = 1; table_size > 0; table_size <<= 1)
807     if (table_size > pat_len)
808       break;
809 
810   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
811   dfa->state_hash_mask = table_size - 1;
812 
813   dfa->subexps_alloc = 1;
814   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
815   dfa->word_char = NULL;
816 
817   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
818           || dfa->subexps == NULL, 0))
819     {
820       /* We don't bother to free anything which was allocated.  Very
821 	 soon the process will go down anyway.  */
822       dfa->subexps = NULL;
823       dfa->state_table = NULL;
824       dfa->nodes = NULL;
825       return REG_ESPACE;
826     }
827   return REG_NOERROR;
828 }
829 
830 /* Initialize WORD_CHAR table, which indicate which character is
831    "word".  In this case "word" means that it is the word construction
832    character used by some operators like "\<", "\>", etc.  */
833 
834 static reg_errcode_t
init_word_char(dfa)835 init_word_char (dfa)
836      re_dfa_t *dfa;
837 {
838   int i, j, ch;
839   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
840   if (BE (dfa->word_char == NULL, 0))
841     return REG_ESPACE;
842   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
843     for (j = 0; j < UINT_BITS; ++j, ++ch)
844       if (isalnum (ch) || ch == '_')
845         dfa->word_char[i] |= 1 << j;
846   return REG_NOERROR;
847 }
848 
849 /* Free the work area which are only used while compiling.  */
850 
851 static void
free_workarea_compile(preg)852 free_workarea_compile (preg)
853      regex_t *preg;
854 {
855   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
856   free_bin_tree (dfa->str_tree);
857   dfa->str_tree = NULL;
858 }
859 
860 /* Create initial states for all contexts.  */
861 
862 static reg_errcode_t
create_initial_state(dfa)863 create_initial_state (dfa)
864      re_dfa_t *dfa;
865 {
866   int first, i;
867   reg_errcode_t err;
868   re_node_set init_nodes;
869 
870   /* Initial states have the epsilon closure of the node which is
871      the first node of the regular expression.  */
872   first = dfa->str_tree->first;
873   dfa->init_node = first;
874   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
875   if (BE (err != REG_NOERROR, 0))
876     return err;
877 
878   /* The back-references which are in initial states can epsilon transit,
879      since in this case all of the subexpressions can be null.
880      Then we add epsilon closures of the nodes which are the next nodes of
881      the back-references.  */
882   if (dfa->nbackref > 0)
883     for (i = 0; i < init_nodes.nelem; ++i)
884       {
885         int node_idx = init_nodes.elems[i];
886         re_token_type_t type = dfa->nodes[node_idx].type;
887 
888         int clexp_idx;
889         int entity = (type != OP_CONTEXT_NODE ? node_idx
890                       : dfa->nodes[node_idx].opr.ctx_info->entity);
891         if ((type != OP_CONTEXT_NODE
892              || (dfa->nodes[entity].type != OP_BACK_REF))
893             && (type != OP_BACK_REF))
894           continue;
895         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
896           {
897             re_token_t *clexp_node;
898             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
899             if (clexp_node->type == OP_CLOSE_SUBEXP
900                 && clexp_node->opr.idx + 1 == dfa->nodes[entity].opr.idx)
901               break;
902           }
903         if (clexp_idx == init_nodes.nelem)
904           continue;
905 
906         if (type == OP_CONTEXT_NODE
907             && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
908                 == OP_BACK_REF))
909           {
910             int prev_nelem = init_nodes.nelem;
911             re_node_set_merge (&init_nodes,
912                            dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
913             if (prev_nelem < init_nodes.nelem)
914               i = 0;
915           }
916         else if (type == OP_BACK_REF)
917           {
918             int next_idx = dfa->nexts[node_idx];
919             if (!re_node_set_contains (&init_nodes, next_idx))
920               {
921                 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
922                 i = 0;
923               }
924           }
925       }
926 
927   /* It must be the first time to invoke acquire_state.  */
928   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
929   /* We don't check ERR here, since the initial state must not be NULL.  */
930   if (BE (dfa->init_state == NULL, 0))
931     return err;
932   if (dfa->init_state->has_constraint)
933     {
934       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
935                                                        CONTEXT_WORD);
936       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
937                                                      CONTEXT_NEWLINE);
938       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
939                                                          &init_nodes,
940                                                          CONTEXT_NEWLINE
941                                                          | CONTEXT_BEGBUF);
942       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
943               || dfa->init_state_begbuf == NULL, 0))
944         return err;
945     }
946   else
947     dfa->init_state_word = dfa->init_state_nl
948       = dfa->init_state_begbuf = dfa->init_state;
949 
950   re_node_set_free (&init_nodes);
951   return REG_NOERROR;
952 }
953 
954 /* Analyze the structure tree, and calculate "first", "next", "edest",
955    "eclosure", and "inveclosure".  */
956 
957 static reg_errcode_t
analyze(dfa)958 analyze (dfa)
959      re_dfa_t *dfa;
960 {
961   int i;
962   reg_errcode_t ret;
963 
964   /* Allocate arrays.  */
965   dfa->firsts = re_malloc (int, dfa->nodes_alloc);
966   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
967   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
968   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
969   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
970   if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
971           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
972     return REG_ESPACE;
973   /* Initialize them.  */
974   for (i = 0; i < dfa->nodes_len; ++i)
975     {
976       dfa->firsts[i] = -1;
977       dfa->nexts[i] = -1;
978       re_node_set_init_empty (dfa->edests + i);
979       re_node_set_init_empty (dfa->eclosures + i);
980       re_node_set_init_empty (dfa->inveclosures + i);
981     }
982 
983   ret = analyze_tree (dfa, dfa->str_tree);
984   if (BE (ret == REG_NOERROR, 1))
985     {
986       ret = calc_eclosure (dfa);
987       if (ret == REG_NOERROR)
988 	calc_inveclosure (dfa);
989     }
990   return ret;
991 }
992 
993 /* Helper functions for analyze.
994    This function calculate "first", "next", and "edest" for the subtree
995    whose root is NODE.  */
996 
997 static reg_errcode_t
analyze_tree(dfa,node)998 analyze_tree (dfa, node)
999      re_dfa_t *dfa;
1000      bin_tree_t *node;
1001 {
1002   reg_errcode_t ret;
1003   if (node->first == -1)
1004     calc_first (dfa, node);
1005   if (node->next == -1)
1006     calc_next (dfa, node);
1007   if (node->eclosure.nelem == 0)
1008     calc_epsdest (dfa, node);
1009   /* Calculate "first" etc. for the left child.  */
1010   if (node->left != NULL)
1011     {
1012       ret = analyze_tree (dfa, node->left);
1013       if (BE (ret != REG_NOERROR, 0))
1014         return ret;
1015     }
1016   /* Calculate "first" etc. for the right child.  */
1017   if (node->right != NULL)
1018     {
1019       ret = analyze_tree (dfa, node->right);
1020       if (BE (ret != REG_NOERROR, 0))
1021         return ret;
1022     }
1023   return REG_NOERROR;
1024 }
1025 
1026 /* Calculate "first" for the node NODE.  */
1027 static void
calc_first(dfa,node)1028 calc_first (dfa, node)
1029      re_dfa_t *dfa;
1030      bin_tree_t *node;
1031 {
1032   int idx, type;
1033   idx = node->node_idx;
1034   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1035 
1036   switch (type)
1037     {
1038 #ifdef DEBUG
1039     case OP_OPEN_BRACKET:
1040     case OP_CLOSE_BRACKET:
1041     case OP_OPEN_DUP_NUM:
1042     case OP_CLOSE_DUP_NUM:
1043     case OP_NON_MATCH_LIST:
1044     case OP_OPEN_COLL_ELEM:
1045     case OP_CLOSE_COLL_ELEM:
1046     case OP_OPEN_EQUIV_CLASS:
1047     case OP_CLOSE_EQUIV_CLASS:
1048     case OP_OPEN_CHAR_CLASS:
1049     case OP_CLOSE_CHAR_CLASS:
1050       /* These must not be appeared here.  */
1051       assert (0);
1052 #endif
1053     case END_OF_RE:
1054     case CHARACTER:
1055     case OP_PERIOD:
1056     case OP_DUP_ASTERISK:
1057     case OP_DUP_QUESTION:
1058 #ifdef RE_ENABLE_I18N
1059     case COMPLEX_BRACKET:
1060 #endif /* RE_ENABLE_I18N */
1061     case SIMPLE_BRACKET:
1062     case OP_BACK_REF:
1063     case ANCHOR:
1064     case OP_OPEN_SUBEXP:
1065     case OP_CLOSE_SUBEXP:
1066       node->first = idx;
1067       break;
1068     case OP_DUP_PLUS:
1069 #ifdef DEBUG
1070       assert (node->left != NULL);
1071 #endif
1072       if (node->left->first == -1)
1073         calc_first (dfa, node->left);
1074       node->first = node->left->first;
1075       break;
1076     case OP_ALT:
1077       node->first = idx;
1078       break;
1079       /* else fall through */
1080     default:
1081 #ifdef DEBUG
1082       assert (node->left != NULL);
1083 #endif
1084       if (node->left->first == -1)
1085         calc_first (dfa, node->left);
1086       node->first = node->left->first;
1087       break;
1088     }
1089   if (node->type == 0)
1090     dfa->firsts[idx] = node->first;
1091 }
1092 
1093 /* Calculate "next" for the node NODE.  */
1094 
1095 static void
calc_next(dfa,node)1096 calc_next (dfa, node)
1097      re_dfa_t *dfa;
1098      bin_tree_t *node;
1099 {
1100   int idx, type;
1101   bin_tree_t *parent = node->parent;
1102   if (parent == NULL)
1103     {
1104       node->next = -1;
1105       idx = node->node_idx;
1106       if (node->type == 0)
1107         dfa->nexts[idx] = node->next;
1108       return;
1109     }
1110 
1111   idx = parent->node_idx;
1112   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1113 
1114   switch (type)
1115     {
1116     case OP_DUP_ASTERISK:
1117     case OP_DUP_PLUS:
1118       node->next = idx;
1119       break;
1120     case CONCAT:
1121       if (parent->left == node)
1122         {
1123           if (parent->right->first == -1)
1124             calc_first (dfa, parent->right);
1125           node->next = parent->right->first;
1126           break;
1127         }
1128       /* else fall through */
1129     default:
1130       if (parent->next == -1)
1131         calc_next (dfa, parent);
1132       node->next = parent->next;
1133       break;
1134     }
1135   idx = node->node_idx;
1136   if (node->type == 0)
1137     dfa->nexts[idx] = node->next;
1138 }
1139 
1140 /* Calculate "edest" for the node NODE.  */
1141 
1142 static void
calc_epsdest(dfa,node)1143 calc_epsdest (dfa, node)
1144      re_dfa_t *dfa;
1145      bin_tree_t *node;
1146 {
1147   int idx;
1148   idx = node->node_idx;
1149   if (node->type == 0)
1150     {
1151       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1152           || dfa->nodes[idx].type == OP_DUP_PLUS
1153           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1154         {
1155           if (node->left->first == -1)
1156             calc_first (dfa, node->left);
1157           if (node->next == -1)
1158             calc_next (dfa, node);
1159           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1160                               node->next);
1161         }
1162       else if (dfa->nodes[idx].type == OP_ALT)
1163         {
1164           int left, right;
1165           if (node->left != NULL)
1166             {
1167               if (node->left->first == -1)
1168                 calc_first (dfa, node->left);
1169               left = node->left->first;
1170             }
1171           else
1172             {
1173               if (node->next == -1)
1174                 calc_next (dfa, node);
1175               left = node->next;
1176             }
1177           if (node->right != NULL)
1178             {
1179               if (node->right->first == -1)
1180                 calc_first (dfa, node->right);
1181               right = node->right->first;
1182             }
1183           else
1184             {
1185               if (node->next == -1)
1186                 calc_next (dfa, node);
1187               right = node->next;
1188             }
1189           re_node_set_init_2 (dfa->edests + idx, left, right);
1190         }
1191       else if (dfa->nodes[idx].type == ANCHOR
1192                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1193                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
1194         re_node_set_init_1 (dfa->edests + idx, node->next);
1195     }
1196 }
1197 
1198 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1199    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1200    otherwise return the error code.  */
1201 
1202 static reg_errcode_t
duplicate_node(new_idx,dfa,org_idx,constraint)1203 duplicate_node (new_idx, dfa, org_idx, constraint)
1204      re_dfa_t *dfa;
1205      int *new_idx, org_idx;
1206      unsigned int constraint;
1207 {
1208   re_token_t dup;
1209   int dup_idx;
1210   reg_errcode_t err;
1211 
1212   dup.type = OP_CONTEXT_NODE;
1213   if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1214     {
1215       /* If the node whose index is ORG_IDX is the same as the intended
1216          node, use it.  */
1217       if (dfa->nodes[org_idx].constraint == constraint)
1218         {
1219           *new_idx = org_idx;
1220           return REG_NOERROR;
1221         }
1222       dup.constraint = constraint |
1223         dfa->nodes[org_idx].constraint;
1224     }
1225   else
1226     dup.constraint = constraint;
1227 
1228   /* In case that `entity' points OP_CONTEXT_NODE,
1229      we correct `entity' to real entity in calc_inveclosures(). */
1230   dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
1231   dup_idx = re_dfa_add_node (dfa, dup, 1);
1232   if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
1233     return REG_ESPACE;
1234   dup.opr.ctx_info->entity = org_idx;
1235   dup.opr.ctx_info->bkref_eclosure = NULL;
1236 
1237   dfa->nodes[dup_idx].duplicated = 1;
1238   dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1239   dfa->nexts[dup_idx] = dfa->nexts[org_idx];
1240   err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
1241   if (BE (err != REG_NOERROR, 0))
1242     return err;
1243   /* Since we don't duplicate epsilon nodes, epsilon closure have
1244      only itself.  */
1245   err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
1246   if (BE (err != REG_NOERROR, 0))
1247     return err;
1248   err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
1249   if (BE (err != REG_NOERROR, 0))
1250     return err;
1251   /* Then we must update inveclosure for this node.
1252      We process them at last part of calc_eclosure(),
1253      since we don't complete to calculate them here.  */
1254 
1255   *new_idx = dup_idx;
1256   return REG_NOERROR;
1257 }
1258 
1259 static void
calc_inveclosure(dfa)1260 calc_inveclosure (dfa)
1261      re_dfa_t *dfa;
1262 {
1263   int src, idx, dest, entity;
1264   for (src = 0; src < dfa->nodes_len; ++src)
1265     {
1266       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1267         {
1268           dest = dfa->eclosures[src].elems[idx];
1269           re_node_set_insert (dfa->inveclosures + dest, src);
1270         }
1271 
1272       entity = src;
1273       while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1274         {
1275           entity = dfa->nodes[entity].opr.ctx_info->entity;
1276           re_node_set_merge (dfa->inveclosures + src,
1277                              dfa->inveclosures + entity);
1278           dfa->nodes[src].opr.ctx_info->entity = entity;
1279         }
1280     }
1281 }
1282 
1283 /* Calculate "eclosure" for all the node in DFA.  */
1284 
1285 static reg_errcode_t
calc_eclosure(dfa)1286 calc_eclosure (dfa)
1287      re_dfa_t *dfa;
1288 {
1289   int idx, node_idx, max, incomplete = 0;
1290 #ifdef DEBUG
1291   assert (dfa->nodes_len > 0);
1292 #endif
1293   /* For each nodes, calculate epsilon closure.  */
1294   for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1295     {
1296       reg_errcode_t err;
1297       re_node_set eclosure_elem;
1298       if (node_idx == max)
1299         {
1300           if (!incomplete)
1301             break;
1302           incomplete = 0;
1303           node_idx = 0;
1304         }
1305 
1306 #ifdef DEBUG
1307       assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1308       assert (dfa->eclosures[node_idx].nelem != -1);
1309 #endif
1310       /* If we have already calculated, skip it.  */
1311       if (dfa->eclosures[node_idx].nelem != 0)
1312         continue;
1313       /* Calculate epsilon closure of `node_idx'.  */
1314       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1315       if (BE (err != REG_NOERROR, 0))
1316         return err;
1317 
1318       if (dfa->eclosures[node_idx].nelem == 0)
1319         {
1320           incomplete = 1;
1321           re_node_set_free (&eclosure_elem);
1322         }
1323     }
1324 
1325   /* for duplicated nodes.  */
1326   for (idx = max; idx < dfa->nodes_len; ++idx)
1327     {
1328       int entity, i, constraint;
1329       re_node_set *bkref_eclosure;
1330       entity = dfa->nodes[idx].opr.ctx_info->entity;
1331       re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1332       if (dfa->nodes[entity].type != OP_BACK_REF)
1333         continue;
1334 
1335       /* If the node is backreference, duplicate the epsilon closure of
1336          the next node. Since it may epsilon transit.  */
1337       /* Note: duplicate_node() may realloc dfa->eclosures, etc.  */
1338       bkref_eclosure = re_malloc (re_node_set, 1);
1339       if (BE (bkref_eclosure == NULL, 0))
1340 	return REG_ESPACE;
1341       re_node_set_init_empty (bkref_eclosure);
1342       constraint = dfa->nodes[idx].constraint;
1343       for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1344         {
1345           int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1346           if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
1347             {
1348               reg_errcode_t err;
1349               err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1350                                     constraint);
1351               if (BE (err != REG_NOERROR, 0))
1352                 return err;
1353             }
1354           re_node_set_insert (bkref_eclosure, dest_node_idx);
1355         }
1356       dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1357     }
1358 
1359   return REG_NOERROR;
1360 }
1361 
1362 /* Calculate epsilon closure of NODE.  */
1363 
1364 static reg_errcode_t
calc_eclosure_iter(new_set,dfa,node,root)1365 calc_eclosure_iter (new_set, dfa, node, root)
1366      re_node_set *new_set;
1367      re_dfa_t *dfa;
1368      int node, root;
1369 {
1370   reg_errcode_t err;
1371   unsigned int constraint;
1372   int i, max, incomplete = 0;
1373   re_node_set eclosure;
1374   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1375   if (BE (err != REG_NOERROR, 0))
1376     return err;
1377 
1378   /* This indicates that we are calculating this node now.
1379      We reference this value to avoid infinite loop.  */
1380   dfa->eclosures[node].nelem = -1;
1381 
1382   constraint = ((dfa->nodes[node].type == ANCHOR)
1383                 ? dfa->nodes[node].opr.ctx_type : 0);
1384 
1385   /* Expand each epsilon destination nodes.  */
1386   if (dfa->edests[node].nelem != 0)
1387     for (i = 0; i < dfa->edests[node].nelem; ++i)
1388       {
1389         re_node_set eclosure_elem;
1390         int edest = dfa->edests[node].elems[i];
1391         /* If calculating the epsilon closure of `edest' is in progress,
1392            return intermediate result.  */
1393         if (dfa->eclosures[edest].nelem == -1)
1394           {
1395             incomplete = 1;
1396             continue;
1397           }
1398         /* If we haven't calculated the epsilon closure of `edest' yet,
1399            calculate now. Otherwise use calculated epsilon closure.  */
1400         if (dfa->eclosures[edest].nelem == 0)
1401           {
1402             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1403             if (BE (err != REG_NOERROR, 0))
1404               return err;
1405           }
1406         else
1407           eclosure_elem = dfa->eclosures[edest];
1408         /* Merge the epsilon closure of `edest'.  */
1409         re_node_set_merge (&eclosure, &eclosure_elem);
1410         /* If the epsilon closure of `edest' is incomplete,
1411            the epsilon closure of this node is also incomplete.  */
1412         if (dfa->eclosures[edest].nelem == 0)
1413           {
1414             incomplete = 1;
1415             re_node_set_free (&eclosure_elem);
1416           }
1417       }
1418 
1419   /* If the current node has constraints, duplicate all non-epsilon nodes.
1420      Since they must inherit the constraints.  */
1421   if (constraint)
1422     for (i = 0, max = eclosure.nelem; i < max; ++i)
1423       {
1424         int dest = eclosure.elems[i];
1425         if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1426           {
1427             int dup_dest;
1428             reg_errcode_t err;
1429             err = duplicate_node (&dup_dest, dfa, dest, constraint);
1430             if (BE (err != REG_NOERROR, 0))
1431               return err;
1432             if (dest != dup_dest)
1433               {
1434                 re_node_set_remove_at (&eclosure, i--);
1435                 re_node_set_insert (&eclosure, dup_dest);
1436                 --max;
1437               }
1438           }
1439       }
1440 
1441   /* Epsilon closures include itself.  */
1442   re_node_set_insert (&eclosure, node);
1443   if (incomplete && !root)
1444     dfa->eclosures[node].nelem = 0;
1445   else
1446     dfa->eclosures[node] = eclosure;
1447   *new_set = eclosure;
1448   return REG_NOERROR;
1449 }
1450 
1451 /* Functions for token which are used in the parser.  */
1452 
1453 /* Fetch a token from INPUT.
1454    We must not use this function inside bracket expressions.  */
1455 
1456 static re_token_t
fetch_token(input,syntax)1457 fetch_token (input, syntax)
1458      re_string_t *input;
1459      reg_syntax_t syntax;
1460 {
1461   re_token_t token;
1462   int consumed_byte;
1463   consumed_byte = peek_token (&token, input, syntax);
1464   re_string_skip_bytes (input, consumed_byte);
1465   return token;
1466 }
1467 
1468 /* Peek a token from INPUT, and return the length of the token.
1469    We must not use this function inside bracket expressions.  */
1470 
1471 static int
peek_token(token,input,syntax)1472 peek_token (token, input, syntax)
1473      re_token_t *token;
1474      re_string_t *input;
1475      reg_syntax_t syntax;
1476 {
1477   unsigned char c;
1478 
1479   if (re_string_eoi (input))
1480     {
1481       token->type = END_OF_RE;
1482       return 0;
1483     }
1484 
1485   c = re_string_peek_byte (input, 0);
1486   token->opr.c = c;
1487 
1488 #ifdef RE_ENABLE_I18N
1489   token->mb_partial = 0;
1490   if (MB_CUR_MAX > 1 &&
1491       !re_string_first_byte (input, re_string_cur_idx (input)))
1492     {
1493       token->type = CHARACTER;
1494       token->mb_partial = 1;
1495       return 1;
1496     }
1497 #endif
1498   if (c == '\\')
1499     {
1500       unsigned char c2;
1501       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1502         {
1503           token->type = BACK_SLASH;
1504           return 1;
1505         }
1506 
1507       c2 = re_string_peek_byte_case (input, 1);
1508       token->opr.c = c2;
1509       token->type = CHARACTER;
1510       switch (c2)
1511         {
1512         case '|':
1513           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514             token->type = OP_ALT;
1515           break;
1516         case '1': case '2': case '3': case '4': case '5':
1517         case '6': case '7': case '8': case '9':
1518           if (!(syntax & RE_NO_BK_REFS))
1519             {
1520               token->type = OP_BACK_REF;
1521               token->opr.idx = c2 - '0';
1522             }
1523           break;
1524         case '<':
1525           if (!(syntax & RE_NO_GNU_OPS))
1526             {
1527               token->type = ANCHOR;
1528               token->opr.idx = WORD_FIRST;
1529             }
1530           break;
1531         case '>':
1532           if (!(syntax & RE_NO_GNU_OPS))
1533             {
1534               token->type = ANCHOR;
1535               token->opr.idx = WORD_LAST;
1536             }
1537           break;
1538         case 'b':
1539           if (!(syntax & RE_NO_GNU_OPS))
1540             {
1541               token->type = ANCHOR;
1542               token->opr.idx = WORD_DELIM;
1543             }
1544           break;
1545         case 'B':
1546           if (!(syntax & RE_NO_GNU_OPS))
1547             {
1548               token->type = ANCHOR;
1549               token->opr.idx = INSIDE_WORD;
1550             }
1551           break;
1552         case 'w':
1553           if (!(syntax & RE_NO_GNU_OPS))
1554             token->type = OP_WORD;
1555           break;
1556         case 'W':
1557           if (!(syntax & RE_NO_GNU_OPS))
1558             token->type = OP_NOTWORD;
1559           break;
1560         case '`':
1561           if (!(syntax & RE_NO_GNU_OPS))
1562             {
1563               token->type = ANCHOR;
1564               token->opr.idx = BUF_FIRST;
1565             }
1566           break;
1567         case '\'':
1568           if (!(syntax & RE_NO_GNU_OPS))
1569             {
1570               token->type = ANCHOR;
1571               token->opr.idx = BUF_LAST;
1572             }
1573           break;
1574         case '(':
1575           if (!(syntax & RE_NO_BK_PARENS))
1576             token->type = OP_OPEN_SUBEXP;
1577           break;
1578         case ')':
1579           if (!(syntax & RE_NO_BK_PARENS))
1580             token->type = OP_CLOSE_SUBEXP;
1581           break;
1582         case '+':
1583           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584             token->type = OP_DUP_PLUS;
1585           break;
1586         case '?':
1587           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588             token->type = OP_DUP_QUESTION;
1589           break;
1590         case '{':
1591           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592             token->type = OP_OPEN_DUP_NUM;
1593           break;
1594         case '}':
1595           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596             token->type = OP_CLOSE_DUP_NUM;
1597           break;
1598         default:
1599           break;
1600         }
1601       return 2;
1602     }
1603 
1604   token->type = CHARACTER;
1605   switch (c)
1606     {
1607     case '\n':
1608       if (syntax & RE_NEWLINE_ALT)
1609         token->type = OP_ALT;
1610       break;
1611     case '|':
1612       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613         token->type = OP_ALT;
1614       break;
1615     case '*':
1616       token->type = OP_DUP_ASTERISK;
1617       break;
1618     case '+':
1619       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620         token->type = OP_DUP_PLUS;
1621       break;
1622     case '?':
1623       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624         token->type = OP_DUP_QUESTION;
1625       break;
1626     case '{':
1627       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628         token->type = OP_OPEN_DUP_NUM;
1629       break;
1630     case '}':
1631       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632         token->type = OP_CLOSE_DUP_NUM;
1633       break;
1634     case '(':
1635       if (syntax & RE_NO_BK_PARENS)
1636         token->type = OP_OPEN_SUBEXP;
1637       break;
1638     case ')':
1639       if (syntax & RE_NO_BK_PARENS)
1640         token->type = OP_CLOSE_SUBEXP;
1641       break;
1642     case '[':
1643       token->type = OP_OPEN_BRACKET;
1644       break;
1645     case '.':
1646       token->type = OP_PERIOD;
1647       break;
1648     case '^':
1649       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650           re_string_cur_idx (input) != 0)
1651         {
1652           char prev = re_string_peek_byte (input, -1);
1653           if (prev != '|' && prev != '(' &&
1654               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1655             break;
1656         }
1657       token->type = ANCHOR;
1658       token->opr.idx = LINE_FIRST;
1659       break;
1660     case '$':
1661       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662           re_string_cur_idx (input) + 1 != re_string_length (input))
1663         {
1664           re_token_t next;
1665           re_string_skip_bytes (input, 1);
1666           peek_token (&next, input, syntax);
1667           re_string_skip_bytes (input, -1);
1668           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1669             break;
1670         }
1671       token->type = ANCHOR;
1672       token->opr.idx = LINE_LAST;
1673       break;
1674     default:
1675       break;
1676     }
1677   return 1;
1678 }
1679 
1680 /* Peek a token from INPUT, and return the length of the token.
1681    We must not use this function out of bracket expressions.  */
1682 
1683 static int
peek_token_bracket(token,input,syntax)1684 peek_token_bracket (token, input, syntax)
1685      re_token_t *token;
1686      re_string_t *input;
1687      reg_syntax_t syntax;
1688 {
1689   unsigned char c;
1690   if (re_string_eoi (input))
1691     {
1692       token->type = END_OF_RE;
1693       return 0;
1694     }
1695   c = re_string_peek_byte (input, 0);
1696   token->opr.c = c;
1697 
1698 #ifdef RE_ENABLE_I18N
1699   if (MB_CUR_MAX > 1 &&
1700       !re_string_first_byte (input, re_string_cur_idx (input)))
1701     {
1702       token->type = CHARACTER;
1703       return 1;
1704     }
1705 #endif /* RE_ENABLE_I18N */
1706 
1707   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1708     {
1709       /* In this case, '\' escape a character.  */
1710       unsigned char c2;
1711       c2 = re_string_peek_byte (input, 1);
1712       token->opr.c = c2;
1713       token->type = CHARACTER;
1714       return 1;
1715     }
1716   if (c == '[') /* '[' is a special char in a bracket exps.  */
1717     {
1718       unsigned char c2;
1719       int token_len;
1720       c2 = re_string_peek_byte (input, 1);
1721       token->opr.c = c2;
1722       token_len = 2;
1723       switch (c2)
1724         {
1725         case '.':
1726           token->type = OP_OPEN_COLL_ELEM;
1727           break;
1728         case '=':
1729           token->type = OP_OPEN_EQUIV_CLASS;
1730           break;
1731         case ':':
1732           if (syntax & RE_CHAR_CLASSES)
1733             {
1734               token->type = OP_OPEN_CHAR_CLASS;
1735               break;
1736             }
1737           /* else fall through.  */
1738         default:
1739           token->type = CHARACTER;
1740           token->opr.c = c;
1741           token_len = 1;
1742           break;
1743         }
1744       return token_len;
1745     }
1746   switch (c)
1747     {
1748     case '-':
1749       token->type = OP_CHARSET_RANGE;
1750       break;
1751     case ']':
1752       token->type = OP_CLOSE_BRACKET;
1753       break;
1754     case '^':
1755       token->type = OP_NON_MATCH_LIST;
1756       break;
1757     default:
1758       token->type = CHARACTER;
1759     }
1760   return 1;
1761 }
1762 
1763 /* Functions for parser.  */
1764 
1765 /* Entry point of the parser.
1766    Parse the regular expression REGEXP and return the structure tree.
1767    If an error is occurred, ERR is set by error code, and return NULL.
1768    This function build the following tree, from regular expression <reg_exp>:
1769            CAT
1770            / \
1771           /   \
1772    <reg_exp>  EOR
1773 
1774    CAT means concatenation.
1775    EOR means end of regular expression.  */
1776 
1777 static bin_tree_t *
parse(regexp,preg,syntax,err)1778 parse (regexp, preg, syntax, err)
1779      re_string_t *regexp;
1780      regex_t *preg;
1781      reg_syntax_t syntax;
1782      reg_errcode_t *err;
1783 {
1784   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1785   bin_tree_t *tree, *eor, *root;
1786   re_token_t current_token;
1787   int new_idx;
1788   current_token = fetch_token (regexp, syntax);
1789   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1790   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1791     return NULL;
1792   new_idx = re_dfa_add_node (dfa, current_token, 0);
1793   eor = create_tree (NULL, NULL, 0, new_idx);
1794   if (tree != NULL)
1795     root = create_tree (tree, eor, CONCAT, 0);
1796   else
1797     root = eor;
1798   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1799     return *err = REG_ESPACE, NULL;
1800   return root;
1801 }
1802 
1803 /* This function build the following tree, from regular expression
1804    <branch1>|<branch2>:
1805            ALT
1806            / \
1807           /   \
1808    <branch1> <branch2>
1809 
1810    ALT means alternative, which represents the operator `|'.  */
1811 
1812 static bin_tree_t *
parse_reg_exp(regexp,preg,token,syntax,nest,err)1813 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1814      re_string_t *regexp;
1815      regex_t *preg;
1816      re_token_t *token;
1817      reg_syntax_t syntax;
1818      int nest;
1819      reg_errcode_t *err;
1820 {
1821   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1822   bin_tree_t *tree, *branch = NULL;
1823   int new_idx;
1824   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1825   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1826     return NULL;
1827 
1828   while (token->type == OP_ALT)
1829     {
1830       re_token_t alt_token = *token;
1831       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1832       *token = fetch_token (regexp, syntax);
1833       if (token->type != OP_ALT && token->type != END_OF_RE
1834           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1835         {
1836           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1837           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1838             {
1839               free_bin_tree (tree);
1840               return NULL;
1841             }
1842         }
1843       else
1844 	branch = NULL;
1845       tree = create_tree (tree, branch, 0, new_idx);
1846       if (BE (new_idx == -1 || tree == NULL, 0))
1847         return *err = REG_ESPACE, NULL;
1848       dfa->has_plural_match = 1;
1849     }
1850   return tree;
1851 }
1852 
1853 /* This function build the following tree, from regular expression
1854    <exp1><exp2>:
1855         CAT
1856         / \
1857        /   \
1858    <exp1> <exp2>
1859 
1860    CAT means concatenation.  */
1861 
1862 static bin_tree_t *
parse_branch(regexp,preg,token,syntax,nest,err)1863 parse_branch (regexp, preg, token, syntax, nest, err)
1864      re_string_t *regexp;
1865      regex_t *preg;
1866      re_token_t *token;
1867      reg_syntax_t syntax;
1868      int nest;
1869      reg_errcode_t *err;
1870 {
1871   bin_tree_t *tree, *exp;
1872   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1873   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1874     return NULL;
1875 
1876   while (token->type != OP_ALT && token->type != END_OF_RE
1877          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1878     {
1879       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1880       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1881         {
1882           free_bin_tree (tree);
1883           return NULL;
1884         }
1885       if (tree != NULL && exp != NULL)
1886         {
1887           tree = create_tree (tree, exp, CONCAT, 0);
1888           if (tree == NULL)
1889             return *err = REG_ESPACE, NULL;
1890         }
1891       else if (tree == NULL)
1892         tree = exp;
1893       /* Otherwise exp == NULL, we don't need to create new tree.  */
1894     }
1895   return tree;
1896 }
1897 
1898 /* This function build the following tree, from regular expression a*:
1899          *
1900          |
1901          a
1902 */
1903 
1904 static bin_tree_t *
parse_expression(regexp,preg,token,syntax,nest,err)1905 parse_expression (regexp, preg, token, syntax, nest, err)
1906      re_string_t *regexp;
1907      regex_t *preg;
1908      re_token_t *token;
1909      reg_syntax_t syntax;
1910      int nest;
1911      reg_errcode_t *err;
1912 {
1913   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1914   bin_tree_t *tree;
1915   int new_idx;
1916   switch (token->type)
1917     {
1918     case CHARACTER:
1919       new_idx = re_dfa_add_node (dfa, *token, 0);
1920       tree = create_tree (NULL, NULL, 0, new_idx);
1921       if (BE (new_idx == -1 || tree == NULL, 0))
1922         return *err = REG_ESPACE, NULL;
1923 #ifdef RE_ENABLE_I18N
1924       if (MB_CUR_MAX > 1)
1925 	{
1926 	  while (!re_string_eoi (regexp)
1927 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1928 	    {
1929               bin_tree_t *mbc_remain;
1930               *token = fetch_token (regexp, syntax);
1931               new_idx = re_dfa_add_node (dfa, *token, 0);
1932               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1933               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1934               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1935                 return *err = REG_ESPACE, NULL;
1936             }
1937 	}
1938 #endif
1939       break;
1940     case OP_OPEN_SUBEXP:
1941       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1942       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1943         return NULL;
1944       break;
1945     case OP_OPEN_BRACKET:
1946       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1947       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1948         return NULL;
1949       break;
1950     case OP_BACK_REF:
1951       if (BE (preg->re_nsub < token->opr.idx
1952               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1953         {
1954           *err = REG_ESUBREG;
1955           return NULL;
1956         }
1957       new_idx = re_dfa_add_node (dfa, *token, 0);
1958       tree = create_tree (NULL, NULL, 0, new_idx);
1959       if (BE (new_idx == -1 || tree == NULL, 0))
1960         return *err = REG_ESPACE, NULL;
1961       ++dfa->nbackref;
1962       dfa->has_mb_node = 1;
1963       break;
1964     case OP_DUP_ASTERISK:
1965     case OP_DUP_PLUS:
1966     case OP_DUP_QUESTION:
1967     case OP_OPEN_DUP_NUM:
1968       if (syntax & RE_CONTEXT_INVALID_OPS)
1969         return *err = REG_BADRPT, NULL;
1970       else if (syntax & RE_CONTEXT_INDEP_OPS)
1971         {
1972           *token = fetch_token (regexp, syntax);
1973           return parse_expression (regexp, preg, token, syntax, nest, err);
1974         }
1975       /* else fall through  */
1976     case OP_CLOSE_SUBEXP:
1977       if ((token->type == OP_CLOSE_SUBEXP) &&
1978           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1979         return *err = REG_ERPAREN, NULL;
1980       /* else fall through  */
1981     case OP_CLOSE_DUP_NUM:
1982       /* We treat it as a normal character.  */
1983 
1984       /* Then we can these characters as normal characters.  */
1985       token->type = CHARACTER;
1986       new_idx = re_dfa_add_node (dfa, *token, 0);
1987       tree = create_tree (NULL, NULL, 0, new_idx);
1988       if (BE (new_idx == -1 || tree == NULL, 0))
1989         return *err = REG_ESPACE, NULL;
1990       break;
1991     case ANCHOR:
1992       if (dfa->word_char == NULL)
1993         {
1994           *err = init_word_char (dfa);
1995           if (BE (*err != REG_NOERROR, 0))
1996             return NULL;
1997         }
1998       if (token->opr.ctx_type == WORD_DELIM)
1999         {
2000           bin_tree_t *tree_first, *tree_last;
2001           int idx_first, idx_last;
2002           token->opr.ctx_type = WORD_FIRST;
2003           idx_first = re_dfa_add_node (dfa, *token, 0);
2004           tree_first = create_tree (NULL, NULL, 0, idx_first);
2005           token->opr.ctx_type = WORD_LAST;
2006           idx_last = re_dfa_add_node (dfa, *token, 0);
2007           tree_last = create_tree (NULL, NULL, 0, idx_last);
2008           token->type = OP_ALT;
2009           new_idx = re_dfa_add_node (dfa, *token, 0);
2010           tree = create_tree (tree_first, tree_last, 0, new_idx);
2011           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2012                   || tree_first == NULL || tree_last == NULL
2013                   || tree == NULL, 0))
2014             return *err = REG_ESPACE, NULL;
2015         }
2016       else
2017         {
2018           new_idx = re_dfa_add_node (dfa, *token, 0);
2019           tree = create_tree (NULL, NULL, 0, new_idx);
2020           if (BE (new_idx == -1 || tree == NULL, 0))
2021             return *err = REG_ESPACE, NULL;
2022         }
2023       /* We must return here, since ANCHORs can't be followed
2024          by repetition operators.
2025          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2026              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2027       *token = fetch_token (regexp, syntax);
2028       return tree;
2029     case OP_PERIOD:
2030       new_idx = re_dfa_add_node (dfa, *token, 0);
2031       tree = create_tree (NULL, NULL, 0, new_idx);
2032       if (BE (new_idx == -1 || tree == NULL, 0))
2033         return *err = REG_ESPACE, NULL;
2034       if (MB_CUR_MAX > 1)
2035         dfa->has_mb_node = 1;
2036       break;
2037     case OP_WORD:
2038       tree = build_word_op (dfa, 0, err);
2039       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2040         return NULL;
2041       break;
2042     case OP_NOTWORD:
2043       tree = build_word_op (dfa, 1, err);
2044       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2045         return NULL;
2046       break;
2047     case OP_ALT:
2048     case END_OF_RE:
2049       return NULL;
2050     case BACK_SLASH:
2051       *err = REG_EESCAPE;
2052       return NULL;
2053     default:
2054       /* Must not happen?  */
2055 #ifdef DEBUG
2056       assert (0);
2057 #endif
2058       return NULL;
2059     }
2060   *token = fetch_token (regexp, syntax);
2061 
2062   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2063          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2064     {
2065       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2066       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067         return NULL;
2068       dfa->has_plural_match = 1;
2069     }
2070 
2071   return tree;
2072 }
2073 
2074 /* This function build the following tree, from regular expression
2075    (<reg_exp>):
2076          SUBEXP
2077             |
2078         <reg_exp>
2079 */
2080 
2081 static bin_tree_t *
parse_sub_exp(regexp,preg,token,syntax,nest,err)2082 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2083      re_string_t *regexp;
2084      regex_t *preg;
2085      re_token_t *token;
2086      reg_syntax_t syntax;
2087      int nest;
2088      reg_errcode_t *err;
2089 {
2090   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2091   bin_tree_t *tree, *left_par, *right_par;
2092   size_t cur_nsub;
2093   int new_idx;
2094   cur_nsub = preg->re_nsub++;
2095   if (dfa->subexps_alloc < preg->re_nsub)
2096     {
2097       re_subexp_t *new_array;
2098       dfa->subexps_alloc *= 2;
2099       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2100       if (BE (new_array == NULL, 0))
2101 	{
2102 	  dfa->subexps_alloc /= 2;
2103 	  *err = REG_ESPACE;
2104 	  return NULL;
2105 	}
2106       dfa->subexps = new_array;
2107     }
2108   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2109   dfa->subexps[cur_nsub].end = -1;
2110 
2111   new_idx = re_dfa_add_node (dfa, *token, 0);
2112   left_par = create_tree (NULL, NULL, 0, new_idx);
2113   if (BE (new_idx == -1 || left_par == NULL, 0))
2114     return *err = REG_ESPACE, NULL;
2115   dfa->nodes[new_idx].opr.idx = cur_nsub;
2116   *token = fetch_token (regexp, syntax);
2117 
2118   /* The subexpression may be a null string.  */
2119   if (token->type == OP_CLOSE_SUBEXP)
2120     tree = NULL;
2121   else
2122     {
2123       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2124       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2125         return NULL;
2126     }
2127   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2128     {
2129       free_bin_tree (tree);
2130       *err = REG_BADPAT;
2131       return NULL;
2132     }
2133   new_idx = re_dfa_add_node (dfa, *token, 0);
2134   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2135   right_par = create_tree (NULL, NULL, 0, new_idx);
2136   tree = ((tree == NULL) ? right_par
2137           : create_tree (tree, right_par, CONCAT, 0));
2138   tree = create_tree (left_par, tree, CONCAT, 0);
2139   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2140     return *err = REG_ESPACE, NULL;
2141   dfa->nodes[new_idx].opr.idx = cur_nsub;
2142 
2143   return tree;
2144 }
2145 
2146 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2147 
2148 static bin_tree_t *
parse_dup_op(dup_elem,regexp,dfa,token,syntax,err)2149 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2150      bin_tree_t *dup_elem;
2151      re_string_t *regexp;
2152      re_dfa_t *dfa;
2153      re_token_t *token;
2154      reg_syntax_t syntax;
2155      reg_errcode_t *err;
2156 {
2157   re_token_t dup_token;
2158   bin_tree_t *tree = dup_elem, *work_tree;
2159   int new_idx, start_idx = re_string_cur_idx (regexp);
2160   re_token_t start_token = *token;
2161   if (token->type == OP_OPEN_DUP_NUM)
2162     {
2163       int i;
2164       int end = 0;
2165       int start = fetch_number (regexp, token, syntax);
2166       bin_tree_t *elem;
2167       if (start == -1)
2168         {
2169           if (token->type == CHARACTER && token->opr.c == ',')
2170             start = 0; /* We treat "{,m}" as "{0,m}".  */
2171           else
2172             {
2173               *err = REG_BADBR; /* <re>{} is invalid.  */
2174               return NULL;
2175             }
2176         }
2177       if (BE (start != -2, 1))
2178         {
2179           /* We treat "{n}" as "{n,n}".  */
2180           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2181                  : ((token->type == CHARACTER && token->opr.c == ',')
2182                     ? fetch_number (regexp, token, syntax) : -2));
2183         }
2184       if (BE (start == -2 || end == -2, 0))
2185         {
2186           /* Invalid sequence.  */
2187           if (token->type == OP_CLOSE_DUP_NUM)
2188             goto parse_dup_op_invalid_interval;
2189           else
2190             goto parse_dup_op_ebrace;
2191         }
2192       if (BE (start == 0 && end == 0, 0))
2193         {
2194           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2195           *token = fetch_token (regexp, syntax);
2196           free_bin_tree (dup_elem);
2197           return NULL;
2198         }
2199 
2200       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2201       elem = tree;
2202       for (i = 0; i < start; ++i)
2203         if (i != 0)
2204           {
2205             work_tree = duplicate_tree (elem, dfa);
2206             tree = create_tree (tree, work_tree, CONCAT, 0);
2207             if (BE (work_tree == NULL || tree == NULL, 0))
2208               goto parse_dup_op_espace;
2209           }
2210 
2211       if (end == -1)
2212         {
2213           /* We treat "<re>{0,}" as "<re>*".  */
2214           dup_token.type = OP_DUP_ASTERISK;
2215           if (start > 0)
2216             {
2217               elem = duplicate_tree (elem, dfa);
2218               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2219               work_tree = create_tree (elem, NULL, 0, new_idx);
2220               tree = create_tree (tree, work_tree, CONCAT, 0);
2221               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2222                       || tree == NULL, 0))
2223                 goto parse_dup_op_espace;
2224             }
2225           else
2226             {
2227               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2228               tree = create_tree (elem, NULL, 0, new_idx);
2229               if (BE (new_idx == -1 || tree == NULL, 0))
2230                 goto parse_dup_op_espace;
2231             }
2232         }
2233       else if (end - start > 0)
2234         {
2235           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2236           dup_token.type = OP_DUP_QUESTION;
2237           if (start > 0)
2238             {
2239               elem = duplicate_tree (elem, dfa);
2240               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2241               elem = create_tree (elem, NULL, 0, new_idx);
2242               tree = create_tree (tree, elem, CONCAT, 0);
2243               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2244                 goto parse_dup_op_espace;
2245             }
2246           else
2247             {
2248               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2249               tree = elem = create_tree (elem, NULL, 0, new_idx);
2250               if (BE (new_idx == -1 || tree == NULL, 0))
2251                 goto parse_dup_op_espace;
2252             }
2253           for (i = 1; i < end - start; ++i)
2254             {
2255               work_tree = duplicate_tree (elem, dfa);
2256               tree = create_tree (tree, work_tree, CONCAT, 0);
2257               if (BE (work_tree == NULL || tree == NULL, 0))
2258                 return *err = REG_ESPACE, NULL;
2259             }
2260         }
2261     }
2262   else
2263     {
2264       new_idx = re_dfa_add_node (dfa, *token, 0);
2265       tree = create_tree (tree, NULL, 0, new_idx);
2266       if (BE (new_idx == -1 || tree == NULL, 0))
2267         return *err = REG_ESPACE, NULL;
2268     }
2269   *token = fetch_token (regexp, syntax);
2270   return tree;
2271 
2272  parse_dup_op_espace:
2273   free_bin_tree (tree);
2274   *err = REG_ESPACE;
2275   return NULL;
2276 
2277  parse_dup_op_ebrace:
2278   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2279     {
2280       *err = REG_EBRACE;
2281       return NULL;
2282     }
2283   goto parse_dup_op_rollback;
2284  parse_dup_op_invalid_interval:
2285   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2286     {
2287       *err = REG_BADBR;
2288       return NULL;
2289     }
2290  parse_dup_op_rollback:
2291   re_string_set_index (regexp, start_idx);
2292   *token = start_token;
2293   token->type = CHARACTER;
2294   return dup_elem;
2295 }
2296 
2297 /* Size of the names for collating symbol/equivalence_class/character_class.
2298    I'm not sure, but maybe enough.  */
2299 #define BRACKET_NAME_BUF_SIZE 32
2300 
2301 #ifndef _LIBC
2302   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2303      Build the range expression which starts from START_ELEM, and ends
2304      at END_ELEM.  The result are written to MBCSET and SBCSET.
2305      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2306      mbcset->range_ends, is a pointer argument sinse we may
2307      update it.  */
2308 
2309 static reg_errcode_t
2310 # ifdef RE_ENABLE_I18N
build_range_exp(sbcset,mbcset,range_alloc,start_elem,end_elem)2311 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2312      re_charset_t *mbcset;
2313      int *range_alloc;
2314 # else /* not RE_ENABLE_I18N */
2315 build_range_exp (sbcset, start_elem, end_elem)
2316 # endif /* not RE_ENABLE_I18N */
2317      re_bitset_ptr_t sbcset;
2318      bracket_elem_t *start_elem, *end_elem;
2319 {
2320   unsigned int start_ch, end_ch;
2321   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2322   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2323           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2324           0))
2325     return REG_ERANGE;
2326 
2327   /* We can handle no multi character collating elements without libc
2328      support.  */
2329   if (BE ((start_elem->type == COLL_SYM
2330            && strlen ((char *) start_elem->opr.name) > 1)
2331           || (end_elem->type == COLL_SYM
2332               && strlen ((char *) end_elem->opr.name) > 1), 0))
2333     return REG_ECOLLATE;
2334 
2335 # ifdef RE_ENABLE_I18N
2336   {
2337     wchar_t wc, start_wc, end_wc;
2338     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2339 
2340     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2341                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2342                    : 0));
2343     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2344               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2345                  : 0));
2346     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2347                 ? __btowc (start_ch) : start_elem->opr.wch);
2348     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2349               ? __btowc (end_ch) : end_elem->opr.wch);
2350     cmp_buf[0] = start_wc;
2351     cmp_buf[4] = end_wc;
2352     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2353       return REG_ERANGE;
2354 
2355     /* Check the space of the arrays.  */
2356     if (*range_alloc == mbcset->nranges)
2357       {
2358         /* There are not enough space, need realloc.  */
2359         wchar_t *new_array_start, *new_array_end;
2360         int new_nranges;
2361 
2362         /* +1 in case of mbcset->nranges is 0.  */
2363         new_nranges = 2 * mbcset->nranges + 1;
2364         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2365            are NULL if *range_alloc == 0.  */
2366         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2367                                       new_nranges);
2368         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2369                                     new_nranges);
2370 
2371         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2372           return REG_ESPACE;
2373 
2374         mbcset->range_starts = new_array_start;
2375         mbcset->range_ends = new_array_end;
2376         *range_alloc = new_nranges;
2377       }
2378 
2379     mbcset->range_starts[mbcset->nranges] = start_wc;
2380     mbcset->range_ends[mbcset->nranges++] = end_wc;
2381 
2382     /* Build the table for single byte characters.  */
2383     for (wc = 0; wc <= SBC_MAX; ++wc)
2384       {
2385         cmp_buf[2] = wc;
2386         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2387             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2388           bitset_set (sbcset, wc);
2389       }
2390   }
2391 # else /* not RE_ENABLE_I18N */
2392   {
2393     unsigned int ch;
2394     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2395                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2396                    : 0));
2397     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2398               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2399                  : 0));
2400     if (start_ch > end_ch)
2401       return REG_ERANGE;
2402     /* Build the table for single byte characters.  */
2403     for (ch = 0; ch <= SBC_MAX; ++ch)
2404       if (start_ch <= ch  && ch <= end_ch)
2405         bitset_set (sbcset, ch);
2406   }
2407 # endif /* not RE_ENABLE_I18N */
2408   return REG_NOERROR;
2409 }
2410 #endif /* not _LIBC */
2411 
2412 #ifndef _LIBC
2413 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2414    Build the collating element which is represented by NAME.
2415    The result are written to MBCSET and SBCSET.
2416    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2417    pointer argument since we may update it.  */
2418 
2419 static reg_errcode_t
2420 # ifdef RE_ENABLE_I18N
build_collating_symbol(sbcset,mbcset,coll_syxmalloc,name)2421 build_collating_symbol (sbcset, mbcset, coll_syxmalloc, name)
2422      re_charset_t *mbcset;
2423      int *coll_syxmalloc;
2424 # else /* not RE_ENABLE_I18N */
2425 build_collating_symbol (sbcset, name)
2426 # endif /* not RE_ENABLE_I18N */
2427      re_bitset_ptr_t sbcset;
2428      const unsigned char *name;
2429 {
2430   size_t name_len = strlen ((const char *) name);
2431   if (BE (name_len != 1, 0))
2432     return REG_ECOLLATE;
2433   else
2434     {
2435       bitset_set (sbcset, name[0]);
2436       return REG_NOERROR;
2437     }
2438 }
2439 #endif /* not _LIBC */
2440 
2441 /* This function parse bracket expression like "[abc]", "[a-c]",
2442    "[[.a-a.]]" etc.  */
2443 
2444 static bin_tree_t *
parse_bracket_exp(regexp,dfa,token,syntax,err)2445 parse_bracket_exp (regexp, dfa, token, syntax, err)
2446      re_string_t *regexp;
2447      re_dfa_t *dfa;
2448      re_token_t *token;
2449      reg_syntax_t syntax;
2450      reg_errcode_t *err;
2451 {
2452 #ifdef _LIBC
2453   const unsigned char *collseqmb;
2454   const char *collseqwc;
2455   uint32_t nrules;
2456   int32_t table_size;
2457   const int32_t *symb_table;
2458   const unsigned char *extra;
2459 
2460   /* Local function for parse_bracket_exp used in _LIBC environement.
2461      Seek the collating symbol entry correspondings to NAME.
2462      Return the index of the symbol in the SYMB_TABLE.  */
2463 
2464   static inline int32_t
2465   seek_collating_symbol_entry (name, name_len)
2466          const unsigned char *name;
2467          size_t name_len;
2468     {
2469       int32_t hash = elem_hash ((const char *) name, name_len);
2470       int32_t elem = hash % table_size;
2471       int32_t second = hash % (table_size - 2);
2472       while (symb_table[2 * elem] != 0)
2473         {
2474           /* First compare the hashing value.  */
2475           if (symb_table[2 * elem] == hash
2476               /* Compare the length of the name.  */
2477               && name_len == extra[symb_table[2 * elem + 1]]
2478               /* Compare the name.  */
2479               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2480                          name_len) == 0)
2481             {
2482               /* Yep, this is the entry.  */
2483               break;
2484             }
2485 
2486           /* Next entry.  */
2487           elem += second;
2488         }
2489       return elem;
2490     }
2491 
2492   /* Local function for parse_bracket_exp used in _LIBC environement.
2493      Look up the collation sequence value of BR_ELEM.
2494      Return the value if succeeded, UINT_MAX otherwise.  */
2495 
2496   static inline unsigned int
2497   lookup_collation_sequence_value (br_elem)
2498          bracket_elem_t *br_elem;
2499     {
2500       if (br_elem->type == SB_CHAR)
2501         {
2502           /*
2503           if (MB_CUR_MAX == 1)
2504           */
2505           if (nrules == 0)
2506             return collseqmb[br_elem->opr.ch];
2507           else
2508             {
2509               wint_t wc = __btowc (br_elem->opr.ch);
2510               return collseq_table_lookup (collseqwc, wc);
2511             }
2512         }
2513       else if (br_elem->type == MB_CHAR)
2514         {
2515           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2516         }
2517       else if (br_elem->type == COLL_SYM)
2518         {
2519           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2520           if (nrules != 0)
2521             {
2522               int32_t elem, idx;
2523               elem = seek_collating_symbol_entry (br_elem->opr.name,
2524                                                   sym_name_len);
2525               if (symb_table[2 * elem] != 0)
2526                 {
2527                   /* We found the entry.  */
2528                   idx = symb_table[2 * elem + 1];
2529                   /* Skip the name of collating element name.  */
2530                   idx += 1 + extra[idx];
2531                   /* Skip the byte sequence of the collating element.  */
2532                   idx += 1 + extra[idx];
2533                   /* Adjust for the alignment.  */
2534                   idx = (idx + 3) & ~3;
2535                   /* Skip the multibyte collation sequence value.  */
2536                   idx += sizeof (unsigned int);
2537                   /* Skip the wide char sequence of the collating element.  */
2538                   idx += sizeof (unsigned int) *
2539                     (1 + *(unsigned int *) (extra + idx));
2540                   /* Return the collation sequence value.  */
2541                   return *(unsigned int *) (extra + idx);
2542                 }
2543               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2544                 {
2545                   /* No valid character.  Match it as a single byte
2546                      character.  */
2547                   return collseqmb[br_elem->opr.name[0]];
2548                 }
2549             }
2550           else if (sym_name_len == 1)
2551             return collseqmb[br_elem->opr.name[0]];
2552         }
2553       return UINT_MAX;
2554     }
2555 
2556   /* Local function for parse_bracket_exp used in _LIBC environement.
2557      Build the range expression which starts from START_ELEM, and ends
2558      at END_ELEM.  The result are written to MBCSET and SBCSET.
2559      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2560      mbcset->range_ends, is a pointer argument sinse we may
2561      update it.  */
2562 
2563   static inline reg_errcode_t
2564 # ifdef RE_ENABLE_I18N
2565   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2566          re_charset_t *mbcset;
2567          int *range_alloc;
2568 # else /* not RE_ENABLE_I18N */
2569   build_range_exp (sbcset, start_elem, end_elem)
2570 # endif /* not RE_ENABLE_I18N */
2571          re_bitset_ptr_t sbcset;
2572          bracket_elem_t *start_elem, *end_elem;
2573     {
2574       unsigned int ch;
2575       uint32_t start_collseq;
2576       uint32_t end_collseq;
2577 
2578 # ifdef RE_ENABLE_I18N
2579       /* Check the space of the arrays.  */
2580       if (*range_alloc == mbcset->nranges)
2581         {
2582           /* There are not enough space, need realloc.  */
2583           uint32_t *new_array_start;
2584           uint32_t *new_array_end;
2585 	  int new_nranges;
2586 
2587           /* +1 in case of mbcset->nranges is 0.  */
2588           new_nranges = 2 * mbcset->nranges + 1;
2589 	  /* Use realloc since mbcset->range_starts and mbcset->range_ends
2590              are NULL if *range_alloc == 0.  */
2591           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2592                                         new_nranges);
2593           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2594                                       new_nranges);
2595 
2596           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2597             return REG_ESPACE;
2598 
2599           mbcset->range_starts = new_array_start;
2600           mbcset->range_ends = new_array_end;
2601 	  *range_alloc = new_nranges;
2602         }
2603 # endif /* RE_ENABLE_I18N */
2604 
2605       /* Equivalence Classes and Character Classes can't be a range
2606          start/end.  */
2607       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2608               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2609               0))
2610         return REG_ERANGE;
2611 
2612       start_collseq = lookup_collation_sequence_value (start_elem);
2613       end_collseq = lookup_collation_sequence_value (end_elem);
2614       /* Check start/end collation sequence values.  */
2615       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2616         return REG_ECOLLATE;
2617       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2618         return REG_ERANGE;
2619 
2620 # ifdef RE_ENABLE_I18N
2621       /* Got valid collation sequence values, add them as a new entry.  */
2622       mbcset->range_starts[mbcset->nranges] = start_collseq;
2623       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2624 # endif /* RE_ENABLE_I18N */
2625 
2626       /* Build the table for single byte characters.  */
2627       for (ch = 0; ch <= SBC_MAX; ch++)
2628         {
2629           uint32_t ch_collseq;
2630           /*
2631           if (MB_CUR_MAX == 1)
2632           */
2633           if (nrules == 0)
2634             ch_collseq = collseqmb[ch];
2635           else
2636             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2637           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2638             bitset_set (sbcset, ch);
2639         }
2640       return REG_NOERROR;
2641     }
2642 
2643   /* Local function for parse_bracket_exp used in _LIBC environement.
2644      Build the collating element which is represented by NAME.
2645      The result are written to MBCSET and SBCSET.
2646      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2647      pointer argument sinse we may update it.  */
2648 
2649   static inline reg_errcode_t
2650 # ifdef RE_ENABLE_I18N
2651   build_collating_symbol (sbcset, mbcset, coll_syxmalloc, name)
2652          re_charset_t *mbcset;
2653          int *coll_syxmalloc;
2654 # else /* not RE_ENABLE_I18N */
2655   build_collating_symbol (sbcset, name)
2656 # endif /* not RE_ENABLE_I18N */
2657          re_bitset_ptr_t sbcset;
2658          const unsigned char *name;
2659     {
2660       int32_t elem, idx;
2661       size_t name_len = strlen ((const char *) name);
2662       if (nrules != 0)
2663         {
2664           elem = seek_collating_symbol_entry (name, name_len);
2665           if (symb_table[2 * elem] != 0)
2666             {
2667               /* We found the entry.  */
2668               idx = symb_table[2 * elem + 1];
2669               /* Skip the name of collating element name.  */
2670               idx += 1 + extra[idx];
2671             }
2672           else if (symb_table[2 * elem] == 0 && name_len == 1)
2673             {
2674               /* No valid character, treat it as a normal
2675                  character.  */
2676               bitset_set (sbcset, name[0]);
2677               return REG_NOERROR;
2678             }
2679           else
2680             return REG_ECOLLATE;
2681 
2682 # ifdef RE_ENABLE_I18N
2683           /* Got valid collation sequence, add it as a new entry.  */
2684           /* Check the space of the arrays.  */
2685           if (*coll_syxmalloc == mbcset->ncoll_syms)
2686             {
2687               /* Not enough, realloc it.  */
2688               /* +1 in case of mbcset->ncoll_syms is 0.  */
2689               *coll_syxmalloc = 2 * mbcset->ncoll_syms + 1;
2690               /* Use realloc since mbcset->coll_syms is NULL
2691                  if *alloc == 0.  */
2692               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2693                                               *coll_syxmalloc);
2694               if (BE (mbcset->coll_syms == NULL, 0))
2695                 return REG_ESPACE;
2696             }
2697           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2698 # endif /* RE_ENABLE_I18N */
2699           return REG_NOERROR;
2700         }
2701       else
2702         {
2703           if (BE (name_len != 1, 0))
2704             return REG_ECOLLATE;
2705           else
2706             {
2707               bitset_set (sbcset, name[0]);
2708               return REG_NOERROR;
2709             }
2710         }
2711     }
2712 #endif
2713 
2714   re_token_t br_token;
2715   re_bitset_ptr_t sbcset;
2716 #ifdef RE_ENABLE_I18N
2717   re_charset_t *mbcset;
2718   int coll_syxmalloc = 0, range_alloc = 0, mbchar_alloc = 0;
2719   int equiv_class_alloc = 0, char_class_alloc = 0;
2720 #else /* not RE_ENABLE_I18N */
2721   int non_match = 0;
2722 #endif /* not RE_ENABLE_I18N */
2723   bin_tree_t *work_tree;
2724   int token_len, new_idx;
2725 #ifdef _LIBC
2726   collseqmb = (const unsigned char *)
2727     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2728   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2729   if (nrules)
2730     {
2731       /*
2732       if (MB_CUR_MAX > 1)
2733       */
2734         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2735       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2736       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2737                                                   _NL_COLLATE_SYMB_TABLEMB);
2738       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2739                                                    _NL_COLLATE_SYMB_EXTRAMB);
2740     }
2741 #endif
2742   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2743 #ifdef RE_ENABLE_I18N
2744   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2745 #endif /* RE_ENABLE_I18N */
2746 #ifdef RE_ENABLE_I18N
2747   if (BE (sbcset == NULL || mbcset == NULL, 0))
2748 #else
2749   if (BE (sbcset == NULL, 0))
2750 #endif /* RE_ENABLE_I18N */
2751     {
2752       *err = REG_ESPACE;
2753       return NULL;
2754     }
2755 
2756   token_len = peek_token_bracket (token, regexp, syntax);
2757   if (BE (token->type == END_OF_RE, 0))
2758     {
2759       *err = REG_BADPAT;
2760       goto parse_bracket_exp_free_return;
2761     }
2762   if (token->type == OP_NON_MATCH_LIST)
2763     {
2764 #ifdef RE_ENABLE_I18N
2765       int i;
2766       mbcset->non_match = 1;
2767 #else /* not RE_ENABLE_I18N */
2768       non_match = 1;
2769 #endif /* not RE_ENABLE_I18N */
2770       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2771         bitset_set (sbcset, '\0');
2772       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2773       token_len = peek_token_bracket (token, regexp, syntax);
2774       if (BE (token->type == END_OF_RE, 0))
2775         {
2776           *err = REG_BADPAT;
2777           goto parse_bracket_exp_free_return;
2778         }
2779 #ifdef RE_ENABLE_I18N
2780       if (MB_CUR_MAX > 1)
2781         for (i = 0; i < SBC_MAX; ++i)
2782           if (__btowc (i) == WEOF)
2783             bitset_set (sbcset, i);
2784 #endif /* RE_ENABLE_I18N */
2785     }
2786 
2787   /* We treat the first ']' as a normal character.  */
2788   if (token->type == OP_CLOSE_BRACKET)
2789     token->type = CHARACTER;
2790 
2791   while (1)
2792     {
2793       bracket_elem_t start_elem, end_elem;
2794       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2795       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2796       reg_errcode_t ret;
2797       int token_len2 = 0, is_range_exp = 0;
2798       re_token_t token2;
2799 
2800       start_elem.opr.name = start_name_buf;
2801       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2802                                    syntax);
2803       if (BE (ret != REG_NOERROR, 0))
2804         {
2805           *err = ret;
2806           goto parse_bracket_exp_free_return;
2807         }
2808 
2809       token_len = peek_token_bracket (token, regexp, syntax);
2810       if (BE (token->type == END_OF_RE, 0))
2811         {
2812           *err = REG_BADPAT;
2813           goto parse_bracket_exp_free_return;
2814         }
2815       if (token->type == OP_CHARSET_RANGE)
2816         {
2817           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2818           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2819           if (BE (token->type == END_OF_RE, 0))
2820             {
2821               *err = REG_BADPAT;
2822               goto parse_bracket_exp_free_return;
2823             }
2824           if (token2.type == OP_CLOSE_BRACKET)
2825             {
2826               /* We treat the last '-' as a normal character.  */
2827               re_string_skip_bytes (regexp, -token_len);
2828               token->type = CHARACTER;
2829             }
2830           else
2831             is_range_exp = 1;
2832         }
2833 
2834       if (is_range_exp == 1)
2835         {
2836           end_elem.opr.name = end_name_buf;
2837           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2838                                        dfa, syntax);
2839           if (BE (ret != REG_NOERROR, 0))
2840             {
2841               *err = ret;
2842               goto parse_bracket_exp_free_return;
2843             }
2844 
2845           token_len = peek_token_bracket (token, regexp, syntax);
2846           if (BE (token->type == END_OF_RE, 0))
2847             {
2848               *err = REG_BADPAT;
2849               goto parse_bracket_exp_free_return;
2850             }
2851           *err = build_range_exp (sbcset,
2852 #ifdef RE_ENABLE_I18N
2853                                   mbcset, &range_alloc,
2854 #endif /* RE_ENABLE_I18N */
2855                                   &start_elem, &end_elem);
2856           if (BE (*err != REG_NOERROR, 0))
2857             goto parse_bracket_exp_free_return;
2858         }
2859       else
2860         {
2861           switch (start_elem.type)
2862             {
2863             case SB_CHAR:
2864               bitset_set (sbcset, start_elem.opr.ch);
2865               break;
2866 #ifdef RE_ENABLE_I18N
2867             case MB_CHAR:
2868               /* Check whether the array has enough space.  */
2869               if (mbchar_alloc == mbcset->nmbchars)
2870                 {
2871                   /* Not enough, realloc it.  */
2872                   /* +1 in case of mbcset->nmbchars is 0.  */
2873                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2874                   /* Use realloc since array is NULL if *alloc == 0.  */
2875                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2876                                                 mbchar_alloc);
2877                   if (BE (mbcset->mbchars == NULL, 0))
2878                     goto parse_bracket_exp_espace;
2879                 }
2880               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2881               break;
2882 #endif /* RE_ENABLE_I18N */
2883             case EQUIV_CLASS:
2884               *err = build_equiv_class (sbcset,
2885 #ifdef RE_ENABLE_I18N
2886                                         mbcset, &equiv_class_alloc,
2887 #endif /* RE_ENABLE_I18N */
2888 					start_elem.opr.name);
2889               if (BE (*err != REG_NOERROR, 0))
2890                 goto parse_bracket_exp_free_return;
2891               break;
2892             case COLL_SYM:
2893               *err = build_collating_symbol (sbcset,
2894 #ifdef RE_ENABLE_I18N
2895                                              mbcset, &coll_syxmalloc,
2896 #endif /* RE_ENABLE_I18N */
2897 					     start_elem.opr.name);
2898               if (BE (*err != REG_NOERROR, 0))
2899                 goto parse_bracket_exp_free_return;
2900               break;
2901             case CHAR_CLASS:
2902               ret = build_charclass (sbcset,
2903 #ifdef RE_ENABLE_I18N
2904                                      mbcset, &char_class_alloc,
2905 #endif /* RE_ENABLE_I18N */
2906                                      start_elem.opr.name, syntax);
2907               if (BE (ret != REG_NOERROR, 0))
2908                goto parse_bracket_exp_espace;
2909               break;
2910             default:
2911               assert (0);
2912               break;
2913             }
2914         }
2915       if (token->type == OP_CLOSE_BRACKET)
2916         break;
2917     }
2918 
2919   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2920 
2921   /* If it is non-matching list.  */
2922 #ifdef RE_ENABLE_I18N
2923   if (mbcset->non_match)
2924 #else /* not RE_ENABLE_I18N */
2925   if (non_match)
2926 #endif /* not RE_ENABLE_I18N */
2927     bitset_not (sbcset);
2928 
2929   /* Build a tree for simple bracket.  */
2930   br_token.type = SIMPLE_BRACKET;
2931   br_token.opr.sbcset = sbcset;
2932   new_idx = re_dfa_add_node (dfa, br_token, 0);
2933   work_tree = create_tree (NULL, NULL, 0, new_idx);
2934   if (BE (new_idx == -1 || work_tree == NULL, 0))
2935     goto parse_bracket_exp_espace;
2936 
2937 #ifdef RE_ENABLE_I18N
2938   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2939       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2940                                                 || mbcset->non_match)))
2941     {
2942       re_token_t alt_token;
2943       bin_tree_t *mbc_tree;
2944       /* Build a tree for complex bracket.  */
2945       br_token.type = COMPLEX_BRACKET;
2946       br_token.opr.mbcset = mbcset;
2947       dfa->has_mb_node = 1;
2948       new_idx = re_dfa_add_node (dfa, br_token, 0);
2949       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2950       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2951         goto parse_bracket_exp_espace;
2952       /* Then join them by ALT node.  */
2953       dfa->has_plural_match = 1;
2954       alt_token.type = OP_ALT;
2955       new_idx = re_dfa_add_node (dfa, alt_token, 0);
2956       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
2957       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
2958         return work_tree;
2959     }
2960   else
2961     {
2962       free_charset (mbcset);
2963       return work_tree;
2964     }
2965 #else /* not RE_ENABLE_I18N */
2966   return work_tree;
2967 #endif /* not RE_ENABLE_I18N */
2968 
2969  parse_bracket_exp_espace:
2970   *err = REG_ESPACE;
2971  parse_bracket_exp_free_return:
2972   re_free (sbcset);
2973 #ifdef RE_ENABLE_I18N
2974   free_charset (mbcset);
2975 #endif /* RE_ENABLE_I18N */
2976   return NULL;
2977 }
2978 
2979 /* Parse an element in the bracket expression.  */
2980 
2981 static reg_errcode_t
parse_bracket_element(elem,regexp,token,token_len,dfa,syntax)2982 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2983      bracket_elem_t *elem;
2984      re_string_t *regexp;
2985      re_token_t *token;
2986      int token_len;
2987      re_dfa_t *dfa;
2988      reg_syntax_t syntax;
2989 {
2990 #ifdef RE_ENABLE_I18N
2991   int cur_char_size;
2992   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2993   if (cur_char_size > 1)
2994     {
2995       elem->type = MB_CHAR;
2996       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2997       re_string_skip_bytes (regexp, cur_char_size);
2998       return REG_NOERROR;
2999     }
3000 #endif /* RE_ENABLE_I18N */
3001   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3002   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3003       || token->type == OP_OPEN_EQUIV_CLASS)
3004     return parse_bracket_symbol (elem, regexp, token);
3005   elem->type = SB_CHAR;
3006   elem->opr.ch = token->opr.c;
3007   return REG_NOERROR;
3008 }
3009 
3010 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3011    such as [:<character_class>:], [.<collating_element>.], and
3012    [=<equivalent_class>=].  */
3013 
3014 static reg_errcode_t
parse_bracket_symbol(elem,regexp,token)3015 parse_bracket_symbol (elem, regexp, token)
3016      bracket_elem_t *elem;
3017      re_string_t *regexp;
3018      re_token_t *token;
3019 {
3020   unsigned char ch, delim = token->opr.c;
3021   int i = 0;
3022   for (;; ++i)
3023     {
3024       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3025         return REG_EBRACK;
3026       if (token->type == OP_OPEN_CHAR_CLASS)
3027         ch = re_string_fetch_byte_case (regexp);
3028       else
3029         ch = re_string_fetch_byte (regexp);
3030       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3031         break;
3032       elem->opr.name[i] = ch;
3033     }
3034   re_string_skip_bytes (regexp, 1);
3035   elem->opr.name[i] = '\0';
3036   switch (token->type)
3037     {
3038     case OP_OPEN_COLL_ELEM:
3039       elem->type = COLL_SYM;
3040       break;
3041     case OP_OPEN_EQUIV_CLASS:
3042       elem->type = EQUIV_CLASS;
3043       break;
3044     case OP_OPEN_CHAR_CLASS:
3045       elem->type = CHAR_CLASS;
3046       break;
3047     default:
3048       break;
3049     }
3050   return REG_NOERROR;
3051 }
3052 
3053   /* Helper function for parse_bracket_exp.
3054      Build the equivalence class which is represented by NAME.
3055      The result are written to MBCSET and SBCSET.
3056      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3057      is a pointer argument sinse we may update it.  */
3058 
3059 static reg_errcode_t
3060 #ifdef RE_ENABLE_I18N
build_equiv_class(sbcset,mbcset,equiv_class_alloc,name)3061 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3062      re_charset_t *mbcset;
3063      int *equiv_class_alloc;
3064 #else /* not RE_ENABLE_I18N */
3065 build_equiv_class (sbcset, name)
3066 #endif /* not RE_ENABLE_I18N */
3067      re_bitset_ptr_t sbcset;
3068      const unsigned char *name;
3069 {
3070 #if defined _LIBC && defined RE_ENABLE_I18N
3071   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3072   if (nrules != 0)
3073     {
3074       const int32_t *table, *indirect;
3075       const unsigned char *weights, *extra, *cp;
3076       unsigned char char_buf[2];
3077       int32_t idx1, idx2;
3078       unsigned int ch;
3079       size_t len;
3080       /* This #include defines a local function!  */
3081 # include <locale/weight.h>
3082       /* Calculate the index for equivalence class.  */
3083       cp = name;
3084       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3085       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3086 					       _NL_COLLATE_WEIGHTMB);
3087       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3088                                                    _NL_COLLATE_EXTRAMB);
3089       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3090                                                 _NL_COLLATE_INDIRECTMB);
3091       idx1 = findidx (&cp);
3092       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3093         /* This isn't a valid character.  */
3094         return REG_ECOLLATE;
3095 
3096       /* Build single byte matcing table for this equivalence class.  */
3097       char_buf[1] = (unsigned char) '\0';
3098       len = weights[idx1];
3099       for (ch = 0; ch < SBC_MAX; ++ch)
3100         {
3101           char_buf[0] = ch;
3102           cp = char_buf;
3103           idx2 = findidx (&cp);
3104 /*
3105           idx2 = table[ch];
3106 */
3107           if (idx2 == 0)
3108             /* This isn't a valid character.  */
3109             continue;
3110           if (len == weights[idx2])
3111             {
3112               int cnt = 0;
3113               while (cnt <= len &&
3114                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3115                 ++cnt;
3116 
3117               if (cnt > len)
3118                 bitset_set (sbcset, ch);
3119             }
3120         }
3121       /* Check whether the array has enough space.  */
3122       if (*equiv_class_alloc == mbcset->nequiv_classes)
3123         {
3124           /* Not enough, realloc it.  */
3125           /* +1 in case of mbcset->nequiv_classes is 0.  */
3126           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3127           /* Use realloc since the array is NULL if *alloc == 0.  */
3128           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3129                                               *equiv_class_alloc);
3130           if (BE (mbcset->equiv_classes == NULL, 0))
3131             return REG_ESPACE;
3132         }
3133       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3134     }
3135   else
3136 #endif /* _LIBC && RE_ENABLE_I18N */
3137     {
3138       if (BE (strlen ((const char *) name) != 1, 0))
3139         return REG_ECOLLATE;
3140       bitset_set (sbcset, *name);
3141     }
3142   return REG_NOERROR;
3143 }
3144 
3145   /* Helper function for parse_bracket_exp.
3146      Build the character class which is represented by NAME.
3147      The result are written to MBCSET and SBCSET.
3148      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3149      is a pointer argument sinse we may update it.  */
3150 
3151 static reg_errcode_t
3152 #ifdef RE_ENABLE_I18N
build_charclass(sbcset,mbcset,char_class_alloc,class_name,syntax)3153 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3154      re_charset_t *mbcset;
3155      int *char_class_alloc;
3156 #else /* not RE_ENABLE_I18N */
3157 build_charclass (sbcset, class_name, syntax)
3158 #endif /* not RE_ENABLE_I18N */
3159      re_bitset_ptr_t sbcset;
3160      const unsigned char *class_name;
3161      reg_syntax_t syntax;
3162 {
3163   int i;
3164   const char *name = (const char *) class_name;
3165 
3166   /* In case of REG_ICASE "upper" and "lower" match the both of
3167      upper and lower cases.  */
3168   if ((syntax & RE_ICASE)
3169       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3170     name = "alpha";
3171 
3172 #ifdef RE_ENABLE_I18N
3173   /* Check the space of the arrays.  */
3174   if (*char_class_alloc == mbcset->nchar_classes)
3175     {
3176       /* Not enough, realloc it.  */
3177       /* +1 in case of mbcset->nchar_classes is 0.  */
3178       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3179       /* Use realloc since array is NULL if *alloc == 0.  */
3180       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3181                                          *char_class_alloc);
3182       if (BE (mbcset->char_classes == NULL, 0))
3183         return REG_ESPACE;
3184     }
3185   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3186 #endif /* RE_ENABLE_I18N */
3187 
3188 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3189     for (i = 0; i < SBC_MAX; ++i)	\
3190       {					\
3191         if (ctype_func (i))		\
3192           bitset_set (sbcset, i);	\
3193       }
3194 
3195   if (strcmp (name, "alnum") == 0)
3196     BUILD_CHARCLASS_LOOP (isalnum)
3197   else if (strcmp (name, "cntrl") == 0)
3198     BUILD_CHARCLASS_LOOP (iscntrl)
3199   else if (strcmp (name, "lower") == 0)
3200     BUILD_CHARCLASS_LOOP (islower)
3201   else if (strcmp (name, "space") == 0)
3202     BUILD_CHARCLASS_LOOP (isspace)
3203   else if (strcmp (name, "alpha") == 0)
3204     BUILD_CHARCLASS_LOOP (isalpha)
3205   else if (strcmp (name, "digit") == 0)
3206     BUILD_CHARCLASS_LOOP (isdigit)
3207   else if (strcmp (name, "print") == 0)
3208     BUILD_CHARCLASS_LOOP (isprint)
3209   else if (strcmp (name, "upper") == 0)
3210     BUILD_CHARCLASS_LOOP (isupper)
3211   else if (strcmp (name, "blank") == 0)
3212     BUILD_CHARCLASS_LOOP (isblank)
3213   else if (strcmp (name, "graph") == 0)
3214     BUILD_CHARCLASS_LOOP (isgraph)
3215   else if (strcmp (name, "punct") == 0)
3216     BUILD_CHARCLASS_LOOP (ispunct)
3217   else if (strcmp (name, "xdigit") == 0)
3218     BUILD_CHARCLASS_LOOP (isxdigit)
3219   else
3220     return REG_ECTYPE;
3221 
3222   return REG_NOERROR;
3223 }
3224 
3225 static bin_tree_t *
build_word_op(dfa,not,err)3226 build_word_op (dfa, not, err)
3227      re_dfa_t *dfa;
3228      int not;
3229      reg_errcode_t *err;
3230 {
3231   re_bitset_ptr_t sbcset;
3232 #ifdef RE_ENABLE_I18N
3233   re_charset_t *mbcset;
3234   int alloc = 0;
3235 #else /* not RE_ENABLE_I18N */
3236   int non_match = 0;
3237 #endif /* not RE_ENABLE_I18N */
3238   reg_errcode_t ret;
3239   re_token_t br_token;
3240   bin_tree_t *tree;
3241   int new_idx;
3242 
3243   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3244 #ifdef RE_ENABLE_I18N
3245   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3246 #endif /* RE_ENABLE_I18N */
3247 
3248 #ifdef RE_ENABLE_I18N
3249   if (BE (sbcset == NULL || mbcset == NULL, 0))
3250 #else /* not RE_ENABLE_I18N */
3251   if (BE (sbcset == NULL, 0))
3252 #endif /* not RE_ENABLE_I18N */
3253     {
3254       *err = REG_ESPACE;
3255       return NULL;
3256     }
3257 
3258   if (not)
3259     {
3260 #ifdef RE_ENABLE_I18N
3261       int i;
3262       /*
3263       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3264         bitset_set(cset->sbcset, '\0');
3265       */
3266       mbcset->non_match = 1;
3267       if (MB_CUR_MAX > 1)
3268         for (i = 0; i < SBC_MAX; ++i)
3269           if (__btowc (i) == WEOF)
3270             bitset_set (sbcset, i);
3271 #else /* not RE_ENABLE_I18N */
3272       non_match = 1;
3273 #endif /* not RE_ENABLE_I18N */
3274     }
3275 
3276   /* We don't care the syntax in this case.  */
3277   ret = build_charclass (sbcset,
3278 #ifdef RE_ENABLE_I18N
3279                          mbcset, &alloc,
3280 #endif /* RE_ENABLE_I18N */
3281                          (const unsigned char *) "alpha", 0);
3282 
3283   if (BE (ret != REG_NOERROR, 0))
3284     {
3285       re_free (sbcset);
3286 #ifdef RE_ENABLE_I18N
3287       free_charset (mbcset);
3288 #endif /* RE_ENABLE_I18N */
3289       *err = REG_ESPACE;
3290       return NULL;
3291     }
3292   /* \w match '_' also.  */
3293   bitset_set (sbcset, '_');
3294 
3295   /* If it is non-matching list.  */
3296 #ifdef RE_ENABLE_I18N
3297   if (mbcset->non_match)
3298 #else /* not RE_ENABLE_I18N */
3299   if (non_match)
3300 #endif /* not RE_ENABLE_I18N */
3301     bitset_not (sbcset);
3302 
3303   /* Build a tree for simple bracket.  */
3304   br_token.type = SIMPLE_BRACKET;
3305   br_token.opr.sbcset = sbcset;
3306   new_idx = re_dfa_add_node (dfa, br_token, 0);
3307   tree = create_tree (NULL, NULL, 0, new_idx);
3308   if (BE (new_idx == -1 || tree == NULL, 0))
3309     goto build_word_op_espace;
3310 
3311 #ifdef RE_ENABLE_I18N
3312   if (MB_CUR_MAX > 1)
3313     {
3314       re_token_t alt_token;
3315       bin_tree_t *mbc_tree;
3316       /* Build a tree for complex bracket.  */
3317       br_token.type = COMPLEX_BRACKET;
3318       br_token.opr.mbcset = mbcset;
3319       dfa->has_mb_node = 1;
3320       new_idx = re_dfa_add_node (dfa, br_token, 0);
3321       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3322       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3323         goto build_word_op_espace;
3324       /* Then join them by ALT node.  */
3325       alt_token.type = OP_ALT;
3326       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3327       tree = create_tree (tree, mbc_tree, 0, new_idx);
3328       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3329         return tree;
3330     }
3331   else
3332     {
3333       free_charset (mbcset);
3334       return tree;
3335     }
3336 #else /* not RE_ENABLE_I18N */
3337   return tree;
3338 #endif /* not RE_ENABLE_I18N */
3339 
3340  build_word_op_espace:
3341   re_free (sbcset);
3342 #ifdef RE_ENABLE_I18N
3343   free_charset (mbcset);
3344 #endif /* RE_ENABLE_I18N */
3345   *err = REG_ESPACE;
3346   return NULL;
3347 }
3348 
3349 /* This is intended for the expressions like "a{1,3}".
3350    Fetch a number from `input', and return the number.
3351    Return -1, if the number field is empty like "{,1}".
3352    Return -2, If an error is occurred.  */
3353 
3354 static int
fetch_number(input,token,syntax)3355 fetch_number (input, token, syntax)
3356      re_string_t *input;
3357      re_token_t *token;
3358      reg_syntax_t syntax;
3359 {
3360   int num = -1;
3361   unsigned char c;
3362   while (1)
3363     {
3364       *token = fetch_token (input, syntax);
3365       c = token->opr.c;
3366       if (BE (token->type == END_OF_RE, 0))
3367         return -2;
3368       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3369         break;
3370       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3371              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3372       num = (num > RE_DUP_MAX) ? -2 : num;
3373     }
3374   return num;
3375 }
3376 
3377 #ifdef RE_ENABLE_I18N
3378 static void
free_charset(re_charset_t * cset)3379 free_charset (re_charset_t *cset)
3380 {
3381   re_free (cset->mbchars);
3382 # ifdef _LIBC
3383   re_free (cset->coll_syms);
3384   re_free (cset->equiv_classes);
3385   re_free (cset->range_starts);
3386   re_free (cset->range_ends);
3387 # endif
3388   re_free (cset->char_classes);
3389   re_free (cset);
3390 }
3391 #endif /* RE_ENABLE_I18N */
3392 
3393 /* Functions for binary tree operation.  */
3394 
3395 /* Create a node of tree.
3396    Note: This function automatically free left and right if malloc fails.  */
3397 
3398 static bin_tree_t *
create_tree(left,right,type,index)3399 create_tree (left, right, type, index)
3400      bin_tree_t *left;
3401      bin_tree_t *right;
3402      re_token_type_t type;
3403      int index;
3404 {
3405   bin_tree_t *tree;
3406   tree = re_malloc (bin_tree_t, 1);
3407   if (BE (tree == NULL, 0))
3408     {
3409       free_bin_tree (left);
3410       free_bin_tree (right);
3411       return NULL;
3412     }
3413   tree->parent = NULL;
3414   tree->left = left;
3415   tree->right = right;
3416   tree->type = type;
3417   tree->node_idx = index;
3418   tree->first = -1;
3419   tree->next = -1;
3420   re_node_set_init_empty (&tree->eclosure);
3421 
3422   if (left != NULL)
3423     left->parent = tree;
3424   if (right != NULL)
3425     right->parent = tree;
3426   return tree;
3427 }
3428 
3429 /* Free the sub tree pointed by TREE.  */
3430 
3431 static void
free_bin_tree(tree)3432 free_bin_tree (tree)
3433      bin_tree_t *tree;
3434 {
3435   if (tree == NULL)
3436     return;
3437   /*re_node_set_free (&tree->eclosure);*/
3438   free_bin_tree (tree->left);
3439   free_bin_tree (tree->right);
3440   re_free (tree);
3441 }
3442 
3443 /* Duplicate the node SRC, and return new node.  */
3444 
3445 static bin_tree_t *
duplicate_tree(src,dfa)3446 duplicate_tree (src, dfa)
3447      const bin_tree_t *src;
3448      re_dfa_t *dfa;
3449 {
3450   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3451   int new_node_idx;
3452   /* Since node indies must be according to Post-order of the tree,
3453      we must duplicate the left at first.  */
3454   if (src->left != NULL)
3455     {
3456       left = duplicate_tree (src->left, dfa);
3457       if (left == NULL)
3458         return NULL;
3459     }
3460 
3461   /* Secondaly, duplicate the right.  */
3462   if (src->right != NULL)
3463     {
3464       right = duplicate_tree (src->right, dfa);
3465       if (right == NULL)
3466         {
3467           free_bin_tree (left);
3468           return NULL;
3469         }
3470     }
3471 
3472   /* At last, duplicate itself.  */
3473   if (src->type == NON_TYPE)
3474     {
3475       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3476       dfa->nodes[new_node_idx].duplicated = 1;
3477       if (BE (new_node_idx == -1, 0))
3478         {
3479           free_bin_tree (left);
3480           free_bin_tree (right);
3481           return NULL;
3482         }
3483     }
3484   else
3485     new_node_idx = src->type;
3486 
3487   new_tree = create_tree (left, right, src->type, new_node_idx);
3488   if (BE (new_tree == NULL, 0))
3489     {
3490       free_bin_tree (left);
3491       free_bin_tree (right);
3492     }
3493   return new_tree;
3494 }
3495