1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129 const char __re_error_msgid[] attribute_hidden =
130 {
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203
204 /* Entry points for GNU code. */
205
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
212
213 const char *
re_compile_pattern(pattern,length,bufp)214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 {
219 reg_errcode_t ret;
220
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225
226 /* Match anchors at newline. */
227 bufp->newline_anchor = 1;
228
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 }
235 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
238
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
245
246
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
250
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
253
254 reg_syntax_t
255 re_set_syntax (syntax)
256 reg_syntax_t syntax;
257 {
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262 }
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
266
267 int
268 re_compile_fastmap (bufp)
269 struct re_pattern_buffer *bufp;
270 {
271 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272 char *fastmap = bufp->fastmap;
273
274 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276 if (dfa->init_state != dfa->init_state_word)
277 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278 if (dfa->init_state != dfa->init_state_nl)
279 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280 if (dfa->init_state != dfa->init_state_begbuf)
281 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282 bufp->fastmap_accurate = 1;
283 return 0;
284 }
285 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
288
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
292 {
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
296 }
297
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
300
301 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 char *fastmap)
304 {
305 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int node_cnt;
307 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 {
310 int node = init_state->nodes.elems[node_cnt];
311 re_token_type_t type = dfa->nodes[node].type;
312
313 if (type == CHARACTER)
314 {
315 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 {
319 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 wchar_t wc;
321 mbstate_t state;
322
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, 0, buf[0]);
335 }
336 #endif
337 }
338 else if (type == SIMPLE_BRACKET)
339 {
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 {
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
348 }
349 }
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
352 {
353 int i;
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
356 || cset->nranges || cset->nchar_classes)
357 {
358 # ifdef _LIBC
359 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
360 {
361 /* In this case we want to catch the bytes which are
362 the first byte of any collation elements.
363 e.g. In da_DK, we want to catch 'a' since "aa"
364 is a valid collation element, and don't catch
365 'b' since 'b' is the only collation element
366 which starts from 'b'. */
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
372 }
373 # else
374 if (dfa->mb_cur_max > 1)
375 for (i = 0; i < SBC_MAX; ++i)
376 if (__btowc (i) == WEOF)
377 re_set_fastmap (fastmap, icase, i);
378 # endif /* not _LIBC */
379 }
380 for (i = 0; i < cset->nmbchars; ++i)
381 {
382 char buf[256];
383 mbstate_t state;
384 memset (&state, '\0', sizeof (state));
385 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
386 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
387 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
388 {
389 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
390 != (size_t) -1)
391 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
392 }
393 }
394 }
395 #endif /* RE_ENABLE_I18N */
396 else if (type == OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 || type == OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 || type == END_OF_RE)
401 {
402 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
403 if (type == END_OF_RE)
404 bufp->can_be_null = 1;
405 return;
406 }
407 }
408 }
409
410 /* Entry point for POSIX code. */
411 /* regcomp takes a regular expression as a string and compiles it.
412
413 PREG is a regex_t *. We do not expect any fields to be initialized,
414 since POSIX says we shouldn't. Thus, we set
415
416 `buffer' to the compiled pattern;
417 `used' to the length of the compiled pattern;
418 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419 REG_EXTENDED bit in CFLAGS is set; otherwise, to
420 RE_SYNTAX_POSIX_BASIC;
421 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422 `fastmap' to an allocated space for the fastmap;
423 `fastmap_accurate' to zero;
424 `re_nsub' to the number of subexpressions in PATTERN.
425
426 PATTERN is the address of the pattern string.
427
428 CFLAGS is a series of bits which affect compilation.
429
430 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431 use POSIX basic syntax.
432
433 If REG_NEWLINE is set, then . and [^...] don't match newline.
434 Also, regexec will try a match beginning after every newline.
435
436 If REG_ICASE is set, then we considers upper- and lowercase
437 versions of letters to be equivalent when matching.
438
439 If REG_NOSUB is set, then when PREG is passed to regexec, that
440 routine will report only success or failure, and nothing about the
441 registers.
442
443 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
444 the return codes and their meanings.) */
445
446 int
regcomp(preg,pattern,cflags)447 regcomp (preg, pattern, cflags)
448 regex_t *__restrict preg;
449 const char *__restrict pattern;
450 int cflags;
451 {
452 reg_errcode_t ret;
453 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
454 : RE_SYNTAX_POSIX_BASIC);
455
456 preg->buffer = NULL;
457 preg->allocated = 0;
458 preg->used = 0;
459
460 /* Try to allocate space for the fastmap. */
461 preg->fastmap = re_malloc (char, SBC_MAX);
462 if (BE (preg->fastmap == NULL, 0))
463 return REG_ESPACE;
464
465 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
466
467 /* If REG_NEWLINE is set, newlines are treated differently. */
468 if (cflags & REG_NEWLINE)
469 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
470 syntax &= ~RE_DOT_NEWLINE;
471 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
472 /* It also changes the matching behavior. */
473 preg->newline_anchor = 1;
474 }
475 else
476 preg->newline_anchor = 0;
477 preg->no_sub = !!(cflags & REG_NOSUB);
478 preg->translate = NULL;
479
480 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
481
482 /* POSIX doesn't distinguish between an unmatched open-group and an
483 unmatched close-group: both are REG_EPAREN. */
484 if (ret == REG_ERPAREN)
485 ret = REG_EPAREN;
486
487 /* We have already checked preg->fastmap != NULL. */
488 if (BE (ret == REG_NOERROR, 1))
489 /* Compute the fastmap now, since regexec cannot modify the pattern
490 buffer. This function never fails in this implementation. */
491 (void) re_compile_fastmap (preg);
492 else
493 {
494 /* Some error occurred while compiling the expression. */
495 re_free (preg->fastmap);
496 preg->fastmap = NULL;
497 }
498
499 return (int) ret;
500 }
501 #ifdef _LIBC
weak_alias(__regcomp,regcomp)502 weak_alias (__regcomp, regcomp)
503 #endif
504
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506 from either regcomp or regexec. We don't use PREG here. */
507
508 size_t regerror ( int errcode, const regex_t *__restrict preg,
509 char *__restrict errbuf, size_t errbuf_size )
510 {
511 const char *msg;
512 size_t msg_size;
513
514 if (BE (errcode < 0
515 || errcode >= (int) (sizeof (__re_error_msgid_idx)
516 / sizeof (__re_error_msgid_idx[0])), 0))
517 /* Only error codes returned by the rest of the code should be passed
518 to this routine. If we are given anything else, or if other regex
519 code generates an invalid error code, then the program has a bug.
520 Dump core so we can fix it. */
521 abort ();
522
523 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
524
525 msg_size = strlen (msg) + 1; /* Includes the null. */
526
527 if (BE (errbuf_size != 0, 1))
528 {
529 if (BE (msg_size > errbuf_size, 0))
530 {
531 #if defined HAVE_MEMPCPY || defined _LIBC
532 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
533 #else
534 memmove (errbuf, msg, errbuf_size - 1);
535 errbuf[errbuf_size - 1] = 0;
536 #endif
537 }
538 else
539 memmove (errbuf, msg, msg_size);
540 }
541
542 return msg_size;
543 }
544 #ifdef _LIBC
545 weak_alias (__regerror, regerror)
546 #endif
547
548
549 #ifdef RE_ENABLE_I18N
550 /* This static array is used for the map to single-byte characters when
551 UTF-8 is used. Otherwise we would allocate memory just to initialize
552 it the same all the time. UTF-8 is the preferred encoding so this is
553 a worthwhile optimization. */
554 static const bitset_t utf8_sb_map =
555 {
556 /* Set the first 128 bits. */
557 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
558 };
559 #endif
560
561
562 static void
free_dfa_content(re_dfa_t * dfa)563 free_dfa_content (re_dfa_t *dfa)
564 {
565 int i, j;
566
567 if (dfa->nodes)
568 for (i = 0; i < dfa->nodes_len; ++i)
569 free_token (dfa->nodes + i);
570 re_free (dfa->nexts);
571 for (i = 0; i < dfa->nodes_len; ++i)
572 {
573 if (dfa->eclosures != NULL)
574 re_node_set_free (dfa->eclosures + i);
575 if (dfa->inveclosures != NULL)
576 re_node_set_free (dfa->inveclosures + i);
577 if (dfa->edests != NULL)
578 re_node_set_free (dfa->edests + i);
579 }
580 re_free (dfa->edests);
581 re_free (dfa->eclosures);
582 re_free (dfa->inveclosures);
583 re_free (dfa->nodes);
584
585 if (dfa->state_table)
586 for (i = 0; i <= dfa->state_hash_mask; ++i)
587 {
588 struct re_state_table_entry *entry = dfa->state_table + i;
589 for (j = 0; j < entry->num; ++j)
590 {
591 re_dfastate_t *state = entry->array[j];
592 free_state (state);
593 }
594 re_free (entry->array);
595 }
596 re_free (dfa->state_table);
597 #ifdef RE_ENABLE_I18N
598 if (dfa->sb_char != utf8_sb_map)
599 re_free (dfa->sb_char);
600 #endif
601 re_free (dfa->subexp_map);
602 #ifdef DEBUG
603 re_free (dfa->re_str);
604 #endif
605
606 re_free (dfa);
607 }
608
609
610 /* Free dynamically allocated space used by PREG. */
611
612 void
regfree(preg)613 regfree (preg)
614 regex_t *preg;
615 {
616 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
617 if (BE (dfa != NULL, 1))
618 free_dfa_content (dfa);
619 preg->buffer = NULL;
620 preg->allocated = 0;
621
622 re_free (preg->fastmap);
623 preg->fastmap = NULL;
624
625 re_free (preg->translate);
626 preg->translate = NULL;
627 }
628 #ifdef _LIBC
629 weak_alias (__regfree, regfree)
630 #endif
631
632 /* Entry points compatible with 4.2 BSD regex library. We don't define
633 them unless specifically requested. */
634
635 #if defined _REGEX_RE_COMP || defined _LIBC
636
637 /* BSD has one and only one pattern buffer. */
638 static struct re_pattern_buffer re_comp_buf;
639
640 char *
641 # ifdef _LIBC
642 /* Make these definitions weak in libc, so POSIX programs can redefine
643 these names if they don't use our functions, and still use
644 regcomp/regexec above without link errors. */
645 weak_function
646 # endif
re_comp(s)647 re_comp (s)
648 const char *s;
649 {
650 reg_errcode_t ret;
651 char *fastmap;
652
653 if (!s)
654 {
655 if (!re_comp_buf.buffer)
656 return gettext ("No previous regular expression");
657 return 0;
658 }
659
660 if (re_comp_buf.buffer)
661 {
662 fastmap = re_comp_buf.fastmap;
663 re_comp_buf.fastmap = NULL;
664 __regfree (&re_comp_buf);
665 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
666 re_comp_buf.fastmap = fastmap;
667 }
668
669 if (re_comp_buf.fastmap == NULL)
670 {
671 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
672 if (re_comp_buf.fastmap == NULL)
673 return (char *) gettext (__re_error_msgid
674 + __re_error_msgid_idx[(int) REG_ESPACE]);
675 }
676
677 /* Since `re_exec' always passes NULL for the `regs' argument, we
678 don't need to initialize the pattern buffer fields which affect it. */
679
680 /* Match anchors at newlines. */
681 re_comp_buf.newline_anchor = 1;
682
683 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
684
685 if (!ret)
686 return NULL;
687
688 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
689 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
690 }
691
692 #ifdef _LIBC
libc_freeres_fn(free_mem)693 libc_freeres_fn (free_mem)
694 {
695 __regfree (&re_comp_buf);
696 }
697 #endif
698
699 #endif /* _REGEX_RE_COMP */
700
701 /* Internal entry point.
702 Compile the regular expression PATTERN, whose length is LENGTH.
703 SYNTAX indicate regular expression's syntax. */
704
705 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)706 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
707 reg_syntax_t syntax)
708 {
709 reg_errcode_t err = REG_NOERROR;
710 re_dfa_t *dfa;
711 re_string_t regexp;
712
713 /* Initialize the pattern buffer. */
714 preg->fastmap_accurate = 0;
715 preg->syntax = syntax;
716 preg->not_bol = preg->not_eol = 0;
717 preg->used = 0;
718 preg->re_nsub = 0;
719 preg->can_be_null = 0;
720 preg->regs_allocated = REGS_UNALLOCATED;
721
722 /* Initialize the dfa. */
723 dfa = (re_dfa_t *) preg->buffer;
724 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
725 {
726 /* If zero allocated, but buffer is non-null, try to realloc
727 enough space. This loses if buffer's address is bogus, but
728 that is the user's responsibility. If ->buffer is NULL this
729 is a simple allocation. */
730 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
731 if (dfa == NULL)
732 return REG_ESPACE;
733 preg->allocated = sizeof (re_dfa_t);
734 preg->buffer = (unsigned char *) dfa;
735 }
736 preg->used = sizeof (re_dfa_t);
737
738 err = init_dfa (dfa, length);
739 if (BE (err != REG_NOERROR, 0))
740 {
741 free_dfa_content (dfa);
742 preg->buffer = NULL;
743 preg->allocated = 0;
744 return err;
745 }
746 #ifdef DEBUG
747 /* Note: length+1 will not overflow since it is checked in init_dfa. */
748 dfa->re_str = re_malloc (char, length + 1);
749 strncpy (dfa->re_str, pattern, length + 1);
750 #endif
751
752 __libc_lock_init (dfa->lock);
753
754 err = re_string_construct (®exp, pattern, length, preg->translate,
755 syntax & RE_ICASE, dfa);
756 if (BE (err != REG_NOERROR, 0))
757 {
758 re_compile_internal_free_return:
759 free_workarea_compile (preg);
760 re_string_destruct (®exp);
761 free_dfa_content (dfa);
762 preg->buffer = NULL;
763 preg->allocated = 0;
764 return err;
765 }
766
767 /* Parse the regular expression, and build a structure tree. */
768 preg->re_nsub = 0;
769 dfa->str_tree = parse (®exp, preg, syntax, &err);
770 if (BE (dfa->str_tree == NULL, 0))
771 goto re_compile_internal_free_return;
772
773 /* Analyze the tree and create the nfa. */
774 err = analyze (preg);
775 if (BE (err != REG_NOERROR, 0))
776 goto re_compile_internal_free_return;
777
778 #ifdef RE_ENABLE_I18N
779 /* If possible, do searching in single byte encoding to speed things up. */
780 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
781 optimize_utf8 (dfa);
782 #endif
783
784 /* Then create the initial state of the dfa. */
785 err = create_initial_state (dfa);
786
787 /* Release work areas. */
788 free_workarea_compile (preg);
789 re_string_destruct (®exp);
790
791 if (BE (err != REG_NOERROR, 0))
792 {
793 free_dfa_content (dfa);
794 preg->buffer = NULL;
795 preg->allocated = 0;
796 }
797
798 return err;
799 }
800
801 /* Initialize DFA. We use the length of the regular expression PAT_LEN
802 as the initial length of some arrays. */
803
804 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)805 init_dfa (re_dfa_t *dfa, size_t pat_len)
806 {
807 unsigned int table_size;
808 #ifndef _LIBC
809 char *codeset_name;
810 #endif
811
812 memset (dfa, '\0', sizeof (re_dfa_t));
813
814 /* Force allocation of str_tree_storage the first time. */
815 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
816
817 /* Avoid overflows. */
818 if (pat_len == SIZE_MAX)
819 return REG_ESPACE;
820
821 dfa->nodes_alloc = pat_len + 1;
822 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
823
824 /* table_size = 2 ^ ceil(log pat_len) */
825 for (table_size = 1; ; table_size <<= 1)
826 if (table_size > pat_len)
827 break;
828
829 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
830 dfa->state_hash_mask = table_size - 1;
831
832 dfa->mb_cur_max = MB_CUR_MAX;
833 #ifdef _LIBC
834 if (dfa->mb_cur_max == 6
835 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
836 dfa->is_utf8 = 1;
837 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
838 != 0);
839 #else
840 # ifdef HAVE_LANGINFO_CODESET
841 codeset_name = nl_langinfo (CODESET);
842 # else
843 codeset_name = getenv ("LC_ALL");
844 if (codeset_name == NULL || codeset_name[0] == '\0')
845 codeset_name = getenv ("LC_CTYPE");
846 if (codeset_name == NULL || codeset_name[0] == '\0')
847 codeset_name = getenv ("LANG");
848 if (codeset_name == NULL)
849 codeset_name = "";
850 else if (strchr (codeset_name, '.') != NULL)
851 codeset_name = strchr (codeset_name, '.') + 1;
852 # endif
853
854 if (strcasecmp (codeset_name, "UTF-8") == 0
855 || strcasecmp (codeset_name, "UTF8") == 0)
856 dfa->is_utf8 = 1;
857
858 /* We check exhaustively in the loop below if this charset is a
859 superset of ASCII. */
860 dfa->map_notascii = 0;
861 #endif
862
863 #ifdef RE_ENABLE_I18N
864 if (dfa->mb_cur_max > 1)
865 {
866 if (dfa->is_utf8)
867 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
868 else
869 {
870 int i, j, ch;
871
872 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
873 if (BE (dfa->sb_char == NULL, 0))
874 return REG_ESPACE;
875
876 /* Set the bits corresponding to single byte chars. */
877 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
878 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
879 {
880 wint_t wch = __btowc (ch);
881 if (wch != WEOF)
882 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
883 # ifndef _LIBC
884 if (isascii (ch) && wch != ch)
885 dfa->map_notascii = 1;
886 # endif
887 }
888 }
889 }
890 #endif
891
892 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
893 return REG_ESPACE;
894 return REG_NOERROR;
895 }
896
897 /* Initialize WORD_CHAR table, which indicate which character is
898 "word". In this case "word" means that it is the word construction
899 character used by some operators like "\<", "\>", etc. */
900
901 static void
902 internal_function
init_word_char(re_dfa_t * dfa)903 init_word_char (re_dfa_t *dfa)
904 {
905 int i, j, ch;
906 dfa->word_ops_used = 1;
907 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
908 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
909 if (isalnum (ch) || ch == '_')
910 dfa->word_char[i] |= (bitset_word_t) 1 << j;
911 }
912
913 /* Free the work area which are only used while compiling. */
914
915 static void
free_workarea_compile(regex_t * preg)916 free_workarea_compile (regex_t *preg)
917 {
918 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
919 bin_tree_storage_t *storage, *next;
920 for (storage = dfa->str_tree_storage; storage; storage = next)
921 {
922 next = storage->next;
923 re_free (storage);
924 }
925 dfa->str_tree_storage = NULL;
926 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
927 dfa->str_tree = NULL;
928 re_free (dfa->org_indices);
929 dfa->org_indices = NULL;
930 }
931
932 /* Create initial states for all contexts. */
933
934 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)935 create_initial_state (re_dfa_t *dfa)
936 {
937 int first, i;
938 reg_errcode_t err;
939 re_node_set init_nodes;
940
941 /* Initial states have the epsilon closure of the node which is
942 the first node of the regular expression. */
943 first = dfa->str_tree->first->node_idx;
944 dfa->init_node = first;
945 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
946 if (BE (err != REG_NOERROR, 0))
947 return err;
948
949 /* The back-references which are in initial states can epsilon transit,
950 since in this case all of the subexpressions can be null.
951 Then we add epsilon closures of the nodes which are the next nodes of
952 the back-references. */
953 if (dfa->nbackref > 0)
954 for (i = 0; i < init_nodes.nelem; ++i)
955 {
956 int node_idx = init_nodes.elems[i];
957 re_token_type_t type = dfa->nodes[node_idx].type;
958
959 int clexp_idx;
960 if (type != OP_BACK_REF)
961 continue;
962 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
963 {
964 re_token_t *clexp_node;
965 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
966 if (clexp_node->type == OP_CLOSE_SUBEXP
967 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
968 break;
969 }
970 if (clexp_idx == init_nodes.nelem)
971 continue;
972
973 if (type == OP_BACK_REF)
974 {
975 int dest_idx = dfa->edests[node_idx].elems[0];
976 if (!re_node_set_contains (&init_nodes, dest_idx))
977 {
978 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
979 i = 0;
980 }
981 }
982 }
983
984 /* It must be the first time to invoke acquire_state. */
985 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
986 /* We don't check ERR here, since the initial state must not be NULL. */
987 if (BE (dfa->init_state == NULL, 0))
988 return err;
989 if (dfa->init_state->has_constraint)
990 {
991 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
992 CONTEXT_WORD);
993 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
994 CONTEXT_NEWLINE);
995 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
996 &init_nodes,
997 CONTEXT_NEWLINE
998 | CONTEXT_BEGBUF);
999 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1000 || dfa->init_state_begbuf == NULL, 0))
1001 return err;
1002 }
1003 else
1004 dfa->init_state_word = dfa->init_state_nl
1005 = dfa->init_state_begbuf = dfa->init_state;
1006
1007 re_node_set_free (&init_nodes);
1008 return REG_NOERROR;
1009 }
1010
1011 #ifdef RE_ENABLE_I18N
1012 /* If it is possible to do searching in single byte encoding instead of UTF-8
1013 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1014 DFA nodes where needed. */
1015
1016 static void
optimize_utf8(re_dfa_t * dfa)1017 optimize_utf8 (re_dfa_t *dfa)
1018 {
1019 int node, i, mb_chars = 0, has_period = 0;
1020
1021 for (node = 0; node < dfa->nodes_len; ++node)
1022 switch (dfa->nodes[node].type)
1023 {
1024 case CHARACTER:
1025 if (dfa->nodes[node].opr.c >= 0x80)
1026 mb_chars = 1;
1027 break;
1028 case ANCHOR:
1029 switch (dfa->nodes[node].opr.idx)
1030 {
1031 case LINE_FIRST:
1032 case LINE_LAST:
1033 case BUF_FIRST:
1034 case BUF_LAST:
1035 break;
1036 default:
1037 /* Word anchors etc. cannot be handled. */
1038 return;
1039 }
1040 break;
1041 case OP_PERIOD:
1042 has_period = 1;
1043 break;
1044 case OP_BACK_REF:
1045 case OP_ALT:
1046 case END_OF_RE:
1047 case OP_DUP_ASTERISK:
1048 case OP_OPEN_SUBEXP:
1049 case OP_CLOSE_SUBEXP:
1050 break;
1051 case COMPLEX_BRACKET:
1052 return;
1053 case SIMPLE_BRACKET:
1054 /* Just double check. The non-ASCII range starts at 0x80. */
1055 assert (0x80 % BITSET_WORD_BITS == 0);
1056 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1057 if (dfa->nodes[node].opr.sbcset[i])
1058 return;
1059 break;
1060 default:
1061 abort ();
1062 }
1063
1064 if (mb_chars || has_period)
1065 for (node = 0; node < dfa->nodes_len; ++node)
1066 {
1067 if (dfa->nodes[node].type == CHARACTER
1068 && dfa->nodes[node].opr.c >= 0x80)
1069 dfa->nodes[node].mb_partial = 0;
1070 else if (dfa->nodes[node].type == OP_PERIOD)
1071 dfa->nodes[node].type = OP_UTF8_PERIOD;
1072 }
1073
1074 /* The search can be in single byte locale. */
1075 dfa->mb_cur_max = 1;
1076 dfa->is_utf8 = 0;
1077 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1078 }
1079 #endif
1080
1081 /* Analyze the structure tree, and calculate "first", "next", "edest",
1082 "eclosure", and "inveclosure". */
1083
1084 static reg_errcode_t
analyze(regex_t * preg)1085 analyze (regex_t *preg)
1086 {
1087 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1088 reg_errcode_t ret;
1089
1090 /* Allocate arrays. */
1091 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1092 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1093 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1094 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1095 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1096 || dfa->eclosures == NULL, 0))
1097 return REG_ESPACE;
1098
1099 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1100 if (dfa->subexp_map != NULL)
1101 {
1102 int i;
1103 for (i = 0; i < preg->re_nsub; i++)
1104 dfa->subexp_map[i] = i;
1105 preorder (dfa->str_tree, optimize_subexps, dfa);
1106 for (i = 0; i < preg->re_nsub; i++)
1107 if (dfa->subexp_map[i] != i)
1108 break;
1109 if (i == preg->re_nsub)
1110 {
1111 free (dfa->subexp_map);
1112 dfa->subexp_map = NULL;
1113 }
1114 }
1115
1116 ret = postorder (dfa->str_tree, lower_subexps, preg);
1117 if (BE (ret != REG_NOERROR, 0))
1118 return ret;
1119 ret = postorder (dfa->str_tree, calc_first, dfa);
1120 if (BE (ret != REG_NOERROR, 0))
1121 return ret;
1122 preorder (dfa->str_tree, calc_next, dfa);
1123 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1124 if (BE (ret != REG_NOERROR, 0))
1125 return ret;
1126 ret = calc_eclosure (dfa);
1127 if (BE (ret != REG_NOERROR, 0))
1128 return ret;
1129
1130 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1131 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1132 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1133 || dfa->nbackref)
1134 {
1135 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1136 if (BE (dfa->inveclosures == NULL, 0))
1137 return REG_ESPACE;
1138 ret = calc_inveclosure (dfa);
1139 }
1140
1141 return ret;
1142 }
1143
1144 /* Our parse trees are very unbalanced, so we cannot use a stack to
1145 implement parse tree visits. Instead, we use parent pointers and
1146 some hairy code in these two functions. */
1147 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1148 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1149 void *extra)
1150 {
1151 bin_tree_t *node, *prev;
1152
1153 for (node = root; ; )
1154 {
1155 /* Descend down the tree, preferably to the left (or to the right
1156 if that's the only child). */
1157 while (node->left || node->right)
1158 if (node->left)
1159 node = node->left;
1160 else
1161 node = node->right;
1162
1163 do
1164 {
1165 reg_errcode_t err = fn (extra, node);
1166 if (BE (err != REG_NOERROR, 0))
1167 return err;
1168 if (node->parent == NULL)
1169 return REG_NOERROR;
1170 prev = node;
1171 node = node->parent;
1172 }
1173 /* Go up while we have a node that is reached from the right. */
1174 while (node->right == prev || node->right == NULL);
1175 node = node->right;
1176 }
1177 }
1178
1179 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1180 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1181 void *extra)
1182 {
1183 bin_tree_t *node;
1184
1185 for (node = root; ; )
1186 {
1187 reg_errcode_t err = fn (extra, node);
1188 if (BE (err != REG_NOERROR, 0))
1189 return err;
1190
1191 /* Go to the left node, or up and to the right. */
1192 if (node->left)
1193 node = node->left;
1194 else
1195 {
1196 bin_tree_t *prev = NULL;
1197 while (node->right == prev || node->right == NULL)
1198 {
1199 prev = node;
1200 node = node->parent;
1201 if (!node)
1202 return REG_NOERROR;
1203 }
1204 node = node->right;
1205 }
1206 }
1207 }
1208
1209 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1210 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1211 backreferences as well. Requires a preorder visit. */
1212 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1213 optimize_subexps (void *extra, bin_tree_t *node)
1214 {
1215 re_dfa_t *dfa = (re_dfa_t *) extra;
1216
1217 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1218 {
1219 int idx = node->token.opr.idx;
1220 node->token.opr.idx = dfa->subexp_map[idx];
1221 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1222 }
1223
1224 else if (node->token.type == SUBEXP
1225 && node->left && node->left->token.type == SUBEXP)
1226 {
1227 int other_idx = node->left->token.opr.idx;
1228
1229 node->left = node->left->left;
1230 if (node->left)
1231 node->left->parent = node;
1232
1233 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1234 if (other_idx < BITSET_WORD_BITS)
1235 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1236 }
1237
1238 return REG_NOERROR;
1239 }
1240
1241 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1242 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1243 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1244 lower_subexps (void *extra, bin_tree_t *node)
1245 {
1246 regex_t *preg = (regex_t *) extra;
1247 reg_errcode_t err = REG_NOERROR;
1248
1249 if (node->left && node->left->token.type == SUBEXP)
1250 {
1251 node->left = lower_subexp (&err, preg, node->left);
1252 if (node->left)
1253 node->left->parent = node;
1254 }
1255 if (node->right && node->right->token.type == SUBEXP)
1256 {
1257 node->right = lower_subexp (&err, preg, node->right);
1258 if (node->right)
1259 node->right->parent = node;
1260 }
1261
1262 return err;
1263 }
1264
1265 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1266 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1267 {
1268 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1269 bin_tree_t *body = node->left;
1270 bin_tree_t *op, *cls, *tree1, *tree;
1271
1272 if (preg->no_sub
1273 /* We do not optimize empty subexpressions, because otherwise we may
1274 have bad CONCAT nodes with NULL children. This is obviously not
1275 very common, so we do not lose much. An example that triggers
1276 this case is the sed "script" /\(\)/x. */
1277 && node->left != NULL
1278 && (node->token.opr.idx >= BITSET_WORD_BITS
1279 || !(dfa->used_bkref_map
1280 & ((bitset_word_t) 1 << node->token.opr.idx))))
1281 return node->left;
1282
1283 /* Convert the SUBEXP node to the concatenation of an
1284 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1285 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1286 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1287 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1288 tree = create_tree (dfa, op, tree1, CONCAT);
1289 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1290 {
1291 *err = REG_ESPACE;
1292 return NULL;
1293 }
1294
1295 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1296 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1297 return tree;
1298 }
1299
1300 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1301 nodes. Requires a postorder visit. */
1302 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1303 calc_first (void *extra, bin_tree_t *node)
1304 {
1305 re_dfa_t *dfa = (re_dfa_t *) extra;
1306 if (node->token.type == CONCAT)
1307 {
1308 node->first = node->left->first;
1309 node->node_idx = node->left->node_idx;
1310 }
1311 else
1312 {
1313 node->first = node;
1314 node->node_idx = re_dfa_add_node (dfa, node->token);
1315 if (BE (node->node_idx == -1, 0))
1316 return REG_ESPACE;
1317 }
1318 return REG_NOERROR;
1319 }
1320
1321 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1322 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1323 calc_next (void *extra, bin_tree_t *node)
1324 {
1325 switch (node->token.type)
1326 {
1327 case OP_DUP_ASTERISK:
1328 node->left->next = node;
1329 break;
1330 case CONCAT:
1331 node->left->next = node->right->first;
1332 node->right->next = node->next;
1333 break;
1334 default:
1335 if (node->left)
1336 node->left->next = node->next;
1337 if (node->right)
1338 node->right->next = node->next;
1339 break;
1340 }
1341 return REG_NOERROR;
1342 }
1343
1344 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1345 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1346 link_nfa_nodes (void *extra, bin_tree_t *node)
1347 {
1348 re_dfa_t *dfa = (re_dfa_t *) extra;
1349 int idx = node->node_idx;
1350 reg_errcode_t err = REG_NOERROR;
1351
1352 switch (node->token.type)
1353 {
1354 case CONCAT:
1355 break;
1356
1357 case END_OF_RE:
1358 assert (node->next == NULL);
1359 break;
1360
1361 case OP_DUP_ASTERISK:
1362 case OP_ALT:
1363 {
1364 int left, right;
1365 dfa->has_plural_match = 1;
1366 if (node->left != NULL)
1367 left = node->left->first->node_idx;
1368 else
1369 left = node->next->node_idx;
1370 if (node->right != NULL)
1371 right = node->right->first->node_idx;
1372 else
1373 right = node->next->node_idx;
1374 assert (left > -1);
1375 assert (right > -1);
1376 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1377 }
1378 break;
1379
1380 case ANCHOR:
1381 case OP_OPEN_SUBEXP:
1382 case OP_CLOSE_SUBEXP:
1383 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1384 break;
1385
1386 case OP_BACK_REF:
1387 dfa->nexts[idx] = node->next->node_idx;
1388 if (node->token.type == OP_BACK_REF)
1389 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1390 break;
1391
1392 default:
1393 assert (!IS_EPSILON_NODE (node->token.type));
1394 dfa->nexts[idx] = node->next->node_idx;
1395 break;
1396 }
1397
1398 return err;
1399 }
1400
1401 /* Duplicate the epsilon closure of the node ROOT_NODE.
1402 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1403 to their own constraint. */
1404
1405 static reg_errcode_t
1406 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1407 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1408 int root_node, unsigned int init_constraint)
1409 {
1410 int org_node, clone_node, ret;
1411 unsigned int constraint = init_constraint;
1412 for (org_node = top_org_node, clone_node = top_clone_node;;)
1413 {
1414 int org_dest, clone_dest;
1415 if (dfa->nodes[org_node].type == OP_BACK_REF)
1416 {
1417 /* If the back reference epsilon-transit, its destination must
1418 also have the constraint. Then duplicate the epsilon closure
1419 of the destination of the back reference, and store it in
1420 edests of the back reference. */
1421 org_dest = dfa->nexts[org_node];
1422 re_node_set_empty (dfa->edests + clone_node);
1423 clone_dest = duplicate_node (dfa, org_dest, constraint);
1424 if (BE (clone_dest == -1, 0))
1425 return REG_ESPACE;
1426 dfa->nexts[clone_node] = dfa->nexts[org_node];
1427 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1428 if (BE (ret < 0, 0))
1429 return REG_ESPACE;
1430 }
1431 else if (dfa->edests[org_node].nelem == 0)
1432 {
1433 /* In case of the node can't epsilon-transit, don't duplicate the
1434 destination and store the original destination as the
1435 destination of the node. */
1436 dfa->nexts[clone_node] = dfa->nexts[org_node];
1437 break;
1438 }
1439 else if (dfa->edests[org_node].nelem == 1)
1440 {
1441 /* In case of the node can epsilon-transit, and it has only one
1442 destination. */
1443 org_dest = dfa->edests[org_node].elems[0];
1444 re_node_set_empty (dfa->edests + clone_node);
1445 if (dfa->nodes[org_node].type == ANCHOR)
1446 {
1447 /* In case of the node has another constraint, append it. */
1448 if (org_node == root_node && clone_node != org_node)
1449 {
1450 /* ...but if the node is root_node itself, it means the
1451 epsilon closure have a loop, then tie it to the
1452 destination of the root_node. */
1453 ret = re_node_set_insert (dfa->edests + clone_node,
1454 org_dest);
1455 if (BE (ret < 0, 0))
1456 return REG_ESPACE;
1457 break;
1458 }
1459 constraint |= dfa->nodes[org_node].opr.ctx_type;
1460 }
1461 clone_dest = duplicate_node (dfa, org_dest, constraint);
1462 if (BE (clone_dest == -1, 0))
1463 return REG_ESPACE;
1464 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1465 if (BE (ret < 0, 0))
1466 return REG_ESPACE;
1467 }
1468 else /* dfa->edests[org_node].nelem == 2 */
1469 {
1470 /* In case of the node can epsilon-transit, and it has two
1471 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1472 org_dest = dfa->edests[org_node].elems[0];
1473 re_node_set_empty (dfa->edests + clone_node);
1474 /* Search for a duplicated node which satisfies the constraint. */
1475 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1476 if (clone_dest == -1)
1477 {
1478 /* There are no such a duplicated node, create a new one. */
1479 reg_errcode_t err;
1480 clone_dest = duplicate_node (dfa, org_dest, constraint);
1481 if (BE (clone_dest == -1, 0))
1482 return REG_ESPACE;
1483 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1484 if (BE (ret < 0, 0))
1485 return REG_ESPACE;
1486 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1487 root_node, constraint);
1488 if (BE (err != REG_NOERROR, 0))
1489 return err;
1490 }
1491 else
1492 {
1493 /* There are a duplicated node which satisfy the constraint,
1494 use it to avoid infinite loop. */
1495 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1496 if (BE (ret < 0, 0))
1497 return REG_ESPACE;
1498 }
1499
1500 org_dest = dfa->edests[org_node].elems[1];
1501 clone_dest = duplicate_node (dfa, org_dest, constraint);
1502 if (BE (clone_dest == -1, 0))
1503 return REG_ESPACE;
1504 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1505 if (BE (ret < 0, 0))
1506 return REG_ESPACE;
1507 }
1508 org_node = org_dest;
1509 clone_node = clone_dest;
1510 }
1511 return REG_NOERROR;
1512 }
1513
1514 /* Search for a node which is duplicated from the node ORG_NODE, and
1515 satisfies the constraint CONSTRAINT. */
1516
1517 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1518 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1519 unsigned int constraint)
1520 {
1521 int idx;
1522 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1523 {
1524 if (org_node == dfa->org_indices[idx]
1525 && constraint == dfa->nodes[idx].constraint)
1526 return idx; /* Found. */
1527 }
1528 return -1; /* Not found. */
1529 }
1530
1531 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1532 Return the index of the new node, or -1 if insufficient storage is
1533 available. */
1534
1535 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1536 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1537 {
1538 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1539 if (BE (dup_idx != -1, 1))
1540 {
1541 dfa->nodes[dup_idx].constraint = constraint;
1542 if (dfa->nodes[org_idx].type == ANCHOR)
1543 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1544 dfa->nodes[dup_idx].duplicated = 1;
1545
1546 /* Store the index of the original node. */
1547 dfa->org_indices[dup_idx] = org_idx;
1548 }
1549 return dup_idx;
1550 }
1551
1552 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1553 calc_inveclosure (re_dfa_t *dfa)
1554 {
1555 int src, idx, ret;
1556 for (idx = 0; idx < dfa->nodes_len; ++idx)
1557 re_node_set_init_empty (dfa->inveclosures + idx);
1558
1559 for (src = 0; src < dfa->nodes_len; ++src)
1560 {
1561 int *elems = dfa->eclosures[src].elems;
1562 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1563 {
1564 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1565 if (BE (ret == -1, 0))
1566 return REG_ESPACE;
1567 }
1568 }
1569
1570 return REG_NOERROR;
1571 }
1572
1573 /* Calculate "eclosure" for all the node in DFA. */
1574
1575 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1576 calc_eclosure (re_dfa_t *dfa)
1577 {
1578 int node_idx, incomplete;
1579 #ifdef DEBUG
1580 assert (dfa->nodes_len > 0);
1581 #endif
1582 incomplete = 0;
1583 /* For each nodes, calculate epsilon closure. */
1584 for (node_idx = 0; ; ++node_idx)
1585 {
1586 reg_errcode_t err;
1587 re_node_set eclosure_elem;
1588 if (node_idx == dfa->nodes_len)
1589 {
1590 if (!incomplete)
1591 break;
1592 incomplete = 0;
1593 node_idx = 0;
1594 }
1595
1596 #ifdef DEBUG
1597 assert (dfa->eclosures[node_idx].nelem != -1);
1598 #endif
1599
1600 /* If we have already calculated, skip it. */
1601 if (dfa->eclosures[node_idx].nelem != 0)
1602 continue;
1603 /* Calculate epsilon closure of `node_idx'. */
1604 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1605 if (BE (err != REG_NOERROR, 0))
1606 return err;
1607
1608 if (dfa->eclosures[node_idx].nelem == 0)
1609 {
1610 incomplete = 1;
1611 re_node_set_free (&eclosure_elem);
1612 }
1613 }
1614 return REG_NOERROR;
1615 }
1616
1617 /* Calculate epsilon closure of NODE. */
1618
1619 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1620 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1621 {
1622 reg_errcode_t err;
1623 unsigned int constraint;
1624 int i, incomplete;
1625 re_node_set eclosure;
1626 incomplete = 0;
1627 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1628 if (BE (err != REG_NOERROR, 0))
1629 return err;
1630
1631 /* This indicates that we are calculating this node now.
1632 We reference this value to avoid infinite loop. */
1633 dfa->eclosures[node].nelem = -1;
1634
1635 constraint = ((dfa->nodes[node].type == ANCHOR)
1636 ? dfa->nodes[node].opr.ctx_type : 0);
1637 /* If the current node has constraints, duplicate all nodes.
1638 Since they must inherit the constraints. */
1639 if (constraint
1640 && dfa->edests[node].nelem
1641 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1642 {
1643 err = duplicate_node_closure (dfa, node, node, node, constraint);
1644 if (BE (err != REG_NOERROR, 0))
1645 return err;
1646 }
1647
1648 /* Expand each epsilon destination nodes. */
1649 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1650 for (i = 0; i < dfa->edests[node].nelem; ++i)
1651 {
1652 re_node_set eclosure_elem;
1653 int edest = dfa->edests[node].elems[i];
1654 /* If calculating the epsilon closure of `edest' is in progress,
1655 return intermediate result. */
1656 if (dfa->eclosures[edest].nelem == -1)
1657 {
1658 incomplete = 1;
1659 continue;
1660 }
1661 /* If we haven't calculated the epsilon closure of `edest' yet,
1662 calculate now. Otherwise use calculated epsilon closure. */
1663 if (dfa->eclosures[edest].nelem == 0)
1664 {
1665 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1666 if (BE (err != REG_NOERROR, 0))
1667 return err;
1668 }
1669 else
1670 eclosure_elem = dfa->eclosures[edest];
1671 /* Merge the epsilon closure of `edest'. */
1672 re_node_set_merge (&eclosure, &eclosure_elem);
1673 /* If the epsilon closure of `edest' is incomplete,
1674 the epsilon closure of this node is also incomplete. */
1675 if (dfa->eclosures[edest].nelem == 0)
1676 {
1677 incomplete = 1;
1678 re_node_set_free (&eclosure_elem);
1679 }
1680 }
1681
1682 /* Epsilon closures include itself. */
1683 re_node_set_insert (&eclosure, node);
1684 if (incomplete && !root)
1685 dfa->eclosures[node].nelem = 0;
1686 else
1687 dfa->eclosures[node] = eclosure;
1688 *new_set = eclosure;
1689 return REG_NOERROR;
1690 }
1691
1692 /* Functions for token which are used in the parser. */
1693
1694 /* Fetch a token from INPUT.
1695 We must not use this function inside bracket expressions. */
1696
1697 static void
1698 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1700 {
1701 re_string_skip_bytes (input, peek_token (result, input, syntax));
1702 }
1703
1704 /* Peek a token from INPUT, and return the length of the token.
1705 We must not use this function inside bracket expressions. */
1706
1707 static int
1708 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1709 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1710 {
1711 unsigned char c;
1712
1713 if (re_string_eoi (input))
1714 {
1715 token->type = END_OF_RE;
1716 return 0;
1717 }
1718
1719 c = re_string_peek_byte (input, 0);
1720 token->opr.c = c;
1721
1722 token->word_char = 0;
1723 #ifdef RE_ENABLE_I18N
1724 token->mb_partial = 0;
1725 if (input->mb_cur_max > 1 &&
1726 !re_string_first_byte (input, re_string_cur_idx (input)))
1727 {
1728 token->type = CHARACTER;
1729 token->mb_partial = 1;
1730 return 1;
1731 }
1732 #endif
1733 if (c == '\\')
1734 {
1735 unsigned char c2;
1736 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1737 {
1738 token->type = BACK_SLASH;
1739 return 1;
1740 }
1741
1742 c2 = re_string_peek_byte_case (input, 1);
1743 token->opr.c = c2;
1744 token->type = CHARACTER;
1745 #ifdef RE_ENABLE_I18N
1746 if (input->mb_cur_max > 1)
1747 {
1748 wint_t wc = re_string_wchar_at (input,
1749 re_string_cur_idx (input) + 1);
1750 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1751 }
1752 else
1753 #endif
1754 token->word_char = IS_WORD_CHAR (c2) != 0;
1755
1756 switch (c2)
1757 {
1758 case '|':
1759 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1760 token->type = OP_ALT;
1761 break;
1762 case '1': case '2': case '3': case '4': case '5':
1763 case '6': case '7': case '8': case '9':
1764 if (!(syntax & RE_NO_BK_REFS))
1765 {
1766 token->type = OP_BACK_REF;
1767 token->opr.idx = c2 - '1';
1768 }
1769 break;
1770 case '<':
1771 if (!(syntax & RE_NO_GNU_OPS))
1772 {
1773 token->type = ANCHOR;
1774 token->opr.ctx_type = WORD_FIRST;
1775 }
1776 break;
1777 case '>':
1778 if (!(syntax & RE_NO_GNU_OPS))
1779 {
1780 token->type = ANCHOR;
1781 token->opr.ctx_type = WORD_LAST;
1782 }
1783 break;
1784 case 'b':
1785 if (!(syntax & RE_NO_GNU_OPS))
1786 {
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = WORD_DELIM;
1789 }
1790 break;
1791 case 'B':
1792 if (!(syntax & RE_NO_GNU_OPS))
1793 {
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = NOT_WORD_DELIM;
1796 }
1797 break;
1798 case 'w':
1799 if (!(syntax & RE_NO_GNU_OPS))
1800 token->type = OP_WORD;
1801 break;
1802 case 'W':
1803 if (!(syntax & RE_NO_GNU_OPS))
1804 token->type = OP_NOTWORD;
1805 break;
1806 case 's':
1807 if (!(syntax & RE_NO_GNU_OPS))
1808 token->type = OP_SPACE;
1809 break;
1810 case 'S':
1811 if (!(syntax & RE_NO_GNU_OPS))
1812 token->type = OP_NOTSPACE;
1813 break;
1814 case '`':
1815 if (!(syntax & RE_NO_GNU_OPS))
1816 {
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = BUF_FIRST;
1819 }
1820 break;
1821 case '\'':
1822 if (!(syntax & RE_NO_GNU_OPS))
1823 {
1824 token->type = ANCHOR;
1825 token->opr.ctx_type = BUF_LAST;
1826 }
1827 break;
1828 case '(':
1829 if (!(syntax & RE_NO_BK_PARENS))
1830 token->type = OP_OPEN_SUBEXP;
1831 break;
1832 case ')':
1833 if (!(syntax & RE_NO_BK_PARENS))
1834 token->type = OP_CLOSE_SUBEXP;
1835 break;
1836 case '+':
1837 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1838 token->type = OP_DUP_PLUS;
1839 break;
1840 case '?':
1841 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1842 token->type = OP_DUP_QUESTION;
1843 break;
1844 case '{':
1845 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1846 token->type = OP_OPEN_DUP_NUM;
1847 break;
1848 case '}':
1849 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1850 token->type = OP_CLOSE_DUP_NUM;
1851 break;
1852 default:
1853 break;
1854 }
1855 return 2;
1856 }
1857
1858 token->type = CHARACTER;
1859 #ifdef RE_ENABLE_I18N
1860 if (input->mb_cur_max > 1)
1861 {
1862 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1863 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1864 }
1865 else
1866 #endif
1867 token->word_char = IS_WORD_CHAR (token->opr.c);
1868
1869 switch (c)
1870 {
1871 case '\n':
1872 if (syntax & RE_NEWLINE_ALT)
1873 token->type = OP_ALT;
1874 break;
1875 case '|':
1876 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1877 token->type = OP_ALT;
1878 break;
1879 case '*':
1880 token->type = OP_DUP_ASTERISK;
1881 break;
1882 case '+':
1883 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1884 token->type = OP_DUP_PLUS;
1885 break;
1886 case '?':
1887 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1888 token->type = OP_DUP_QUESTION;
1889 break;
1890 case '{':
1891 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1892 token->type = OP_OPEN_DUP_NUM;
1893 break;
1894 case '}':
1895 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1896 token->type = OP_CLOSE_DUP_NUM;
1897 break;
1898 case '(':
1899 if (syntax & RE_NO_BK_PARENS)
1900 token->type = OP_OPEN_SUBEXP;
1901 break;
1902 case ')':
1903 if (syntax & RE_NO_BK_PARENS)
1904 token->type = OP_CLOSE_SUBEXP;
1905 break;
1906 case '[':
1907 token->type = OP_OPEN_BRACKET;
1908 break;
1909 case '.':
1910 token->type = OP_PERIOD;
1911 break;
1912 case '^':
1913 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1914 re_string_cur_idx (input) != 0)
1915 {
1916 char prev = re_string_peek_byte (input, -1);
1917 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1918 break;
1919 }
1920 token->type = ANCHOR;
1921 token->opr.ctx_type = LINE_FIRST;
1922 break;
1923 case '$':
1924 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1925 re_string_cur_idx (input) + 1 != re_string_length (input))
1926 {
1927 re_token_t next;
1928 re_string_skip_bytes (input, 1);
1929 peek_token (&next, input, syntax);
1930 re_string_skip_bytes (input, -1);
1931 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1932 break;
1933 }
1934 token->type = ANCHOR;
1935 token->opr.ctx_type = LINE_LAST;
1936 break;
1937 default:
1938 break;
1939 }
1940 return 1;
1941 }
1942
1943 /* Peek a token from INPUT, and return the length of the token.
1944 We must not use this function out of bracket expressions. */
1945
1946 static int
1947 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1948 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1949 {
1950 unsigned char c;
1951 if (re_string_eoi (input))
1952 {
1953 token->type = END_OF_RE;
1954 return 0;
1955 }
1956 c = re_string_peek_byte (input, 0);
1957 token->opr.c = c;
1958
1959 #ifdef RE_ENABLE_I18N
1960 if (input->mb_cur_max > 1 &&
1961 !re_string_first_byte (input, re_string_cur_idx (input)))
1962 {
1963 token->type = CHARACTER;
1964 return 1;
1965 }
1966 #endif /* RE_ENABLE_I18N */
1967
1968 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1969 && re_string_cur_idx (input) + 1 < re_string_length (input))
1970 {
1971 /* In this case, '\' escape a character. */
1972 unsigned char c2;
1973 re_string_skip_bytes (input, 1);
1974 c2 = re_string_peek_byte (input, 0);
1975 token->opr.c = c2;
1976 token->type = CHARACTER;
1977 return 1;
1978 }
1979 if (c == '[') /* '[' is a special char in a bracket exps. */
1980 {
1981 unsigned char c2;
1982 int token_len;
1983 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1984 c2 = re_string_peek_byte (input, 1);
1985 else
1986 c2 = 0;
1987 token->opr.c = c2;
1988 token_len = 2;
1989 switch (c2)
1990 {
1991 case '.':
1992 token->type = OP_OPEN_COLL_ELEM;
1993 break;
1994 case '=':
1995 token->type = OP_OPEN_EQUIV_CLASS;
1996 break;
1997 case ':':
1998 if (syntax & RE_CHAR_CLASSES)
1999 {
2000 token->type = OP_OPEN_CHAR_CLASS;
2001 break;
2002 }
2003 /* else fall through. */
2004 default:
2005 token->type = CHARACTER;
2006 token->opr.c = c;
2007 token_len = 1;
2008 break;
2009 }
2010 return token_len;
2011 }
2012 switch (c)
2013 {
2014 case '-':
2015 token->type = OP_CHARSET_RANGE;
2016 break;
2017 case ']':
2018 token->type = OP_CLOSE_BRACKET;
2019 break;
2020 case '^':
2021 token->type = OP_NON_MATCH_LIST;
2022 break;
2023 default:
2024 token->type = CHARACTER;
2025 }
2026 return 1;
2027 }
2028
2029 /* Functions for parser. */
2030
2031 /* Entry point of the parser.
2032 Parse the regular expression REGEXP and return the structure tree.
2033 If an error is occured, ERR is set by error code, and return NULL.
2034 This function build the following tree, from regular expression <reg_exp>:
2035 CAT
2036 / \
2037 / \
2038 <reg_exp> EOR
2039
2040 CAT means concatenation.
2041 EOR means end of regular expression. */
2042
2043 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2044 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2045 reg_errcode_t *err)
2046 {
2047 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2048 bin_tree_t *tree, *eor, *root;
2049 re_token_t current_token;
2050 dfa->syntax = syntax;
2051 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2052 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2053 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2054 return NULL;
2055 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2056 if (tree != NULL)
2057 root = create_tree (dfa, tree, eor, CONCAT);
2058 else
2059 root = eor;
2060 if (BE (eor == NULL || root == NULL, 0))
2061 {
2062 *err = REG_ESPACE;
2063 return NULL;
2064 }
2065 return root;
2066 }
2067
2068 /* This function build the following tree, from regular expression
2069 <branch1>|<branch2>:
2070 ALT
2071 / \
2072 / \
2073 <branch1> <branch2>
2074
2075 ALT means alternative, which represents the operator `|'. */
2076
2077 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2078 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2079 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2080 {
2081 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2082 bin_tree_t *tree, *branch = NULL;
2083 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2084 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2085 return NULL;
2086
2087 while (token->type == OP_ALT)
2088 {
2089 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2090 if (token->type != OP_ALT && token->type != END_OF_RE
2091 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2092 {
2093 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2094 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2095 return NULL;
2096 }
2097 else
2098 branch = NULL;
2099 tree = create_tree (dfa, tree, branch, OP_ALT);
2100 if (BE (tree == NULL, 0))
2101 {
2102 *err = REG_ESPACE;
2103 return NULL;
2104 }
2105 }
2106 return tree;
2107 }
2108
2109 /* This function build the following tree, from regular expression
2110 <exp1><exp2>:
2111 CAT
2112 / \
2113 / \
2114 <exp1> <exp2>
2115
2116 CAT means concatenation. */
2117
2118 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2119 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2120 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2121 {
2122 bin_tree_t *tree, *exp;
2123 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2124 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2125 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126 return NULL;
2127
2128 while (token->type != OP_ALT && token->type != END_OF_RE
2129 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2130 {
2131 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2132 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2133 {
2134 return NULL;
2135 }
2136 if (tree != NULL && exp != NULL)
2137 {
2138 tree = create_tree (dfa, tree, exp, CONCAT);
2139 if (tree == NULL)
2140 {
2141 *err = REG_ESPACE;
2142 return NULL;
2143 }
2144 }
2145 else if (tree == NULL)
2146 tree = exp;
2147 /* Otherwise exp == NULL, we don't need to create new tree. */
2148 }
2149 return tree;
2150 }
2151
2152 /* This function build the following tree, from regular expression a*:
2153 *
2154 |
2155 a
2156 */
2157
2158 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2159 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2160 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2161 {
2162 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2163 bin_tree_t *tree;
2164 switch (token->type)
2165 {
2166 case CHARACTER:
2167 tree = create_token_tree (dfa, NULL, NULL, token);
2168 if (BE (tree == NULL, 0))
2169 {
2170 *err = REG_ESPACE;
2171 return NULL;
2172 }
2173 #ifdef RE_ENABLE_I18N
2174 if (dfa->mb_cur_max > 1)
2175 {
2176 while (!re_string_eoi (regexp)
2177 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2178 {
2179 bin_tree_t *mbc_remain;
2180 fetch_token (token, regexp, syntax);
2181 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2182 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2183 if (BE (mbc_remain == NULL || tree == NULL, 0))
2184 {
2185 *err = REG_ESPACE;
2186 return NULL;
2187 }
2188 }
2189 }
2190 #endif
2191 break;
2192 case OP_OPEN_SUBEXP:
2193 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2194 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195 return NULL;
2196 break;
2197 case OP_OPEN_BRACKET:
2198 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2199 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2200 return NULL;
2201 break;
2202 case OP_BACK_REF:
2203 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2204 {
2205 *err = REG_ESUBREG;
2206 return NULL;
2207 }
2208 dfa->used_bkref_map |= 1 << token->opr.idx;
2209 tree = create_token_tree (dfa, NULL, NULL, token);
2210 if (BE (tree == NULL, 0))
2211 {
2212 *err = REG_ESPACE;
2213 return NULL;
2214 }
2215 ++dfa->nbackref;
2216 dfa->has_mb_node = 1;
2217 break;
2218 case OP_OPEN_DUP_NUM:
2219 if (syntax & RE_CONTEXT_INVALID_DUP)
2220 {
2221 *err = REG_BADRPT;
2222 return NULL;
2223 }
2224 /* FALLTHROUGH */
2225 case OP_DUP_ASTERISK:
2226 case OP_DUP_PLUS:
2227 case OP_DUP_QUESTION:
2228 if (syntax & RE_CONTEXT_INVALID_OPS)
2229 {
2230 *err = REG_BADRPT;
2231 return NULL;
2232 }
2233 else if (syntax & RE_CONTEXT_INDEP_OPS)
2234 {
2235 fetch_token (token, regexp, syntax);
2236 return parse_expression (regexp, preg, token, syntax, nest, err);
2237 }
2238 /* else fall through */
2239 case OP_CLOSE_SUBEXP:
2240 if ((token->type == OP_CLOSE_SUBEXP) &&
2241 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2242 {
2243 *err = REG_ERPAREN;
2244 return NULL;
2245 }
2246 /* else fall through */
2247 case OP_CLOSE_DUP_NUM:
2248 /* We treat it as a normal character. */
2249
2250 /* Then we can these characters as normal characters. */
2251 token->type = CHARACTER;
2252 /* mb_partial and word_char bits should be initialized already
2253 by peek_token. */
2254 tree = create_token_tree (dfa, NULL, NULL, token);
2255 if (BE (tree == NULL, 0))
2256 {
2257 *err = REG_ESPACE;
2258 return NULL;
2259 }
2260 break;
2261 case ANCHOR:
2262 if ((token->opr.ctx_type
2263 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2264 && dfa->word_ops_used == 0)
2265 init_word_char (dfa);
2266 if (token->opr.ctx_type == WORD_DELIM
2267 || token->opr.ctx_type == NOT_WORD_DELIM)
2268 {
2269 bin_tree_t *tree_first, *tree_last;
2270 if (token->opr.ctx_type == WORD_DELIM)
2271 {
2272 token->opr.ctx_type = WORD_FIRST;
2273 tree_first = create_token_tree (dfa, NULL, NULL, token);
2274 token->opr.ctx_type = WORD_LAST;
2275 }
2276 else
2277 {
2278 token->opr.ctx_type = INSIDE_WORD;
2279 tree_first = create_token_tree (dfa, NULL, NULL, token);
2280 token->opr.ctx_type = INSIDE_NOTWORD;
2281 }
2282 tree_last = create_token_tree (dfa, NULL, NULL, token);
2283 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2284 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2285 {
2286 *err = REG_ESPACE;
2287 return NULL;
2288 }
2289 }
2290 else
2291 {
2292 tree = create_token_tree (dfa, NULL, NULL, token);
2293 if (BE (tree == NULL, 0))
2294 {
2295 *err = REG_ESPACE;
2296 return NULL;
2297 }
2298 }
2299 /* We must return here, since ANCHORs can't be followed
2300 by repetition operators.
2301 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2302 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2303 fetch_token (token, regexp, syntax);
2304 return tree;
2305 case OP_PERIOD:
2306 tree = create_token_tree (dfa, NULL, NULL, token);
2307 if (BE (tree == NULL, 0))
2308 {
2309 *err = REG_ESPACE;
2310 return NULL;
2311 }
2312 if (dfa->mb_cur_max > 1)
2313 dfa->has_mb_node = 1;
2314 break;
2315 case OP_WORD:
2316 case OP_NOTWORD:
2317 tree = build_charclass_op (dfa, regexp->trans,
2318 (const unsigned char *) "alnum",
2319 (const unsigned char *) "_",
2320 token->type == OP_NOTWORD, err);
2321 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2322 return NULL;
2323 break;
2324 case OP_SPACE:
2325 case OP_NOTSPACE:
2326 tree = build_charclass_op (dfa, regexp->trans,
2327 (const unsigned char *) "space",
2328 (const unsigned char *) "",
2329 token->type == OP_NOTSPACE, err);
2330 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2331 return NULL;
2332 break;
2333 case OP_ALT:
2334 case END_OF_RE:
2335 return NULL;
2336 case BACK_SLASH:
2337 *err = REG_EESCAPE;
2338 return NULL;
2339 default:
2340 /* Must not happen? */
2341 #ifdef DEBUG
2342 assert (0);
2343 #endif
2344 return NULL;
2345 }
2346 fetch_token (token, regexp, syntax);
2347
2348 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2349 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2350 {
2351 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2352 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2353 return NULL;
2354 /* In BRE consecutive duplications are not allowed. */
2355 if ((syntax & RE_CONTEXT_INVALID_DUP)
2356 && (token->type == OP_DUP_ASTERISK
2357 || token->type == OP_OPEN_DUP_NUM))
2358 {
2359 *err = REG_BADRPT;
2360 return NULL;
2361 }
2362 }
2363
2364 return tree;
2365 }
2366
2367 /* This function build the following tree, from regular expression
2368 (<reg_exp>):
2369 SUBEXP
2370 |
2371 <reg_exp>
2372 */
2373
2374 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2375 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2376 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2377 {
2378 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2379 bin_tree_t *tree;
2380 size_t cur_nsub;
2381 cur_nsub = preg->re_nsub++;
2382
2383 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2384
2385 /* The subexpression may be a null string. */
2386 if (token->type == OP_CLOSE_SUBEXP)
2387 tree = NULL;
2388 else
2389 {
2390 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2391 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2392 *err = REG_EPAREN;
2393 if (BE (*err != REG_NOERROR, 0))
2394 return NULL;
2395 }
2396
2397 if (cur_nsub <= '9' - '1')
2398 dfa->completed_bkref_map |= 1 << cur_nsub;
2399
2400 tree = create_tree (dfa, tree, NULL, SUBEXP);
2401 if (BE (tree == NULL, 0))
2402 {
2403 *err = REG_ESPACE;
2404 return NULL;
2405 }
2406 tree->token.opr.idx = cur_nsub;
2407 return tree;
2408 }
2409
2410 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2411
2412 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2413 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2414 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2415 {
2416 bin_tree_t *tree = NULL, *old_tree = NULL;
2417 int i, start, end, start_idx = re_string_cur_idx (regexp);
2418 re_token_t start_token = *token;
2419
2420 if (token->type == OP_OPEN_DUP_NUM)
2421 {
2422 end = 0;
2423 start = fetch_number (regexp, token, syntax);
2424 if (start == -1)
2425 {
2426 if (token->type == CHARACTER && token->opr.c == ',')
2427 start = 0; /* We treat "{,m}" as "{0,m}". */
2428 else
2429 {
2430 *err = REG_BADBR; /* <re>{} is invalid. */
2431 return NULL;
2432 }
2433 }
2434 if (BE (start != -2, 1))
2435 {
2436 /* We treat "{n}" as "{n,n}". */
2437 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2438 : ((token->type == CHARACTER && token->opr.c == ',')
2439 ? fetch_number (regexp, token, syntax) : -2));
2440 }
2441 if (BE (start == -2 || end == -2, 0))
2442 {
2443 /* Invalid sequence. */
2444 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2445 {
2446 if (token->type == END_OF_RE)
2447 *err = REG_EBRACE;
2448 else
2449 *err = REG_BADBR;
2450
2451 return NULL;
2452 }
2453
2454 /* If the syntax bit is set, rollback. */
2455 re_string_set_index (regexp, start_idx);
2456 *token = start_token;
2457 token->type = CHARACTER;
2458 /* mb_partial and word_char bits should be already initialized by
2459 peek_token. */
2460 return elem;
2461 }
2462
2463 if (BE (end != -1 && start > end, 0))
2464 {
2465 /* First number greater than second. */
2466 *err = REG_BADBR;
2467 return NULL;
2468 }
2469 }
2470 else
2471 {
2472 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2473 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2474 }
2475
2476 fetch_token (token, regexp, syntax);
2477
2478 if (BE (elem == NULL, 0))
2479 return NULL;
2480 if (BE (start == 0 && end == 0, 0))
2481 {
2482 postorder (elem, free_tree, NULL);
2483 return NULL;
2484 }
2485
2486 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2487 if (BE (start > 0, 0))
2488 {
2489 tree = elem;
2490 for (i = 2; i <= start; ++i)
2491 {
2492 elem = duplicate_tree (elem, dfa);
2493 tree = create_tree (dfa, tree, elem, CONCAT);
2494 if (BE (elem == NULL || tree == NULL, 0))
2495 goto parse_dup_op_espace;
2496 }
2497
2498 if (start == end)
2499 return tree;
2500
2501 /* Duplicate ELEM before it is marked optional. */
2502 elem = duplicate_tree (elem, dfa);
2503 old_tree = tree;
2504 }
2505 else
2506 old_tree = NULL;
2507
2508 if (elem->token.type == SUBEXP)
2509 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2510
2511 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2512 if (BE (tree == NULL, 0))
2513 goto parse_dup_op_espace;
2514
2515 /* This loop is actually executed only when end != -1,
2516 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2517 already created the start+1-th copy. */
2518 for (i = start + 2; i <= end; ++i)
2519 {
2520 elem = duplicate_tree (elem, dfa);
2521 tree = create_tree (dfa, tree, elem, CONCAT);
2522 if (BE (elem == NULL || tree == NULL, 0))
2523 goto parse_dup_op_espace;
2524
2525 tree = create_tree (dfa, tree, NULL, OP_ALT);
2526 if (BE (tree == NULL, 0))
2527 goto parse_dup_op_espace;
2528 }
2529
2530 if (old_tree)
2531 tree = create_tree (dfa, old_tree, tree, CONCAT);
2532
2533 return tree;
2534
2535 parse_dup_op_espace:
2536 *err = REG_ESPACE;
2537 return NULL;
2538 }
2539
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541 I'm not sure, but maybe enough. */
2542 #define BRACKET_NAME_BUF_SIZE 32
2543
2544 #ifndef _LIBC
2545 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546 Build the range expression which starts from START_ELEM, and ends
2547 at END_ELEM. The result are written to MBCSET and SBCSET.
2548 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549 mbcset->range_ends, is a pointer argument sinse we may
2550 update it. */
2551
2552 static reg_errcode_t
2553 internal_function
2554 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2555 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2556 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2557 # else /* not RE_ENABLE_I18N */
2558 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2559 bracket_elem_t *end_elem)
2560 # endif /* not RE_ENABLE_I18N */
2561 {
2562 unsigned int start_ch, end_ch;
2563 /* Equivalence Classes and Character Classes can't be a range start/end. */
2564 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2565 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2566 0))
2567 return REG_ERANGE;
2568
2569 /* We can handle no multi character collating elements without libc
2570 support. */
2571 if (BE ((start_elem->type == COLL_SYM
2572 && strlen ((char *) start_elem->opr.name) > 1)
2573 || (end_elem->type == COLL_SYM
2574 && strlen ((char *) end_elem->opr.name) > 1), 0))
2575 return REG_ECOLLATE;
2576
2577 # ifdef RE_ENABLE_I18N
2578 {
2579 wchar_t wc;
2580 wint_t start_wc;
2581 wint_t end_wc;
2582 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2583
2584 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2585 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2586 : 0));
2587 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2588 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2589 : 0));
2590 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2591 ? __btowc (start_ch) : start_elem->opr.wch);
2592 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2593 ? __btowc (end_ch) : end_elem->opr.wch);
2594 if (start_wc == WEOF || end_wc == WEOF)
2595 return REG_ECOLLATE;
2596 cmp_buf[0] = start_wc;
2597 cmp_buf[4] = end_wc;
2598 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2599 return REG_ERANGE;
2600
2601 /* Got valid collation sequence values, add them as a new entry.
2602 However, for !_LIBC we have no collation elements: if the
2603 character set is single byte, the single byte character set
2604 that we build below suffices. parse_bracket_exp passes
2605 no MBCSET if dfa->mb_cur_max == 1. */
2606 if (mbcset)
2607 {
2608 /* Check the space of the arrays. */
2609 if (BE (*range_alloc == mbcset->nranges, 0))
2610 {
2611 /* There is not enough space, need realloc. */
2612 wchar_t *new_array_start, *new_array_end;
2613 int new_nranges;
2614
2615 /* +1 in case of mbcset->nranges is 0. */
2616 new_nranges = 2 * mbcset->nranges + 1;
2617 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2618 are NULL if *range_alloc == 0. */
2619 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2620 new_nranges);
2621 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2622 new_nranges);
2623
2624 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2625 return REG_ESPACE;
2626
2627 mbcset->range_starts = new_array_start;
2628 mbcset->range_ends = new_array_end;
2629 *range_alloc = new_nranges;
2630 }
2631
2632 mbcset->range_starts[mbcset->nranges] = start_wc;
2633 mbcset->range_ends[mbcset->nranges++] = end_wc;
2634 }
2635
2636 /* Build the table for single byte characters. */
2637 for (wc = 0; wc < SBC_MAX; ++wc)
2638 {
2639 cmp_buf[2] = wc;
2640 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2641 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2642 bitset_set (sbcset, wc);
2643 }
2644 }
2645 # else /* not RE_ENABLE_I18N */
2646 {
2647 unsigned int ch;
2648 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2649 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2650 : 0));
2651 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2652 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2653 : 0));
2654 if (start_ch > end_ch)
2655 return REG_ERANGE;
2656 /* Build the table for single byte characters. */
2657 for (ch = 0; ch < SBC_MAX; ++ch)
2658 if (start_ch <= ch && ch <= end_ch)
2659 bitset_set (sbcset, ch);
2660 }
2661 # endif /* not RE_ENABLE_I18N */
2662 return REG_NOERROR;
2663 }
2664 #endif /* not _LIBC */
2665
2666 #ifndef _LIBC
2667 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2668 Build the collating element which is represented by NAME.
2669 The result are written to MBCSET and SBCSET.
2670 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2671 pointer argument since we may update it. */
2672
2673 static reg_errcode_t
2674 internal_function
2675 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2676 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2677 int *coll_sym_alloc, const unsigned char *name)
2678 # else /* not RE_ENABLE_I18N */
2679 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2680 # endif /* not RE_ENABLE_I18N */
2681 {
2682 size_t name_len = strlen ((const char *) name);
2683 if (BE (name_len != 1, 0))
2684 return REG_ECOLLATE;
2685 else
2686 {
2687 bitset_set (sbcset, name[0]);
2688 return REG_NOERROR;
2689 }
2690 }
2691 #endif /* not _LIBC */
2692
2693 /* This function parse bracket expression like "[abc]", "[a-c]",
2694 "[[.a-a.]]" etc. */
2695
2696 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2697 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2698 reg_syntax_t syntax, reg_errcode_t *err)
2699 {
2700 #ifdef _LIBC
2701 const unsigned char *collseqmb;
2702 const char *collseqwc;
2703 uint32_t nrules;
2704 int32_t table_size;
2705 const int32_t *symb_table;
2706 const unsigned char *extra;
2707
2708 /* Local function for parse_bracket_exp used in _LIBC environement.
2709 Seek the collating symbol entry correspondings to NAME.
2710 Return the index of the symbol in the SYMB_TABLE. */
2711
2712 auto inline int32_t
2713 __attribute ((always_inline))
2714 seek_collating_symbol_entry (name, name_len)
2715 const unsigned char *name;
2716 size_t name_len;
2717 {
2718 int32_t hash = elem_hash ((const char *) name, name_len);
2719 int32_t elem = hash % table_size;
2720 if (symb_table[2 * elem] != 0)
2721 {
2722 int32_t second = hash % (table_size - 2) + 1;
2723
2724 do
2725 {
2726 /* First compare the hashing value. */
2727 if (symb_table[2 * elem] == hash
2728 /* Compare the length of the name. */
2729 && name_len == extra[symb_table[2 * elem + 1]]
2730 /* Compare the name. */
2731 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2732 name_len) == 0)
2733 {
2734 /* Yep, this is the entry. */
2735 break;
2736 }
2737
2738 /* Next entry. */
2739 elem += second;
2740 }
2741 while (symb_table[2 * elem] != 0);
2742 }
2743 return elem;
2744 }
2745
2746 /* Local function for parse_bracket_exp used in _LIBC environement.
2747 Look up the collation sequence value of BR_ELEM.
2748 Return the value if succeeded, UINT_MAX otherwise. */
2749
2750 auto inline unsigned int
2751 __attribute ((always_inline))
2752 lookup_collation_sequence_value (br_elem)
2753 bracket_elem_t *br_elem;
2754 {
2755 if (br_elem->type == SB_CHAR)
2756 {
2757 /*
2758 if (MB_CUR_MAX == 1)
2759 */
2760 if (nrules == 0)
2761 return collseqmb[br_elem->opr.ch];
2762 else
2763 {
2764 wint_t wc = __btowc (br_elem->opr.ch);
2765 return __collseq_table_lookup (collseqwc, wc);
2766 }
2767 }
2768 else if (br_elem->type == MB_CHAR)
2769 {
2770 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2771 }
2772 else if (br_elem->type == COLL_SYM)
2773 {
2774 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2775 if (nrules != 0)
2776 {
2777 int32_t elem, idx;
2778 elem = seek_collating_symbol_entry (br_elem->opr.name,
2779 sym_name_len);
2780 if (symb_table[2 * elem] != 0)
2781 {
2782 /* We found the entry. */
2783 idx = symb_table[2 * elem + 1];
2784 /* Skip the name of collating element name. */
2785 idx += 1 + extra[idx];
2786 /* Skip the byte sequence of the collating element. */
2787 idx += 1 + extra[idx];
2788 /* Adjust for the alignment. */
2789 idx = (idx + 3) & ~3;
2790 /* Skip the multibyte collation sequence value. */
2791 idx += sizeof (unsigned int);
2792 /* Skip the wide char sequence of the collating element. */
2793 idx += sizeof (unsigned int) *
2794 (1 + *(unsigned int *) (extra + idx));
2795 /* Return the collation sequence value. */
2796 return *(unsigned int *) (extra + idx);
2797 }
2798 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2799 {
2800 /* No valid character. Match it as a single byte
2801 character. */
2802 return collseqmb[br_elem->opr.name[0]];
2803 }
2804 }
2805 else if (sym_name_len == 1)
2806 return collseqmb[br_elem->opr.name[0]];
2807 }
2808 return UINT_MAX;
2809 }
2810
2811 /* Local function for parse_bracket_exp used in _LIBC environement.
2812 Build the range expression which starts from START_ELEM, and ends
2813 at END_ELEM. The result are written to MBCSET and SBCSET.
2814 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2815 mbcset->range_ends, is a pointer argument sinse we may
2816 update it. */
2817
2818 auto inline reg_errcode_t
2819 __attribute ((always_inline))
2820 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2821 re_charset_t *mbcset;
2822 int *range_alloc;
2823 bitset_t sbcset;
2824 bracket_elem_t *start_elem, *end_elem;
2825 {
2826 unsigned int ch;
2827 uint32_t start_collseq;
2828 uint32_t end_collseq;
2829
2830 /* Equivalence Classes and Character Classes can't be a range
2831 start/end. */
2832 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2833 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2834 0))
2835 return REG_ERANGE;
2836
2837 start_collseq = lookup_collation_sequence_value (start_elem);
2838 end_collseq = lookup_collation_sequence_value (end_elem);
2839 /* Check start/end collation sequence values. */
2840 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2841 return REG_ECOLLATE;
2842 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2843 return REG_ERANGE;
2844
2845 /* Got valid collation sequence values, add them as a new entry.
2846 However, if we have no collation elements, and the character set
2847 is single byte, the single byte character set that we
2848 build below suffices. */
2849 if (nrules > 0 || dfa->mb_cur_max > 1)
2850 {
2851 /* Check the space of the arrays. */
2852 if (BE (*range_alloc == mbcset->nranges, 0))
2853 {
2854 /* There is not enough space, need realloc. */
2855 uint32_t *new_array_start;
2856 uint32_t *new_array_end;
2857 int new_nranges;
2858
2859 /* +1 in case of mbcset->nranges is 0. */
2860 new_nranges = 2 * mbcset->nranges + 1;
2861 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2862 new_nranges);
2863 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2864 new_nranges);
2865
2866 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2867 return REG_ESPACE;
2868
2869 mbcset->range_starts = new_array_start;
2870 mbcset->range_ends = new_array_end;
2871 *range_alloc = new_nranges;
2872 }
2873
2874 mbcset->range_starts[mbcset->nranges] = start_collseq;
2875 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2876 }
2877
2878 /* Build the table for single byte characters. */
2879 for (ch = 0; ch < SBC_MAX; ch++)
2880 {
2881 uint32_t ch_collseq;
2882 /*
2883 if (MB_CUR_MAX == 1)
2884 */
2885 if (nrules == 0)
2886 ch_collseq = collseqmb[ch];
2887 else
2888 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2889 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2890 bitset_set (sbcset, ch);
2891 }
2892 return REG_NOERROR;
2893 }
2894
2895 /* Local function for parse_bracket_exp used in _LIBC environement.
2896 Build the collating element which is represented by NAME.
2897 The result are written to MBCSET and SBCSET.
2898 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2899 pointer argument sinse we may update it. */
2900
2901 auto inline reg_errcode_t
2902 __attribute ((always_inline))
2903 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2904 re_charset_t *mbcset;
2905 int *coll_sym_alloc;
2906 bitset_t sbcset;
2907 const unsigned char *name;
2908 {
2909 int32_t elem, idx;
2910 size_t name_len = strlen ((const char *) name);
2911 if (nrules != 0)
2912 {
2913 elem = seek_collating_symbol_entry (name, name_len);
2914 if (symb_table[2 * elem] != 0)
2915 {
2916 /* We found the entry. */
2917 idx = symb_table[2 * elem + 1];
2918 /* Skip the name of collating element name. */
2919 idx += 1 + extra[idx];
2920 }
2921 else if (symb_table[2 * elem] == 0 && name_len == 1)
2922 {
2923 /* No valid character, treat it as a normal
2924 character. */
2925 bitset_set (sbcset, name[0]);
2926 return REG_NOERROR;
2927 }
2928 else
2929 return REG_ECOLLATE;
2930
2931 /* Got valid collation sequence, add it as a new entry. */
2932 /* Check the space of the arrays. */
2933 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2934 {
2935 /* Not enough, realloc it. */
2936 /* +1 in case of mbcset->ncoll_syms is 0. */
2937 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2938 /* Use realloc since mbcset->coll_syms is NULL
2939 if *alloc == 0. */
2940 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2941 new_coll_sym_alloc);
2942 if (BE (new_coll_syms == NULL, 0))
2943 return REG_ESPACE;
2944 mbcset->coll_syms = new_coll_syms;
2945 *coll_sym_alloc = new_coll_sym_alloc;
2946 }
2947 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2948 return REG_NOERROR;
2949 }
2950 else
2951 {
2952 if (BE (name_len != 1, 0))
2953 return REG_ECOLLATE;
2954 else
2955 {
2956 bitset_set (sbcset, name[0]);
2957 return REG_NOERROR;
2958 }
2959 }
2960 }
2961 #endif
2962
2963 re_token_t br_token;
2964 re_bitset_ptr_t sbcset;
2965 #ifdef RE_ENABLE_I18N
2966 re_charset_t *mbcset;
2967 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2968 int equiv_class_alloc = 0, char_class_alloc = 0;
2969 #endif /* not RE_ENABLE_I18N */
2970 int non_match = 0;
2971 bin_tree_t *work_tree;
2972 int token_len;
2973 int first_round = 1;
2974 #ifdef _LIBC
2975 collseqmb = (const unsigned char *)
2976 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2977 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2978 if (nrules)
2979 {
2980 /*
2981 if (MB_CUR_MAX > 1)
2982 */
2983 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2984 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2985 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2986 _NL_COLLATE_SYMB_TABLEMB);
2987 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2988 _NL_COLLATE_SYMB_EXTRAMB);
2989 }
2990 #endif
2991 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2992 #ifdef RE_ENABLE_I18N
2993 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2994 #endif /* RE_ENABLE_I18N */
2995 #ifdef RE_ENABLE_I18N
2996 if (BE (sbcset == NULL || mbcset == NULL, 0))
2997 #else
2998 if (BE (sbcset == NULL, 0))
2999 #endif /* RE_ENABLE_I18N */
3000 {
3001 *err = REG_ESPACE;
3002 return NULL;
3003 }
3004
3005 token_len = peek_token_bracket (token, regexp, syntax);
3006 if (BE (token->type == END_OF_RE, 0))
3007 {
3008 *err = REG_BADPAT;
3009 goto parse_bracket_exp_free_return;
3010 }
3011 if (token->type == OP_NON_MATCH_LIST)
3012 {
3013 #ifdef RE_ENABLE_I18N
3014 mbcset->non_match = 1;
3015 #endif /* not RE_ENABLE_I18N */
3016 non_match = 1;
3017 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3018 bitset_set (sbcset, '\0');
3019 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3020 token_len = peek_token_bracket (token, regexp, syntax);
3021 if (BE (token->type == END_OF_RE, 0))
3022 {
3023 *err = REG_BADPAT;
3024 goto parse_bracket_exp_free_return;
3025 }
3026 }
3027
3028 /* We treat the first ']' as a normal character. */
3029 if (token->type == OP_CLOSE_BRACKET)
3030 token->type = CHARACTER;
3031
3032 while (1)
3033 {
3034 bracket_elem_t start_elem, end_elem;
3035 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3036 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3037 reg_errcode_t ret;
3038 int token_len2 = 0, is_range_exp = 0;
3039 re_token_t token2;
3040
3041 start_elem.opr.name = start_name_buf;
3042 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3043 syntax, first_round);
3044 if (BE (ret != REG_NOERROR, 0))
3045 {
3046 *err = ret;
3047 goto parse_bracket_exp_free_return;
3048 }
3049 first_round = 0;
3050
3051 /* Get information about the next token. We need it in any case. */
3052 token_len = peek_token_bracket (token, regexp, syntax);
3053
3054 /* Do not check for ranges if we know they are not allowed. */
3055 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3056 {
3057 if (BE (token->type == END_OF_RE, 0))
3058 {
3059 *err = REG_EBRACK;
3060 goto parse_bracket_exp_free_return;
3061 }
3062 if (token->type == OP_CHARSET_RANGE)
3063 {
3064 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3065 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3066 if (BE (token2.type == END_OF_RE, 0))
3067 {
3068 *err = REG_EBRACK;
3069 goto parse_bracket_exp_free_return;
3070 }
3071 if (token2.type == OP_CLOSE_BRACKET)
3072 {
3073 /* We treat the last '-' as a normal character. */
3074 re_string_skip_bytes (regexp, -token_len);
3075 token->type = CHARACTER;
3076 }
3077 else
3078 is_range_exp = 1;
3079 }
3080 }
3081
3082 if (is_range_exp == 1)
3083 {
3084 end_elem.opr.name = end_name_buf;
3085 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3086 dfa, syntax, 1);
3087 if (BE (ret != REG_NOERROR, 0))
3088 {
3089 *err = ret;
3090 goto parse_bracket_exp_free_return;
3091 }
3092
3093 token_len = peek_token_bracket (token, regexp, syntax);
3094
3095 #ifdef _LIBC
3096 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3097 &start_elem, &end_elem);
3098 #else
3099 # ifdef RE_ENABLE_I18N
3100 *err = build_range_exp (sbcset,
3101 dfa->mb_cur_max > 1 ? mbcset : NULL,
3102 &range_alloc, &start_elem, &end_elem);
3103 # else
3104 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3105 # endif
3106 #endif /* RE_ENABLE_I18N */
3107 if (BE (*err != REG_NOERROR, 0))
3108 goto parse_bracket_exp_free_return;
3109 }
3110 else
3111 {
3112 switch (start_elem.type)
3113 {
3114 case SB_CHAR:
3115 bitset_set (sbcset, start_elem.opr.ch);
3116 break;
3117 #ifdef RE_ENABLE_I18N
3118 case MB_CHAR:
3119 /* Check whether the array has enough space. */
3120 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3121 {
3122 wchar_t *new_mbchars;
3123 /* Not enough, realloc it. */
3124 /* +1 in case of mbcset->nmbchars is 0. */
3125 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3126 /* Use realloc since array is NULL if *alloc == 0. */
3127 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3128 mbchar_alloc);
3129 if (BE (new_mbchars == NULL, 0))
3130 goto parse_bracket_exp_espace;
3131 mbcset->mbchars = new_mbchars;
3132 }
3133 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3134 break;
3135 #endif /* RE_ENABLE_I18N */
3136 case EQUIV_CLASS:
3137 *err = build_equiv_class (sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 mbcset, &equiv_class_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141 start_elem.opr.name);
3142 if (BE (*err != REG_NOERROR, 0))
3143 goto parse_bracket_exp_free_return;
3144 break;
3145 case COLL_SYM:
3146 *err = build_collating_symbol (sbcset,
3147 #ifdef RE_ENABLE_I18N
3148 mbcset, &coll_sym_alloc,
3149 #endif /* RE_ENABLE_I18N */
3150 start_elem.opr.name);
3151 if (BE (*err != REG_NOERROR, 0))
3152 goto parse_bracket_exp_free_return;
3153 break;
3154 case CHAR_CLASS:
3155 *err = build_charclass (regexp->trans, sbcset,
3156 #ifdef RE_ENABLE_I18N
3157 mbcset, &char_class_alloc,
3158 #endif /* RE_ENABLE_I18N */
3159 start_elem.opr.name, syntax);
3160 if (BE (*err != REG_NOERROR, 0))
3161 goto parse_bracket_exp_free_return;
3162 break;
3163 default:
3164 assert (0);
3165 break;
3166 }
3167 }
3168 if (BE (token->type == END_OF_RE, 0))
3169 {
3170 *err = REG_EBRACK;
3171 goto parse_bracket_exp_free_return;
3172 }
3173 if (token->type == OP_CLOSE_BRACKET)
3174 break;
3175 }
3176
3177 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3178
3179 /* If it is non-matching list. */
3180 if (non_match)
3181 bitset_not (sbcset);
3182
3183 #ifdef RE_ENABLE_I18N
3184 /* Ensure only single byte characters are set. */
3185 if (dfa->mb_cur_max > 1)
3186 bitset_mask (sbcset, dfa->sb_char);
3187
3188 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3189 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3190 || mbcset->non_match)))
3191 {
3192 bin_tree_t *mbc_tree;
3193 int sbc_idx;
3194 /* Build a tree for complex bracket. */
3195 dfa->has_mb_node = 1;
3196 br_token.type = COMPLEX_BRACKET;
3197 br_token.opr.mbcset = mbcset;
3198 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3199 if (BE (mbc_tree == NULL, 0))
3200 goto parse_bracket_exp_espace;
3201 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3202 if (sbcset[sbc_idx])
3203 break;
3204 /* If there are no bits set in sbcset, there is no point
3205 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3206 if (sbc_idx < BITSET_WORDS)
3207 {
3208 /* Build a tree for simple bracket. */
3209 br_token.type = SIMPLE_BRACKET;
3210 br_token.opr.sbcset = sbcset;
3211 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3212 if (BE (work_tree == NULL, 0))
3213 goto parse_bracket_exp_espace;
3214
3215 /* Then join them by ALT node. */
3216 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3217 if (BE (work_tree == NULL, 0))
3218 goto parse_bracket_exp_espace;
3219 }
3220 else
3221 {
3222 re_free (sbcset);
3223 work_tree = mbc_tree;
3224 }
3225 }
3226 else
3227 #endif /* not RE_ENABLE_I18N */
3228 {
3229 #ifdef RE_ENABLE_I18N
3230 free_charset (mbcset);
3231 #endif
3232 /* Build a tree for simple bracket. */
3233 br_token.type = SIMPLE_BRACKET;
3234 br_token.opr.sbcset = sbcset;
3235 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3236 if (BE (work_tree == NULL, 0))
3237 goto parse_bracket_exp_espace;
3238 }
3239 return work_tree;
3240
3241 parse_bracket_exp_espace:
3242 *err = REG_ESPACE;
3243 parse_bracket_exp_free_return:
3244 re_free (sbcset);
3245 #ifdef RE_ENABLE_I18N
3246 free_charset (mbcset);
3247 #endif /* RE_ENABLE_I18N */
3248 return NULL;
3249 }
3250
3251 /* Parse an element in the bracket expression. */
3252
3253 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3254 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3255 re_token_t *token, int token_len, re_dfa_t *dfa,
3256 reg_syntax_t syntax, int accept_hyphen)
3257 {
3258 #ifdef RE_ENABLE_I18N
3259 int cur_char_size;
3260 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3261 if (cur_char_size > 1)
3262 {
3263 elem->type = MB_CHAR;
3264 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3265 re_string_skip_bytes (regexp, cur_char_size);
3266 return REG_NOERROR;
3267 }
3268 #endif /* RE_ENABLE_I18N */
3269 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3270 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3271 || token->type == OP_OPEN_EQUIV_CLASS)
3272 return parse_bracket_symbol (elem, regexp, token);
3273 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3274 {
3275 /* A '-' must only appear as anything but a range indicator before
3276 the closing bracket. Everything else is an error. */
3277 re_token_t token2;
3278 (void) peek_token_bracket (&token2, regexp, syntax);
3279 if (token2.type != OP_CLOSE_BRACKET)
3280 /* The actual error value is not standardized since this whole
3281 case is undefined. But ERANGE makes good sense. */
3282 return REG_ERANGE;
3283 }
3284 elem->type = SB_CHAR;
3285 elem->opr.ch = token->opr.c;
3286 return REG_NOERROR;
3287 }
3288
3289 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3290 such as [:<character_class>:], [.<collating_element>.], and
3291 [=<equivalent_class>=]. */
3292
3293 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3294 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3295 re_token_t *token)
3296 {
3297 unsigned char ch, delim = token->opr.c;
3298 int i = 0;
3299 if (re_string_eoi(regexp))
3300 return REG_EBRACK;
3301 for (;; ++i)
3302 {
3303 if (i >= BRACKET_NAME_BUF_SIZE)
3304 return REG_EBRACK;
3305 if (token->type == OP_OPEN_CHAR_CLASS)
3306 ch = re_string_fetch_byte_case (regexp);
3307 else
3308 ch = re_string_fetch_byte (regexp);
3309 if (re_string_eoi(regexp))
3310 return REG_EBRACK;
3311 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3312 break;
3313 elem->opr.name[i] = ch;
3314 }
3315 re_string_skip_bytes (regexp, 1);
3316 elem->opr.name[i] = '\0';
3317 switch (token->type)
3318 {
3319 case OP_OPEN_COLL_ELEM:
3320 elem->type = COLL_SYM;
3321 break;
3322 case OP_OPEN_EQUIV_CLASS:
3323 elem->type = EQUIV_CLASS;
3324 break;
3325 case OP_OPEN_CHAR_CLASS:
3326 elem->type = CHAR_CLASS;
3327 break;
3328 default:
3329 break;
3330 }
3331 return REG_NOERROR;
3332 }
3333
3334 /* Helper function for parse_bracket_exp.
3335 Build the equivalence class which is represented by NAME.
3336 The result are written to MBCSET and SBCSET.
3337 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3338 is a pointer argument sinse we may update it. */
3339
3340 static reg_errcode_t
3341 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3342 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3343 int *equiv_class_alloc, const unsigned char *name)
3344 #else /* not RE_ENABLE_I18N */
3345 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3346 #endif /* not RE_ENABLE_I18N */
3347 {
3348 #ifdef _LIBC
3349 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3350 if (nrules != 0)
3351 {
3352 const int32_t *table, *indirect;
3353 const unsigned char *weights, *extra, *cp;
3354 unsigned char char_buf[2];
3355 int32_t idx1, idx2;
3356 unsigned int ch;
3357 size_t len;
3358 /* This #include defines a local function! */
3359 # include <locale/weight.h>
3360 /* Calculate the index for equivalence class. */
3361 cp = name;
3362 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3363 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3364 _NL_COLLATE_WEIGHTMB);
3365 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3366 _NL_COLLATE_EXTRAMB);
3367 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3368 _NL_COLLATE_INDIRECTMB);
3369 idx1 = findidx (&cp);
3370 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3371 /* This isn't a valid character. */
3372 return REG_ECOLLATE;
3373
3374 /* Build single byte matcing table for this equivalence class. */
3375 char_buf[1] = (unsigned char) '\0';
3376 len = weights[idx1];
3377 for (ch = 0; ch < SBC_MAX; ++ch)
3378 {
3379 char_buf[0] = ch;
3380 cp = char_buf;
3381 idx2 = findidx (&cp);
3382 /*
3383 idx2 = table[ch];
3384 */
3385 if (idx2 == 0)
3386 /* This isn't a valid character. */
3387 continue;
3388 if (len == weights[idx2])
3389 {
3390 int cnt = 0;
3391 while (cnt <= len &&
3392 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3393 ++cnt;
3394
3395 if (cnt > len)
3396 bitset_set (sbcset, ch);
3397 }
3398 }
3399 /* Check whether the array has enough space. */
3400 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3401 {
3402 /* Not enough, realloc it. */
3403 /* +1 in case of mbcset->nequiv_classes is 0. */
3404 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3405 /* Use realloc since the array is NULL if *alloc == 0. */
3406 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3407 int32_t,
3408 new_equiv_class_alloc);
3409 if (BE (new_equiv_classes == NULL, 0))
3410 return REG_ESPACE;
3411 mbcset->equiv_classes = new_equiv_classes;
3412 *equiv_class_alloc = new_equiv_class_alloc;
3413 }
3414 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3415 }
3416 else
3417 #endif /* _LIBC */
3418 {
3419 if (BE (strlen ((const char *) name) != 1, 0))
3420 return REG_ECOLLATE;
3421 bitset_set (sbcset, *name);
3422 }
3423 return REG_NOERROR;
3424 }
3425
3426 /* Helper function for parse_bracket_exp.
3427 Build the character class which is represented by NAME.
3428 The result are written to MBCSET and SBCSET.
3429 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3430 is a pointer argument sinse we may update it. */
3431
3432 static reg_errcode_t
3433 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3434 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3435 re_charset_t *mbcset, int *char_class_alloc,
3436 const unsigned char *class_name, reg_syntax_t syntax)
3437 #else /* not RE_ENABLE_I18N */
3438 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3439 const unsigned char *class_name, reg_syntax_t syntax)
3440 #endif /* not RE_ENABLE_I18N */
3441 {
3442 int i;
3443 const char *name = (const char *) class_name;
3444
3445 /* In case of REG_ICASE "upper" and "lower" match the both of
3446 upper and lower cases. */
3447 if ((syntax & RE_ICASE)
3448 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3449 name = "alpha";
3450
3451 #ifdef RE_ENABLE_I18N
3452 /* Check the space of the arrays. */
3453 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3454 {
3455 /* Not enough, realloc it. */
3456 /* +1 in case of mbcset->nchar_classes is 0. */
3457 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3458 /* Use realloc since array is NULL if *alloc == 0. */
3459 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3460 new_char_class_alloc);
3461 if (BE (new_char_classes == NULL, 0))
3462 return REG_ESPACE;
3463 mbcset->char_classes = new_char_classes;
3464 *char_class_alloc = new_char_class_alloc;
3465 }
3466 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3467 #endif /* RE_ENABLE_I18N */
3468
3469 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3470 do { \
3471 if (BE (trans != NULL, 0)) \
3472 { \
3473 for (i = 0; i < SBC_MAX; ++i) \
3474 if (ctype_func (i)) \
3475 bitset_set (sbcset, trans[i]); \
3476 } \
3477 else \
3478 { \
3479 for (i = 0; i < SBC_MAX; ++i) \
3480 if (ctype_func (i)) \
3481 bitset_set (sbcset, i); \
3482 } \
3483 } while (0)
3484
3485 if (strcmp (name, "alnum") == 0)
3486 BUILD_CHARCLASS_LOOP (isalnum);
3487 else if (strcmp (name, "cntrl") == 0)
3488 BUILD_CHARCLASS_LOOP (iscntrl);
3489 else if (strcmp (name, "lower") == 0)
3490 BUILD_CHARCLASS_LOOP (islower);
3491 else if (strcmp (name, "space") == 0)
3492 BUILD_CHARCLASS_LOOP (isspace);
3493 else if (strcmp (name, "alpha") == 0)
3494 BUILD_CHARCLASS_LOOP (isalpha);
3495 else if (strcmp (name, "digit") == 0)
3496 BUILD_CHARCLASS_LOOP (isdigit);
3497 else if (strcmp (name, "print") == 0)
3498 BUILD_CHARCLASS_LOOP (isprint);
3499 else if (strcmp (name, "upper") == 0)
3500 BUILD_CHARCLASS_LOOP (isupper);
3501 else if (strcmp (name, "blank") == 0)
3502 BUILD_CHARCLASS_LOOP (isblank);
3503 else if (strcmp (name, "graph") == 0)
3504 BUILD_CHARCLASS_LOOP (isgraph);
3505 else if (strcmp (name, "punct") == 0)
3506 BUILD_CHARCLASS_LOOP (ispunct);
3507 else if (strcmp (name, "xdigit") == 0)
3508 BUILD_CHARCLASS_LOOP (isxdigit);
3509 else
3510 return REG_ECTYPE;
3511
3512 return REG_NOERROR;
3513 }
3514
3515 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)3516 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3517 const unsigned char *class_name,
3518 const unsigned char *extra, int non_match,
3519 reg_errcode_t *err)
3520 {
3521 re_bitset_ptr_t sbcset;
3522 #ifdef RE_ENABLE_I18N
3523 re_charset_t *mbcset;
3524 int alloc = 0;
3525 #endif /* not RE_ENABLE_I18N */
3526 reg_errcode_t ret;
3527 re_token_t br_token;
3528 bin_tree_t *tree;
3529
3530 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3531 #ifdef RE_ENABLE_I18N
3532 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3533 #endif /* RE_ENABLE_I18N */
3534
3535 #ifdef RE_ENABLE_I18N
3536 if (BE (sbcset == NULL || mbcset == NULL, 0))
3537 #else /* not RE_ENABLE_I18N */
3538 if (BE (sbcset == NULL, 0))
3539 #endif /* not RE_ENABLE_I18N */
3540 {
3541 *err = REG_ESPACE;
3542 return NULL;
3543 }
3544
3545 if (non_match)
3546 {
3547 #ifdef RE_ENABLE_I18N
3548 /*
3549 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3550 bitset_set(cset->sbcset, '\0');
3551 */
3552 mbcset->non_match = 1;
3553 #endif /* not RE_ENABLE_I18N */
3554 }
3555
3556 /* We don't care the syntax in this case. */
3557 ret = build_charclass (trans, sbcset,
3558 #ifdef RE_ENABLE_I18N
3559 mbcset, &alloc,
3560 #endif /* RE_ENABLE_I18N */
3561 class_name, 0);
3562
3563 if (BE (ret != REG_NOERROR, 0))
3564 {
3565 re_free (sbcset);
3566 #ifdef RE_ENABLE_I18N
3567 free_charset (mbcset);
3568 #endif /* RE_ENABLE_I18N */
3569 *err = ret;
3570 return NULL;
3571 }
3572 /* \w match '_' also. */
3573 for (; *extra; extra++)
3574 bitset_set (sbcset, *extra);
3575
3576 /* If it is non-matching list. */
3577 if (non_match)
3578 bitset_not (sbcset);
3579
3580 #ifdef RE_ENABLE_I18N
3581 /* Ensure only single byte characters are set. */
3582 if (dfa->mb_cur_max > 1)
3583 bitset_mask (sbcset, dfa->sb_char);
3584 #endif
3585
3586 /* Build a tree for simple bracket. */
3587 br_token.type = SIMPLE_BRACKET;
3588 br_token.opr.sbcset = sbcset;
3589 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3590 if (BE (tree == NULL, 0))
3591 goto build_word_op_espace;
3592
3593 #ifdef RE_ENABLE_I18N
3594 if (dfa->mb_cur_max > 1)
3595 {
3596 bin_tree_t *mbc_tree;
3597 /* Build a tree for complex bracket. */
3598 br_token.type = COMPLEX_BRACKET;
3599 br_token.opr.mbcset = mbcset;
3600 dfa->has_mb_node = 1;
3601 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602 if (BE (mbc_tree == NULL, 0))
3603 goto build_word_op_espace;
3604 /* Then join them by ALT node. */
3605 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606 if (BE (mbc_tree != NULL, 1))
3607 return tree;
3608 }
3609 else
3610 {
3611 free_charset (mbcset);
3612 return tree;
3613 }
3614 #else /* not RE_ENABLE_I18N */
3615 return tree;
3616 #endif /* not RE_ENABLE_I18N */
3617
3618 build_word_op_espace:
3619 re_free (sbcset);
3620 #ifdef RE_ENABLE_I18N
3621 free_charset (mbcset);
3622 #endif /* RE_ENABLE_I18N */
3623 *err = REG_ESPACE;
3624 return NULL;
3625 }
3626
3627 /* This is intended for the expressions like "a{1,3}".
3628 Fetch a number from `input', and return the number.
3629 Return -1, if the number field is empty like "{,1}".
3630 Return -2, If an error is occured. */
3631
3632 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3633 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3634 {
3635 int num = -1;
3636 unsigned char c;
3637 while (1)
3638 {
3639 fetch_token (token, input, syntax);
3640 c = token->opr.c;
3641 if (BE (token->type == END_OF_RE, 0))
3642 return -2;
3643 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3644 break;
3645 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3646 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3647 num = (num > RE_DUP_MAX) ? -2 : num;
3648 }
3649 return num;
3650 }
3651
3652 #ifdef RE_ENABLE_I18N
3653 static void
free_charset(re_charset_t * cset)3654 free_charset (re_charset_t *cset)
3655 {
3656 re_free (cset->mbchars);
3657 # ifdef _LIBC
3658 re_free (cset->coll_syms);
3659 re_free (cset->equiv_classes);
3660 re_free (cset->range_starts);
3661 re_free (cset->range_ends);
3662 # endif
3663 re_free (cset->char_classes);
3664 re_free (cset);
3665 }
3666 #endif /* RE_ENABLE_I18N */
3667
3668 /* Functions for binary tree operation. */
3669
3670 /* Create a tree node. */
3671
3672 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3673 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3674 re_token_type_t type)
3675 {
3676 re_token_t t;
3677 t.type = type;
3678 return create_token_tree (dfa, left, right, &t);
3679 }
3680
3681 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3682 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3683 const re_token_t *token)
3684 {
3685 bin_tree_t *tree;
3686 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3687 {
3688 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3689
3690 if (storage == NULL)
3691 return NULL;
3692 storage->next = dfa->str_tree_storage;
3693 dfa->str_tree_storage = storage;
3694 dfa->str_tree_storage_idx = 0;
3695 }
3696 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3697
3698 tree->parent = NULL;
3699 tree->left = left;
3700 tree->right = right;
3701 tree->token = *token;
3702 tree->token.duplicated = 0;
3703 tree->token.opt_subexp = 0;
3704 tree->first = NULL;
3705 tree->next = NULL;
3706 tree->node_idx = -1;
3707
3708 if (left != NULL)
3709 left->parent = tree;
3710 if (right != NULL)
3711 right->parent = tree;
3712 return tree;
3713 }
3714
3715 /* Mark the tree SRC as an optional subexpression.
3716 To be called from preorder or postorder. */
3717
3718 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3719 mark_opt_subexp (void *extra, bin_tree_t *node)
3720 {
3721 int idx = (int) (long) extra;
3722 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3723 node->token.opt_subexp = 1;
3724
3725 return REG_NOERROR;
3726 }
3727
3728 /* Free the allocated memory inside NODE. */
3729
3730 static void
free_token(re_token_t * node)3731 free_token (re_token_t *node)
3732 {
3733 #ifdef RE_ENABLE_I18N
3734 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3735 free_charset (node->opr.mbcset);
3736 else
3737 #endif /* RE_ENABLE_I18N */
3738 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3739 re_free (node->opr.sbcset);
3740 }
3741
3742 /* Worker function for tree walking. Free the allocated memory inside NODE
3743 and its children. */
3744
3745 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3746 free_tree (void *extra, bin_tree_t *node)
3747 {
3748 free_token (&node->token);
3749 return REG_NOERROR;
3750 }
3751
3752
3753 /* Duplicate the node SRC, and return new node. This is a preorder
3754 visit similar to the one implemented by the generic visitor, but
3755 we need more infrastructure to maintain two parallel trees --- so,
3756 it's easier to duplicate. */
3757
3758 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3759 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3760 {
3761 const bin_tree_t *node;
3762 bin_tree_t *dup_root;
3763 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3764
3765 for (node = root; ; )
3766 {
3767 /* Create a new tree and link it back to the current parent. */
3768 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3769 if (*p_new == NULL)
3770 return NULL;
3771 (*p_new)->parent = dup_node;
3772 (*p_new)->token.duplicated = 1;
3773 dup_node = *p_new;
3774
3775 /* Go to the left node, or up and to the right. */
3776 if (node->left)
3777 {
3778 node = node->left;
3779 p_new = &dup_node->left;
3780 }
3781 else
3782 {
3783 const bin_tree_t *prev = NULL;
3784 while (node->right == prev || node->right == NULL)
3785 {
3786 prev = node;
3787 node = node->parent;
3788 dup_node = dup_node->parent;
3789 if (!node)
3790 return dup_root;
3791 }
3792 node = node->right;
3793 p_new = &dup_node->right;
3794 }
3795 }
3796 }
3797