1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129 const char __re_error_msgid[] attribute_hidden =
130 {
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183 const size_t __re_error_msgid_idx[] attribute_hidden =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203
204 /* Entry points for GNU code. */
205
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
212
213 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)214 re_compile_pattern (const char *pattern, size_t length,
215 struct re_pattern_buffer *bufp)
216 {
217 reg_errcode_t ret;
218
219 /* And GNU code determines whether or not to get register information
220 by passing null for the REGS argument to re_match, etc., not by
221 setting no_sub, unless RE_NO_SUB is set. */
222 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
223
224 /* Match anchors at newline. */
225 bufp->newline_anchor = 1;
226
227 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
228
229 if (!ret)
230 return NULL;
231 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
232 }
233 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)234 weak_alias (__re_compile_pattern, re_compile_pattern)
235 #endif
236
237 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
238 also be assigned to arbitrarily: each pattern buffer stores its own
239 syntax, so it can be changed between regex compilations. */
240 /* This has no initializer because initialized variables in Emacs
241 become read-only after dumping. */
242 reg_syntax_t re_syntax_options;
243
244
245 /* Specify the precise syntax of regexps for compilation. This provides
246 for compatibility for various utilities which historically have
247 different, incompatible syntaxes.
248
249 The argument SYNTAX is a bit mask comprised of the various bits
250 defined in regex.h. We return the old syntax. */
251
252 reg_syntax_t
253 re_set_syntax (reg_syntax_t syntax)
254 {
255 reg_syntax_t ret = re_syntax_options;
256
257 re_syntax_options = syntax;
258 return ret;
259 }
260 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)261 weak_alias (__re_set_syntax, re_set_syntax)
262 #endif
263
264 int
265 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 {
267 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
268 char *fastmap = bufp->fastmap;
269
270 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
271 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
272 if (dfa->init_state != dfa->init_state_word)
273 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
274 if (dfa->init_state != dfa->init_state_nl)
275 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
276 if (dfa->init_state != dfa->init_state_begbuf)
277 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
278 bufp->fastmap_accurate = 1;
279 return 0;
280 }
281 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)282 weak_alias (__re_compile_fastmap, re_compile_fastmap)
283 #endif
284
285 static inline void
286 __attribute ((always_inline))
287 re_set_fastmap (char *fastmap, int icase, int ch)
288 {
289 fastmap[ch] = 1;
290 if (icase)
291 fastmap[tolower (ch)] = 1;
292 }
293
294 /* Helper function for re_compile_fastmap.
295 Compile fastmap for the initial_state INIT_STATE. */
296
297 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)298 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
299 char *fastmap)
300 {
301 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
302 int node_cnt;
303 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
304 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
305 {
306 int node = init_state->nodes.elems[node_cnt];
307 re_token_type_t type = dfa->nodes[node].type;
308
309 if (type == CHARACTER)
310 {
311 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
312 #ifdef RE_ENABLE_I18N
313 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
314 {
315 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
316 wchar_t wc;
317 mbstate_t state;
318
319 p = buf;
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, '\0', sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
327 &state) == p - buf
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 != (size_t) -1))
330 re_set_fastmap (fastmap, 0, buf[0]);
331 }
332 #endif
333 }
334 else if (type == SIMPLE_BRACKET)
335 {
336 int i, ch;
337 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 {
339 int j;
340 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
341 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342 if (w & ((bitset_word_t) 1 << j))
343 re_set_fastmap (fastmap, icase, ch);
344 }
345 }
346 #ifdef RE_ENABLE_I18N
347 else if (type == COMPLEX_BRACKET)
348 {
349 int i;
350 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
351 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
352 || cset->nranges || cset->nchar_classes)
353 {
354 # ifdef _LIBC
355 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
356 {
357 /* In this case we want to catch the bytes which are
358 the first byte of any collation elements.
359 e.g. In da_DK, we want to catch 'a' since "aa"
360 is a valid collation element, and don't catch
361 'b' since 'b' is the only collation element
362 which starts from 'b'. */
363 const int32_t *table = (const int32_t *)
364 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
365 for (i = 0; i < SBC_MAX; ++i)
366 if (table[i] < 0)
367 re_set_fastmap (fastmap, icase, i);
368 }
369 # else
370 if (dfa->mb_cur_max > 1)
371 for (i = 0; i < SBC_MAX; ++i)
372 if (__btowc (i) == WEOF)
373 re_set_fastmap (fastmap, icase, i);
374 # endif /* not _LIBC */
375 }
376 for (i = 0; i < cset->nmbchars; ++i)
377 {
378 char buf[256];
379 mbstate_t state;
380 memset (&state, '\0', sizeof (state));
381 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
384 {
385 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
386 != (size_t) -1)
387 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
388 }
389 }
390 }
391 #endif /* RE_ENABLE_I18N */
392 else if (type == OP_PERIOD
393 #ifdef RE_ENABLE_I18N
394 || type == OP_UTF8_PERIOD
395 #endif /* RE_ENABLE_I18N */
396 || type == END_OF_RE)
397 {
398 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
399 if (type == END_OF_RE)
400 bufp->can_be_null = 1;
401 return;
402 }
403 }
404 }
405
406 /* Entry point for POSIX code. */
407 /* regcomp takes a regular expression as a string and compiles it.
408
409 PREG is a regex_t *. We do not expect any fields to be initialized,
410 since POSIX says we shouldn't. Thus, we set
411
412 `buffer' to the compiled pattern;
413 `used' to the length of the compiled pattern;
414 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
415 REG_EXTENDED bit in CFLAGS is set; otherwise, to
416 RE_SYNTAX_POSIX_BASIC;
417 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
418 `fastmap' to an allocated space for the fastmap;
419 `fastmap_accurate' to zero;
420 `re_nsub' to the number of subexpressions in PATTERN.
421
422 PATTERN is the address of the pattern string.
423
424 CFLAGS is a series of bits which affect compilation.
425
426 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
427 use POSIX basic syntax.
428
429 If REG_NEWLINE is set, then . and [^...] don't match newline.
430 Also, regexec will try a match beginning after every newline.
431
432 If REG_ICASE is set, then we considers upper- and lowercase
433 versions of letters to be equivalent when matching.
434
435 If REG_NOSUB is set, then when PREG is passed to regexec, that
436 routine will report only success or failure, and nothing about the
437 registers.
438
439 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
440 the return codes and their meanings.) */
441
442 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)443 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
444 {
445 reg_errcode_t ret;
446 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
447 : RE_SYNTAX_POSIX_BASIC);
448
449 preg->buffer = NULL;
450 preg->allocated = 0;
451 preg->used = 0;
452
453 /* Try to allocate space for the fastmap. */
454 preg->fastmap = re_malloc (char, SBC_MAX);
455 if (BE (preg->fastmap == NULL, 0))
456 return REG_ESPACE;
457
458 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
459
460 /* If REG_NEWLINE is set, newlines are treated differently. */
461 if (cflags & REG_NEWLINE)
462 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
463 syntax &= ~RE_DOT_NEWLINE;
464 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
465 /* It also changes the matching behavior. */
466 preg->newline_anchor = 1;
467 }
468 else
469 preg->newline_anchor = 0;
470 preg->no_sub = !!(cflags & REG_NOSUB);
471 preg->translate = NULL;
472
473 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
474
475 /* POSIX doesn't distinguish between an unmatched open-group and an
476 unmatched close-group: both are REG_EPAREN. */
477 if (ret == REG_ERPAREN)
478 ret = REG_EPAREN;
479
480 /* We have already checked preg->fastmap != NULL. */
481 if (BE (ret == REG_NOERROR, 1))
482 /* Compute the fastmap now, since regexec cannot modify the pattern
483 buffer. This function never fails in this implementation. */
484 (void) re_compile_fastmap (preg);
485 else
486 {
487 /* Some error occurred while compiling the expression. */
488 re_free (preg->fastmap);
489 preg->fastmap = NULL;
490 }
491
492 return (int) ret;
493 }
494 #ifdef _LIBC
weak_alias(__regcomp,regcomp)495 weak_alias (__regcomp, regcomp)
496 #endif
497
498 /* Returns a message corresponding to an error code, ERRCODE, returned
499 from either regcomp or regexec. We don't use PREG here. */
500
501 size_t
502 regerror (int errcode, const regex_t *__restrict preg,
503 char *__restrict errbuf, size_t errbuf_size)
504 {
505 const char *msg;
506 size_t msg_size;
507
508 if (BE (errcode < 0
509 || errcode >= (int) (sizeof (__re_error_msgid_idx)
510 / sizeof (__re_error_msgid_idx[0])), 0))
511 /* Only error codes returned by the rest of the code should be passed
512 to this routine. If we are given anything else, or if other regex
513 code generates an invalid error code, then the program has a bug.
514 Dump core so we can fix it. */
515 abort ();
516
517 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
518
519 msg_size = strlen (msg) + 1; /* Includes the null. */
520
521 if (BE (errbuf_size != 0, 1))
522 {
523 if (BE (msg_size > errbuf_size, 0))
524 {
525 #if defined HAVE_MEMPCPY || defined _LIBC
526 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
527 #else
528 memcpy (errbuf, msg, errbuf_size - 1);
529 errbuf[errbuf_size - 1] = 0;
530 #endif
531 }
532 else
533 memcpy (errbuf, msg, msg_size);
534 }
535
536 return msg_size;
537 }
538 #ifdef _LIBC
539 weak_alias (__regerror, regerror)
540 #endif
541
542
543 #ifdef RE_ENABLE_I18N
544 /* This static array is used for the map to single-byte characters when
545 UTF-8 is used. Otherwise we would allocate memory just to initialize
546 it the same all the time. UTF-8 is the preferred encoding so this is
547 a worthwhile optimization. */
548 static const bitset_t utf8_sb_map =
549 {
550 /* Set the first 128 bits. */
551 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
552 };
553 #endif
554
555
556 static void
free_dfa_content(re_dfa_t * dfa)557 free_dfa_content (re_dfa_t *dfa)
558 {
559 int i, j;
560
561 if (dfa->nodes)
562 for (i = 0; i < dfa->nodes_len; ++i)
563 free_token (dfa->nodes + i);
564 re_free (dfa->nexts);
565 for (i = 0; i < dfa->nodes_len; ++i)
566 {
567 if (dfa->eclosures != NULL)
568 re_node_set_free (dfa->eclosures + i);
569 if (dfa->inveclosures != NULL)
570 re_node_set_free (dfa->inveclosures + i);
571 if (dfa->edests != NULL)
572 re_node_set_free (dfa->edests + i);
573 }
574 re_free (dfa->edests);
575 re_free (dfa->eclosures);
576 re_free (dfa->inveclosures);
577 re_free (dfa->nodes);
578
579 if (dfa->state_table)
580 for (i = 0; i <= dfa->state_hash_mask; ++i)
581 {
582 struct re_state_table_entry *entry = dfa->state_table + i;
583 for (j = 0; j < entry->num; ++j)
584 {
585 re_dfastate_t *state = entry->array[j];
586 free_state (state);
587 }
588 re_free (entry->array);
589 }
590 re_free (dfa->state_table);
591 #ifdef RE_ENABLE_I18N
592 if (dfa->sb_char != utf8_sb_map)
593 re_free (dfa->sb_char);
594 #endif
595 re_free (dfa->subexp_map);
596 #ifdef DEBUG
597 re_free (dfa->re_str);
598 #endif
599
600 re_free (dfa);
601 }
602
603
604 /* Free dynamically allocated space used by PREG. */
605
606 void
regfree(regex_t * preg)607 regfree (regex_t *preg)
608 {
609 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
610 if (BE (dfa != NULL, 1))
611 free_dfa_content (dfa);
612 preg->buffer = NULL;
613 preg->allocated = 0;
614
615 re_free (preg->fastmap);
616 preg->fastmap = NULL;
617
618 re_free (preg->translate);
619 preg->translate = NULL;
620 }
621 #ifdef _LIBC
622 weak_alias (__regfree, regfree)
623 #endif
624
625 /* Entry points compatible with 4.2 BSD regex library. We don't define
626 them unless specifically requested. */
627
628 #if defined _REGEX_RE_COMP || defined _LIBC
629
630 /* BSD has one and only one pattern buffer. */
631 static struct re_pattern_buffer re_comp_buf;
632
633 char *
634 # ifdef _LIBC
635 /* Make these definitions weak in libc, so POSIX programs can redefine
636 these names if they don't use our functions, and still use
637 regcomp/regexec above without link errors. */
638 weak_function
639 # endif
re_comp(s)640 re_comp (s)
641 const char *s;
642 {
643 reg_errcode_t ret;
644 char *fastmap;
645
646 if (!s)
647 {
648 if (!re_comp_buf.buffer)
649 return gettext ("No previous regular expression");
650 return 0;
651 }
652
653 if (re_comp_buf.buffer)
654 {
655 fastmap = re_comp_buf.fastmap;
656 re_comp_buf.fastmap = NULL;
657 __regfree (&re_comp_buf);
658 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
659 re_comp_buf.fastmap = fastmap;
660 }
661
662 if (re_comp_buf.fastmap == NULL)
663 {
664 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
665 if (re_comp_buf.fastmap == NULL)
666 return (char *) gettext (__re_error_msgid
667 + __re_error_msgid_idx[(int) REG_ESPACE]);
668 }
669
670 /* Since `re_exec' always passes NULL for the `regs' argument, we
671 don't need to initialize the pattern buffer fields which affect it. */
672
673 /* Match anchors at newlines. */
674 re_comp_buf.newline_anchor = 1;
675
676 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
677
678 if (!ret)
679 return NULL;
680
681 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
682 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
683 }
684
685 #ifdef _LIBC
libc_freeres_fn(free_mem)686 libc_freeres_fn (free_mem)
687 {
688 __regfree (&re_comp_buf);
689 }
690 #endif
691
692 #endif /* _REGEX_RE_COMP */
693
694 /* Internal entry point.
695 Compile the regular expression PATTERN, whose length is LENGTH.
696 SYNTAX indicate regular expression's syntax. */
697
698 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)699 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
700 reg_syntax_t syntax)
701 {
702 reg_errcode_t err = REG_NOERROR;
703 re_dfa_t *dfa;
704 re_string_t regexp;
705
706 /* Initialize the pattern buffer. */
707 preg->fastmap_accurate = 0;
708 preg->syntax = syntax;
709 preg->not_bol = preg->not_eol = 0;
710 preg->used = 0;
711 preg->re_nsub = 0;
712 preg->can_be_null = 0;
713 preg->regs_allocated = REGS_UNALLOCATED;
714
715 /* Initialize the dfa. */
716 dfa = (re_dfa_t *) preg->buffer;
717 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
718 {
719 /* If zero allocated, but buffer is non-null, try to realloc
720 enough space. This loses if buffer's address is bogus, but
721 that is the user's responsibility. If ->buffer is NULL this
722 is a simple allocation. */
723 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
724 if (dfa == NULL)
725 return REG_ESPACE;
726 preg->allocated = sizeof (re_dfa_t);
727 preg->buffer = (unsigned char *) dfa;
728 }
729 preg->used = sizeof (re_dfa_t);
730
731 err = init_dfa (dfa, length);
732 if (BE (err != REG_NOERROR, 0))
733 {
734 free_dfa_content (dfa);
735 preg->buffer = NULL;
736 preg->allocated = 0;
737 return err;
738 }
739 #ifdef DEBUG
740 /* Note: length+1 will not overflow since it is checked in init_dfa. */
741 dfa->re_str = re_malloc (char, length + 1);
742 strncpy (dfa->re_str, pattern, length + 1);
743 #endif
744
745 __libc_lock_init (dfa->lock);
746
747 err = re_string_construct (®exp, pattern, length, preg->translate,
748 syntax & RE_ICASE, dfa);
749 if (BE (err != REG_NOERROR, 0))
750 {
751 re_compile_internal_free_return:
752 free_workarea_compile (preg);
753 re_string_destruct (®exp);
754 free_dfa_content (dfa);
755 preg->buffer = NULL;
756 preg->allocated = 0;
757 return err;
758 }
759
760 /* Parse the regular expression, and build a structure tree. */
761 preg->re_nsub = 0;
762 dfa->str_tree = parse (®exp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
765
766 /* Analyze the tree and create the nfa. */
767 err = analyze (preg);
768 if (BE (err != REG_NOERROR, 0))
769 goto re_compile_internal_free_return;
770
771 #ifdef RE_ENABLE_I18N
772 /* If possible, do searching in single byte encoding to speed things up. */
773 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
774 optimize_utf8 (dfa);
775 #endif
776
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
779
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (®exp);
783
784 if (BE (err != REG_NOERROR, 0))
785 {
786 free_dfa_content (dfa);
787 preg->buffer = NULL;
788 preg->allocated = 0;
789 }
790
791 return err;
792 }
793
794 /* Initialize DFA. We use the length of the regular expression PAT_LEN
795 as the initial length of some arrays. */
796
797 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)798 init_dfa (re_dfa_t *dfa, size_t pat_len)
799 {
800 unsigned int table_size;
801 #ifndef _LIBC
802 const char *codeset_name;
803 #endif
804
805 memset (dfa, '\0', sizeof (re_dfa_t));
806
807 /* Force allocation of str_tree_storage the first time. */
808 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
809
810 /* Avoid overflows. */
811 if (pat_len == SIZE_MAX)
812 return REG_ESPACE;
813
814 dfa->nodes_alloc = pat_len + 1;
815 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
816
817 /* table_size = 2 ^ ceil(log pat_len) */
818 for (table_size = 1; ; table_size <<= 1)
819 if (table_size > pat_len)
820 break;
821
822 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
823 dfa->state_hash_mask = table_size - 1;
824
825 dfa->mb_cur_max = MB_CUR_MAX;
826 #ifdef _LIBC
827 if (dfa->mb_cur_max == 6
828 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
829 dfa->is_utf8 = 1;
830 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
831 != 0);
832 #else
833 # ifdef HAVE_LANGINFO_CODESET
834 codeset_name = nl_langinfo (CODESET);
835 # else
836 codeset_name = getenv ("LC_ALL");
837 if (codeset_name == NULL || codeset_name[0] == '\0')
838 codeset_name = getenv ("LC_CTYPE");
839 if (codeset_name == NULL || codeset_name[0] == '\0')
840 codeset_name = getenv ("LANG");
841 if (codeset_name == NULL)
842 codeset_name = "";
843 else if (strchr (codeset_name, '.') != NULL)
844 codeset_name = strchr (codeset_name, '.') + 1;
845 # endif
846
847 if (strcasecmp (codeset_name, "UTF-8") == 0
848 || strcasecmp (codeset_name, "UTF8") == 0)
849 dfa->is_utf8 = 1;
850
851 /* We check exhaustively in the loop below if this charset is a
852 superset of ASCII. */
853 dfa->map_notascii = 0;
854 #endif
855
856 #ifdef RE_ENABLE_I18N
857 if (dfa->mb_cur_max > 1)
858 {
859 if (dfa->is_utf8)
860 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 else
862 {
863 int i, j, ch;
864
865 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866 if (BE (dfa->sb_char == NULL, 0))
867 return REG_ESPACE;
868
869 /* Set the bits corresponding to single byte chars. */
870 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
872 {
873 wint_t wch = __btowc (ch);
874 if (wch != WEOF)
875 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876 # ifndef _LIBC
877 if (isascii (ch) && wch != ch)
878 dfa->map_notascii = 1;
879 # endif
880 }
881 }
882 }
883 #endif
884
885 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
886 return REG_ESPACE;
887 return REG_NOERROR;
888 }
889
890 /* Initialize WORD_CHAR table, which indicate which character is
891 "word". In this case "word" means that it is the word construction
892 character used by some operators like "\<", "\>", etc. */
893
894 static void
895 internal_function
init_word_char(re_dfa_t * dfa)896 init_word_char (re_dfa_t *dfa)
897 {
898 int i, j, ch;
899 dfa->word_ops_used = 1;
900 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
902 if (isalnum (ch) || ch == '_')
903 dfa->word_char[i] |= (bitset_word_t) 1 << j;
904 }
905
906 /* Free the work area which are only used while compiling. */
907
908 static void
free_workarea_compile(regex_t * preg)909 free_workarea_compile (regex_t *preg)
910 {
911 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
912 bin_tree_storage_t *storage, *next;
913 for (storage = dfa->str_tree_storage; storage; storage = next)
914 {
915 next = storage->next;
916 re_free (storage);
917 }
918 dfa->str_tree_storage = NULL;
919 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
920 dfa->str_tree = NULL;
921 re_free (dfa->org_indices);
922 dfa->org_indices = NULL;
923 }
924
925 /* Create initial states for all contexts. */
926
927 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)928 create_initial_state (re_dfa_t *dfa)
929 {
930 int first, i;
931 reg_errcode_t err;
932 re_node_set init_nodes;
933
934 /* Initial states have the epsilon closure of the node which is
935 the first node of the regular expression. */
936 first = dfa->str_tree->first->node_idx;
937 dfa->init_node = first;
938 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
939 if (BE (err != REG_NOERROR, 0))
940 return err;
941
942 /* The back-references which are in initial states can epsilon transit,
943 since in this case all of the subexpressions can be null.
944 Then we add epsilon closures of the nodes which are the next nodes of
945 the back-references. */
946 if (dfa->nbackref > 0)
947 for (i = 0; i < init_nodes.nelem; ++i)
948 {
949 int node_idx = init_nodes.elems[i];
950 re_token_type_t type = dfa->nodes[node_idx].type;
951
952 int clexp_idx;
953 if (type != OP_BACK_REF)
954 continue;
955 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
956 {
957 re_token_t *clexp_node;
958 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
959 if (clexp_node->type == OP_CLOSE_SUBEXP
960 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
961 break;
962 }
963 if (clexp_idx == init_nodes.nelem)
964 continue;
965
966 if (type == OP_BACK_REF)
967 {
968 int dest_idx = dfa->edests[node_idx].elems[0];
969 if (!re_node_set_contains (&init_nodes, dest_idx))
970 {
971 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
972 i = 0;
973 }
974 }
975 }
976
977 /* It must be the first time to invoke acquire_state. */
978 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
979 /* We don't check ERR here, since the initial state must not be NULL. */
980 if (BE (dfa->init_state == NULL, 0))
981 return err;
982 if (dfa->init_state->has_constraint)
983 {
984 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
985 CONTEXT_WORD);
986 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
987 CONTEXT_NEWLINE);
988 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
989 &init_nodes,
990 CONTEXT_NEWLINE
991 | CONTEXT_BEGBUF);
992 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
993 || dfa->init_state_begbuf == NULL, 0))
994 return err;
995 }
996 else
997 dfa->init_state_word = dfa->init_state_nl
998 = dfa->init_state_begbuf = dfa->init_state;
999
1000 re_node_set_free (&init_nodes);
1001 return REG_NOERROR;
1002 }
1003
1004 #ifdef RE_ENABLE_I18N
1005 /* If it is possible to do searching in single byte encoding instead of UTF-8
1006 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1007 DFA nodes where needed. */
1008
1009 static void
optimize_utf8(re_dfa_t * dfa)1010 optimize_utf8 (re_dfa_t *dfa)
1011 {
1012 int node, i, mb_chars = 0, has_period = 0;
1013
1014 for (node = 0; node < dfa->nodes_len; ++node)
1015 switch (dfa->nodes[node].type)
1016 {
1017 case CHARACTER:
1018 if (dfa->nodes[node].opr.c >= 0x80)
1019 mb_chars = 1;
1020 break;
1021 case ANCHOR:
1022 switch (dfa->nodes[node].opr.idx)
1023 {
1024 case LINE_FIRST:
1025 case LINE_LAST:
1026 case BUF_FIRST:
1027 case BUF_LAST:
1028 break;
1029 default:
1030 /* Word anchors etc. cannot be handled. */
1031 return;
1032 }
1033 break;
1034 case OP_PERIOD:
1035 has_period = 1;
1036 break;
1037 case OP_BACK_REF:
1038 case OP_ALT:
1039 case END_OF_RE:
1040 case OP_DUP_ASTERISK:
1041 case OP_OPEN_SUBEXP:
1042 case OP_CLOSE_SUBEXP:
1043 break;
1044 case COMPLEX_BRACKET:
1045 return;
1046 case SIMPLE_BRACKET:
1047 /* Just double check. The non-ASCII range starts at 0x80. */
1048 assert (0x80 % BITSET_WORD_BITS == 0);
1049 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1050 if (dfa->nodes[node].opr.sbcset[i])
1051 return;
1052 break;
1053 default:
1054 abort ();
1055 }
1056
1057 if (mb_chars || has_period)
1058 for (node = 0; node < dfa->nodes_len; ++node)
1059 {
1060 if (dfa->nodes[node].type == CHARACTER
1061 && dfa->nodes[node].opr.c >= 0x80)
1062 dfa->nodes[node].mb_partial = 0;
1063 else if (dfa->nodes[node].type == OP_PERIOD)
1064 dfa->nodes[node].type = OP_UTF8_PERIOD;
1065 }
1066
1067 /* The search can be in single byte locale. */
1068 dfa->mb_cur_max = 1;
1069 dfa->is_utf8 = 0;
1070 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1071 }
1072 #endif
1073
1074 /* Analyze the structure tree, and calculate "first", "next", "edest",
1075 "eclosure", and "inveclosure". */
1076
1077 static reg_errcode_t
analyze(regex_t * preg)1078 analyze (regex_t *preg)
1079 {
1080 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1081 reg_errcode_t ret;
1082
1083 /* Allocate arrays. */
1084 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1085 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1086 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1087 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1088 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1089 || dfa->eclosures == NULL, 0))
1090 return REG_ESPACE;
1091
1092 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1093 if (dfa->subexp_map != NULL)
1094 {
1095 int i;
1096 for (i = 0; i < preg->re_nsub; i++)
1097 dfa->subexp_map[i] = i;
1098 preorder (dfa->str_tree, optimize_subexps, dfa);
1099 for (i = 0; i < preg->re_nsub; i++)
1100 if (dfa->subexp_map[i] != i)
1101 break;
1102 if (i == preg->re_nsub)
1103 {
1104 free (dfa->subexp_map);
1105 dfa->subexp_map = NULL;
1106 }
1107 }
1108
1109 ret = postorder (dfa->str_tree, lower_subexps, preg);
1110 if (BE (ret != REG_NOERROR, 0))
1111 return ret;
1112 ret = postorder (dfa->str_tree, calc_first, dfa);
1113 if (BE (ret != REG_NOERROR, 0))
1114 return ret;
1115 preorder (dfa->str_tree, calc_next, dfa);
1116 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1117 if (BE (ret != REG_NOERROR, 0))
1118 return ret;
1119 ret = calc_eclosure (dfa);
1120 if (BE (ret != REG_NOERROR, 0))
1121 return ret;
1122
1123 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1124 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1125 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1126 || dfa->nbackref)
1127 {
1128 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1129 if (BE (dfa->inveclosures == NULL, 0))
1130 return REG_ESPACE;
1131 ret = calc_inveclosure (dfa);
1132 }
1133
1134 return ret;
1135 }
1136
1137 /* Our parse trees are very unbalanced, so we cannot use a stack to
1138 implement parse tree visits. Instead, we use parent pointers and
1139 some hairy code in these two functions. */
1140 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1141 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1142 void *extra)
1143 {
1144 bin_tree_t *node, *prev;
1145
1146 for (node = root; ; )
1147 {
1148 /* Descend down the tree, preferably to the left (or to the right
1149 if that's the only child). */
1150 while (node->left || node->right)
1151 if (node->left)
1152 node = node->left;
1153 else
1154 node = node->right;
1155
1156 do
1157 {
1158 reg_errcode_t err = fn (extra, node);
1159 if (BE (err != REG_NOERROR, 0))
1160 return err;
1161 if (node->parent == NULL)
1162 return REG_NOERROR;
1163 prev = node;
1164 node = node->parent;
1165 }
1166 /* Go up while we have a node that is reached from the right. */
1167 while (node->right == prev || node->right == NULL);
1168 node = node->right;
1169 }
1170 }
1171
1172 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1173 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1174 void *extra)
1175 {
1176 bin_tree_t *node;
1177
1178 for (node = root; ; )
1179 {
1180 reg_errcode_t err = fn (extra, node);
1181 if (BE (err != REG_NOERROR, 0))
1182 return err;
1183
1184 /* Go to the left node, or up and to the right. */
1185 if (node->left)
1186 node = node->left;
1187 else
1188 {
1189 bin_tree_t *prev = NULL;
1190 while (node->right == prev || node->right == NULL)
1191 {
1192 prev = node;
1193 node = node->parent;
1194 if (!node)
1195 return REG_NOERROR;
1196 }
1197 node = node->right;
1198 }
1199 }
1200 }
1201
1202 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1203 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1204 backreferences as well. Requires a preorder visit. */
1205 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1206 optimize_subexps (void *extra, bin_tree_t *node)
1207 {
1208 re_dfa_t *dfa = (re_dfa_t *) extra;
1209
1210 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1211 {
1212 int idx = node->token.opr.idx;
1213 node->token.opr.idx = dfa->subexp_map[idx];
1214 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1215 }
1216
1217 else if (node->token.type == SUBEXP
1218 && node->left && node->left->token.type == SUBEXP)
1219 {
1220 int other_idx = node->left->token.opr.idx;
1221
1222 node->left = node->left->left;
1223 if (node->left)
1224 node->left->parent = node;
1225
1226 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1227 if (other_idx < BITSET_WORD_BITS)
1228 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1229 }
1230
1231 return REG_NOERROR;
1232 }
1233
1234 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1235 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1236 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1237 lower_subexps (void *extra, bin_tree_t *node)
1238 {
1239 regex_t *preg = (regex_t *) extra;
1240 reg_errcode_t err = REG_NOERROR;
1241
1242 if (node->left && node->left->token.type == SUBEXP)
1243 {
1244 node->left = lower_subexp (&err, preg, node->left);
1245 if (node->left)
1246 node->left->parent = node;
1247 }
1248 if (node->right && node->right->token.type == SUBEXP)
1249 {
1250 node->right = lower_subexp (&err, preg, node->right);
1251 if (node->right)
1252 node->right->parent = node;
1253 }
1254
1255 return err;
1256 }
1257
1258 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1259 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1260 {
1261 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1262 bin_tree_t *body = node->left;
1263 bin_tree_t *op, *cls, *tree1, *tree;
1264
1265 if (preg->no_sub
1266 /* We do not optimize empty subexpressions, because otherwise we may
1267 have bad CONCAT nodes with NULL children. This is obviously not
1268 very common, so we do not lose much. An example that triggers
1269 this case is the sed "script" /\(\)/x. */
1270 && node->left != NULL
1271 && (node->token.opr.idx >= BITSET_WORD_BITS
1272 || !(dfa->used_bkref_map
1273 & ((bitset_word_t) 1 << node->token.opr.idx))))
1274 return node->left;
1275
1276 /* Convert the SUBEXP node to the concatenation of an
1277 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1278 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1279 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1280 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1281 tree = create_tree (dfa, op, tree1, CONCAT);
1282 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1283 {
1284 *err = REG_ESPACE;
1285 return NULL;
1286 }
1287
1288 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1289 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1290 return tree;
1291 }
1292
1293 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1294 nodes. Requires a postorder visit. */
1295 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1296 calc_first (void *extra, bin_tree_t *node)
1297 {
1298 re_dfa_t *dfa = (re_dfa_t *) extra;
1299 if (node->token.type == CONCAT)
1300 {
1301 node->first = node->left->first;
1302 node->node_idx = node->left->node_idx;
1303 }
1304 else
1305 {
1306 node->first = node;
1307 node->node_idx = re_dfa_add_node (dfa, node->token);
1308 if (BE (node->node_idx == -1, 0))
1309 return REG_ESPACE;
1310 }
1311 return REG_NOERROR;
1312 }
1313
1314 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1315 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1316 calc_next (void *extra, bin_tree_t *node)
1317 {
1318 switch (node->token.type)
1319 {
1320 case OP_DUP_ASTERISK:
1321 node->left->next = node;
1322 break;
1323 case CONCAT:
1324 node->left->next = node->right->first;
1325 node->right->next = node->next;
1326 break;
1327 default:
1328 if (node->left)
1329 node->left->next = node->next;
1330 if (node->right)
1331 node->right->next = node->next;
1332 break;
1333 }
1334 return REG_NOERROR;
1335 }
1336
1337 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1338 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1339 link_nfa_nodes (void *extra, bin_tree_t *node)
1340 {
1341 re_dfa_t *dfa = (re_dfa_t *) extra;
1342 int idx = node->node_idx;
1343 reg_errcode_t err = REG_NOERROR;
1344
1345 switch (node->token.type)
1346 {
1347 case CONCAT:
1348 break;
1349
1350 case END_OF_RE:
1351 assert (node->next == NULL);
1352 break;
1353
1354 case OP_DUP_ASTERISK:
1355 case OP_ALT:
1356 {
1357 int left, right;
1358 dfa->has_plural_match = 1;
1359 if (node->left != NULL)
1360 left = node->left->first->node_idx;
1361 else
1362 left = node->next->node_idx;
1363 if (node->right != NULL)
1364 right = node->right->first->node_idx;
1365 else
1366 right = node->next->node_idx;
1367 assert (left > -1);
1368 assert (right > -1);
1369 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1370 }
1371 break;
1372
1373 case ANCHOR:
1374 case OP_OPEN_SUBEXP:
1375 case OP_CLOSE_SUBEXP:
1376 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1377 break;
1378
1379 case OP_BACK_REF:
1380 dfa->nexts[idx] = node->next->node_idx;
1381 if (node->token.type == OP_BACK_REF)
1382 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1383 break;
1384
1385 default:
1386 assert (!IS_EPSILON_NODE (node->token.type));
1387 dfa->nexts[idx] = node->next->node_idx;
1388 break;
1389 }
1390
1391 return err;
1392 }
1393
1394 /* Duplicate the epsilon closure of the node ROOT_NODE.
1395 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1396 to their own constraint. */
1397
1398 static reg_errcode_t
1399 internal_function
duplicate_node_closure(re_dfa_t * dfa,int top_org_node,int top_clone_node,int root_node,unsigned int init_constraint)1400 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1401 int root_node, unsigned int init_constraint)
1402 {
1403 int org_node, clone_node, ret;
1404 unsigned int constraint = init_constraint;
1405 for (org_node = top_org_node, clone_node = top_clone_node;;)
1406 {
1407 int org_dest, clone_dest;
1408 if (dfa->nodes[org_node].type == OP_BACK_REF)
1409 {
1410 /* If the back reference epsilon-transit, its destination must
1411 also have the constraint. Then duplicate the epsilon closure
1412 of the destination of the back reference, and store it in
1413 edests of the back reference. */
1414 org_dest = dfa->nexts[org_node];
1415 re_node_set_empty (dfa->edests + clone_node);
1416 clone_dest = duplicate_node (dfa, org_dest, constraint);
1417 if (BE (clone_dest == -1, 0))
1418 return REG_ESPACE;
1419 dfa->nexts[clone_node] = dfa->nexts[org_node];
1420 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1421 if (BE (ret < 0, 0))
1422 return REG_ESPACE;
1423 }
1424 else if (dfa->edests[org_node].nelem == 0)
1425 {
1426 /* In case of the node can't epsilon-transit, don't duplicate the
1427 destination and store the original destination as the
1428 destination of the node. */
1429 dfa->nexts[clone_node] = dfa->nexts[org_node];
1430 break;
1431 }
1432 else if (dfa->edests[org_node].nelem == 1)
1433 {
1434 /* In case of the node can epsilon-transit, and it has only one
1435 destination. */
1436 org_dest = dfa->edests[org_node].elems[0];
1437 re_node_set_empty (dfa->edests + clone_node);
1438 if (dfa->nodes[org_node].type == ANCHOR)
1439 {
1440 /* In case of the node has another constraint, append it. */
1441 if (org_node == root_node && clone_node != org_node)
1442 {
1443 /* ...but if the node is root_node itself, it means the
1444 epsilon closure have a loop, then tie it to the
1445 destination of the root_node. */
1446 ret = re_node_set_insert (dfa->edests + clone_node,
1447 org_dest);
1448 if (BE (ret < 0, 0))
1449 return REG_ESPACE;
1450 break;
1451 }
1452 constraint |= dfa->nodes[org_node].opr.ctx_type;
1453 }
1454 clone_dest = duplicate_node (dfa, org_dest, constraint);
1455 if (BE (clone_dest == -1, 0))
1456 return REG_ESPACE;
1457 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1458 if (BE (ret < 0, 0))
1459 return REG_ESPACE;
1460 }
1461 else /* dfa->edests[org_node].nelem == 2 */
1462 {
1463 /* In case of the node can epsilon-transit, and it has two
1464 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1465 org_dest = dfa->edests[org_node].elems[0];
1466 re_node_set_empty (dfa->edests + clone_node);
1467 /* Search for a duplicated node which satisfies the constraint. */
1468 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1469 if (clone_dest == -1)
1470 {
1471 /* There are no such a duplicated node, create a new one. */
1472 reg_errcode_t err;
1473 clone_dest = duplicate_node (dfa, org_dest, constraint);
1474 if (BE (clone_dest == -1, 0))
1475 return REG_ESPACE;
1476 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1477 if (BE (ret < 0, 0))
1478 return REG_ESPACE;
1479 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1480 root_node, constraint);
1481 if (BE (err != REG_NOERROR, 0))
1482 return err;
1483 }
1484 else
1485 {
1486 /* There are a duplicated node which satisfy the constraint,
1487 use it to avoid infinite loop. */
1488 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489 if (BE (ret < 0, 0))
1490 return REG_ESPACE;
1491 }
1492
1493 org_dest = dfa->edests[org_node].elems[1];
1494 clone_dest = duplicate_node (dfa, org_dest, constraint);
1495 if (BE (clone_dest == -1, 0))
1496 return REG_ESPACE;
1497 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498 if (BE (ret < 0, 0))
1499 return REG_ESPACE;
1500 }
1501 org_node = org_dest;
1502 clone_node = clone_dest;
1503 }
1504 return REG_NOERROR;
1505 }
1506
1507 /* Search for a node which is duplicated from the node ORG_NODE, and
1508 satisfies the constraint CONSTRAINT. */
1509
1510 static int
search_duplicated_node(const re_dfa_t * dfa,int org_node,unsigned int constraint)1511 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1512 unsigned int constraint)
1513 {
1514 int idx;
1515 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1516 {
1517 if (org_node == dfa->org_indices[idx]
1518 && constraint == dfa->nodes[idx].constraint)
1519 return idx; /* Found. */
1520 }
1521 return -1; /* Not found. */
1522 }
1523
1524 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1525 Return the index of the new node, or -1 if insufficient storage is
1526 available. */
1527
1528 static int
duplicate_node(re_dfa_t * dfa,int org_idx,unsigned int constraint)1529 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1530 {
1531 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1532 if (BE (dup_idx != -1, 1))
1533 {
1534 dfa->nodes[dup_idx].constraint = constraint;
1535 if (dfa->nodes[org_idx].type == ANCHOR)
1536 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1537 dfa->nodes[dup_idx].duplicated = 1;
1538
1539 /* Store the index of the original node. */
1540 dfa->org_indices[dup_idx] = org_idx;
1541 }
1542 return dup_idx;
1543 }
1544
1545 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1546 calc_inveclosure (re_dfa_t *dfa)
1547 {
1548 int src, idx, ret;
1549 for (idx = 0; idx < dfa->nodes_len; ++idx)
1550 re_node_set_init_empty (dfa->inveclosures + idx);
1551
1552 for (src = 0; src < dfa->nodes_len; ++src)
1553 {
1554 int *elems = dfa->eclosures[src].elems;
1555 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1556 {
1557 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1558 if (BE (ret == -1, 0))
1559 return REG_ESPACE;
1560 }
1561 }
1562
1563 return REG_NOERROR;
1564 }
1565
1566 /* Calculate "eclosure" for all the node in DFA. */
1567
1568 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1569 calc_eclosure (re_dfa_t *dfa)
1570 {
1571 int node_idx, incomplete;
1572 #ifdef DEBUG
1573 assert (dfa->nodes_len > 0);
1574 #endif
1575 incomplete = 0;
1576 /* For each nodes, calculate epsilon closure. */
1577 for (node_idx = 0; ; ++node_idx)
1578 {
1579 reg_errcode_t err;
1580 re_node_set eclosure_elem;
1581 if (node_idx == dfa->nodes_len)
1582 {
1583 if (!incomplete)
1584 break;
1585 incomplete = 0;
1586 node_idx = 0;
1587 }
1588
1589 #ifdef DEBUG
1590 assert (dfa->eclosures[node_idx].nelem != -1);
1591 #endif
1592
1593 /* If we have already calculated, skip it. */
1594 if (dfa->eclosures[node_idx].nelem != 0)
1595 continue;
1596 /* Calculate epsilon closure of `node_idx'. */
1597 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1598 if (BE (err != REG_NOERROR, 0))
1599 return err;
1600
1601 if (dfa->eclosures[node_idx].nelem == 0)
1602 {
1603 incomplete = 1;
1604 re_node_set_free (&eclosure_elem);
1605 }
1606 }
1607 return REG_NOERROR;
1608 }
1609
1610 /* Calculate epsilon closure of NODE. */
1611
1612 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,int node,int root)1613 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1614 {
1615 reg_errcode_t err;
1616 unsigned int constraint;
1617 int i, incomplete;
1618 re_node_set eclosure;
1619 incomplete = 0;
1620 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1621 if (BE (err != REG_NOERROR, 0))
1622 return err;
1623
1624 /* This indicates that we are calculating this node now.
1625 We reference this value to avoid infinite loop. */
1626 dfa->eclosures[node].nelem = -1;
1627
1628 constraint = ((dfa->nodes[node].type == ANCHOR)
1629 ? dfa->nodes[node].opr.ctx_type : 0);
1630 /* If the current node has constraints, duplicate all nodes.
1631 Since they must inherit the constraints. */
1632 if (constraint
1633 && dfa->edests[node].nelem
1634 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1635 {
1636 err = duplicate_node_closure (dfa, node, node, node, constraint);
1637 if (BE (err != REG_NOERROR, 0))
1638 return err;
1639 }
1640
1641 /* Expand each epsilon destination nodes. */
1642 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1643 for (i = 0; i < dfa->edests[node].nelem; ++i)
1644 {
1645 re_node_set eclosure_elem;
1646 int edest = dfa->edests[node].elems[i];
1647 /* If calculating the epsilon closure of `edest' is in progress,
1648 return intermediate result. */
1649 if (dfa->eclosures[edest].nelem == -1)
1650 {
1651 incomplete = 1;
1652 continue;
1653 }
1654 /* If we haven't calculated the epsilon closure of `edest' yet,
1655 calculate now. Otherwise use calculated epsilon closure. */
1656 if (dfa->eclosures[edest].nelem == 0)
1657 {
1658 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1659 if (BE (err != REG_NOERROR, 0))
1660 return err;
1661 }
1662 else
1663 eclosure_elem = dfa->eclosures[edest];
1664 /* Merge the epsilon closure of `edest'. */
1665 re_node_set_merge (&eclosure, &eclosure_elem);
1666 /* If the epsilon closure of `edest' is incomplete,
1667 the epsilon closure of this node is also incomplete. */
1668 if (dfa->eclosures[edest].nelem == 0)
1669 {
1670 incomplete = 1;
1671 re_node_set_free (&eclosure_elem);
1672 }
1673 }
1674
1675 /* Epsilon closures include itself. */
1676 re_node_set_insert (&eclosure, node);
1677 if (incomplete && !root)
1678 dfa->eclosures[node].nelem = 0;
1679 else
1680 dfa->eclosures[node] = eclosure;
1681 *new_set = eclosure;
1682 return REG_NOERROR;
1683 }
1684
1685 /* Functions for token which are used in the parser. */
1686
1687 /* Fetch a token from INPUT.
1688 We must not use this function inside bracket expressions. */
1689
1690 static void
1691 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1692 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1693 {
1694 re_string_skip_bytes (input, peek_token (result, input, syntax));
1695 }
1696
1697 /* Peek a token from INPUT, and return the length of the token.
1698 We must not use this function inside bracket expressions. */
1699
1700 static int
1701 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1702 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1703 {
1704 unsigned char c;
1705
1706 if (re_string_eoi (input))
1707 {
1708 token->type = END_OF_RE;
1709 return 0;
1710 }
1711
1712 c = re_string_peek_byte (input, 0);
1713 token->opr.c = c;
1714
1715 token->word_char = 0;
1716 #ifdef RE_ENABLE_I18N
1717 token->mb_partial = 0;
1718 if (input->mb_cur_max > 1 &&
1719 !re_string_first_byte (input, re_string_cur_idx (input)))
1720 {
1721 token->type = CHARACTER;
1722 token->mb_partial = 1;
1723 return 1;
1724 }
1725 #endif
1726 if (c == '\\')
1727 {
1728 unsigned char c2;
1729 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1730 {
1731 token->type = BACK_SLASH;
1732 return 1;
1733 }
1734
1735 c2 = re_string_peek_byte_case (input, 1);
1736 token->opr.c = c2;
1737 token->type = CHARACTER;
1738 #ifdef RE_ENABLE_I18N
1739 if (input->mb_cur_max > 1)
1740 {
1741 wint_t wc = re_string_wchar_at (input,
1742 re_string_cur_idx (input) + 1);
1743 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1744 }
1745 else
1746 #endif
1747 token->word_char = IS_WORD_CHAR (c2) != 0;
1748
1749 switch (c2)
1750 {
1751 case '|':
1752 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1753 token->type = OP_ALT;
1754 break;
1755 case '1': case '2': case '3': case '4': case '5':
1756 case '6': case '7': case '8': case '9':
1757 if (!(syntax & RE_NO_BK_REFS))
1758 {
1759 token->type = OP_BACK_REF;
1760 token->opr.idx = c2 - '1';
1761 }
1762 break;
1763 case '<':
1764 if (!(syntax & RE_NO_GNU_OPS))
1765 {
1766 token->type = ANCHOR;
1767 token->opr.ctx_type = WORD_FIRST;
1768 }
1769 break;
1770 case '>':
1771 if (!(syntax & RE_NO_GNU_OPS))
1772 {
1773 token->type = ANCHOR;
1774 token->opr.ctx_type = WORD_LAST;
1775 }
1776 break;
1777 case 'b':
1778 if (!(syntax & RE_NO_GNU_OPS))
1779 {
1780 token->type = ANCHOR;
1781 token->opr.ctx_type = WORD_DELIM;
1782 }
1783 break;
1784 case 'B':
1785 if (!(syntax & RE_NO_GNU_OPS))
1786 {
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = NOT_WORD_DELIM;
1789 }
1790 break;
1791 case 'w':
1792 if (!(syntax & RE_NO_GNU_OPS))
1793 token->type = OP_WORD;
1794 break;
1795 case 'W':
1796 if (!(syntax & RE_NO_GNU_OPS))
1797 token->type = OP_NOTWORD;
1798 break;
1799 case 's':
1800 if (!(syntax & RE_NO_GNU_OPS))
1801 token->type = OP_SPACE;
1802 break;
1803 case 'S':
1804 if (!(syntax & RE_NO_GNU_OPS))
1805 token->type = OP_NOTSPACE;
1806 break;
1807 case '`':
1808 if (!(syntax & RE_NO_GNU_OPS))
1809 {
1810 token->type = ANCHOR;
1811 token->opr.ctx_type = BUF_FIRST;
1812 }
1813 break;
1814 case '\'':
1815 if (!(syntax & RE_NO_GNU_OPS))
1816 {
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = BUF_LAST;
1819 }
1820 break;
1821 case '(':
1822 if (!(syntax & RE_NO_BK_PARENS))
1823 token->type = OP_OPEN_SUBEXP;
1824 break;
1825 case ')':
1826 if (!(syntax & RE_NO_BK_PARENS))
1827 token->type = OP_CLOSE_SUBEXP;
1828 break;
1829 case '+':
1830 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1831 token->type = OP_DUP_PLUS;
1832 break;
1833 case '?':
1834 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1835 token->type = OP_DUP_QUESTION;
1836 break;
1837 case '{':
1838 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1839 token->type = OP_OPEN_DUP_NUM;
1840 break;
1841 case '}':
1842 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1843 token->type = OP_CLOSE_DUP_NUM;
1844 break;
1845 default:
1846 break;
1847 }
1848 return 2;
1849 }
1850
1851 token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853 if (input->mb_cur_max > 1)
1854 {
1855 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1856 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1857 }
1858 else
1859 #endif
1860 token->word_char = IS_WORD_CHAR (token->opr.c);
1861
1862 switch (c)
1863 {
1864 case '\n':
1865 if (syntax & RE_NEWLINE_ALT)
1866 token->type = OP_ALT;
1867 break;
1868 case '|':
1869 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1870 token->type = OP_ALT;
1871 break;
1872 case '*':
1873 token->type = OP_DUP_ASTERISK;
1874 break;
1875 case '+':
1876 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1877 token->type = OP_DUP_PLUS;
1878 break;
1879 case '?':
1880 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1881 token->type = OP_DUP_QUESTION;
1882 break;
1883 case '{':
1884 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1885 token->type = OP_OPEN_DUP_NUM;
1886 break;
1887 case '}':
1888 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1889 token->type = OP_CLOSE_DUP_NUM;
1890 break;
1891 case '(':
1892 if (syntax & RE_NO_BK_PARENS)
1893 token->type = OP_OPEN_SUBEXP;
1894 break;
1895 case ')':
1896 if (syntax & RE_NO_BK_PARENS)
1897 token->type = OP_CLOSE_SUBEXP;
1898 break;
1899 case '[':
1900 token->type = OP_OPEN_BRACKET;
1901 break;
1902 case '.':
1903 token->type = OP_PERIOD;
1904 break;
1905 case '^':
1906 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1907 re_string_cur_idx (input) != 0)
1908 {
1909 char prev = re_string_peek_byte (input, -1);
1910 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1911 break;
1912 }
1913 token->type = ANCHOR;
1914 token->opr.ctx_type = LINE_FIRST;
1915 break;
1916 case '$':
1917 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1918 re_string_cur_idx (input) + 1 != re_string_length (input))
1919 {
1920 re_token_t next;
1921 re_string_skip_bytes (input, 1);
1922 peek_token (&next, input, syntax);
1923 re_string_skip_bytes (input, -1);
1924 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1925 break;
1926 }
1927 token->type = ANCHOR;
1928 token->opr.ctx_type = LINE_LAST;
1929 break;
1930 default:
1931 break;
1932 }
1933 return 1;
1934 }
1935
1936 /* Peek a token from INPUT, and return the length of the token.
1937 We must not use this function out of bracket expressions. */
1938
1939 static int
1940 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1941 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1942 {
1943 unsigned char c;
1944 if (re_string_eoi (input))
1945 {
1946 token->type = END_OF_RE;
1947 return 0;
1948 }
1949 c = re_string_peek_byte (input, 0);
1950 token->opr.c = c;
1951
1952 #ifdef RE_ENABLE_I18N
1953 if (input->mb_cur_max > 1 &&
1954 !re_string_first_byte (input, re_string_cur_idx (input)))
1955 {
1956 token->type = CHARACTER;
1957 return 1;
1958 }
1959 #endif /* RE_ENABLE_I18N */
1960
1961 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1962 && re_string_cur_idx (input) + 1 < re_string_length (input))
1963 {
1964 /* In this case, '\' escape a character. */
1965 unsigned char c2;
1966 re_string_skip_bytes (input, 1);
1967 c2 = re_string_peek_byte (input, 0);
1968 token->opr.c = c2;
1969 token->type = CHARACTER;
1970 return 1;
1971 }
1972 if (c == '[') /* '[' is a special char in a bracket exps. */
1973 {
1974 unsigned char c2;
1975 int token_len;
1976 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1977 c2 = re_string_peek_byte (input, 1);
1978 else
1979 c2 = 0;
1980 token->opr.c = c2;
1981 token_len = 2;
1982 switch (c2)
1983 {
1984 case '.':
1985 token->type = OP_OPEN_COLL_ELEM;
1986 break;
1987 case '=':
1988 token->type = OP_OPEN_EQUIV_CLASS;
1989 break;
1990 case ':':
1991 if (syntax & RE_CHAR_CLASSES)
1992 {
1993 token->type = OP_OPEN_CHAR_CLASS;
1994 break;
1995 }
1996 /* else fall through. */
1997 default:
1998 token->type = CHARACTER;
1999 token->opr.c = c;
2000 token_len = 1;
2001 break;
2002 }
2003 return token_len;
2004 }
2005 switch (c)
2006 {
2007 case '-':
2008 token->type = OP_CHARSET_RANGE;
2009 break;
2010 case ']':
2011 token->type = OP_CLOSE_BRACKET;
2012 break;
2013 case '^':
2014 token->type = OP_NON_MATCH_LIST;
2015 break;
2016 default:
2017 token->type = CHARACTER;
2018 }
2019 return 1;
2020 }
2021
2022 /* Functions for parser. */
2023
2024 /* Entry point of the parser.
2025 Parse the regular expression REGEXP and return the structure tree.
2026 If an error is occured, ERR is set by error code, and return NULL.
2027 This function build the following tree, from regular expression <reg_exp>:
2028 CAT
2029 / \
2030 / \
2031 <reg_exp> EOR
2032
2033 CAT means concatenation.
2034 EOR means end of regular expression. */
2035
2036 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2037 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2038 reg_errcode_t *err)
2039 {
2040 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2041 bin_tree_t *tree, *eor, *root;
2042 re_token_t current_token;
2043 dfa->syntax = syntax;
2044 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2045 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2046 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2047 return NULL;
2048 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2049 if (tree != NULL)
2050 root = create_tree (dfa, tree, eor, CONCAT);
2051 else
2052 root = eor;
2053 if (BE (eor == NULL || root == NULL, 0))
2054 {
2055 *err = REG_ESPACE;
2056 return NULL;
2057 }
2058 return root;
2059 }
2060
2061 /* This function build the following tree, from regular expression
2062 <branch1>|<branch2>:
2063 ALT
2064 / \
2065 / \
2066 <branch1> <branch2>
2067
2068 ALT means alternative, which represents the operator `|'. */
2069
2070 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2071 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2072 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2073 {
2074 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2075 bin_tree_t *tree, *branch = NULL;
2076 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2077 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078 return NULL;
2079
2080 while (token->type == OP_ALT)
2081 {
2082 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2083 if (token->type != OP_ALT && token->type != END_OF_RE
2084 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2085 {
2086 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2087 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2088 return NULL;
2089 }
2090 else
2091 branch = NULL;
2092 tree = create_tree (dfa, tree, branch, OP_ALT);
2093 if (BE (tree == NULL, 0))
2094 {
2095 *err = REG_ESPACE;
2096 return NULL;
2097 }
2098 }
2099 return tree;
2100 }
2101
2102 /* This function build the following tree, from regular expression
2103 <exp1><exp2>:
2104 CAT
2105 / \
2106 / \
2107 <exp1> <exp2>
2108
2109 CAT means concatenation. */
2110
2111 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2112 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2113 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2114 {
2115 bin_tree_t *tree, *exp;
2116 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2117 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119 return NULL;
2120
2121 while (token->type != OP_ALT && token->type != END_OF_RE
2122 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2123 {
2124 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2125 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2126 {
2127 return NULL;
2128 }
2129 if (tree != NULL && exp != NULL)
2130 {
2131 tree = create_tree (dfa, tree, exp, CONCAT);
2132 if (tree == NULL)
2133 {
2134 *err = REG_ESPACE;
2135 return NULL;
2136 }
2137 }
2138 else if (tree == NULL)
2139 tree = exp;
2140 /* Otherwise exp == NULL, we don't need to create new tree. */
2141 }
2142 return tree;
2143 }
2144
2145 /* This function build the following tree, from regular expression a*:
2146 *
2147 |
2148 a
2149 */
2150
2151 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2152 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2154 {
2155 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2156 bin_tree_t *tree;
2157 switch (token->type)
2158 {
2159 case CHARACTER:
2160 tree = create_token_tree (dfa, NULL, NULL, token);
2161 if (BE (tree == NULL, 0))
2162 {
2163 *err = REG_ESPACE;
2164 return NULL;
2165 }
2166 #ifdef RE_ENABLE_I18N
2167 if (dfa->mb_cur_max > 1)
2168 {
2169 while (!re_string_eoi (regexp)
2170 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2171 {
2172 bin_tree_t *mbc_remain;
2173 fetch_token (token, regexp, syntax);
2174 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2175 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2176 if (BE (mbc_remain == NULL || tree == NULL, 0))
2177 {
2178 *err = REG_ESPACE;
2179 return NULL;
2180 }
2181 }
2182 }
2183 #endif
2184 break;
2185 case OP_OPEN_SUBEXP:
2186 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188 return NULL;
2189 break;
2190 case OP_OPEN_BRACKET:
2191 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 return NULL;
2194 break;
2195 case OP_BACK_REF:
2196 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2197 {
2198 *err = REG_ESUBREG;
2199 return NULL;
2200 }
2201 dfa->used_bkref_map |= 1 << token->opr.idx;
2202 tree = create_token_tree (dfa, NULL, NULL, token);
2203 if (BE (tree == NULL, 0))
2204 {
2205 *err = REG_ESPACE;
2206 return NULL;
2207 }
2208 ++dfa->nbackref;
2209 dfa->has_mb_node = 1;
2210 break;
2211 case OP_OPEN_DUP_NUM:
2212 if (syntax & RE_CONTEXT_INVALID_DUP)
2213 {
2214 *err = REG_BADRPT;
2215 return NULL;
2216 }
2217 /* FALLTHROUGH */
2218 case OP_DUP_ASTERISK:
2219 case OP_DUP_PLUS:
2220 case OP_DUP_QUESTION:
2221 if (syntax & RE_CONTEXT_INVALID_OPS)
2222 {
2223 *err = REG_BADRPT;
2224 return NULL;
2225 }
2226 else if (syntax & RE_CONTEXT_INDEP_OPS)
2227 {
2228 fetch_token (token, regexp, syntax);
2229 return parse_expression (regexp, preg, token, syntax, nest, err);
2230 }
2231 /* else fall through */
2232 case OP_CLOSE_SUBEXP:
2233 if ((token->type == OP_CLOSE_SUBEXP) &&
2234 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2235 {
2236 *err = REG_ERPAREN;
2237 return NULL;
2238 }
2239 /* else fall through */
2240 case OP_CLOSE_DUP_NUM:
2241 /* We treat it as a normal character. */
2242
2243 /* Then we can these characters as normal characters. */
2244 token->type = CHARACTER;
2245 /* mb_partial and word_char bits should be initialized already
2246 by peek_token. */
2247 tree = create_token_tree (dfa, NULL, NULL, token);
2248 if (BE (tree == NULL, 0))
2249 {
2250 *err = REG_ESPACE;
2251 return NULL;
2252 }
2253 break;
2254 case ANCHOR:
2255 if ((token->opr.ctx_type
2256 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2257 && dfa->word_ops_used == 0)
2258 init_word_char (dfa);
2259 if (token->opr.ctx_type == WORD_DELIM
2260 || token->opr.ctx_type == NOT_WORD_DELIM)
2261 {
2262 bin_tree_t *tree_first, *tree_last;
2263 if (token->opr.ctx_type == WORD_DELIM)
2264 {
2265 token->opr.ctx_type = WORD_FIRST;
2266 tree_first = create_token_tree (dfa, NULL, NULL, token);
2267 token->opr.ctx_type = WORD_LAST;
2268 }
2269 else
2270 {
2271 token->opr.ctx_type = INSIDE_WORD;
2272 tree_first = create_token_tree (dfa, NULL, NULL, token);
2273 token->opr.ctx_type = INSIDE_NOTWORD;
2274 }
2275 tree_last = create_token_tree (dfa, NULL, NULL, token);
2276 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2277 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2278 {
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
2282 }
2283 else
2284 {
2285 tree = create_token_tree (dfa, NULL, NULL, token);
2286 if (BE (tree == NULL, 0))
2287 {
2288 *err = REG_ESPACE;
2289 return NULL;
2290 }
2291 }
2292 /* We must return here, since ANCHORs can't be followed
2293 by repetition operators.
2294 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2295 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2296 fetch_token (token, regexp, syntax);
2297 return tree;
2298 case OP_PERIOD:
2299 tree = create_token_tree (dfa, NULL, NULL, token);
2300 if (BE (tree == NULL, 0))
2301 {
2302 *err = REG_ESPACE;
2303 return NULL;
2304 }
2305 if (dfa->mb_cur_max > 1)
2306 dfa->has_mb_node = 1;
2307 break;
2308 case OP_WORD:
2309 case OP_NOTWORD:
2310 tree = build_charclass_op (dfa, regexp->trans,
2311 (const unsigned char *) "alnum",
2312 (const unsigned char *) "_",
2313 token->type == OP_NOTWORD, err);
2314 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2315 return NULL;
2316 break;
2317 case OP_SPACE:
2318 case OP_NOTSPACE:
2319 tree = build_charclass_op (dfa, regexp->trans,
2320 (const unsigned char *) "space",
2321 (const unsigned char *) "",
2322 token->type == OP_NOTSPACE, err);
2323 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 return NULL;
2325 break;
2326 case OP_ALT:
2327 case END_OF_RE:
2328 return NULL;
2329 case BACK_SLASH:
2330 *err = REG_EESCAPE;
2331 return NULL;
2332 default:
2333 /* Must not happen? */
2334 #ifdef DEBUG
2335 assert (0);
2336 #endif
2337 return NULL;
2338 }
2339 fetch_token (token, regexp, syntax);
2340
2341 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2342 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2343 {
2344 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2345 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2346 return NULL;
2347 /* In BRE consecutive duplications are not allowed. */
2348 if ((syntax & RE_CONTEXT_INVALID_DUP)
2349 && (token->type == OP_DUP_ASTERISK
2350 || token->type == OP_OPEN_DUP_NUM))
2351 {
2352 *err = REG_BADRPT;
2353 return NULL;
2354 }
2355 }
2356
2357 return tree;
2358 }
2359
2360 /* This function build the following tree, from regular expression
2361 (<reg_exp>):
2362 SUBEXP
2363 |
2364 <reg_exp>
2365 */
2366
2367 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,int nest,reg_errcode_t * err)2368 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2369 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2370 {
2371 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2372 bin_tree_t *tree;
2373 size_t cur_nsub;
2374 cur_nsub = preg->re_nsub++;
2375
2376 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2377
2378 /* The subexpression may be a null string. */
2379 if (token->type == OP_CLOSE_SUBEXP)
2380 tree = NULL;
2381 else
2382 {
2383 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2384 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2385 *err = REG_EPAREN;
2386 if (BE (*err != REG_NOERROR, 0))
2387 return NULL;
2388 }
2389
2390 if (cur_nsub <= '9' - '1')
2391 dfa->completed_bkref_map |= 1 << cur_nsub;
2392
2393 tree = create_tree (dfa, tree, NULL, SUBEXP);
2394 if (BE (tree == NULL, 0))
2395 {
2396 *err = REG_ESPACE;
2397 return NULL;
2398 }
2399 tree->token.opr.idx = cur_nsub;
2400 return tree;
2401 }
2402
2403 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2404
2405 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2406 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2407 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2408 {
2409 bin_tree_t *tree = NULL, *old_tree = NULL;
2410 int i, start, end, start_idx = re_string_cur_idx (regexp);
2411 re_token_t start_token = *token;
2412
2413 if (token->type == OP_OPEN_DUP_NUM)
2414 {
2415 end = 0;
2416 start = fetch_number (regexp, token, syntax);
2417 if (start == -1)
2418 {
2419 if (token->type == CHARACTER && token->opr.c == ',')
2420 start = 0; /* We treat "{,m}" as "{0,m}". */
2421 else
2422 {
2423 *err = REG_BADBR; /* <re>{} is invalid. */
2424 return NULL;
2425 }
2426 }
2427 if (BE (start != -2, 1))
2428 {
2429 /* We treat "{n}" as "{n,n}". */
2430 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2431 : ((token->type == CHARACTER && token->opr.c == ',')
2432 ? fetch_number (regexp, token, syntax) : -2));
2433 }
2434 if (BE (start == -2 || end == -2, 0))
2435 {
2436 /* Invalid sequence. */
2437 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2438 {
2439 if (token->type == END_OF_RE)
2440 *err = REG_EBRACE;
2441 else
2442 *err = REG_BADBR;
2443
2444 return NULL;
2445 }
2446
2447 /* If the syntax bit is set, rollback. */
2448 re_string_set_index (regexp, start_idx);
2449 *token = start_token;
2450 token->type = CHARACTER;
2451 /* mb_partial and word_char bits should be already initialized by
2452 peek_token. */
2453 return elem;
2454 }
2455
2456 if (BE (end != -1 && start > end, 0))
2457 {
2458 /* First number greater than second. */
2459 *err = REG_BADBR;
2460 return NULL;
2461 }
2462 }
2463 else
2464 {
2465 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2466 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2467 }
2468
2469 fetch_token (token, regexp, syntax);
2470
2471 if (BE (elem == NULL, 0))
2472 return NULL;
2473 if (BE (start == 0 && end == 0, 0))
2474 {
2475 postorder (elem, free_tree, NULL);
2476 return NULL;
2477 }
2478
2479 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2480 if (BE (start > 0, 0))
2481 {
2482 tree = elem;
2483 for (i = 2; i <= start; ++i)
2484 {
2485 elem = duplicate_tree (elem, dfa);
2486 tree = create_tree (dfa, tree, elem, CONCAT);
2487 if (BE (elem == NULL || tree == NULL, 0))
2488 goto parse_dup_op_espace;
2489 }
2490
2491 if (start == end)
2492 return tree;
2493
2494 /* Duplicate ELEM before it is marked optional. */
2495 elem = duplicate_tree (elem, dfa);
2496 old_tree = tree;
2497 }
2498 else
2499 old_tree = NULL;
2500
2501 if (elem->token.type == SUBEXP)
2502 #ifdef _WIN64
2503 postorder (elem, mark_opt_subexp, (void *) (__int64) elem->token.opr.idx);
2504 #else
2505 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2506 #endif
2507
2508 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2509 if (BE (tree == NULL, 0))
2510 goto parse_dup_op_espace;
2511
2512 /* This loop is actually executed only when end != -1,
2513 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2514 already created the start+1-th copy. */
2515 for (i = start + 2; i <= end; ++i)
2516 {
2517 elem = duplicate_tree (elem, dfa);
2518 tree = create_tree (dfa, tree, elem, CONCAT);
2519 if (BE (elem == NULL || tree == NULL, 0))
2520 goto parse_dup_op_espace;
2521
2522 tree = create_tree (dfa, tree, NULL, OP_ALT);
2523 if (BE (tree == NULL, 0))
2524 goto parse_dup_op_espace;
2525 }
2526
2527 if (old_tree)
2528 tree = create_tree (dfa, old_tree, tree, CONCAT);
2529
2530 return tree;
2531
2532 parse_dup_op_espace:
2533 *err = REG_ESPACE;
2534 return NULL;
2535 }
2536
2537 /* Size of the names for collating symbol/equivalence_class/character_class.
2538 I'm not sure, but maybe enough. */
2539 #define BRACKET_NAME_BUF_SIZE 32
2540
2541 #ifndef _LIBC
2542 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2543 Build the range expression which starts from START_ELEM, and ends
2544 at END_ELEM. The result are written to MBCSET and SBCSET.
2545 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2546 mbcset->range_ends, is a pointer argument sinse we may
2547 update it. */
2548
2549 static reg_errcode_t
2550 internal_function
2551 # ifdef RE_ENABLE_I18N
build_range_exp(bitset_t sbcset,re_charset_t * mbcset,int * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2552 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2553 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2554 # else /* not RE_ENABLE_I18N */
2555 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2556 bracket_elem_t *end_elem)
2557 # endif /* not RE_ENABLE_I18N */
2558 {
2559 unsigned int start_ch, end_ch;
2560 /* Equivalence Classes and Character Classes can't be a range start/end. */
2561 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2562 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2563 0))
2564 return REG_ERANGE;
2565
2566 /* We can handle no multi character collating elements without libc
2567 support. */
2568 if (BE ((start_elem->type == COLL_SYM
2569 && strlen ((char *) start_elem->opr.name) > 1)
2570 || (end_elem->type == COLL_SYM
2571 && strlen ((char *) end_elem->opr.name) > 1), 0))
2572 return REG_ECOLLATE;
2573
2574 # ifdef RE_ENABLE_I18N
2575 {
2576 wchar_t wc;
2577 wint_t start_wc;
2578 wint_t end_wc;
2579 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2580
2581 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2582 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2583 : 0));
2584 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2585 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2586 : 0));
2587 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2588 ? __btowc (start_ch) : start_elem->opr.wch);
2589 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2590 ? __btowc (end_ch) : end_elem->opr.wch);
2591 if (start_wc == WEOF || end_wc == WEOF)
2592 return REG_ECOLLATE;
2593 cmp_buf[0] = start_wc;
2594 cmp_buf[4] = end_wc;
2595 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2596 return REG_ERANGE;
2597
2598 /* Got valid collation sequence values, add them as a new entry.
2599 However, for !_LIBC we have no collation elements: if the
2600 character set is single byte, the single byte character set
2601 that we build below suffices. parse_bracket_exp passes
2602 no MBCSET if dfa->mb_cur_max == 1. */
2603 if (mbcset)
2604 {
2605 /* Check the space of the arrays. */
2606 if (BE (*range_alloc == mbcset->nranges, 0))
2607 {
2608 /* There is not enough space, need realloc. */
2609 wchar_t *new_array_start, *new_array_end;
2610 int new_nranges;
2611
2612 /* +1 in case of mbcset->nranges is 0. */
2613 new_nranges = 2 * mbcset->nranges + 1;
2614 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2615 are NULL if *range_alloc == 0. */
2616 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2617 new_nranges);
2618 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2619 new_nranges);
2620
2621 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2622 return REG_ESPACE;
2623
2624 mbcset->range_starts = new_array_start;
2625 mbcset->range_ends = new_array_end;
2626 *range_alloc = new_nranges;
2627 }
2628
2629 mbcset->range_starts[mbcset->nranges] = start_wc;
2630 mbcset->range_ends[mbcset->nranges++] = end_wc;
2631 }
2632
2633 /* Build the table for single byte characters. */
2634 for (wc = 0; wc < SBC_MAX; ++wc)
2635 {
2636 cmp_buf[2] = wc;
2637 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2638 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2639 bitset_set (sbcset, wc);
2640 }
2641 }
2642 # else /* not RE_ENABLE_I18N */
2643 {
2644 unsigned int ch;
2645 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2646 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2647 : 0));
2648 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2649 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2650 : 0));
2651 if (start_ch > end_ch)
2652 return REG_ERANGE;
2653 /* Build the table for single byte characters. */
2654 for (ch = 0; ch < SBC_MAX; ++ch)
2655 if (start_ch <= ch && ch <= end_ch)
2656 bitset_set (sbcset, ch);
2657 }
2658 # endif /* not RE_ENABLE_I18N */
2659 return REG_NOERROR;
2660 }
2661 #endif /* not _LIBC */
2662
2663 #ifndef _LIBC
2664 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2665 Build the collating element which is represented by NAME.
2666 The result are written to MBCSET and SBCSET.
2667 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2668 pointer argument since we may update it. */
2669
2670 static reg_errcode_t
2671 internal_function
2672 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,int * coll_sym_alloc,const unsigned char * name)2673 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2674 int *coll_sym_alloc, const unsigned char *name)
2675 # else /* not RE_ENABLE_I18N */
2676 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2677 # endif /* not RE_ENABLE_I18N */
2678 {
2679 size_t name_len = strlen ((const char *) name);
2680 if (BE (name_len != 1, 0))
2681 return REG_ECOLLATE;
2682 else
2683 {
2684 bitset_set (sbcset, name[0]);
2685 return REG_NOERROR;
2686 }
2687 }
2688 #endif /* not _LIBC */
2689
2690 /* This function parse bracket expression like "[abc]", "[a-c]",
2691 "[[.a-a.]]" etc. */
2692
2693 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2694 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2695 reg_syntax_t syntax, reg_errcode_t *err)
2696 {
2697 #ifdef _LIBC
2698 const unsigned char *collseqmb;
2699 const char *collseqwc;
2700 uint32_t nrules;
2701 int32_t table_size;
2702 const int32_t *symb_table;
2703 const unsigned char *extra;
2704
2705 /* Local function for parse_bracket_exp used in _LIBC environement.
2706 Seek the collating symbol entry correspondings to NAME.
2707 Return the index of the symbol in the SYMB_TABLE. */
2708
2709 auto inline int32_t
2710 __attribute ((always_inline))
2711 seek_collating_symbol_entry (name, name_len)
2712 const unsigned char *name;
2713 size_t name_len;
2714 {
2715 int32_t hash = elem_hash ((const char *) name, name_len);
2716 int32_t elem = hash % table_size;
2717 if (symb_table[2 * elem] != 0)
2718 {
2719 int32_t second = hash % (table_size - 2) + 1;
2720
2721 do
2722 {
2723 /* First compare the hashing value. */
2724 if (symb_table[2 * elem] == hash
2725 /* Compare the length of the name. */
2726 && name_len == extra[symb_table[2 * elem + 1]]
2727 /* Compare the name. */
2728 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2729 name_len) == 0)
2730 {
2731 /* Yep, this is the entry. */
2732 break;
2733 }
2734
2735 /* Next entry. */
2736 elem += second;
2737 }
2738 while (symb_table[2 * elem] != 0);
2739 }
2740 return elem;
2741 }
2742
2743 /* Local function for parse_bracket_exp used in _LIBC environement.
2744 Look up the collation sequence value of BR_ELEM.
2745 Return the value if succeeded, UINT_MAX otherwise. */
2746
2747 auto inline unsigned int
2748 __attribute ((always_inline))
2749 lookup_collation_sequence_value (br_elem)
2750 bracket_elem_t *br_elem;
2751 {
2752 if (br_elem->type == SB_CHAR)
2753 {
2754 /*
2755 if (MB_CUR_MAX == 1)
2756 */
2757 if (nrules == 0)
2758 return collseqmb[br_elem->opr.ch];
2759 else
2760 {
2761 wint_t wc = __btowc (br_elem->opr.ch);
2762 return __collseq_table_lookup (collseqwc, wc);
2763 }
2764 }
2765 else if (br_elem->type == MB_CHAR)
2766 {
2767 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2768 }
2769 else if (br_elem->type == COLL_SYM)
2770 {
2771 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2772 if (nrules != 0)
2773 {
2774 int32_t elem, idx;
2775 elem = seek_collating_symbol_entry (br_elem->opr.name,
2776 sym_name_len);
2777 if (symb_table[2 * elem] != 0)
2778 {
2779 /* We found the entry. */
2780 idx = symb_table[2 * elem + 1];
2781 /* Skip the name of collating element name. */
2782 idx += 1 + extra[idx];
2783 /* Skip the byte sequence of the collating element. */
2784 idx += 1 + extra[idx];
2785 /* Adjust for the alignment. */
2786 idx = (idx + 3) & ~3;
2787 /* Skip the multibyte collation sequence value. */
2788 idx += sizeof (unsigned int);
2789 /* Skip the wide char sequence of the collating element. */
2790 idx += sizeof (unsigned int) *
2791 (1 + *(unsigned int *) (extra + idx));
2792 /* Return the collation sequence value. */
2793 return *(unsigned int *) (extra + idx);
2794 }
2795 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2796 {
2797 /* No valid character. Match it as a single byte
2798 character. */
2799 return collseqmb[br_elem->opr.name[0]];
2800 }
2801 }
2802 else if (sym_name_len == 1)
2803 return collseqmb[br_elem->opr.name[0]];
2804 }
2805 return UINT_MAX;
2806 }
2807
2808 /* Local function for parse_bracket_exp used in _LIBC environement.
2809 Build the range expression which starts from START_ELEM, and ends
2810 at END_ELEM. The result are written to MBCSET and SBCSET.
2811 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2812 mbcset->range_ends, is a pointer argument sinse we may
2813 update it. */
2814
2815 auto inline reg_errcode_t
2816 __attribute ((always_inline))
2817 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2818 re_charset_t *mbcset;
2819 int *range_alloc;
2820 bitset_t sbcset;
2821 bracket_elem_t *start_elem, *end_elem;
2822 {
2823 unsigned int ch;
2824 uint32_t start_collseq;
2825 uint32_t end_collseq;
2826
2827 /* Equivalence Classes and Character Classes can't be a range
2828 start/end. */
2829 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2830 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2831 0))
2832 return REG_ERANGE;
2833
2834 start_collseq = lookup_collation_sequence_value (start_elem);
2835 end_collseq = lookup_collation_sequence_value (end_elem);
2836 /* Check start/end collation sequence values. */
2837 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2838 return REG_ECOLLATE;
2839 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2840 return REG_ERANGE;
2841
2842 /* Got valid collation sequence values, add them as a new entry.
2843 However, if we have no collation elements, and the character set
2844 is single byte, the single byte character set that we
2845 build below suffices. */
2846 if (nrules > 0 || dfa->mb_cur_max > 1)
2847 {
2848 /* Check the space of the arrays. */
2849 if (BE (*range_alloc == mbcset->nranges, 0))
2850 {
2851 /* There is not enough space, need realloc. */
2852 uint32_t *new_array_start;
2853 uint32_t *new_array_end;
2854 int new_nranges;
2855
2856 /* +1 in case of mbcset->nranges is 0. */
2857 new_nranges = 2 * mbcset->nranges + 1;
2858 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2859 new_nranges);
2860 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2861 new_nranges);
2862
2863 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2864 return REG_ESPACE;
2865
2866 mbcset->range_starts = new_array_start;
2867 mbcset->range_ends = new_array_end;
2868 *range_alloc = new_nranges;
2869 }
2870
2871 mbcset->range_starts[mbcset->nranges] = start_collseq;
2872 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2873 }
2874
2875 /* Build the table for single byte characters. */
2876 for (ch = 0; ch < SBC_MAX; ch++)
2877 {
2878 uint32_t ch_collseq;
2879 /*
2880 if (MB_CUR_MAX == 1)
2881 */
2882 if (nrules == 0)
2883 ch_collseq = collseqmb[ch];
2884 else
2885 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2886 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2887 bitset_set (sbcset, ch);
2888 }
2889 return REG_NOERROR;
2890 }
2891
2892 /* Local function for parse_bracket_exp used in _LIBC environement.
2893 Build the collating element which is represented by NAME.
2894 The result are written to MBCSET and SBCSET.
2895 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2896 pointer argument sinse we may update it. */
2897
2898 auto inline reg_errcode_t
2899 __attribute ((always_inline))
2900 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2901 re_charset_t *mbcset;
2902 int *coll_sym_alloc;
2903 bitset_t sbcset;
2904 const unsigned char *name;
2905 {
2906 int32_t elem, idx;
2907 size_t name_len = strlen ((const char *) name);
2908 if (nrules != 0)
2909 {
2910 elem = seek_collating_symbol_entry (name, name_len);
2911 if (symb_table[2 * elem] != 0)
2912 {
2913 /* We found the entry. */
2914 idx = symb_table[2 * elem + 1];
2915 /* Skip the name of collating element name. */
2916 idx += 1 + extra[idx];
2917 }
2918 else if (symb_table[2 * elem] == 0 && name_len == 1)
2919 {
2920 /* No valid character, treat it as a normal
2921 character. */
2922 bitset_set (sbcset, name[0]);
2923 return REG_NOERROR;
2924 }
2925 else
2926 return REG_ECOLLATE;
2927
2928 /* Got valid collation sequence, add it as a new entry. */
2929 /* Check the space of the arrays. */
2930 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2931 {
2932 /* Not enough, realloc it. */
2933 /* +1 in case of mbcset->ncoll_syms is 0. */
2934 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2935 /* Use realloc since mbcset->coll_syms is NULL
2936 if *alloc == 0. */
2937 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2938 new_coll_sym_alloc);
2939 if (BE (new_coll_syms == NULL, 0))
2940 return REG_ESPACE;
2941 mbcset->coll_syms = new_coll_syms;
2942 *coll_sym_alloc = new_coll_sym_alloc;
2943 }
2944 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2945 return REG_NOERROR;
2946 }
2947 else
2948 {
2949 if (BE (name_len != 1, 0))
2950 return REG_ECOLLATE;
2951 else
2952 {
2953 bitset_set (sbcset, name[0]);
2954 return REG_NOERROR;
2955 }
2956 }
2957 }
2958 #endif
2959
2960 re_token_t br_token;
2961 re_bitset_ptr_t sbcset;
2962 #ifdef RE_ENABLE_I18N
2963 re_charset_t *mbcset;
2964 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2965 int equiv_class_alloc = 0, char_class_alloc = 0;
2966 #endif /* not RE_ENABLE_I18N */
2967 int non_match = 0;
2968 bin_tree_t *work_tree;
2969 int token_len;
2970 int first_round = 1;
2971 #ifdef _LIBC
2972 collseqmb = (const unsigned char *)
2973 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2974 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2975 if (nrules)
2976 {
2977 /*
2978 if (MB_CUR_MAX > 1)
2979 */
2980 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2981 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2982 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2983 _NL_COLLATE_SYMB_TABLEMB);
2984 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2985 _NL_COLLATE_SYMB_EXTRAMB);
2986 }
2987 #endif
2988 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2989 #ifdef RE_ENABLE_I18N
2990 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2991 #endif /* RE_ENABLE_I18N */
2992 #ifdef RE_ENABLE_I18N
2993 if (BE (sbcset == NULL || mbcset == NULL, 0))
2994 #else
2995 if (BE (sbcset == NULL, 0))
2996 #endif /* RE_ENABLE_I18N */
2997 {
2998 *err = REG_ESPACE;
2999 return NULL;
3000 }
3001
3002 token_len = peek_token_bracket (token, regexp, syntax);
3003 if (BE (token->type == END_OF_RE, 0))
3004 {
3005 *err = REG_BADPAT;
3006 goto parse_bracket_exp_free_return;
3007 }
3008 if (token->type == OP_NON_MATCH_LIST)
3009 {
3010 #ifdef RE_ENABLE_I18N
3011 mbcset->non_match = 1;
3012 #endif /* not RE_ENABLE_I18N */
3013 non_match = 1;
3014 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3015 bitset_set (sbcset, '\0');
3016 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3017 token_len = peek_token_bracket (token, regexp, syntax);
3018 if (BE (token->type == END_OF_RE, 0))
3019 {
3020 *err = REG_BADPAT;
3021 goto parse_bracket_exp_free_return;
3022 }
3023 }
3024
3025 /* We treat the first ']' as a normal character. */
3026 if (token->type == OP_CLOSE_BRACKET)
3027 token->type = CHARACTER;
3028
3029 while (1)
3030 {
3031 bracket_elem_t start_elem, end_elem;
3032 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3033 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3034 reg_errcode_t ret;
3035 int token_len2 = 0, is_range_exp = 0;
3036 re_token_t token2;
3037
3038 start_elem.opr.name = start_name_buf;
3039 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3040 syntax, first_round);
3041 if (BE (ret != REG_NOERROR, 0))
3042 {
3043 *err = ret;
3044 goto parse_bracket_exp_free_return;
3045 }
3046 first_round = 0;
3047
3048 /* Get information about the next token. We need it in any case. */
3049 token_len = peek_token_bracket (token, regexp, syntax);
3050
3051 /* Do not check for ranges if we know they are not allowed. */
3052 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3053 {
3054 if (BE (token->type == END_OF_RE, 0))
3055 {
3056 *err = REG_EBRACK;
3057 goto parse_bracket_exp_free_return;
3058 }
3059 if (token->type == OP_CHARSET_RANGE)
3060 {
3061 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3062 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3063 if (BE (token2.type == END_OF_RE, 0))
3064 {
3065 *err = REG_EBRACK;
3066 goto parse_bracket_exp_free_return;
3067 }
3068 if (token2.type == OP_CLOSE_BRACKET)
3069 {
3070 /* We treat the last '-' as a normal character. */
3071 re_string_skip_bytes (regexp, -token_len);
3072 token->type = CHARACTER;
3073 }
3074 else
3075 is_range_exp = 1;
3076 }
3077 }
3078
3079 if (is_range_exp == 1)
3080 {
3081 end_elem.opr.name = end_name_buf;
3082 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3083 dfa, syntax, 1);
3084 if (BE (ret != REG_NOERROR, 0))
3085 {
3086 *err = ret;
3087 goto parse_bracket_exp_free_return;
3088 }
3089
3090 token_len = peek_token_bracket (token, regexp, syntax);
3091
3092 #ifdef _LIBC
3093 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3094 &start_elem, &end_elem);
3095 #else
3096 # ifdef RE_ENABLE_I18N
3097 *err = build_range_exp (sbcset,
3098 dfa->mb_cur_max > 1 ? mbcset : NULL,
3099 &range_alloc, &start_elem, &end_elem);
3100 # else
3101 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3102 # endif
3103 #endif /* RE_ENABLE_I18N */
3104 if (BE (*err != REG_NOERROR, 0))
3105 goto parse_bracket_exp_free_return;
3106 }
3107 else
3108 {
3109 switch (start_elem.type)
3110 {
3111 case SB_CHAR:
3112 bitset_set (sbcset, start_elem.opr.ch);
3113 break;
3114 #ifdef RE_ENABLE_I18N
3115 case MB_CHAR:
3116 /* Check whether the array has enough space. */
3117 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3118 {
3119 wchar_t *new_mbchars;
3120 /* Not enough, realloc it. */
3121 /* +1 in case of mbcset->nmbchars is 0. */
3122 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3123 /* Use realloc since array is NULL if *alloc == 0. */
3124 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3125 mbchar_alloc);
3126 if (BE (new_mbchars == NULL, 0))
3127 goto parse_bracket_exp_espace;
3128 mbcset->mbchars = new_mbchars;
3129 }
3130 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3131 break;
3132 #endif /* RE_ENABLE_I18N */
3133 case EQUIV_CLASS:
3134 *err = build_equiv_class (sbcset,
3135 #ifdef RE_ENABLE_I18N
3136 mbcset, &equiv_class_alloc,
3137 #endif /* RE_ENABLE_I18N */
3138 start_elem.opr.name);
3139 if (BE (*err != REG_NOERROR, 0))
3140 goto parse_bracket_exp_free_return;
3141 break;
3142 case COLL_SYM:
3143 *err = build_collating_symbol (sbcset,
3144 #ifdef RE_ENABLE_I18N
3145 mbcset, &coll_sym_alloc,
3146 #endif /* RE_ENABLE_I18N */
3147 start_elem.opr.name);
3148 if (BE (*err != REG_NOERROR, 0))
3149 goto parse_bracket_exp_free_return;
3150 break;
3151 case CHAR_CLASS:
3152 *err = build_charclass (regexp->trans, sbcset,
3153 #ifdef RE_ENABLE_I18N
3154 mbcset, &char_class_alloc,
3155 #endif /* RE_ENABLE_I18N */
3156 start_elem.opr.name, syntax);
3157 if (BE (*err != REG_NOERROR, 0))
3158 goto parse_bracket_exp_free_return;
3159 break;
3160 default:
3161 assert (0);
3162 break;
3163 }
3164 }
3165 if (BE (token->type == END_OF_RE, 0))
3166 {
3167 *err = REG_EBRACK;
3168 goto parse_bracket_exp_free_return;
3169 }
3170 if (token->type == OP_CLOSE_BRACKET)
3171 break;
3172 }
3173
3174 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3175
3176 /* If it is non-matching list. */
3177 if (non_match)
3178 bitset_not (sbcset);
3179
3180 #ifdef RE_ENABLE_I18N
3181 /* Ensure only single byte characters are set. */
3182 if (dfa->mb_cur_max > 1)
3183 bitset_mask (sbcset, dfa->sb_char);
3184
3185 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3186 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3187 || mbcset->non_match)))
3188 {
3189 bin_tree_t *mbc_tree;
3190 int sbc_idx;
3191 /* Build a tree for complex bracket. */
3192 dfa->has_mb_node = 1;
3193 br_token.type = COMPLEX_BRACKET;
3194 br_token.opr.mbcset = mbcset;
3195 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3196 if (BE (mbc_tree == NULL, 0))
3197 goto parse_bracket_exp_espace;
3198 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3199 if (sbcset[sbc_idx])
3200 break;
3201 /* If there are no bits set in sbcset, there is no point
3202 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3203 if (sbc_idx < BITSET_WORDS)
3204 {
3205 /* Build a tree for simple bracket. */
3206 br_token.type = SIMPLE_BRACKET;
3207 br_token.opr.sbcset = sbcset;
3208 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3209 if (BE (work_tree == NULL, 0))
3210 goto parse_bracket_exp_espace;
3211
3212 /* Then join them by ALT node. */
3213 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3214 if (BE (work_tree == NULL, 0))
3215 goto parse_bracket_exp_espace;
3216 }
3217 else
3218 {
3219 re_free (sbcset);
3220 work_tree = mbc_tree;
3221 }
3222 }
3223 else
3224 #endif /* not RE_ENABLE_I18N */
3225 {
3226 #ifdef RE_ENABLE_I18N
3227 free_charset (mbcset);
3228 #endif
3229 /* Build a tree for simple bracket. */
3230 br_token.type = SIMPLE_BRACKET;
3231 br_token.opr.sbcset = sbcset;
3232 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3233 if (BE (work_tree == NULL, 0))
3234 goto parse_bracket_exp_espace;
3235 }
3236 return work_tree;
3237
3238 parse_bracket_exp_espace:
3239 *err = REG_ESPACE;
3240 parse_bracket_exp_free_return:
3241 re_free (sbcset);
3242 #ifdef RE_ENABLE_I18N
3243 free_charset (mbcset);
3244 #endif /* RE_ENABLE_I18N */
3245 return NULL;
3246 }
3247
3248 /* Parse an element in the bracket expression. */
3249
3250 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,int accept_hyphen)3251 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3252 re_token_t *token, int token_len, re_dfa_t *dfa,
3253 reg_syntax_t syntax, int accept_hyphen)
3254 {
3255 #ifdef RE_ENABLE_I18N
3256 int cur_char_size;
3257 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3258 if (cur_char_size > 1)
3259 {
3260 elem->type = MB_CHAR;
3261 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3262 re_string_skip_bytes (regexp, cur_char_size);
3263 return REG_NOERROR;
3264 }
3265 #endif /* RE_ENABLE_I18N */
3266 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3267 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3268 || token->type == OP_OPEN_EQUIV_CLASS)
3269 return parse_bracket_symbol (elem, regexp, token);
3270 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3271 {
3272 /* A '-' must only appear as anything but a range indicator before
3273 the closing bracket. Everything else is an error. */
3274 re_token_t token2;
3275 (void) peek_token_bracket (&token2, regexp, syntax);
3276 if (token2.type != OP_CLOSE_BRACKET)
3277 /* The actual error value is not standardized since this whole
3278 case is undefined. But ERANGE makes good sense. */
3279 return REG_ERANGE;
3280 }
3281 elem->type = SB_CHAR;
3282 elem->opr.ch = token->opr.c;
3283 return REG_NOERROR;
3284 }
3285
3286 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3287 such as [:<character_class>:], [.<collating_element>.], and
3288 [=<equivalent_class>=]. */
3289
3290 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3291 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3292 re_token_t *token)
3293 {
3294 unsigned char ch, delim = token->opr.c;
3295 int i = 0;
3296 if (re_string_eoi(regexp))
3297 return REG_EBRACK;
3298 for (;; ++i)
3299 {
3300 if (i >= BRACKET_NAME_BUF_SIZE)
3301 return REG_EBRACK;
3302 if (token->type == OP_OPEN_CHAR_CLASS)
3303 ch = re_string_fetch_byte_case (regexp);
3304 else
3305 ch = re_string_fetch_byte (regexp);
3306 if (re_string_eoi(regexp))
3307 return REG_EBRACK;
3308 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3309 break;
3310 elem->opr.name[i] = ch;
3311 }
3312 re_string_skip_bytes (regexp, 1);
3313 elem->opr.name[i] = '\0';
3314 switch (token->type)
3315 {
3316 case OP_OPEN_COLL_ELEM:
3317 elem->type = COLL_SYM;
3318 break;
3319 case OP_OPEN_EQUIV_CLASS:
3320 elem->type = EQUIV_CLASS;
3321 break;
3322 case OP_OPEN_CHAR_CLASS:
3323 elem->type = CHAR_CLASS;
3324 break;
3325 default:
3326 break;
3327 }
3328 return REG_NOERROR;
3329 }
3330
3331 /* Helper function for parse_bracket_exp.
3332 Build the equivalence class which is represented by NAME.
3333 The result are written to MBCSET and SBCSET.
3334 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3335 is a pointer argument sinse we may update it. */
3336
3337 static reg_errcode_t
3338 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,int * equiv_class_alloc,const unsigned char * name)3339 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3340 int *equiv_class_alloc, const unsigned char *name)
3341 #else /* not RE_ENABLE_I18N */
3342 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3343 #endif /* not RE_ENABLE_I18N */
3344 {
3345 #ifdef _LIBC
3346 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3347 if (nrules != 0)
3348 {
3349 const int32_t *table, *indirect;
3350 const unsigned char *weights, *extra, *cp;
3351 unsigned char char_buf[2];
3352 int32_t idx1, idx2;
3353 unsigned int ch;
3354 size_t len;
3355 /* This #include defines a local function! */
3356 # include <locale/weight.h>
3357 /* Calculate the index for equivalence class. */
3358 cp = name;
3359 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3360 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3361 _NL_COLLATE_WEIGHTMB);
3362 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3363 _NL_COLLATE_EXTRAMB);
3364 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3365 _NL_COLLATE_INDIRECTMB);
3366 idx1 = findidx (&cp);
3367 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3368 /* This isn't a valid character. */
3369 return REG_ECOLLATE;
3370
3371 /* Build single byte matcing table for this equivalence class. */
3372 char_buf[1] = (unsigned char) '\0';
3373 len = weights[idx1];
3374 for (ch = 0; ch < SBC_MAX; ++ch)
3375 {
3376 char_buf[0] = ch;
3377 cp = char_buf;
3378 idx2 = findidx (&cp);
3379 /*
3380 idx2 = table[ch];
3381 */
3382 if (idx2 == 0)
3383 /* This isn't a valid character. */
3384 continue;
3385 if (len == weights[idx2])
3386 {
3387 int cnt = 0;
3388 while (cnt <= len &&
3389 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3390 ++cnt;
3391
3392 if (cnt > len)
3393 bitset_set (sbcset, ch);
3394 }
3395 }
3396 /* Check whether the array has enough space. */
3397 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3398 {
3399 /* Not enough, realloc it. */
3400 /* +1 in case of mbcset->nequiv_classes is 0. */
3401 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3402 /* Use realloc since the array is NULL if *alloc == 0. */
3403 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3404 int32_t,
3405 new_equiv_class_alloc);
3406 if (BE (new_equiv_classes == NULL, 0))
3407 return REG_ESPACE;
3408 mbcset->equiv_classes = new_equiv_classes;
3409 *equiv_class_alloc = new_equiv_class_alloc;
3410 }
3411 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3412 }
3413 else
3414 #endif /* _LIBC */
3415 {
3416 if (BE (strlen ((const char *) name) != 1, 0))
3417 return REG_ECOLLATE;
3418 bitset_set (sbcset, *name);
3419 }
3420 return REG_NOERROR;
3421 }
3422
3423 /* Helper function for parse_bracket_exp.
3424 Build the character class which is represented by NAME.
3425 The result are written to MBCSET and SBCSET.
3426 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3427 is a pointer argument sinse we may update it. */
3428
3429 static reg_errcode_t
3430 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,int * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3431 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3432 re_charset_t *mbcset, int *char_class_alloc,
3433 const unsigned char *class_name, reg_syntax_t syntax)
3434 #else /* not RE_ENABLE_I18N */
3435 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3436 const unsigned char *class_name, reg_syntax_t syntax)
3437 #endif /* not RE_ENABLE_I18N */
3438 {
3439 int i;
3440 const char *name = (const char *) class_name;
3441
3442 /* In case of REG_ICASE "upper" and "lower" match the both of
3443 upper and lower cases. */
3444 if ((syntax & RE_ICASE)
3445 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3446 name = "alpha";
3447
3448 #ifdef RE_ENABLE_I18N
3449 /* Check the space of the arrays. */
3450 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3451 {
3452 /* Not enough, realloc it. */
3453 /* +1 in case of mbcset->nchar_classes is 0. */
3454 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3455 /* Use realloc since array is NULL if *alloc == 0. */
3456 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3457 new_char_class_alloc);
3458 if (BE (new_char_classes == NULL, 0))
3459 return REG_ESPACE;
3460 mbcset->char_classes = new_char_classes;
3461 *char_class_alloc = new_char_class_alloc;
3462 }
3463 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3464 #endif /* RE_ENABLE_I18N */
3465
3466 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3467 do { \
3468 if (BE (trans != NULL, 0)) \
3469 { \
3470 for (i = 0; i < SBC_MAX; ++i) \
3471 if (ctype_func (i)) \
3472 bitset_set (sbcset, trans[i]); \
3473 } \
3474 else \
3475 { \
3476 for (i = 0; i < SBC_MAX; ++i) \
3477 if (ctype_func (i)) \
3478 bitset_set (sbcset, i); \
3479 } \
3480 } while (0)
3481
3482 if (strcmp (name, "alnum") == 0)
3483 BUILD_CHARCLASS_LOOP (isalnum);
3484 else if (strcmp (name, "cntrl") == 0)
3485 BUILD_CHARCLASS_LOOP (iscntrl);
3486 else if (strcmp (name, "lower") == 0)
3487 BUILD_CHARCLASS_LOOP (islower);
3488 else if (strcmp (name, "space") == 0)
3489 BUILD_CHARCLASS_LOOP (isspace);
3490 else if (strcmp (name, "alpha") == 0)
3491 BUILD_CHARCLASS_LOOP (isalpha);
3492 else if (strcmp (name, "digit") == 0)
3493 BUILD_CHARCLASS_LOOP (isdigit);
3494 else if (strcmp (name, "print") == 0)
3495 BUILD_CHARCLASS_LOOP (isprint);
3496 else if (strcmp (name, "upper") == 0)
3497 BUILD_CHARCLASS_LOOP (isupper);
3498 else if (strcmp (name, "blank") == 0)
3499 BUILD_CHARCLASS_LOOP (isblank);
3500 else if (strcmp (name, "graph") == 0)
3501 BUILD_CHARCLASS_LOOP (isgraph);
3502 else if (strcmp (name, "punct") == 0)
3503 BUILD_CHARCLASS_LOOP (ispunct);
3504 else if (strcmp (name, "xdigit") == 0)
3505 BUILD_CHARCLASS_LOOP (isxdigit);
3506 else
3507 return REG_ECTYPE;
3508
3509 return REG_NOERROR;
3510 }
3511
3512 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,int non_match,reg_errcode_t * err)3513 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3514 const unsigned char *class_name,
3515 const unsigned char *extra, int non_match,
3516 reg_errcode_t *err)
3517 {
3518 re_bitset_ptr_t sbcset;
3519 #ifdef RE_ENABLE_I18N
3520 re_charset_t *mbcset;
3521 int alloc = 0;
3522 #endif /* not RE_ENABLE_I18N */
3523 reg_errcode_t ret;
3524 re_token_t br_token;
3525 bin_tree_t *tree;
3526
3527 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3528 #ifdef RE_ENABLE_I18N
3529 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3530 #endif /* RE_ENABLE_I18N */
3531
3532 #ifdef RE_ENABLE_I18N
3533 if (BE (sbcset == NULL || mbcset == NULL, 0))
3534 #else /* not RE_ENABLE_I18N */
3535 if (BE (sbcset == NULL, 0))
3536 #endif /* not RE_ENABLE_I18N */
3537 {
3538 *err = REG_ESPACE;
3539 return NULL;
3540 }
3541
3542 if (non_match)
3543 {
3544 #ifdef RE_ENABLE_I18N
3545 /*
3546 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3547 bitset_set(cset->sbcset, '\0');
3548 */
3549 mbcset->non_match = 1;
3550 #endif /* not RE_ENABLE_I18N */
3551 }
3552
3553 /* We don't care the syntax in this case. */
3554 ret = build_charclass (trans, sbcset,
3555 #ifdef RE_ENABLE_I18N
3556 mbcset, &alloc,
3557 #endif /* RE_ENABLE_I18N */
3558 class_name, 0);
3559
3560 if (BE (ret != REG_NOERROR, 0))
3561 {
3562 re_free (sbcset);
3563 #ifdef RE_ENABLE_I18N
3564 free_charset (mbcset);
3565 #endif /* RE_ENABLE_I18N */
3566 *err = ret;
3567 return NULL;
3568 }
3569 /* \w match '_' also. */
3570 for (; *extra; extra++)
3571 bitset_set (sbcset, *extra);
3572
3573 /* If it is non-matching list. */
3574 if (non_match)
3575 bitset_not (sbcset);
3576
3577 #ifdef RE_ENABLE_I18N
3578 /* Ensure only single byte characters are set. */
3579 if (dfa->mb_cur_max > 1)
3580 bitset_mask (sbcset, dfa->sb_char);
3581 #endif
3582
3583 /* Build a tree for simple bracket. */
3584 br_token.type = SIMPLE_BRACKET;
3585 br_token.opr.sbcset = sbcset;
3586 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3587 if (BE (tree == NULL, 0))
3588 goto build_word_op_espace;
3589
3590 #ifdef RE_ENABLE_I18N
3591 if (dfa->mb_cur_max > 1)
3592 {
3593 bin_tree_t *mbc_tree;
3594 /* Build a tree for complex bracket. */
3595 br_token.type = COMPLEX_BRACKET;
3596 br_token.opr.mbcset = mbcset;
3597 dfa->has_mb_node = 1;
3598 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3599 if (BE (mbc_tree == NULL, 0))
3600 goto build_word_op_espace;
3601 /* Then join them by ALT node. */
3602 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3603 if (BE (mbc_tree != NULL, 1))
3604 return tree;
3605 }
3606 else
3607 {
3608 free_charset (mbcset);
3609 return tree;
3610 }
3611 #else /* not RE_ENABLE_I18N */
3612 return tree;
3613 #endif /* not RE_ENABLE_I18N */
3614
3615 build_word_op_espace:
3616 re_free (sbcset);
3617 #ifdef RE_ENABLE_I18N
3618 free_charset (mbcset);
3619 #endif /* RE_ENABLE_I18N */
3620 *err = REG_ESPACE;
3621 return NULL;
3622 }
3623
3624 /* This is intended for the expressions like "a{1,3}".
3625 Fetch a number from `input', and return the number.
3626 Return -1, if the number field is empty like "{,1}".
3627 Return -2, If an error is occured. */
3628
3629 static int
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3630 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3631 {
3632 int num = -1;
3633 unsigned char c;
3634 while (1)
3635 {
3636 fetch_token (token, input, syntax);
3637 c = token->opr.c;
3638 if (BE (token->type == END_OF_RE, 0))
3639 return -2;
3640 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3641 break;
3642 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3643 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3644 num = (num > RE_DUP_MAX) ? -2 : num;
3645 }
3646 return num;
3647 }
3648
3649 #ifdef RE_ENABLE_I18N
3650 static void
free_charset(re_charset_t * cset)3651 free_charset (re_charset_t *cset)
3652 {
3653 re_free (cset->mbchars);
3654 # ifdef _LIBC
3655 re_free (cset->coll_syms);
3656 re_free (cset->equiv_classes);
3657 re_free (cset->range_starts);
3658 re_free (cset->range_ends);
3659 # endif
3660 re_free (cset->char_classes);
3661 re_free (cset);
3662 }
3663 #endif /* RE_ENABLE_I18N */
3664
3665 /* Functions for binary tree operation. */
3666
3667 /* Create a tree node. */
3668
3669 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3670 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3671 re_token_type_t type)
3672 {
3673 re_token_t t;
3674 t.type = type;
3675 return create_token_tree (dfa, left, right, &t);
3676 }
3677
3678 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3679 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3680 const re_token_t *token)
3681 {
3682 bin_tree_t *tree;
3683 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3684 {
3685 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3686
3687 if (storage == NULL)
3688 return NULL;
3689 storage->next = dfa->str_tree_storage;
3690 dfa->str_tree_storage = storage;
3691 dfa->str_tree_storage_idx = 0;
3692 }
3693 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3694
3695 tree->parent = NULL;
3696 tree->left = left;
3697 tree->right = right;
3698 tree->token = *token;
3699 tree->token.duplicated = 0;
3700 tree->token.opt_subexp = 0;
3701 tree->first = NULL;
3702 tree->next = NULL;
3703 tree->node_idx = -1;
3704
3705 if (left != NULL)
3706 left->parent = tree;
3707 if (right != NULL)
3708 right->parent = tree;
3709 return tree;
3710 }
3711
3712 /* Mark the tree SRC as an optional subexpression.
3713 To be called from preorder or postorder. */
3714
3715 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3716 mark_opt_subexp (void *extra, bin_tree_t *node)
3717 {
3718 #ifdef _WIN64
3719 int idx = (int) (__int64) extra;
3720 #else
3721 int idx = (int) (long) extra;
3722 #endif
3723 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3724 node->token.opt_subexp = 1;
3725
3726 return REG_NOERROR;
3727 }
3728
3729 /* Free the allocated memory inside NODE. */
3730
3731 static void
free_token(re_token_t * node)3732 free_token (re_token_t *node)
3733 {
3734 #ifdef RE_ENABLE_I18N
3735 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3736 free_charset (node->opr.mbcset);
3737 else
3738 #endif /* RE_ENABLE_I18N */
3739 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3740 re_free (node->opr.sbcset);
3741 }
3742
3743 /* Worker function for tree walking. Free the allocated memory inside NODE
3744 and its children. */
3745
3746 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3747 free_tree (void *extra, bin_tree_t *node)
3748 {
3749 free_token (&node->token);
3750 return REG_NOERROR;
3751 }
3752
3753
3754 /* Duplicate the node SRC, and return new node. This is a preorder
3755 visit similar to the one implemented by the generic visitor, but
3756 we need more infrastructure to maintain two parallel trees --- so,
3757 it's easier to duplicate. */
3758
3759 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3760 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3761 {
3762 const bin_tree_t *node;
3763 bin_tree_t *dup_root;
3764 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3765
3766 for (node = root; ; )
3767 {
3768 /* Create a new tree and link it back to the current parent. */
3769 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3770 if (*p_new == NULL)
3771 return NULL;
3772 (*p_new)->parent = dup_node;
3773 (*p_new)->token.duplicated = 1;
3774 dup_node = *p_new;
3775
3776 /* Go to the left node, or up and to the right. */
3777 if (node->left)
3778 {
3779 node = node->left;
3780 p_new = &dup_node->left;
3781 }
3782 else
3783 {
3784 const bin_tree_t *prev = NULL;
3785 while (node->right == prev || node->right == NULL)
3786 {
3787 prev = node;
3788 node = node->parent;
3789 dup_node = dup_node->parent;
3790 if (!node)
3791 return dup_root;
3792 }
3793 node = node->right;
3794 p_new = &dup_node->right;
3795 }
3796 }
3797 }
3798