1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21 static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
23 char *fastmap);
24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
25 #ifdef RE_ENABLE_I18N
26 static void free_charset (re_charset_t *cset);
27 #endif /* RE_ENABLE_I18N */
28 static void free_workarea_compile (regex_t *preg);
29 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
30 #ifdef RE_ENABLE_I18N
31 static void optimize_utf8 (re_dfa_t *dfa);
32 #endif
33 static reg_errcode_t analyze (regex_t *preg);
34 static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
36 void *extra);
37 static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
43 bin_tree_t *node);
44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
52 Idx node, bool root);
53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54 static Idx fetch_number (re_string_t *input, re_token_t *token,
55 reg_syntax_t syntax);
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
77 reg_errcode_t *err);
78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
79 re_string_t *regexp,
80 re_token_t *token, int token_len,
81 re_dfa_t *dfa,
82 reg_syntax_t syntax,
83 bool accept_hyphen);
84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85 re_string_t *regexp,
86 re_token_t *token);
87 #ifdef RE_ENABLE_I18N
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
97 reg_syntax_t syntax);
98 #else /* not RE_ENABLE_I18N */
99 static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 bitset_t sbcset,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #endif /* not RE_ENABLE_I18N */
106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111 static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118 static void free_token (re_token_t *node);
119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
121
122 /* This table gives an error message for each of the error codes listed
123 in regex.h. Obviously the order here has to be same as there.
124 POSIX doesn't require that we do anything for REG_NOERROR,
125 but why not be nice? */
126
127 static const char __re_error_msgid[] =
128 {
129 #define REG_NOERROR_IDX 0
130 gettext_noop ("Success") /* REG_NOERROR */
131 "\0"
132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133 gettext_noop ("No match") /* REG_NOMATCH */
134 "\0"
135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
137 "\0"
138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
140 "\0"
141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
143 "\0"
144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
146 "\0"
147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
149 "\0"
150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
152 "\0"
153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
155 "\0"
156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
158 "\0"
159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
161 "\0"
162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163 gettext_noop ("Invalid range end") /* REG_ERANGE */
164 "\0"
165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
166 gettext_noop ("Memory exhausted") /* REG_ESPACE */
167 "\0"
168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
170 "\0"
171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172 gettext_noop ("Premature end of regular expression") /* REG_EEND */
173 "\0"
174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
175 gettext_noop ("Regular expression too big") /* REG_ESIZE */
176 "\0"
177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
179 };
180
181 static const size_t __re_error_msgid_idx[] =
182 {
183 REG_NOERROR_IDX,
184 REG_NOMATCH_IDX,
185 REG_BADPAT_IDX,
186 REG_ECOLLATE_IDX,
187 REG_ECTYPE_IDX,
188 REG_EESCAPE_IDX,
189 REG_ESUBREG_IDX,
190 REG_EBRACK_IDX,
191 REG_EPAREN_IDX,
192 REG_EBRACE_IDX,
193 REG_BADBR_IDX,
194 REG_ERANGE_IDX,
195 REG_ESPACE_IDX,
196 REG_BADRPT_IDX,
197 REG_EEND_IDX,
198 REG_ESIZE_IDX,
199 REG_ERPAREN_IDX
200 };
201
202 /* Entry points for GNU code. */
203
204 /* re_compile_pattern is the GNU regular expression compiler: it
205 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206 Returns 0 if the pattern was valid, otherwise an error string.
207
208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209 are set in BUFP on entry. */
210
211 #ifdef _LIBC
212 const char *
re_compile_pattern(pattern,length,bufp)213 re_compile_pattern (pattern, length, bufp)
214 const char *pattern;
215 size_t length;
216 struct re_pattern_buffer *bufp;
217 #else /* size_t might promote */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
221 #endif
222 {
223 reg_errcode_t ret;
224
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub, unless RE_NO_SUB is set. */
228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
232
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234
235 if (!ret)
236 return NULL;
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 }
239 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)240 weak_alias (__re_compile_pattern, re_compile_pattern)
241 #endif
242
243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246 /* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248 reg_syntax_t re_syntax_options;
249
250
251 /* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
254
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
257
258 reg_syntax_t
259 re_set_syntax (syntax)
260 reg_syntax_t syntax;
261 {
262 reg_syntax_t ret = re_syntax_options;
263
264 re_syntax_options = syntax;
265 return ret;
266 }
267 #ifdef _LIBC
268 weak_alias (__re_set_syntax, re_set_syntax)
269 #endif
270
271 int
272 re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
274 {
275 re_dfa_t *dfa = bufp->buffer;
276 char *fastmap = bufp->fastmap;
277
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
287 return 0;
288 }
289 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)290 weak_alias (__re_compile_fastmap, re_compile_fastmap)
291 #endif
292
293 static inline void
294 __attribute ((always_inline))
295 re_set_fastmap (char *fastmap, bool icase, int ch)
296 {
297 fastmap[ch] = 1;
298 if (icase)
299 fastmap[tolower (ch)] = 1;
300 }
301
302 /* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
304
305 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
307 char *fastmap)
308 {
309 re_dfa_t *dfa = bufp->buffer;
310 Idx node_cnt;
311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
313 {
314 Idx node = init_state->nodes.elems[node_cnt];
315 re_token_type_t type = dfa->nodes[node].type;
316
317 if (type == CHARACTER)
318 {
319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320 #ifdef RE_ENABLE_I18N
321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
322 {
323 unsigned char buf[MB_LEN_MAX];
324 unsigned char *p;
325 wchar_t wc;
326 mbstate_t state;
327
328 p = buf;
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
336 &state) == p - buf
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
338 != (size_t) -1))
339 re_set_fastmap (fastmap, false, buf[0]);
340 }
341 #endif
342 }
343 else if (type == SIMPLE_BRACKET)
344 {
345 int i, ch;
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
347 {
348 int j;
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
353 }
354 }
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
357 {
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
359 Idx i;
360
361 # ifdef _LIBC
362 /* See if we have to try all bytes which start multiple collation
363 elements.
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
370 {
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
374 if (table[i] < 0)
375 re_set_fastmap (fastmap, icase, i);
376 }
377 # endif /* _LIBC */
378
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
385 # ifdef _LIBC
386 || cset->nequiv_classes
387 # endif /* _LIBC */
388 ))
389 {
390 unsigned char c = 0;
391 do
392 {
393 mbstate_t mbs;
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
397 }
398 while (++c != 0);
399 }
400
401 else
402 {
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
405 {
406 char buf[256];
407 mbstate_t state;
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
412 {
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
414 != (size_t) -1)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 }
417 }
418 }
419 }
420 #endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
426 {
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
430 return;
431 }
432 }
433 }
434
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
437
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
440
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
450
451 PATTERN is the address of the pattern string.
452
453 CFLAGS is a series of bits which affect compilation.
454
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
457
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
460
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
463
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
466 registers.
467
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
470
471 int
regcomp(preg,pattern,cflags)472 regcomp (preg, pattern, cflags)
473 regex_t *_Restrict_ preg;
474 const char *_Restrict_ pattern;
475 int cflags;
476 {
477 reg_errcode_t ret;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
480
481 preg->buffer = NULL;
482 preg->allocated = 0;
483 preg->used = 0;
484
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
488 return REG_ESPACE;
489
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
499 }
500 else
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
504
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
510 ret = REG_EPAREN;
511
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
517 else
518 {
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
522 }
523
524 return (int) ret;
525 }
526 #ifdef _LIBC
527 weak_alias (__regcomp, regcomp)
528 #endif
529
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
532
533 #ifdef _LIBC
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536 int errcode;
537 const regex_t *_Restrict_ preg;
538 char *_Restrict_ errbuf;
539 size_t errbuf_size;
540 #else /* size_t might promote */
541 size_t
542 regerror (int errcode, const regex_t *_Restrict_ preg,
543 char *_Restrict_ errbuf, size_t errbuf_size)
544 #endif
545 {
546 const char *msg;
547 size_t msg_size;
548
549 if (BE (errcode < 0
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
556 abort ();
557
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
559
560 msg_size = strlen (msg) + 1; /* Includes the null. */
561
562 if (BE (errbuf_size != 0, 1))
563 {
564 size_t cpy_size = msg_size;
565 if (BE (msg_size > errbuf_size, 0))
566 {
567 cpy_size = errbuf_size - 1;
568 errbuf[cpy_size] = '\0';
569 }
570 memcpy (errbuf, msg, cpy_size);
571 }
572
573 return msg_size;
574 }
575 #ifdef _LIBC
576 weak_alias (__regerror, regerror)
577 #endif
578
579
580 #ifdef RE_ENABLE_I18N
581 /* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585 static const bitset_t utf8_sb_map =
586 {
587 /* Set the first 128 bits. */
588 # ifdef __GNUC__
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
590 # else
591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
592 # error "bitset_word_t is narrower than 32 bits"
593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX, BITSET_WORD_MAX,
597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
598 BITSET_WORD_MAX,
599 # endif
600 (BITSET_WORD_MAX
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 ? 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
604 # endif
605 };
606 #endif
607
608
609 static void
free_dfa_content(re_dfa_t * dfa)610 free_dfa_content (re_dfa_t *dfa)
611 {
612 Idx i, j;
613
614 if (dfa->nodes)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
619 {
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
626 }
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
631
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
634 {
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
637 {
638 re_dfastate_t *state = entry->array[j];
639 free_state (state);
640 }
641 re_free (entry->array);
642 }
643 re_free (dfa->state_table);
644 #ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
647 #endif
648 re_free (dfa->subexp_map);
649 #ifdef DEBUG
650 re_free (dfa->re_str);
651 #endif
652
653 re_free (dfa);
654 }
655
656
657 /* Free dynamically allocated space used by PREG. */
658
659 void
regfree(preg)660 regfree (preg)
661 regex_t *preg;
662 {
663 re_dfa_t *dfa = preg->buffer;
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
666 preg->buffer = NULL;
667 preg->allocated = 0;
668
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
671
672 re_free (preg->translate);
673 preg->translate = NULL;
674 }
675 #ifdef _LIBC
676 weak_alias (__regfree, regfree)
677 #endif
678
679 /* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
681
682 #if defined _REGEX_RE_COMP || defined _LIBC
683
684 /* BSD has one and only one pattern buffer. */
685 static struct re_pattern_buffer re_comp_buf;
686
687 char *
688 # ifdef _LIBC
689 /* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
692 weak_function
693 # endif
re_comp(s)694 re_comp (s)
695 const char *s;
696 {
697 reg_errcode_t ret;
698 char *fastmap;
699
700 if (!s)
701 {
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
704 return 0;
705 }
706
707 if (re_comp_buf.buffer)
708 {
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
714 }
715
716 if (re_comp_buf.fastmap == NULL)
717 {
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 }
723
724 /* Since 're_exec' always passes NULL for the 'regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
726
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
729
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731
732 if (!ret)
733 return NULL;
734
735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 }
738
739 #ifdef _LIBC
libc_freeres_fn(free_mem)740 libc_freeres_fn (free_mem)
741 {
742 __regfree (&re_comp_buf);
743 }
744 #endif
745
746 #endif /* _REGEX_RE_COMP */
747
748 /* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
751
752 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,size_t length,reg_syntax_t syntax)753 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 reg_syntax_t syntax)
755 {
756 reg_errcode_t err = REG_NOERROR;
757 re_dfa_t *dfa;
758 re_string_t regexp;
759
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
764 preg->used = 0;
765 preg->re_nsub = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
768
769 /* Initialize the dfa. */
770 dfa = preg->buffer;
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772 {
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778 if (dfa == NULL)
779 return REG_ESPACE;
780 preg->allocated = sizeof (re_dfa_t);
781 preg->buffer = dfa;
782 }
783 preg->used = sizeof (re_dfa_t);
784
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
787 {
788 free_dfa_content (dfa);
789 preg->buffer = NULL;
790 preg->allocated = 0;
791 return err;
792 }
793 #ifdef DEBUG
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
797 #endif
798
799 __libc_lock_init (dfa->lock);
800
801 err = re_string_construct (®exp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
804 {
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (®exp);
808 free_dfa_content (dfa);
809 preg->buffer = NULL;
810 preg->allocated = 0;
811 return err;
812 }
813
814 /* Parse the regular expression, and build a structure tree. */
815 preg->re_nsub = 0;
816 dfa->str_tree = parse (®exp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
819
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
824
825 #ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828 optimize_utf8 (dfa);
829 #endif
830
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
833
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (®exp);
837
838 if (BE (err != REG_NOERROR, 0))
839 {
840 free_dfa_content (dfa);
841 preg->buffer = NULL;
842 preg->allocated = 0;
843 }
844
845 return err;
846 }
847
848 /* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
850
851 static reg_errcode_t
init_dfa(re_dfa_t * dfa,size_t pat_len)852 init_dfa (re_dfa_t *dfa, size_t pat_len)
853 {
854 __re_size_t table_size;
855 #ifndef _LIBC
856 const char *codeset_name;
857 #endif
858 #ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
860 #else
861 size_t max_i18n_object_size = 0;
862 #endif
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
869
870 memset (dfa, '\0', sizeof (re_dfa_t));
871
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
880 return REG_ESPACE;
881
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
884
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
888 break;
889
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
892
893 dfa->mb_cur_max = MB_CUR_MAX;
894 #ifdef _LIBC
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
897 dfa->is_utf8 = 1;
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 != 0);
900 #else
901 codeset_name = nl_langinfo (CODESET);
902 if (strcasecmp (codeset_name, "UTF-8") == 0
903 || strcasecmp (codeset_name, "UTF8") == 0)
904 dfa->is_utf8 = 1;
905
906 /* We check exhaustively in the loop below if this charset is a
907 superset of ASCII. */
908 dfa->map_notascii = 0;
909 #endif
910
911 #ifdef RE_ENABLE_I18N
912 if (dfa->mb_cur_max > 1)
913 {
914 if (dfa->is_utf8)
915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
916 else
917 {
918 int i, j, ch;
919
920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
921 if (BE (dfa->sb_char == NULL, 0))
922 return REG_ESPACE;
923
924 /* Set the bits corresponding to single byte chars. */
925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
927 {
928 wint_t wch = __btowc (ch);
929 if (wch != WEOF)
930 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
931 # ifndef _LIBC
932 if (isascii (ch) && wch != ch)
933 dfa->map_notascii = 1;
934 # endif
935 }
936 }
937 }
938 #endif
939
940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
941 return REG_ESPACE;
942 return REG_NOERROR;
943 }
944
945 /* Initialize WORD_CHAR table, which indicate which character is
946 "word". In this case "word" means that it is the word construction
947 character used by some operators like "\<", "\>", etc. */
948
949 static void
950 internal_function
init_word_char(re_dfa_t * dfa)951 init_word_char (re_dfa_t *dfa)
952 {
953 dfa->word_ops_used = 1;
954 int i = 0;
955 int j;
956 int ch = 0;
957 if (BE (dfa->map_notascii == 0, 1))
958 {
959 if (BITSET_WORD_BITS == 64)
960 {
961 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
962 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
963 i = 2;
964 }
965 else if (BITSET_WORD_BITS == 32)
966 {
967 dfa->word_char[0] = UINT32_C (0x00000000);
968 dfa->word_char[1] = UINT32_C (0x03ff0000);
969 dfa->word_char[2] = UINT32_C (0x87fffffe);
970 dfa->word_char[3] = UINT32_C (0x07fffffe);
971 i = 4;
972 }
973 else
974 goto general_case;
975 ch = 128;
976
977 if (BE (dfa->is_utf8, 1))
978 {
979 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
980 return;
981 }
982 }
983
984 general_case:
985 for (; i < BITSET_WORDS; ++i)
986 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
987 if (isalnum (ch) || ch == '_')
988 dfa->word_char[i] |= (bitset_word_t) 1 << j;
989 }
990
991 /* Free the work area which are only used while compiling. */
992
993 static void
free_workarea_compile(regex_t * preg)994 free_workarea_compile (regex_t *preg)
995 {
996 re_dfa_t *dfa = preg->buffer;
997 bin_tree_storage_t *storage, *next;
998 for (storage = dfa->str_tree_storage; storage; storage = next)
999 {
1000 next = storage->next;
1001 re_free (storage);
1002 }
1003 dfa->str_tree_storage = NULL;
1004 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1005 dfa->str_tree = NULL;
1006 re_free (dfa->org_indices);
1007 dfa->org_indices = NULL;
1008 }
1009
1010 /* Create initial states for all contexts. */
1011
1012 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)1013 create_initial_state (re_dfa_t *dfa)
1014 {
1015 Idx first, i;
1016 reg_errcode_t err;
1017 re_node_set init_nodes;
1018
1019 /* Initial states have the epsilon closure of the node which is
1020 the first node of the regular expression. */
1021 first = dfa->str_tree->first->node_idx;
1022 dfa->init_node = first;
1023 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1024 if (BE (err != REG_NOERROR, 0))
1025 return err;
1026
1027 /* The back-references which are in initial states can epsilon transit,
1028 since in this case all of the subexpressions can be null.
1029 Then we add epsilon closures of the nodes which are the next nodes of
1030 the back-references. */
1031 if (dfa->nbackref > 0)
1032 for (i = 0; i < init_nodes.nelem; ++i)
1033 {
1034 Idx node_idx = init_nodes.elems[i];
1035 re_token_type_t type = dfa->nodes[node_idx].type;
1036
1037 Idx clexp_idx;
1038 if (type != OP_BACK_REF)
1039 continue;
1040 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1041 {
1042 re_token_t *clexp_node;
1043 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1044 if (clexp_node->type == OP_CLOSE_SUBEXP
1045 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1046 break;
1047 }
1048 if (clexp_idx == init_nodes.nelem)
1049 continue;
1050
1051 if (type == OP_BACK_REF)
1052 {
1053 Idx dest_idx = dfa->edests[node_idx].elems[0];
1054 if (!re_node_set_contains (&init_nodes, dest_idx))
1055 {
1056 reg_errcode_t merge_err
1057 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1058 if (merge_err != REG_NOERROR)
1059 return merge_err;
1060 i = 0;
1061 }
1062 }
1063 }
1064
1065 /* It must be the first time to invoke acquire_state. */
1066 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1067 /* We don't check ERR here, since the initial state must not be NULL. */
1068 if (BE (dfa->init_state == NULL, 0))
1069 return err;
1070 if (dfa->init_state->has_constraint)
1071 {
1072 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1073 CONTEXT_WORD);
1074 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1075 CONTEXT_NEWLINE);
1076 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1077 &init_nodes,
1078 CONTEXT_NEWLINE
1079 | CONTEXT_BEGBUF);
1080 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1081 || dfa->init_state_begbuf == NULL, 0))
1082 return err;
1083 }
1084 else
1085 dfa->init_state_word = dfa->init_state_nl
1086 = dfa->init_state_begbuf = dfa->init_state;
1087
1088 re_node_set_free (&init_nodes);
1089 return REG_NOERROR;
1090 }
1091
1092 #ifdef RE_ENABLE_I18N
1093 /* If it is possible to do searching in single byte encoding instead of UTF-8
1094 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1095 DFA nodes where needed. */
1096
1097 static void
optimize_utf8(re_dfa_t * dfa)1098 optimize_utf8 (re_dfa_t *dfa)
1099 {
1100 Idx node;
1101 int i;
1102 bool mb_chars = false;
1103 bool has_period = false;
1104
1105 for (node = 0; node < dfa->nodes_len; ++node)
1106 switch (dfa->nodes[node].type)
1107 {
1108 case CHARACTER:
1109 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1110 mb_chars = true;
1111 break;
1112 case ANCHOR:
1113 switch (dfa->nodes[node].opr.ctx_type)
1114 {
1115 case LINE_FIRST:
1116 case LINE_LAST:
1117 case BUF_FIRST:
1118 case BUF_LAST:
1119 break;
1120 default:
1121 /* Word anchors etc. cannot be handled. It's okay to test
1122 opr.ctx_type since constraints (for all DFA nodes) are
1123 created by ORing one or more opr.ctx_type values. */
1124 return;
1125 }
1126 break;
1127 case OP_PERIOD:
1128 has_period = true;
1129 break;
1130 case OP_BACK_REF:
1131 case OP_ALT:
1132 case END_OF_RE:
1133 case OP_DUP_ASTERISK:
1134 case OP_OPEN_SUBEXP:
1135 case OP_CLOSE_SUBEXP:
1136 break;
1137 case COMPLEX_BRACKET:
1138 return;
1139 case SIMPLE_BRACKET:
1140 /* Just double check. */
1141 {
1142 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1143 ? 0
1144 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1145 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1146 {
1147 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1148 return;
1149 rshift = 0;
1150 }
1151 }
1152 break;
1153 default:
1154 abort ();
1155 }
1156
1157 if (mb_chars || has_period)
1158 for (node = 0; node < dfa->nodes_len; ++node)
1159 {
1160 if (dfa->nodes[node].type == CHARACTER
1161 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1162 dfa->nodes[node].mb_partial = 0;
1163 else if (dfa->nodes[node].type == OP_PERIOD)
1164 dfa->nodes[node].type = OP_UTF8_PERIOD;
1165 }
1166
1167 /* The search can be in single byte locale. */
1168 dfa->mb_cur_max = 1;
1169 dfa->is_utf8 = 0;
1170 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1171 }
1172 #endif
1173
1174 /* Analyze the structure tree, and calculate "first", "next", "edest",
1175 "eclosure", and "inveclosure". */
1176
1177 static reg_errcode_t
analyze(regex_t * preg)1178 analyze (regex_t *preg)
1179 {
1180 re_dfa_t *dfa = preg->buffer;
1181 reg_errcode_t ret;
1182
1183 /* Allocate arrays. */
1184 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1185 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1186 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1187 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1188 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1189 || dfa->eclosures == NULL, 0))
1190 return REG_ESPACE;
1191
1192 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1193 if (dfa->subexp_map != NULL)
1194 {
1195 Idx i;
1196 for (i = 0; i < preg->re_nsub; i++)
1197 dfa->subexp_map[i] = i;
1198 preorder (dfa->str_tree, optimize_subexps, dfa);
1199 for (i = 0; i < preg->re_nsub; i++)
1200 if (dfa->subexp_map[i] != i)
1201 break;
1202 if (i == preg->re_nsub)
1203 {
1204 free (dfa->subexp_map);
1205 dfa->subexp_map = NULL;
1206 }
1207 }
1208
1209 ret = postorder (dfa->str_tree, lower_subexps, preg);
1210 if (BE (ret != REG_NOERROR, 0))
1211 return ret;
1212 ret = postorder (dfa->str_tree, calc_first, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1214 return ret;
1215 preorder (dfa->str_tree, calc_next, dfa);
1216 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1217 if (BE (ret != REG_NOERROR, 0))
1218 return ret;
1219 ret = calc_eclosure (dfa);
1220 if (BE (ret != REG_NOERROR, 0))
1221 return ret;
1222
1223 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1224 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1225 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1226 || dfa->nbackref)
1227 {
1228 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1229 if (BE (dfa->inveclosures == NULL, 0))
1230 return REG_ESPACE;
1231 ret = calc_inveclosure (dfa);
1232 }
1233
1234 return ret;
1235 }
1236
1237 /* Our parse trees are very unbalanced, so we cannot use a stack to
1238 implement parse tree visits. Instead, we use parent pointers and
1239 some hairy code in these two functions. */
1240 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1241 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242 void *extra)
1243 {
1244 bin_tree_t *node, *prev;
1245
1246 for (node = root; ; )
1247 {
1248 /* Descend down the tree, preferably to the left (or to the right
1249 if that's the only child). */
1250 while (node->left || node->right)
1251 if (node->left)
1252 node = node->left;
1253 else
1254 node = node->right;
1255
1256 do
1257 {
1258 reg_errcode_t err = fn (extra, node);
1259 if (BE (err != REG_NOERROR, 0))
1260 return err;
1261 if (node->parent == NULL)
1262 return REG_NOERROR;
1263 prev = node;
1264 node = node->parent;
1265 }
1266 /* Go up while we have a node that is reached from the right. */
1267 while (node->right == prev || node->right == NULL);
1268 node = node->right;
1269 }
1270 }
1271
1272 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1273 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1274 void *extra)
1275 {
1276 bin_tree_t *node;
1277
1278 for (node = root; ; )
1279 {
1280 reg_errcode_t err = fn (extra, node);
1281 if (BE (err != REG_NOERROR, 0))
1282 return err;
1283
1284 /* Go to the left node, or up and to the right. */
1285 if (node->left)
1286 node = node->left;
1287 else
1288 {
1289 bin_tree_t *prev = NULL;
1290 while (node->right == prev || node->right == NULL)
1291 {
1292 prev = node;
1293 node = node->parent;
1294 if (!node)
1295 return REG_NOERROR;
1296 }
1297 node = node->right;
1298 }
1299 }
1300 }
1301
1302 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1303 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1304 backreferences as well. Requires a preorder visit. */
1305 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1306 optimize_subexps (void *extra, bin_tree_t *node)
1307 {
1308 re_dfa_t *dfa = (re_dfa_t *) extra;
1309
1310 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1311 {
1312 int idx = node->token.opr.idx;
1313 node->token.opr.idx = dfa->subexp_map[idx];
1314 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1315 }
1316
1317 else if (node->token.type == SUBEXP
1318 && node->left && node->left->token.type == SUBEXP)
1319 {
1320 Idx other_idx = node->left->token.opr.idx;
1321
1322 node->left = node->left->left;
1323 if (node->left)
1324 node->left->parent = node;
1325
1326 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1327 if (other_idx < BITSET_WORD_BITS)
1328 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1329 }
1330
1331 return REG_NOERROR;
1332 }
1333
1334 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1335 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1336 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1337 lower_subexps (void *extra, bin_tree_t *node)
1338 {
1339 regex_t *preg = (regex_t *) extra;
1340 reg_errcode_t err = REG_NOERROR;
1341
1342 if (node->left && node->left->token.type == SUBEXP)
1343 {
1344 node->left = lower_subexp (&err, preg, node->left);
1345 if (node->left)
1346 node->left->parent = node;
1347 }
1348 if (node->right && node->right->token.type == SUBEXP)
1349 {
1350 node->right = lower_subexp (&err, preg, node->right);
1351 if (node->right)
1352 node->right->parent = node;
1353 }
1354
1355 return err;
1356 }
1357
1358 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1359 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1360 {
1361 re_dfa_t *dfa = preg->buffer;
1362 bin_tree_t *body = node->left;
1363 bin_tree_t *op, *cls, *tree1, *tree;
1364
1365 if (preg->no_sub
1366 /* We do not optimize empty subexpressions, because otherwise we may
1367 have bad CONCAT nodes with NULL children. This is obviously not
1368 very common, so we do not lose much. An example that triggers
1369 this case is the sed "script" /\(\)/x. */
1370 && node->left != NULL
1371 && (node->token.opr.idx >= BITSET_WORD_BITS
1372 || !(dfa->used_bkref_map
1373 & ((bitset_word_t) 1 << node->token.opr.idx))))
1374 return node->left;
1375
1376 /* Convert the SUBEXP node to the concatenation of an
1377 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1378 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1379 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1380 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1381 tree = create_tree (dfa, op, tree1, CONCAT);
1382 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1383 {
1384 *err = REG_ESPACE;
1385 return NULL;
1386 }
1387
1388 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1389 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1390 return tree;
1391 }
1392
1393 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1394 nodes. Requires a postorder visit. */
1395 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1396 calc_first (void *extra, bin_tree_t *node)
1397 {
1398 re_dfa_t *dfa = (re_dfa_t *) extra;
1399 if (node->token.type == CONCAT)
1400 {
1401 node->first = node->left->first;
1402 node->node_idx = node->left->node_idx;
1403 }
1404 else
1405 {
1406 node->first = node;
1407 node->node_idx = re_dfa_add_node (dfa, node->token);
1408 if (BE (node->node_idx == REG_MISSING, 0))
1409 return REG_ESPACE;
1410 if (node->token.type == ANCHOR)
1411 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1412 }
1413 return REG_NOERROR;
1414 }
1415
1416 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1417 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1418 calc_next (void *extra, bin_tree_t *node)
1419 {
1420 switch (node->token.type)
1421 {
1422 case OP_DUP_ASTERISK:
1423 node->left->next = node;
1424 break;
1425 case CONCAT:
1426 node->left->next = node->right->first;
1427 node->right->next = node->next;
1428 break;
1429 default:
1430 if (node->left)
1431 node->left->next = node->next;
1432 if (node->right)
1433 node->right->next = node->next;
1434 break;
1435 }
1436 return REG_NOERROR;
1437 }
1438
1439 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1440 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1441 link_nfa_nodes (void *extra, bin_tree_t *node)
1442 {
1443 re_dfa_t *dfa = (re_dfa_t *) extra;
1444 Idx idx = node->node_idx;
1445 reg_errcode_t err = REG_NOERROR;
1446
1447 switch (node->token.type)
1448 {
1449 case CONCAT:
1450 break;
1451
1452 case END_OF_RE:
1453 assert (node->next == NULL);
1454 break;
1455
1456 case OP_DUP_ASTERISK:
1457 case OP_ALT:
1458 {
1459 Idx left, right;
1460 dfa->has_plural_match = 1;
1461 if (node->left != NULL)
1462 left = node->left->first->node_idx;
1463 else
1464 left = node->next->node_idx;
1465 if (node->right != NULL)
1466 right = node->right->first->node_idx;
1467 else
1468 right = node->next->node_idx;
1469 assert (REG_VALID_INDEX (left));
1470 assert (REG_VALID_INDEX (right));
1471 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1472 }
1473 break;
1474
1475 case ANCHOR:
1476 case OP_OPEN_SUBEXP:
1477 case OP_CLOSE_SUBEXP:
1478 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1479 break;
1480
1481 case OP_BACK_REF:
1482 dfa->nexts[idx] = node->next->node_idx;
1483 if (node->token.type == OP_BACK_REF)
1484 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1485 break;
1486
1487 default:
1488 assert (!IS_EPSILON_NODE (node->token.type));
1489 dfa->nexts[idx] = node->next->node_idx;
1490 break;
1491 }
1492
1493 return err;
1494 }
1495
1496 /* Duplicate the epsilon closure of the node ROOT_NODE.
1497 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1498 to their own constraint. */
1499
1500 static reg_errcode_t
1501 internal_function
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1502 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1503 Idx root_node, unsigned int init_constraint)
1504 {
1505 Idx org_node, clone_node;
1506 bool ok;
1507 unsigned int constraint = init_constraint;
1508 for (org_node = top_org_node, clone_node = top_clone_node;;)
1509 {
1510 Idx org_dest, clone_dest;
1511 if (dfa->nodes[org_node].type == OP_BACK_REF)
1512 {
1513 /* If the back reference epsilon-transit, its destination must
1514 also have the constraint. Then duplicate the epsilon closure
1515 of the destination of the back reference, and store it in
1516 edests of the back reference. */
1517 org_dest = dfa->nexts[org_node];
1518 re_node_set_empty (dfa->edests + clone_node);
1519 clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 if (BE (clone_dest == REG_MISSING, 0))
1521 return REG_ESPACE;
1522 dfa->nexts[clone_node] = dfa->nexts[org_node];
1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524 if (BE (! ok, 0))
1525 return REG_ESPACE;
1526 }
1527 else if (dfa->edests[org_node].nelem == 0)
1528 {
1529 /* In case of the node can't epsilon-transit, don't duplicate the
1530 destination and store the original destination as the
1531 destination of the node. */
1532 dfa->nexts[clone_node] = dfa->nexts[org_node];
1533 break;
1534 }
1535 else if (dfa->edests[org_node].nelem == 1)
1536 {
1537 /* In case of the node can epsilon-transit, and it has only one
1538 destination. */
1539 org_dest = dfa->edests[org_node].elems[0];
1540 re_node_set_empty (dfa->edests + clone_node);
1541 /* If the node is root_node itself, it means the epsilon closure
1542 has a loop. Then tie it to the destination of the root_node. */
1543 if (org_node == root_node && clone_node != org_node)
1544 {
1545 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1546 if (BE (! ok, 0))
1547 return REG_ESPACE;
1548 break;
1549 }
1550 /* In case the node has another constraint, append it. */
1551 constraint |= dfa->nodes[org_node].constraint;
1552 clone_dest = duplicate_node (dfa, org_dest, constraint);
1553 if (BE (clone_dest == REG_MISSING, 0))
1554 return REG_ESPACE;
1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 if (BE (! ok, 0))
1557 return REG_ESPACE;
1558 }
1559 else /* dfa->edests[org_node].nelem == 2 */
1560 {
1561 /* In case of the node can epsilon-transit, and it has two
1562 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1563 org_dest = dfa->edests[org_node].elems[0];
1564 re_node_set_empty (dfa->edests + clone_node);
1565 /* Search for a duplicated node which satisfies the constraint. */
1566 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1567 if (clone_dest == REG_MISSING)
1568 {
1569 /* There is no such duplicated node, create a new one. */
1570 reg_errcode_t err;
1571 clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 if (BE (clone_dest == REG_MISSING, 0))
1573 return REG_ESPACE;
1574 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575 if (BE (! ok, 0))
1576 return REG_ESPACE;
1577 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1578 root_node, constraint);
1579 if (BE (err != REG_NOERROR, 0))
1580 return err;
1581 }
1582 else
1583 {
1584 /* There is a duplicated node which satisfies the constraint,
1585 use it to avoid infinite loop. */
1586 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587 if (BE (! ok, 0))
1588 return REG_ESPACE;
1589 }
1590
1591 org_dest = dfa->edests[org_node].elems[1];
1592 clone_dest = duplicate_node (dfa, org_dest, constraint);
1593 if (BE (clone_dest == REG_MISSING, 0))
1594 return REG_ESPACE;
1595 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1596 if (BE (! ok, 0))
1597 return REG_ESPACE;
1598 }
1599 org_node = org_dest;
1600 clone_node = clone_dest;
1601 }
1602 return REG_NOERROR;
1603 }
1604
1605 /* Search for a node which is duplicated from the node ORG_NODE, and
1606 satisfies the constraint CONSTRAINT. */
1607
1608 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1609 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1610 unsigned int constraint)
1611 {
1612 Idx idx;
1613 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1614 {
1615 if (org_node == dfa->org_indices[idx]
1616 && constraint == dfa->nodes[idx].constraint)
1617 return idx; /* Found. */
1618 }
1619 return REG_MISSING; /* Not found. */
1620 }
1621
1622 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1623 Return the index of the new node, or REG_MISSING if insufficient storage is
1624 available. */
1625
1626 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1627 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1628 {
1629 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1630 if (BE (dup_idx != REG_MISSING, 1))
1631 {
1632 dfa->nodes[dup_idx].constraint = constraint;
1633 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1634 dfa->nodes[dup_idx].duplicated = 1;
1635
1636 /* Store the index of the original node. */
1637 dfa->org_indices[dup_idx] = org_idx;
1638 }
1639 return dup_idx;
1640 }
1641
1642 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1643 calc_inveclosure (re_dfa_t *dfa)
1644 {
1645 Idx src, idx;
1646 bool ok;
1647 for (idx = 0; idx < dfa->nodes_len; ++idx)
1648 re_node_set_init_empty (dfa->inveclosures + idx);
1649
1650 for (src = 0; src < dfa->nodes_len; ++src)
1651 {
1652 Idx *elems = dfa->eclosures[src].elems;
1653 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1654 {
1655 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1656 if (BE (! ok, 0))
1657 return REG_ESPACE;
1658 }
1659 }
1660
1661 return REG_NOERROR;
1662 }
1663
1664 /* Calculate "eclosure" for all the node in DFA. */
1665
1666 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1667 calc_eclosure (re_dfa_t *dfa)
1668 {
1669 Idx node_idx;
1670 bool incomplete;
1671 #ifdef DEBUG
1672 assert (dfa->nodes_len > 0);
1673 #endif
1674 incomplete = false;
1675 /* For each nodes, calculate epsilon closure. */
1676 for (node_idx = 0; ; ++node_idx)
1677 {
1678 reg_errcode_t err;
1679 re_node_set eclosure_elem;
1680 if (node_idx == dfa->nodes_len)
1681 {
1682 if (!incomplete)
1683 break;
1684 incomplete = false;
1685 node_idx = 0;
1686 }
1687
1688 #ifdef DEBUG
1689 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1690 #endif
1691
1692 /* If we have already calculated, skip it. */
1693 if (dfa->eclosures[node_idx].nelem != 0)
1694 continue;
1695 /* Calculate epsilon closure of 'node_idx'. */
1696 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1697 if (BE (err != REG_NOERROR, 0))
1698 return err;
1699
1700 if (dfa->eclosures[node_idx].nelem == 0)
1701 {
1702 incomplete = true;
1703 re_node_set_free (&eclosure_elem);
1704 }
1705 }
1706 return REG_NOERROR;
1707 }
1708
1709 /* Calculate epsilon closure of NODE. */
1710
1711 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1712 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1713 {
1714 reg_errcode_t err;
1715 Idx i;
1716 re_node_set eclosure;
1717 bool ok;
1718 bool incomplete = false;
1719 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1720 if (BE (err != REG_NOERROR, 0))
1721 return err;
1722
1723 /* This indicates that we are calculating this node now.
1724 We reference this value to avoid infinite loop. */
1725 dfa->eclosures[node].nelem = REG_MISSING;
1726
1727 /* If the current node has constraints, duplicate all nodes
1728 since they must inherit the constraints. */
1729 if (dfa->nodes[node].constraint
1730 && dfa->edests[node].nelem
1731 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1732 {
1733 err = duplicate_node_closure (dfa, node, node, node,
1734 dfa->nodes[node].constraint);
1735 if (BE (err != REG_NOERROR, 0))
1736 return err;
1737 }
1738
1739 /* Expand each epsilon destination nodes. */
1740 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1741 for (i = 0; i < dfa->edests[node].nelem; ++i)
1742 {
1743 re_node_set eclosure_elem;
1744 Idx edest = dfa->edests[node].elems[i];
1745 /* If calculating the epsilon closure of 'edest' is in progress,
1746 return intermediate result. */
1747 if (dfa->eclosures[edest].nelem == REG_MISSING)
1748 {
1749 incomplete = true;
1750 continue;
1751 }
1752 /* If we haven't calculated the epsilon closure of 'edest' yet,
1753 calculate now. Otherwise use calculated epsilon closure. */
1754 if (dfa->eclosures[edest].nelem == 0)
1755 {
1756 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1757 if (BE (err != REG_NOERROR, 0))
1758 return err;
1759 }
1760 else
1761 eclosure_elem = dfa->eclosures[edest];
1762 /* Merge the epsilon closure of 'edest'. */
1763 err = re_node_set_merge (&eclosure, &eclosure_elem);
1764 if (BE (err != REG_NOERROR, 0))
1765 return err;
1766 /* If the epsilon closure of 'edest' is incomplete,
1767 the epsilon closure of this node is also incomplete. */
1768 if (dfa->eclosures[edest].nelem == 0)
1769 {
1770 incomplete = true;
1771 re_node_set_free (&eclosure_elem);
1772 }
1773 }
1774
1775 /* An epsilon closure includes itself. */
1776 ok = re_node_set_insert (&eclosure, node);
1777 if (BE (! ok, 0))
1778 return REG_ESPACE;
1779 if (incomplete && !root)
1780 dfa->eclosures[node].nelem = 0;
1781 else
1782 dfa->eclosures[node] = eclosure;
1783 *new_set = eclosure;
1784 return REG_NOERROR;
1785 }
1786
1787 /* Functions for token which are used in the parser. */
1788
1789 /* Fetch a token from INPUT.
1790 We must not use this function inside bracket expressions. */
1791
1792 static void
1793 internal_function
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1794 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1795 {
1796 re_string_skip_bytes (input, peek_token (result, input, syntax));
1797 }
1798
1799 /* Peek a token from INPUT, and return the length of the token.
1800 We must not use this function inside bracket expressions. */
1801
1802 static int
1803 internal_function
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1804 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1805 {
1806 unsigned char c;
1807
1808 if (re_string_eoi (input))
1809 {
1810 token->type = END_OF_RE;
1811 return 0;
1812 }
1813
1814 c = re_string_peek_byte (input, 0);
1815 token->opr.c = c;
1816
1817 token->word_char = 0;
1818 #ifdef RE_ENABLE_I18N
1819 token->mb_partial = 0;
1820 if (input->mb_cur_max > 1 &&
1821 !re_string_first_byte (input, re_string_cur_idx (input)))
1822 {
1823 token->type = CHARACTER;
1824 token->mb_partial = 1;
1825 return 1;
1826 }
1827 #endif
1828 if (c == '\\')
1829 {
1830 unsigned char c2;
1831 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1832 {
1833 token->type = BACK_SLASH;
1834 return 1;
1835 }
1836
1837 c2 = re_string_peek_byte_case (input, 1);
1838 token->opr.c = c2;
1839 token->type = CHARACTER;
1840 #ifdef RE_ENABLE_I18N
1841 if (input->mb_cur_max > 1)
1842 {
1843 wint_t wc = re_string_wchar_at (input,
1844 re_string_cur_idx (input) + 1);
1845 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1846 }
1847 else
1848 #endif
1849 token->word_char = IS_WORD_CHAR (c2) != 0;
1850
1851 switch (c2)
1852 {
1853 case '|':
1854 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1855 token->type = OP_ALT;
1856 break;
1857 case '1': case '2': case '3': case '4': case '5':
1858 case '6': case '7': case '8': case '9':
1859 if (!(syntax & RE_NO_BK_REFS))
1860 {
1861 token->type = OP_BACK_REF;
1862 token->opr.idx = c2 - '1';
1863 }
1864 break;
1865 case '<':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 {
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = WORD_FIRST;
1870 }
1871 break;
1872 case '>':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 {
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_LAST;
1877 }
1878 break;
1879 case 'b':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = WORD_DELIM;
1884 }
1885 break;
1886 case 'B':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 {
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = NOT_WORD_DELIM;
1891 }
1892 break;
1893 case 'w':
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 token->type = OP_WORD;
1896 break;
1897 case 'W':
1898 if (!(syntax & RE_NO_GNU_OPS))
1899 token->type = OP_NOTWORD;
1900 break;
1901 case 's':
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = OP_SPACE;
1904 break;
1905 case 'S':
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_NOTSPACE;
1908 break;
1909 case '`':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 {
1912 token->type = ANCHOR;
1913 token->opr.ctx_type = BUF_FIRST;
1914 }
1915 break;
1916 case '\'':
1917 if (!(syntax & RE_NO_GNU_OPS))
1918 {
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = BUF_LAST;
1921 }
1922 break;
1923 case '(':
1924 if (!(syntax & RE_NO_BK_PARENS))
1925 token->type = OP_OPEN_SUBEXP;
1926 break;
1927 case ')':
1928 if (!(syntax & RE_NO_BK_PARENS))
1929 token->type = OP_CLOSE_SUBEXP;
1930 break;
1931 case '+':
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1933 token->type = OP_DUP_PLUS;
1934 break;
1935 case '?':
1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1937 token->type = OP_DUP_QUESTION;
1938 break;
1939 case '{':
1940 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1941 token->type = OP_OPEN_DUP_NUM;
1942 break;
1943 case '}':
1944 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1945 token->type = OP_CLOSE_DUP_NUM;
1946 break;
1947 default:
1948 break;
1949 }
1950 return 2;
1951 }
1952
1953 token->type = CHARACTER;
1954 #ifdef RE_ENABLE_I18N
1955 if (input->mb_cur_max > 1)
1956 {
1957 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1958 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1959 }
1960 else
1961 #endif
1962 token->word_char = IS_WORD_CHAR (token->opr.c);
1963
1964 switch (c)
1965 {
1966 case '\n':
1967 if (syntax & RE_NEWLINE_ALT)
1968 token->type = OP_ALT;
1969 break;
1970 case '|':
1971 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1972 token->type = OP_ALT;
1973 break;
1974 case '*':
1975 token->type = OP_DUP_ASTERISK;
1976 break;
1977 case '+':
1978 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1979 token->type = OP_DUP_PLUS;
1980 break;
1981 case '?':
1982 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1983 token->type = OP_DUP_QUESTION;
1984 break;
1985 case '{':
1986 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1987 token->type = OP_OPEN_DUP_NUM;
1988 break;
1989 case '}':
1990 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1991 token->type = OP_CLOSE_DUP_NUM;
1992 break;
1993 case '(':
1994 if (syntax & RE_NO_BK_PARENS)
1995 token->type = OP_OPEN_SUBEXP;
1996 break;
1997 case ')':
1998 if (syntax & RE_NO_BK_PARENS)
1999 token->type = OP_CLOSE_SUBEXP;
2000 break;
2001 case '[':
2002 token->type = OP_OPEN_BRACKET;
2003 break;
2004 case '.':
2005 token->type = OP_PERIOD;
2006 break;
2007 case '^':
2008 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2009 re_string_cur_idx (input) != 0)
2010 {
2011 char prev = re_string_peek_byte (input, -1);
2012 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2013 break;
2014 }
2015 token->type = ANCHOR;
2016 token->opr.ctx_type = LINE_FIRST;
2017 break;
2018 case '$':
2019 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2020 re_string_cur_idx (input) + 1 != re_string_length (input))
2021 {
2022 re_token_t next;
2023 re_string_skip_bytes (input, 1);
2024 peek_token (&next, input, syntax);
2025 re_string_skip_bytes (input, -1);
2026 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2027 break;
2028 }
2029 token->type = ANCHOR;
2030 token->opr.ctx_type = LINE_LAST;
2031 break;
2032 default:
2033 break;
2034 }
2035 return 1;
2036 }
2037
2038 /* Peek a token from INPUT, and return the length of the token.
2039 We must not use this function out of bracket expressions. */
2040
2041 static int
2042 internal_function
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)2043 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2044 {
2045 unsigned char c;
2046 if (re_string_eoi (input))
2047 {
2048 token->type = END_OF_RE;
2049 return 0;
2050 }
2051 c = re_string_peek_byte (input, 0);
2052 token->opr.c = c;
2053
2054 #ifdef RE_ENABLE_I18N
2055 if (input->mb_cur_max > 1 &&
2056 !re_string_first_byte (input, re_string_cur_idx (input)))
2057 {
2058 token->type = CHARACTER;
2059 return 1;
2060 }
2061 #endif /* RE_ENABLE_I18N */
2062
2063 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2064 && re_string_cur_idx (input) + 1 < re_string_length (input))
2065 {
2066 /* In this case, '\' escape a character. */
2067 unsigned char c2;
2068 re_string_skip_bytes (input, 1);
2069 c2 = re_string_peek_byte (input, 0);
2070 token->opr.c = c2;
2071 token->type = CHARACTER;
2072 return 1;
2073 }
2074 if (c == '[') /* '[' is a special char in a bracket exps. */
2075 {
2076 unsigned char c2;
2077 int token_len;
2078 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2079 c2 = re_string_peek_byte (input, 1);
2080 else
2081 c2 = 0;
2082 token->opr.c = c2;
2083 token_len = 2;
2084 switch (c2)
2085 {
2086 case '.':
2087 token->type = OP_OPEN_COLL_ELEM;
2088 break;
2089 case '=':
2090 token->type = OP_OPEN_EQUIV_CLASS;
2091 break;
2092 case ':':
2093 if (syntax & RE_CHAR_CLASSES)
2094 {
2095 token->type = OP_OPEN_CHAR_CLASS;
2096 break;
2097 }
2098 /* else fall through. */
2099 default:
2100 token->type = CHARACTER;
2101 token->opr.c = c;
2102 token_len = 1;
2103 break;
2104 }
2105 return token_len;
2106 }
2107 switch (c)
2108 {
2109 case '-':
2110 token->type = OP_CHARSET_RANGE;
2111 break;
2112 case ']':
2113 token->type = OP_CLOSE_BRACKET;
2114 break;
2115 case '^':
2116 token->type = OP_NON_MATCH_LIST;
2117 break;
2118 default:
2119 token->type = CHARACTER;
2120 }
2121 return 1;
2122 }
2123
2124 /* Functions for parser. */
2125
2126 /* Entry point of the parser.
2127 Parse the regular expression REGEXP and return the structure tree.
2128 If an error occurs, ERR is set by error code, and return NULL.
2129 This function build the following tree, from regular expression <reg_exp>:
2130 CAT
2131 / \
2132 / \
2133 <reg_exp> EOR
2134
2135 CAT means concatenation.
2136 EOR means end of regular expression. */
2137
2138 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2139 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2140 reg_errcode_t *err)
2141 {
2142 re_dfa_t *dfa = preg->buffer;
2143 bin_tree_t *tree, *eor, *root;
2144 re_token_t current_token;
2145 dfa->syntax = syntax;
2146 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2147 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2148 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2149 return NULL;
2150 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2151 if (tree != NULL)
2152 root = create_tree (dfa, tree, eor, CONCAT);
2153 else
2154 root = eor;
2155 if (BE (eor == NULL || root == NULL, 0))
2156 {
2157 *err = REG_ESPACE;
2158 return NULL;
2159 }
2160 return root;
2161 }
2162
2163 /* This function build the following tree, from regular expression
2164 <branch1>|<branch2>:
2165 ALT
2166 / \
2167 / \
2168 <branch1> <branch2>
2169
2170 ALT means alternative, which represents the operator '|'. */
2171
2172 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2173 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2174 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2175 {
2176 re_dfa_t *dfa = preg->buffer;
2177 bin_tree_t *tree, *branch = NULL;
2178 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2179 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2180 return NULL;
2181
2182 while (token->type == OP_ALT)
2183 {
2184 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2185 if (token->type != OP_ALT && token->type != END_OF_RE
2186 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187 {
2188 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2189 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2190 return NULL;
2191 }
2192 else
2193 branch = NULL;
2194 tree = create_tree (dfa, tree, branch, OP_ALT);
2195 if (BE (tree == NULL, 0))
2196 {
2197 *err = REG_ESPACE;
2198 return NULL;
2199 }
2200 }
2201 return tree;
2202 }
2203
2204 /* This function build the following tree, from regular expression
2205 <exp1><exp2>:
2206 CAT
2207 / \
2208 / \
2209 <exp1> <exp2>
2210
2211 CAT means concatenation. */
2212
2213 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 {
2217 bin_tree_t *tree, *expr;
2218 re_dfa_t *dfa = preg->buffer;
2219 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221 return NULL;
2222
2223 while (token->type != OP_ALT && token->type != END_OF_RE
2224 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2225 {
2226 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2228 {
2229 if (tree != NULL)
2230 postorder (tree, free_tree, NULL);
2231 return NULL;
2232 }
2233 if (tree != NULL && expr != NULL)
2234 {
2235 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 if (newtree == NULL)
2237 {
2238 postorder (expr, free_tree, NULL);
2239 postorder (tree, free_tree, NULL);
2240 *err = REG_ESPACE;
2241 return NULL;
2242 }
2243 tree = newtree;
2244 }
2245 else if (tree == NULL)
2246 tree = expr;
2247 /* Otherwise expr == NULL, we don't need to create new tree. */
2248 }
2249 return tree;
2250 }
2251
2252 /* This function build the following tree, from regular expression a*:
2253 *
2254 |
2255 a
2256 */
2257
2258 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261 {
2262 re_dfa_t *dfa = preg->buffer;
2263 bin_tree_t *tree;
2264 switch (token->type)
2265 {
2266 case CHARACTER:
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2269 {
2270 *err = REG_ESPACE;
2271 return NULL;
2272 }
2273 #ifdef RE_ENABLE_I18N
2274 if (dfa->mb_cur_max > 1)
2275 {
2276 while (!re_string_eoi (regexp)
2277 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278 {
2279 bin_tree_t *mbc_remain;
2280 fetch_token (token, regexp, syntax);
2281 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 if (BE (mbc_remain == NULL || tree == NULL, 0))
2284 {
2285 *err = REG_ESPACE;
2286 return NULL;
2287 }
2288 }
2289 }
2290 #endif
2291 break;
2292 case OP_OPEN_SUBEXP:
2293 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295 return NULL;
2296 break;
2297 case OP_OPEN_BRACKET:
2298 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300 return NULL;
2301 break;
2302 case OP_BACK_REF:
2303 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2304 {
2305 *err = REG_ESUBREG;
2306 return NULL;
2307 }
2308 dfa->used_bkref_map |= 1 << token->opr.idx;
2309 tree = create_token_tree (dfa, NULL, NULL, token);
2310 if (BE (tree == NULL, 0))
2311 {
2312 *err = REG_ESPACE;
2313 return NULL;
2314 }
2315 ++dfa->nbackref;
2316 dfa->has_mb_node = 1;
2317 break;
2318 case OP_OPEN_DUP_NUM:
2319 if (syntax & RE_CONTEXT_INVALID_DUP)
2320 {
2321 *err = REG_BADRPT;
2322 return NULL;
2323 }
2324 /* FALLTHROUGH */
2325 case OP_DUP_ASTERISK:
2326 case OP_DUP_PLUS:
2327 case OP_DUP_QUESTION:
2328 if (syntax & RE_CONTEXT_INVALID_OPS)
2329 {
2330 *err = REG_BADRPT;
2331 return NULL;
2332 }
2333 else if (syntax & RE_CONTEXT_INDEP_OPS)
2334 {
2335 fetch_token (token, regexp, syntax);
2336 return parse_expression (regexp, preg, token, syntax, nest, err);
2337 }
2338 /* else fall through */
2339 case OP_CLOSE_SUBEXP:
2340 if ((token->type == OP_CLOSE_SUBEXP) &&
2341 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2342 {
2343 *err = REG_ERPAREN;
2344 return NULL;
2345 }
2346 /* else fall through */
2347 case OP_CLOSE_DUP_NUM:
2348 /* We treat it as a normal character. */
2349
2350 /* Then we can these characters as normal characters. */
2351 token->type = CHARACTER;
2352 /* mb_partial and word_char bits should be initialized already
2353 by peek_token. */
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2356 {
2357 *err = REG_ESPACE;
2358 return NULL;
2359 }
2360 break;
2361 case ANCHOR:
2362 if ((token->opr.ctx_type
2363 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 && dfa->word_ops_used == 0)
2365 init_word_char (dfa);
2366 if (token->opr.ctx_type == WORD_DELIM
2367 || token->opr.ctx_type == NOT_WORD_DELIM)
2368 {
2369 bin_tree_t *tree_first, *tree_last;
2370 if (token->opr.ctx_type == WORD_DELIM)
2371 {
2372 token->opr.ctx_type = WORD_FIRST;
2373 tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 token->opr.ctx_type = WORD_LAST;
2375 }
2376 else
2377 {
2378 token->opr.ctx_type = INSIDE_WORD;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = INSIDE_NOTWORD;
2381 }
2382 tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2385 {
2386 *err = REG_ESPACE;
2387 return NULL;
2388 }
2389 }
2390 else
2391 {
2392 tree = create_token_tree (dfa, NULL, NULL, token);
2393 if (BE (tree == NULL, 0))
2394 {
2395 *err = REG_ESPACE;
2396 return NULL;
2397 }
2398 }
2399 /* We must return here, since ANCHORs can't be followed
2400 by repetition operators.
2401 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2403 fetch_token (token, regexp, syntax);
2404 return tree;
2405 case OP_PERIOD:
2406 tree = create_token_tree (dfa, NULL, NULL, token);
2407 if (BE (tree == NULL, 0))
2408 {
2409 *err = REG_ESPACE;
2410 return NULL;
2411 }
2412 if (dfa->mb_cur_max > 1)
2413 dfa->has_mb_node = 1;
2414 break;
2415 case OP_WORD:
2416 case OP_NOTWORD:
2417 tree = build_charclass_op (dfa, regexp->trans,
2418 (const unsigned char *) "alnum",
2419 (const unsigned char *) "_",
2420 token->type == OP_NOTWORD, err);
2421 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2422 return NULL;
2423 break;
2424 case OP_SPACE:
2425 case OP_NOTSPACE:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 (const unsigned char *) "space",
2428 (const unsigned char *) "",
2429 token->type == OP_NOTSPACE, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
2433 case OP_ALT:
2434 case END_OF_RE:
2435 return NULL;
2436 case BACK_SLASH:
2437 *err = REG_EESCAPE;
2438 return NULL;
2439 default:
2440 /* Must not happen? */
2441 #ifdef DEBUG
2442 assert (0);
2443 #endif
2444 return NULL;
2445 }
2446 fetch_token (token, regexp, syntax);
2447
2448 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2450 {
2451 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2452 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2453 return NULL;
2454 /* In BRE consecutive duplications are not allowed. */
2455 if ((syntax & RE_CONTEXT_INVALID_DUP)
2456 && (token->type == OP_DUP_ASTERISK
2457 || token->type == OP_OPEN_DUP_NUM))
2458 {
2459 *err = REG_BADRPT;
2460 return NULL;
2461 }
2462 }
2463
2464 return tree;
2465 }
2466
2467 /* This function build the following tree, from regular expression
2468 (<reg_exp>):
2469 SUBEXP
2470 |
2471 <reg_exp>
2472 */
2473
2474 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2475 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2476 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2477 {
2478 re_dfa_t *dfa = preg->buffer;
2479 bin_tree_t *tree;
2480 size_t cur_nsub;
2481 cur_nsub = preg->re_nsub++;
2482
2483 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2484
2485 /* The subexpression may be a null string. */
2486 if (token->type == OP_CLOSE_SUBEXP)
2487 tree = NULL;
2488 else
2489 {
2490 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2491 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2492 {
2493 if (tree != NULL)
2494 postorder (tree, free_tree, NULL);
2495 *err = REG_EPAREN;
2496 }
2497 if (BE (*err != REG_NOERROR, 0))
2498 return NULL;
2499 }
2500
2501 if (cur_nsub <= '9' - '1')
2502 dfa->completed_bkref_map |= 1 << cur_nsub;
2503
2504 tree = create_tree (dfa, tree, NULL, SUBEXP);
2505 if (BE (tree == NULL, 0))
2506 {
2507 *err = REG_ESPACE;
2508 return NULL;
2509 }
2510 tree->token.opr.idx = cur_nsub;
2511 return tree;
2512 }
2513
2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2515
2516 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2517 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2518 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2519 {
2520 bin_tree_t *tree = NULL, *old_tree = NULL;
2521 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2522 re_token_t start_token = *token;
2523
2524 if (token->type == OP_OPEN_DUP_NUM)
2525 {
2526 end = 0;
2527 start = fetch_number (regexp, token, syntax);
2528 if (start == REG_MISSING)
2529 {
2530 if (token->type == CHARACTER && token->opr.c == ',')
2531 start = 0; /* We treat "{,m}" as "{0,m}". */
2532 else
2533 {
2534 *err = REG_BADBR; /* <re>{} is invalid. */
2535 return NULL;
2536 }
2537 }
2538 if (BE (start != REG_ERROR, 1))
2539 {
2540 /* We treat "{n}" as "{n,n}". */
2541 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542 : ((token->type == CHARACTER && token->opr.c == ',')
2543 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2544 }
2545 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2546 {
2547 /* Invalid sequence. */
2548 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549 {
2550 if (token->type == END_OF_RE)
2551 *err = REG_EBRACE;
2552 else
2553 *err = REG_BADBR;
2554
2555 return NULL;
2556 }
2557
2558 /* If the syntax bit is set, rollback. */
2559 re_string_set_index (regexp, start_idx);
2560 *token = start_token;
2561 token->type = CHARACTER;
2562 /* mb_partial and word_char bits should be already initialized by
2563 peek_token. */
2564 return elem;
2565 }
2566
2567 if (BE ((end != REG_MISSING && start > end)
2568 || token->type != OP_CLOSE_DUP_NUM, 0))
2569 {
2570 /* First number greater than second. */
2571 *err = REG_BADBR;
2572 return NULL;
2573 }
2574
2575 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2576 {
2577 *err = REG_ESIZE;
2578 return NULL;
2579 }
2580 }
2581 else
2582 {
2583 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2584 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2585 }
2586
2587 fetch_token (token, regexp, syntax);
2588
2589 if (BE (elem == NULL, 0))
2590 return NULL;
2591 if (BE (start == 0 && end == 0, 0))
2592 {
2593 postorder (elem, free_tree, NULL);
2594 return NULL;
2595 }
2596
2597 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2598 if (BE (start > 0, 0))
2599 {
2600 tree = elem;
2601 for (i = 2; i <= start; ++i)
2602 {
2603 elem = duplicate_tree (elem, dfa);
2604 tree = create_tree (dfa, tree, elem, CONCAT);
2605 if (BE (elem == NULL || tree == NULL, 0))
2606 goto parse_dup_op_espace;
2607 }
2608
2609 if (start == end)
2610 return tree;
2611
2612 /* Duplicate ELEM before it is marked optional. */
2613 elem = duplicate_tree (elem, dfa);
2614 old_tree = tree;
2615 }
2616 else
2617 old_tree = NULL;
2618
2619 if (elem->token.type == SUBEXP)
2620 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2621
2622 tree = create_tree (dfa, elem, NULL,
2623 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2624 if (BE (tree == NULL, 0))
2625 goto parse_dup_op_espace;
2626
2627 /* From gnulib's "intprops.h":
2628 True if the arithmetic type T is signed. */
2629 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2630
2631 /* This loop is actually executed only when end != REG_MISSING,
2632 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2633 already created the start+1-th copy. */
2634 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2635 for (i = start + 2; i <= end; ++i)
2636 {
2637 elem = duplicate_tree (elem, dfa);
2638 tree = create_tree (dfa, tree, elem, CONCAT);
2639 if (BE (elem == NULL || tree == NULL, 0))
2640 goto parse_dup_op_espace;
2641
2642 tree = create_tree (dfa, tree, NULL, OP_ALT);
2643 if (BE (tree == NULL, 0))
2644 goto parse_dup_op_espace;
2645 }
2646
2647 if (old_tree)
2648 tree = create_tree (dfa, old_tree, tree, CONCAT);
2649
2650 return tree;
2651
2652 parse_dup_op_espace:
2653 *err = REG_ESPACE;
2654 return NULL;
2655 }
2656
2657 /* Size of the names for collating symbol/equivalence_class/character_class.
2658 I'm not sure, but maybe enough. */
2659 #define BRACKET_NAME_BUF_SIZE 32
2660
2661 #ifndef _LIBC
2662 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2663 Build the range expression which starts from START_ELEM, and ends
2664 at END_ELEM. The result are written to MBCSET and SBCSET.
2665 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2666 mbcset->range_ends, is a pointer argument since we may
2667 update it. */
2668
2669 static reg_errcode_t
2670 internal_function
2671 # ifdef RE_ENABLE_I18N
build_range_exp(const reg_syntax_t syntax,bitset_t sbcset,re_charset_t * mbcset,Idx * range_alloc,const bracket_elem_t * start_elem,const bracket_elem_t * end_elem)2672 build_range_exp (const reg_syntax_t syntax,
2673 bitset_t sbcset,
2674 re_charset_t *mbcset,
2675 Idx *range_alloc,
2676 const bracket_elem_t *start_elem,
2677 const bracket_elem_t *end_elem)
2678 # else /* not RE_ENABLE_I18N */
2679 build_range_exp (const reg_syntax_t syntax,
2680 bitset_t sbcset,
2681 const bracket_elem_t *start_elem,
2682 const bracket_elem_t *end_elem)
2683 # endif /* not RE_ENABLE_I18N */
2684 {
2685 unsigned int start_ch, end_ch;
2686 /* Equivalence Classes and Character Classes can't be a range start/end. */
2687 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2688 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2689 0))
2690 return REG_ERANGE;
2691
2692 /* We can handle no multi character collating elements without libc
2693 support. */
2694 if (BE ((start_elem->type == COLL_SYM
2695 && strlen ((char *) start_elem->opr.name) > 1)
2696 || (end_elem->type == COLL_SYM
2697 && strlen ((char *) end_elem->opr.name) > 1), 0))
2698 return REG_ECOLLATE;
2699
2700 # ifdef RE_ENABLE_I18N
2701 {
2702 wchar_t wc;
2703 wint_t start_wc;
2704 wint_t end_wc;
2705 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2706
2707 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2708 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2709 : 0));
2710 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2711 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2712 : 0));
2713 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2714 ? __btowc (start_ch) : start_elem->opr.wch);
2715 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2716 ? __btowc (end_ch) : end_elem->opr.wch);
2717 if (start_wc == WEOF || end_wc == WEOF)
2718 return REG_ECOLLATE;
2719 cmp_buf[0] = start_wc;
2720 cmp_buf[4] = end_wc;
2721
2722 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2723 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2724 return REG_ERANGE;
2725
2726 /* Got valid collation sequence values, add them as a new entry.
2727 However, for !_LIBC we have no collation elements: if the
2728 character set is single byte, the single byte character set
2729 that we build below suffices. parse_bracket_exp passes
2730 no MBCSET if dfa->mb_cur_max == 1. */
2731 if (mbcset)
2732 {
2733 /* Check the space of the arrays. */
2734 if (BE (*range_alloc == mbcset->nranges, 0))
2735 {
2736 /* There is not enough space, need realloc. */
2737 wchar_t *new_array_start, *new_array_end;
2738 Idx new_nranges;
2739
2740 /* +1 in case of mbcset->nranges is 0. */
2741 new_nranges = 2 * mbcset->nranges + 1;
2742 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2743 are NULL if *range_alloc == 0. */
2744 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2745 new_nranges);
2746 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2747 new_nranges);
2748
2749 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2750 return REG_ESPACE;
2751
2752 mbcset->range_starts = new_array_start;
2753 mbcset->range_ends = new_array_end;
2754 *range_alloc = new_nranges;
2755 }
2756
2757 mbcset->range_starts[mbcset->nranges] = start_wc;
2758 mbcset->range_ends[mbcset->nranges++] = end_wc;
2759 }
2760
2761 /* Build the table for single byte characters. */
2762 for (wc = 0; wc < SBC_MAX; ++wc)
2763 {
2764 cmp_buf[2] = wc;
2765 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2766 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2767 bitset_set (sbcset, wc);
2768 }
2769 }
2770 # else /* not RE_ENABLE_I18N */
2771 {
2772 unsigned int ch;
2773 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2774 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2775 : 0));
2776 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2777 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2778 : 0));
2779 if (start_ch > end_ch)
2780 return REG_ERANGE;
2781 /* Build the table for single byte characters. */
2782 for (ch = 0; ch < SBC_MAX; ++ch)
2783 if (start_ch <= ch && ch <= end_ch)
2784 bitset_set (sbcset, ch);
2785 }
2786 # endif /* not RE_ENABLE_I18N */
2787 return REG_NOERROR;
2788 }
2789 #endif /* not _LIBC */
2790
2791 #ifndef _LIBC
2792 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2793 Build the collating element which is represented by NAME.
2794 The result are written to MBCSET and SBCSET.
2795 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2796 pointer argument since we may update it. */
2797
2798 static reg_errcode_t
2799 internal_function
2800 # ifdef RE_ENABLE_I18N
build_collating_symbol(bitset_t sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2801 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2802 Idx *coll_sym_alloc, const unsigned char *name)
2803 # else /* not RE_ENABLE_I18N */
2804 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2805 # endif /* not RE_ENABLE_I18N */
2806 {
2807 size_t name_len = strlen ((const char *) name);
2808 if (BE (name_len != 1, 0))
2809 return REG_ECOLLATE;
2810 else
2811 {
2812 bitset_set (sbcset, name[0]);
2813 return REG_NOERROR;
2814 }
2815 }
2816 #endif /* not _LIBC */
2817
2818 /* This function parse bracket expression like "[abc]", "[a-c]",
2819 "[[.a-a.]]" etc. */
2820
2821 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2822 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2823 reg_syntax_t syntax, reg_errcode_t *err)
2824 {
2825 #ifdef _LIBC
2826 const unsigned char *collseqmb;
2827 const char *collseqwc;
2828 uint32_t nrules;
2829 int32_t table_size;
2830 const int32_t *symb_table;
2831 const unsigned char *extra;
2832
2833 /* Local function for parse_bracket_exp used in _LIBC environment.
2834 Seek the collating symbol entry corresponding to NAME.
2835 Return the index of the symbol in the SYMB_TABLE. */
2836
2837 auto inline int32_t
2838 __attribute ((always_inline))
2839 seek_collating_symbol_entry (name, name_len)
2840 const unsigned char *name;
2841 size_t name_len;
2842 {
2843 int32_t hash = elem_hash ((const char *) name, name_len);
2844 int32_t elem = hash % table_size;
2845 if (symb_table[2 * elem] != 0)
2846 {
2847 int32_t second = hash % (table_size - 2) + 1;
2848
2849 do
2850 {
2851 /* First compare the hashing value. */
2852 if (symb_table[2 * elem] == hash
2853 /* Compare the length of the name. */
2854 && name_len == extra[symb_table[2 * elem + 1]]
2855 /* Compare the name. */
2856 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2857 name_len) == 0)
2858 {
2859 /* Yep, this is the entry. */
2860 break;
2861 }
2862
2863 /* Next entry. */
2864 elem += second;
2865 }
2866 while (symb_table[2 * elem] != 0);
2867 }
2868 return elem;
2869 }
2870
2871 /* Local function for parse_bracket_exp used in _LIBC environment.
2872 Look up the collation sequence value of BR_ELEM.
2873 Return the value if succeeded, UINT_MAX otherwise. */
2874
2875 auto inline unsigned int
2876 __attribute ((always_inline))
2877 lookup_collation_sequence_value (br_elem)
2878 bracket_elem_t *br_elem;
2879 {
2880 if (br_elem->type == SB_CHAR)
2881 {
2882 /*
2883 if (MB_CUR_MAX == 1)
2884 */
2885 if (nrules == 0)
2886 return collseqmb[br_elem->opr.ch];
2887 else
2888 {
2889 wint_t wc = __btowc (br_elem->opr.ch);
2890 return __collseq_table_lookup (collseqwc, wc);
2891 }
2892 }
2893 else if (br_elem->type == MB_CHAR)
2894 {
2895 if (nrules != 0)
2896 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2897 }
2898 else if (br_elem->type == COLL_SYM)
2899 {
2900 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2901 if (nrules != 0)
2902 {
2903 int32_t elem, idx;
2904 elem = seek_collating_symbol_entry (br_elem->opr.name,
2905 sym_name_len);
2906 if (symb_table[2 * elem] != 0)
2907 {
2908 /* We found the entry. */
2909 idx = symb_table[2 * elem + 1];
2910 /* Skip the name of collating element name. */
2911 idx += 1 + extra[idx];
2912 /* Skip the byte sequence of the collating element. */
2913 idx += 1 + extra[idx];
2914 /* Adjust for the alignment. */
2915 idx = (idx + 3) & ~3;
2916 /* Skip the multibyte collation sequence value. */
2917 idx += sizeof (unsigned int);
2918 /* Skip the wide char sequence of the collating element. */
2919 idx += sizeof (unsigned int) *
2920 (1 + *(unsigned int *) (extra + idx));
2921 /* Return the collation sequence value. */
2922 return *(unsigned int *) (extra + idx);
2923 }
2924 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2925 {
2926 /* No valid character. Match it as a single byte
2927 character. */
2928 return collseqmb[br_elem->opr.name[0]];
2929 }
2930 }
2931 else if (sym_name_len == 1)
2932 return collseqmb[br_elem->opr.name[0]];
2933 }
2934 return UINT_MAX;
2935 }
2936
2937 /* Local function for parse_bracket_exp used in _LIBC environment.
2938 Build the range expression which starts from START_ELEM, and ends
2939 at END_ELEM. The result are written to MBCSET and SBCSET.
2940 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2941 mbcset->range_ends, is a pointer argument since we may
2942 update it. */
2943
2944 auto inline reg_errcode_t
2945 __attribute ((always_inline))
2946 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2947 re_charset_t *mbcset;
2948 Idx *range_alloc;
2949 bitset_t sbcset;
2950 bracket_elem_t *start_elem, *end_elem;
2951 {
2952 unsigned int ch;
2953 uint32_t start_collseq;
2954 uint32_t end_collseq;
2955
2956 /* Equivalence Classes and Character Classes can't be a range
2957 start/end. */
2958 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2959 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2960 0))
2961 return REG_ERANGE;
2962
2963 start_collseq = lookup_collation_sequence_value (start_elem);
2964 end_collseq = lookup_collation_sequence_value (end_elem);
2965 /* Check start/end collation sequence values. */
2966 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2967 return REG_ECOLLATE;
2968 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2969 return REG_ERANGE;
2970
2971 /* Got valid collation sequence values, add them as a new entry.
2972 However, if we have no collation elements, and the character set
2973 is single byte, the single byte character set that we
2974 build below suffices. */
2975 if (nrules > 0 || dfa->mb_cur_max > 1)
2976 {
2977 /* Check the space of the arrays. */
2978 if (BE (*range_alloc == mbcset->nranges, 0))
2979 {
2980 /* There is not enough space, need realloc. */
2981 uint32_t *new_array_start;
2982 uint32_t *new_array_end;
2983 Idx new_nranges;
2984
2985 /* +1 in case of mbcset->nranges is 0. */
2986 new_nranges = 2 * mbcset->nranges + 1;
2987 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2988 new_nranges);
2989 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2990 new_nranges);
2991
2992 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2993 return REG_ESPACE;
2994
2995 mbcset->range_starts = new_array_start;
2996 mbcset->range_ends = new_array_end;
2997 *range_alloc = new_nranges;
2998 }
2999
3000 mbcset->range_starts[mbcset->nranges] = start_collseq;
3001 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3002 }
3003
3004 /* Build the table for single byte characters. */
3005 for (ch = 0; ch < SBC_MAX; ch++)
3006 {
3007 uint32_t ch_collseq;
3008 /*
3009 if (MB_CUR_MAX == 1)
3010 */
3011 if (nrules == 0)
3012 ch_collseq = collseqmb[ch];
3013 else
3014 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3015 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3016 bitset_set (sbcset, ch);
3017 }
3018 return REG_NOERROR;
3019 }
3020
3021 /* Local function for parse_bracket_exp used in _LIBC environment.
3022 Build the collating element which is represented by NAME.
3023 The result are written to MBCSET and SBCSET.
3024 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3025 pointer argument since we may update it. */
3026
3027 auto inline reg_errcode_t
3028 __attribute ((always_inline))
3029 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3030 re_charset_t *mbcset;
3031 Idx *coll_sym_alloc;
3032 bitset_t sbcset;
3033 const unsigned char *name;
3034 {
3035 int32_t elem, idx;
3036 size_t name_len = strlen ((const char *) name);
3037 if (nrules != 0)
3038 {
3039 elem = seek_collating_symbol_entry (name, name_len);
3040 if (symb_table[2 * elem] != 0)
3041 {
3042 /* We found the entry. */
3043 idx = symb_table[2 * elem + 1];
3044 /* Skip the name of collating element name. */
3045 idx += 1 + extra[idx];
3046 }
3047 else if (symb_table[2 * elem] == 0 && name_len == 1)
3048 {
3049 /* No valid character, treat it as a normal
3050 character. */
3051 bitset_set (sbcset, name[0]);
3052 return REG_NOERROR;
3053 }
3054 else
3055 return REG_ECOLLATE;
3056
3057 /* Got valid collation sequence, add it as a new entry. */
3058 /* Check the space of the arrays. */
3059 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3060 {
3061 /* Not enough, realloc it. */
3062 /* +1 in case of mbcset->ncoll_syms is 0. */
3063 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3064 /* Use realloc since mbcset->coll_syms is NULL
3065 if *alloc == 0. */
3066 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3067 new_coll_sym_alloc);
3068 if (BE (new_coll_syms == NULL, 0))
3069 return REG_ESPACE;
3070 mbcset->coll_syms = new_coll_syms;
3071 *coll_sym_alloc = new_coll_sym_alloc;
3072 }
3073 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3074 return REG_NOERROR;
3075 }
3076 else
3077 {
3078 if (BE (name_len != 1, 0))
3079 return REG_ECOLLATE;
3080 else
3081 {
3082 bitset_set (sbcset, name[0]);
3083 return REG_NOERROR;
3084 }
3085 }
3086 }
3087 #endif
3088
3089 re_token_t br_token;
3090 re_bitset_ptr_t sbcset;
3091 #ifdef RE_ENABLE_I18N
3092 re_charset_t *mbcset;
3093 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3094 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3095 #endif /* not RE_ENABLE_I18N */
3096 bool non_match = false;
3097 bin_tree_t *work_tree;
3098 int token_len;
3099 bool first_round = true;
3100 #ifdef _LIBC
3101 collseqmb = (const unsigned char *)
3102 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3103 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3104 if (nrules)
3105 {
3106 /*
3107 if (MB_CUR_MAX > 1)
3108 */
3109 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3110 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3111 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3112 _NL_COLLATE_SYMB_TABLEMB);
3113 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3114 _NL_COLLATE_SYMB_EXTRAMB);
3115 }
3116 #endif
3117 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3118 #ifdef RE_ENABLE_I18N
3119 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3120 #endif /* RE_ENABLE_I18N */
3121 #ifdef RE_ENABLE_I18N
3122 if (BE (sbcset == NULL || mbcset == NULL, 0))
3123 #else
3124 if (BE (sbcset == NULL, 0))
3125 #endif /* RE_ENABLE_I18N */
3126 {
3127 re_free (sbcset);
3128 #ifdef RE_ENABLE_I18N
3129 re_free (mbcset);
3130 #endif
3131 *err = REG_ESPACE;
3132 return NULL;
3133 }
3134
3135 token_len = peek_token_bracket (token, regexp, syntax);
3136 if (BE (token->type == END_OF_RE, 0))
3137 {
3138 *err = REG_BADPAT;
3139 goto parse_bracket_exp_free_return;
3140 }
3141 if (token->type == OP_NON_MATCH_LIST)
3142 {
3143 #ifdef RE_ENABLE_I18N
3144 mbcset->non_match = 1;
3145 #endif /* not RE_ENABLE_I18N */
3146 non_match = true;
3147 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3148 bitset_set (sbcset, '\n');
3149 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3150 token_len = peek_token_bracket (token, regexp, syntax);
3151 if (BE (token->type == END_OF_RE, 0))
3152 {
3153 *err = REG_BADPAT;
3154 goto parse_bracket_exp_free_return;
3155 }
3156 }
3157
3158 /* We treat the first ']' as a normal character. */
3159 if (token->type == OP_CLOSE_BRACKET)
3160 token->type = CHARACTER;
3161
3162 while (1)
3163 {
3164 bracket_elem_t start_elem, end_elem;
3165 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3166 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3167 reg_errcode_t ret;
3168 int token_len2 = 0;
3169 bool is_range_exp = false;
3170 re_token_t token2;
3171
3172 start_elem.opr.name = start_name_buf;
3173 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3174 syntax, first_round);
3175 if (BE (ret != REG_NOERROR, 0))
3176 {
3177 *err = ret;
3178 goto parse_bracket_exp_free_return;
3179 }
3180 first_round = false;
3181
3182 /* Get information about the next token. We need it in any case. */
3183 token_len = peek_token_bracket (token, regexp, syntax);
3184
3185 /* Do not check for ranges if we know they are not allowed. */
3186 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3187 {
3188 if (BE (token->type == END_OF_RE, 0))
3189 {
3190 *err = REG_EBRACK;
3191 goto parse_bracket_exp_free_return;
3192 }
3193 if (token->type == OP_CHARSET_RANGE)
3194 {
3195 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3196 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3197 if (BE (token2.type == END_OF_RE, 0))
3198 {
3199 *err = REG_EBRACK;
3200 goto parse_bracket_exp_free_return;
3201 }
3202 if (token2.type == OP_CLOSE_BRACKET)
3203 {
3204 /* We treat the last '-' as a normal character. */
3205 re_string_skip_bytes (regexp, -token_len);
3206 token->type = CHARACTER;
3207 }
3208 else
3209 is_range_exp = true;
3210 }
3211 }
3212
3213 if (is_range_exp == true)
3214 {
3215 end_elem.opr.name = end_name_buf;
3216 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3217 dfa, syntax, true);
3218 if (BE (ret != REG_NOERROR, 0))
3219 {
3220 *err = ret;
3221 goto parse_bracket_exp_free_return;
3222 }
3223
3224 token_len = peek_token_bracket (token, regexp, syntax);
3225
3226 #ifdef _LIBC
3227 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3228 &start_elem, &end_elem);
3229 #else
3230 # ifdef RE_ENABLE_I18N
3231 *err = build_range_exp (syntax, sbcset,
3232 dfa->mb_cur_max > 1 ? mbcset : NULL,
3233 &range_alloc, &start_elem, &end_elem);
3234 # else
3235 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3236 # endif
3237 #endif /* RE_ENABLE_I18N */
3238 if (BE (*err != REG_NOERROR, 0))
3239 goto parse_bracket_exp_free_return;
3240 }
3241 else
3242 {
3243 switch (start_elem.type)
3244 {
3245 case SB_CHAR:
3246 bitset_set (sbcset, start_elem.opr.ch);
3247 break;
3248 #ifdef RE_ENABLE_I18N
3249 case MB_CHAR:
3250 /* Check whether the array has enough space. */
3251 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3252 {
3253 wchar_t *new_mbchars;
3254 /* Not enough, realloc it. */
3255 /* +1 in case of mbcset->nmbchars is 0. */
3256 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3257 /* Use realloc since array is NULL if *alloc == 0. */
3258 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3259 mbchar_alloc);
3260 if (BE (new_mbchars == NULL, 0))
3261 goto parse_bracket_exp_espace;
3262 mbcset->mbchars = new_mbchars;
3263 }
3264 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3265 break;
3266 #endif /* RE_ENABLE_I18N */
3267 case EQUIV_CLASS:
3268 *err = build_equiv_class (sbcset,
3269 #ifdef RE_ENABLE_I18N
3270 mbcset, &equiv_class_alloc,
3271 #endif /* RE_ENABLE_I18N */
3272 start_elem.opr.name);
3273 if (BE (*err != REG_NOERROR, 0))
3274 goto parse_bracket_exp_free_return;
3275 break;
3276 case COLL_SYM:
3277 *err = build_collating_symbol (sbcset,
3278 #ifdef RE_ENABLE_I18N
3279 mbcset, &coll_sym_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281 start_elem.opr.name);
3282 if (BE (*err != REG_NOERROR, 0))
3283 goto parse_bracket_exp_free_return;
3284 break;
3285 case CHAR_CLASS:
3286 *err = build_charclass (regexp->trans, sbcset,
3287 #ifdef RE_ENABLE_I18N
3288 mbcset, &char_class_alloc,
3289 #endif /* RE_ENABLE_I18N */
3290 start_elem.opr.name, syntax);
3291 if (BE (*err != REG_NOERROR, 0))
3292 goto parse_bracket_exp_free_return;
3293 break;
3294 default:
3295 assert (0);
3296 break;
3297 }
3298 }
3299 if (BE (token->type == END_OF_RE, 0))
3300 {
3301 *err = REG_EBRACK;
3302 goto parse_bracket_exp_free_return;
3303 }
3304 if (token->type == OP_CLOSE_BRACKET)
3305 break;
3306 }
3307
3308 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3309
3310 /* If it is non-matching list. */
3311 if (non_match)
3312 bitset_not (sbcset);
3313
3314 #ifdef RE_ENABLE_I18N
3315 /* Ensure only single byte characters are set. */
3316 if (dfa->mb_cur_max > 1)
3317 bitset_mask (sbcset, dfa->sb_char);
3318
3319 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3320 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3321 || mbcset->non_match)))
3322 {
3323 bin_tree_t *mbc_tree;
3324 int sbc_idx;
3325 /* Build a tree for complex bracket. */
3326 dfa->has_mb_node = 1;
3327 br_token.type = COMPLEX_BRACKET;
3328 br_token.opr.mbcset = mbcset;
3329 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3330 if (BE (mbc_tree == NULL, 0))
3331 goto parse_bracket_exp_espace;
3332 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3333 if (sbcset[sbc_idx])
3334 break;
3335 /* If there are no bits set in sbcset, there is no point
3336 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3337 if (sbc_idx < BITSET_WORDS)
3338 {
3339 /* Build a tree for simple bracket. */
3340 br_token.type = SIMPLE_BRACKET;
3341 br_token.opr.sbcset = sbcset;
3342 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3343 if (BE (work_tree == NULL, 0))
3344 goto parse_bracket_exp_espace;
3345
3346 /* Then join them by ALT node. */
3347 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3348 if (BE (work_tree == NULL, 0))
3349 goto parse_bracket_exp_espace;
3350 }
3351 else
3352 {
3353 re_free (sbcset);
3354 work_tree = mbc_tree;
3355 }
3356 }
3357 else
3358 #endif /* not RE_ENABLE_I18N */
3359 {
3360 #ifdef RE_ENABLE_I18N
3361 free_charset (mbcset);
3362 #endif
3363 /* Build a tree for simple bracket. */
3364 br_token.type = SIMPLE_BRACKET;
3365 br_token.opr.sbcset = sbcset;
3366 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3367 if (BE (work_tree == NULL, 0))
3368 goto parse_bracket_exp_espace;
3369 }
3370 return work_tree;
3371
3372 parse_bracket_exp_espace:
3373 *err = REG_ESPACE;
3374 parse_bracket_exp_free_return:
3375 re_free (sbcset);
3376 #ifdef RE_ENABLE_I18N
3377 free_charset (mbcset);
3378 #endif /* RE_ENABLE_I18N */
3379 return NULL;
3380 }
3381
3382 /* Parse an element in the bracket expression. */
3383
3384 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3385 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3386 re_token_t *token, int token_len, re_dfa_t *dfa,
3387 reg_syntax_t syntax, bool accept_hyphen)
3388 {
3389 #ifdef RE_ENABLE_I18N
3390 int cur_char_size;
3391 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3392 if (cur_char_size > 1)
3393 {
3394 elem->type = MB_CHAR;
3395 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3396 re_string_skip_bytes (regexp, cur_char_size);
3397 return REG_NOERROR;
3398 }
3399 #endif /* RE_ENABLE_I18N */
3400 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3401 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3402 || token->type == OP_OPEN_EQUIV_CLASS)
3403 return parse_bracket_symbol (elem, regexp, token);
3404 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3405 {
3406 /* A '-' must only appear as anything but a range indicator before
3407 the closing bracket. Everything else is an error. */
3408 re_token_t token2;
3409 (void) peek_token_bracket (&token2, regexp, syntax);
3410 if (token2.type != OP_CLOSE_BRACKET)
3411 /* The actual error value is not standardized since this whole
3412 case is undefined. But ERANGE makes good sense. */
3413 return REG_ERANGE;
3414 }
3415 elem->type = SB_CHAR;
3416 elem->opr.ch = token->opr.c;
3417 return REG_NOERROR;
3418 }
3419
3420 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3421 such as [:<character_class>:], [.<collating_element>.], and
3422 [=<equivalent_class>=]. */
3423
3424 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3425 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3426 re_token_t *token)
3427 {
3428 unsigned char ch, delim = token->opr.c;
3429 int i = 0;
3430 if (re_string_eoi(regexp))
3431 return REG_EBRACK;
3432 for (;; ++i)
3433 {
3434 if (i >= BRACKET_NAME_BUF_SIZE)
3435 return REG_EBRACK;
3436 if (token->type == OP_OPEN_CHAR_CLASS)
3437 ch = re_string_fetch_byte_case (regexp);
3438 else
3439 ch = re_string_fetch_byte (regexp);
3440 if (re_string_eoi(regexp))
3441 return REG_EBRACK;
3442 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3443 break;
3444 elem->opr.name[i] = ch;
3445 }
3446 re_string_skip_bytes (regexp, 1);
3447 elem->opr.name[i] = '\0';
3448 switch (token->type)
3449 {
3450 case OP_OPEN_COLL_ELEM:
3451 elem->type = COLL_SYM;
3452 break;
3453 case OP_OPEN_EQUIV_CLASS:
3454 elem->type = EQUIV_CLASS;
3455 break;
3456 case OP_OPEN_CHAR_CLASS:
3457 elem->type = CHAR_CLASS;
3458 break;
3459 default:
3460 break;
3461 }
3462 return REG_NOERROR;
3463 }
3464
3465 /* Helper function for parse_bracket_exp.
3466 Build the equivalence class which is represented by NAME.
3467 The result are written to MBCSET and SBCSET.
3468 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3469 is a pointer argument since we may update it. */
3470
3471 static reg_errcode_t
3472 #ifdef RE_ENABLE_I18N
build_equiv_class(bitset_t sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3473 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3474 Idx *equiv_class_alloc, const unsigned char *name)
3475 #else /* not RE_ENABLE_I18N */
3476 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3477 #endif /* not RE_ENABLE_I18N */
3478 {
3479 #ifdef _LIBC
3480 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3481 if (nrules != 0)
3482 {
3483 const int32_t *table, *indirect;
3484 const unsigned char *weights, *extra, *cp;
3485 unsigned char char_buf[2];
3486 int32_t idx1, idx2;
3487 unsigned int ch;
3488 size_t len;
3489 /* This #include defines a local function! */
3490 # include <locale/weight.h>
3491 /* Calculate the index for equivalence class. */
3492 cp = name;
3493 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3494 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3495 _NL_COLLATE_WEIGHTMB);
3496 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3497 _NL_COLLATE_EXTRAMB);
3498 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3499 _NL_COLLATE_INDIRECTMB);
3500 idx1 = findidx (&cp, -1);
3501 if (BE (idx1 == 0 || *cp != '\0', 0))
3502 /* This isn't a valid character. */
3503 return REG_ECOLLATE;
3504
3505 /* Build single byte matching table for this equivalence class. */
3506 len = weights[idx1 & 0xffffff];
3507 for (ch = 0; ch < SBC_MAX; ++ch)
3508 {
3509 char_buf[0] = ch;
3510 cp = char_buf;
3511 idx2 = findidx (&cp, 1);
3512 /*
3513 idx2 = table[ch];
3514 */
3515 if (idx2 == 0)
3516 /* This isn't a valid character. */
3517 continue;
3518 /* Compare only if the length matches and the collation rule
3519 index is the same. */
3520 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3521 {
3522 int cnt = 0;
3523
3524 while (cnt <= len &&
3525 weights[(idx1 & 0xffffff) + 1 + cnt]
3526 == weights[(idx2 & 0xffffff) + 1 + cnt])
3527 ++cnt;
3528
3529 if (cnt > len)
3530 bitset_set (sbcset, ch);
3531 }
3532 }
3533 /* Check whether the array has enough space. */
3534 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3535 {
3536 /* Not enough, realloc it. */
3537 /* +1 in case of mbcset->nequiv_classes is 0. */
3538 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3539 /* Use realloc since the array is NULL if *alloc == 0. */
3540 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3541 int32_t,
3542 new_equiv_class_alloc);
3543 if (BE (new_equiv_classes == NULL, 0))
3544 return REG_ESPACE;
3545 mbcset->equiv_classes = new_equiv_classes;
3546 *equiv_class_alloc = new_equiv_class_alloc;
3547 }
3548 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3549 }
3550 else
3551 #endif /* _LIBC */
3552 {
3553 if (BE (strlen ((const char *) name) != 1, 0))
3554 return REG_ECOLLATE;
3555 bitset_set (sbcset, *name);
3556 }
3557 return REG_NOERROR;
3558 }
3559
3560 /* Helper function for parse_bracket_exp.
3561 Build the character class which is represented by NAME.
3562 The result are written to MBCSET and SBCSET.
3563 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3564 is a pointer argument since we may update it. */
3565
3566 static reg_errcode_t
3567 #ifdef RE_ENABLE_I18N
build_charclass(RE_TRANSLATE_TYPE trans,bitset_t sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3568 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3569 re_charset_t *mbcset, Idx *char_class_alloc,
3570 const unsigned char *class_name, reg_syntax_t syntax)
3571 #else /* not RE_ENABLE_I18N */
3572 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573 const unsigned char *class_name, reg_syntax_t syntax)
3574 #endif /* not RE_ENABLE_I18N */
3575 {
3576 int i;
3577 const char *name = (const char *) class_name;
3578
3579 /* In case of REG_ICASE "upper" and "lower" match the both of
3580 upper and lower cases. */
3581 if ((syntax & RE_ICASE)
3582 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3583 name = "alpha";
3584
3585 #ifdef RE_ENABLE_I18N
3586 /* Check the space of the arrays. */
3587 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3588 {
3589 /* Not enough, realloc it. */
3590 /* +1 in case of mbcset->nchar_classes is 0. */
3591 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3592 /* Use realloc since array is NULL if *alloc == 0. */
3593 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3594 new_char_class_alloc);
3595 if (BE (new_char_classes == NULL, 0))
3596 return REG_ESPACE;
3597 mbcset->char_classes = new_char_classes;
3598 *char_class_alloc = new_char_class_alloc;
3599 }
3600 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3601 #endif /* RE_ENABLE_I18N */
3602
3603 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3604 do { \
3605 if (BE (trans != NULL, 0)) \
3606 { \
3607 for (i = 0; i < SBC_MAX; ++i) \
3608 if (ctype_func (i)) \
3609 bitset_set (sbcset, trans[i]); \
3610 } \
3611 else \
3612 { \
3613 for (i = 0; i < SBC_MAX; ++i) \
3614 if (ctype_func (i)) \
3615 bitset_set (sbcset, i); \
3616 } \
3617 } while (0)
3618
3619 if (strcmp (name, "alnum") == 0)
3620 BUILD_CHARCLASS_LOOP (isalnum);
3621 else if (strcmp (name, "cntrl") == 0)
3622 BUILD_CHARCLASS_LOOP (iscntrl);
3623 else if (strcmp (name, "lower") == 0)
3624 BUILD_CHARCLASS_LOOP (islower);
3625 else if (strcmp (name, "space") == 0)
3626 BUILD_CHARCLASS_LOOP (isspace);
3627 else if (strcmp (name, "alpha") == 0)
3628 BUILD_CHARCLASS_LOOP (isalpha);
3629 else if (strcmp (name, "digit") == 0)
3630 BUILD_CHARCLASS_LOOP (isdigit);
3631 else if (strcmp (name, "print") == 0)
3632 BUILD_CHARCLASS_LOOP (isprint);
3633 else if (strcmp (name, "upper") == 0)
3634 BUILD_CHARCLASS_LOOP (isupper);
3635 else if (strcmp (name, "blank") == 0)
3636 BUILD_CHARCLASS_LOOP (isblank);
3637 else if (strcmp (name, "graph") == 0)
3638 BUILD_CHARCLASS_LOOP (isgraph);
3639 else if (strcmp (name, "punct") == 0)
3640 BUILD_CHARCLASS_LOOP (ispunct);
3641 else if (strcmp (name, "xdigit") == 0)
3642 BUILD_CHARCLASS_LOOP (isxdigit);
3643 else
3644 return REG_ECTYPE;
3645
3646 return REG_NOERROR;
3647 }
3648
3649 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,RE_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3650 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3651 const unsigned char *class_name,
3652 const unsigned char *extra, bool non_match,
3653 reg_errcode_t *err)
3654 {
3655 re_bitset_ptr_t sbcset;
3656 #ifdef RE_ENABLE_I18N
3657 re_charset_t *mbcset;
3658 Idx alloc = 0;
3659 #endif /* not RE_ENABLE_I18N */
3660 reg_errcode_t ret;
3661 re_token_t br_token;
3662 bin_tree_t *tree;
3663
3664 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3665 #ifdef RE_ENABLE_I18N
3666 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3667 #endif /* RE_ENABLE_I18N */
3668
3669 #ifdef RE_ENABLE_I18N
3670 if (BE (sbcset == NULL || mbcset == NULL, 0))
3671 #else /* not RE_ENABLE_I18N */
3672 if (BE (sbcset == NULL, 0))
3673 #endif /* not RE_ENABLE_I18N */
3674 {
3675 *err = REG_ESPACE;
3676 return NULL;
3677 }
3678
3679 if (non_match)
3680 {
3681 #ifdef RE_ENABLE_I18N
3682 mbcset->non_match = 1;
3683 #endif /* not RE_ENABLE_I18N */
3684 }
3685
3686 /* We don't care the syntax in this case. */
3687 ret = build_charclass (trans, sbcset,
3688 #ifdef RE_ENABLE_I18N
3689 mbcset, &alloc,
3690 #endif /* RE_ENABLE_I18N */
3691 class_name, 0);
3692
3693 if (BE (ret != REG_NOERROR, 0))
3694 {
3695 re_free (sbcset);
3696 #ifdef RE_ENABLE_I18N
3697 free_charset (mbcset);
3698 #endif /* RE_ENABLE_I18N */
3699 *err = ret;
3700 return NULL;
3701 }
3702 /* \w match '_' also. */
3703 for (; *extra; extra++)
3704 bitset_set (sbcset, *extra);
3705
3706 /* If it is non-matching list. */
3707 if (non_match)
3708 bitset_not (sbcset);
3709
3710 #ifdef RE_ENABLE_I18N
3711 /* Ensure only single byte characters are set. */
3712 if (dfa->mb_cur_max > 1)
3713 bitset_mask (sbcset, dfa->sb_char);
3714 #endif
3715
3716 /* Build a tree for simple bracket. */
3717 br_token.type = SIMPLE_BRACKET;
3718 br_token.opr.sbcset = sbcset;
3719 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3720 if (BE (tree == NULL, 0))
3721 goto build_word_op_espace;
3722
3723 #ifdef RE_ENABLE_I18N
3724 if (dfa->mb_cur_max > 1)
3725 {
3726 bin_tree_t *mbc_tree;
3727 /* Build a tree for complex bracket. */
3728 br_token.type = COMPLEX_BRACKET;
3729 br_token.opr.mbcset = mbcset;
3730 dfa->has_mb_node = 1;
3731 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3732 if (BE (mbc_tree == NULL, 0))
3733 goto build_word_op_espace;
3734 /* Then join them by ALT node. */
3735 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3736 if (BE (mbc_tree != NULL, 1))
3737 return tree;
3738 }
3739 else
3740 {
3741 free_charset (mbcset);
3742 return tree;
3743 }
3744 #else /* not RE_ENABLE_I18N */
3745 return tree;
3746 #endif /* not RE_ENABLE_I18N */
3747
3748 build_word_op_espace:
3749 re_free (sbcset);
3750 #ifdef RE_ENABLE_I18N
3751 free_charset (mbcset);
3752 #endif /* RE_ENABLE_I18N */
3753 *err = REG_ESPACE;
3754 return NULL;
3755 }
3756
3757 /* This is intended for the expressions like "a{1,3}".
3758 Fetch a number from 'input', and return the number.
3759 Return REG_MISSING if the number field is empty like "{,1}".
3760 Return RE_DUP_MAX + 1 if the number field is too large.
3761 Return REG_ERROR if an error occurred. */
3762
3763 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3764 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3765 {
3766 Idx num = REG_MISSING;
3767 unsigned char c;
3768 while (1)
3769 {
3770 fetch_token (token, input, syntax);
3771 c = token->opr.c;
3772 if (BE (token->type == END_OF_RE, 0))
3773 return REG_ERROR;
3774 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3775 break;
3776 num = ((token->type != CHARACTER || c < '0' || '9' < c
3777 || num == REG_ERROR)
3778 ? REG_ERROR
3779 : num == REG_MISSING
3780 ? c - '0'
3781 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3782 }
3783 return num;
3784 }
3785
3786 #ifdef RE_ENABLE_I18N
3787 static void
free_charset(re_charset_t * cset)3788 free_charset (re_charset_t *cset)
3789 {
3790 re_free (cset->mbchars);
3791 # ifdef _LIBC
3792 re_free (cset->coll_syms);
3793 re_free (cset->equiv_classes);
3794 re_free (cset->range_starts);
3795 re_free (cset->range_ends);
3796 # endif
3797 re_free (cset->char_classes);
3798 re_free (cset);
3799 }
3800 #endif /* RE_ENABLE_I18N */
3801
3802 /* Functions for binary tree operation. */
3803
3804 /* Create a tree node. */
3805
3806 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3807 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3808 re_token_type_t type)
3809 {
3810 re_token_t t;
3811 t.type = type;
3812 return create_token_tree (dfa, left, right, &t);
3813 }
3814
3815 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3816 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3817 const re_token_t *token)
3818 {
3819 bin_tree_t *tree;
3820 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3821 {
3822 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3823
3824 if (storage == NULL)
3825 return NULL;
3826 storage->next = dfa->str_tree_storage;
3827 dfa->str_tree_storage = storage;
3828 dfa->str_tree_storage_idx = 0;
3829 }
3830 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3831
3832 tree->parent = NULL;
3833 tree->left = left;
3834 tree->right = right;
3835 tree->token = *token;
3836 tree->token.duplicated = 0;
3837 tree->token.opt_subexp = 0;
3838 tree->first = NULL;
3839 tree->next = NULL;
3840 tree->node_idx = REG_MISSING;
3841
3842 if (left != NULL)
3843 left->parent = tree;
3844 if (right != NULL)
3845 right->parent = tree;
3846 return tree;
3847 }
3848
3849 /* Mark the tree SRC as an optional subexpression.
3850 To be called from preorder or postorder. */
3851
3852 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3853 mark_opt_subexp (void *extra, bin_tree_t *node)
3854 {
3855 Idx idx = (Idx) (long) extra;
3856 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3857 node->token.opt_subexp = 1;
3858
3859 return REG_NOERROR;
3860 }
3861
3862 /* Free the allocated memory inside NODE. */
3863
3864 static void
free_token(re_token_t * node)3865 free_token (re_token_t *node)
3866 {
3867 #ifdef RE_ENABLE_I18N
3868 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3869 free_charset (node->opr.mbcset);
3870 else
3871 #endif /* RE_ENABLE_I18N */
3872 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3873 re_free (node->opr.sbcset);
3874 }
3875
3876 /* Worker function for tree walking. Free the allocated memory inside NODE
3877 and its children. */
3878
3879 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3880 free_tree (void *extra, bin_tree_t *node)
3881 {
3882 free_token (&node->token);
3883 return REG_NOERROR;
3884 }
3885
3886
3887 /* Duplicate the node SRC, and return new node. This is a preorder
3888 visit similar to the one implemented by the generic visitor, but
3889 we need more infrastructure to maintain two parallel trees --- so,
3890 it's easier to duplicate. */
3891
3892 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3893 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3894 {
3895 const bin_tree_t *node;
3896 bin_tree_t *dup_root;
3897 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3898
3899 for (node = root; ; )
3900 {
3901 /* Create a new tree and link it back to the current parent. */
3902 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3903 if (*p_new == NULL)
3904 return NULL;
3905 (*p_new)->parent = dup_node;
3906 (*p_new)->token.duplicated = 1;
3907 dup_node = *p_new;
3908
3909 /* Go to the left node, or up and to the right. */
3910 if (node->left)
3911 {
3912 node = node->left;
3913 p_new = &dup_node->left;
3914 }
3915 else
3916 {
3917 const bin_tree_t *prev = NULL;
3918 while (node->right == prev || node->right == NULL)
3919 {
3920 prev = node;
3921 node = node->parent;
3922 dup_node = dup_node->parent;
3923 if (!node)
3924 return dup_root;
3925 }
3926 node = node->right;
3927 p_new = &dup_node->right;
3928 }
3929 }
3930 }
3931