1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 5 Software Foundation, Inc. 6 This file is part of the GNU C Library. 7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3, or (at your option) 12 any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with this program; if not, write to the Free Software Foundation, 21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 24 size_t length, reg_syntax_t syntax); 25 static void re_compile_fastmap_iter (regex_t *bufp, 26 const re_dfastate_t *init_state, 27 char *fastmap); 28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 29 #ifdef RE_ENABLE_I18N 30 static void free_charset (re_charset_t *cset); 31 #endif /* RE_ENABLE_I18N */ 32 static void free_workarea_compile (regex_t *preg); 33 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 34 #ifdef RE_ENABLE_I18N 35 static void optimize_utf8 (re_dfa_t *dfa); 36 #endif 37 static reg_errcode_t analyze (regex_t *preg); 38 static reg_errcode_t preorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41 static reg_errcode_t postorder (bin_tree_t *root, 42 reg_errcode_t (fn (void *, bin_tree_t *)), 43 void *extra); 44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 47 bin_tree_t *node); 48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 53 unsigned int constraint); 54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 56 Idx node, bool root); 57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 58 static Idx fetch_number (re_string_t *input, re_token_t *token, 59 reg_syntax_t syntax); 60 static int peek_token (re_token_t *token, re_string_t *input, 61 reg_syntax_t syntax) internal_function; 62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 63 reg_syntax_t syntax, reg_errcode_t *err); 64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 Idx nest, reg_errcode_t *err); 67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 Idx nest, reg_errcode_t *err); 70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 Idx nest, reg_errcode_t *err); 73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 74 re_token_t *token, reg_syntax_t syntax, 75 Idx nest, reg_errcode_t *err); 76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 77 re_dfa_t *dfa, re_token_t *token, 78 reg_syntax_t syntax, reg_errcode_t *err); 79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 80 re_token_t *token, reg_syntax_t syntax, 81 reg_errcode_t *err); 82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 83 re_string_t *regexp, 84 re_token_t *token, int token_len, 85 re_dfa_t *dfa, 86 reg_syntax_t syntax, 87 bool accept_hyphen); 88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 89 re_string_t *regexp, 90 re_token_t *token); 91 #ifdef RE_ENABLE_I18N 92 static reg_errcode_t build_equiv_class (bitset_t sbcset, 93 re_charset_t *mbcset, 94 Idx *equiv_class_alloc, 95 const unsigned char *name); 96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 97 bitset_t sbcset, 98 re_charset_t *mbcset, 99 Idx *char_class_alloc, 100 const unsigned char *class_name, 101 reg_syntax_t syntax); 102 #else /* not RE_ENABLE_I18N */ 103 static reg_errcode_t build_equiv_class (bitset_t sbcset, 104 const unsigned char *name); 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 106 bitset_t sbcset, 107 const unsigned char *class_name, 108 reg_syntax_t syntax); 109 #endif /* not RE_ENABLE_I18N */ 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 111 RE_TRANSLATE_TYPE trans, 112 const unsigned char *class_name, 113 const unsigned char *extra, 114 bool non_match, reg_errcode_t *err); 115 static bin_tree_t *create_tree (re_dfa_t *dfa, 116 bin_tree_t *left, bin_tree_t *right, 117 re_token_type_t type); 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 119 bin_tree_t *left, bin_tree_t *right, 120 const re_token_t *token); 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 122 static void free_token (re_token_t *node); 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 125 126 /* This table gives an error message for each of the error codes listed 127 in regex.h. Obviously the order here has to be same as there. 128 POSIX doesn't require that we do anything for REG_NOERROR, 129 but why not be nice? */ 130 131 static const char __re_error_msgid[] = 132 { 133 #define REG_NOERROR_IDX 0 134 gettext_noop ("Success") /* REG_NOERROR */ 135 "\0" 136 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 137 gettext_noop ("No match") /* REG_NOMATCH */ 138 "\0" 139 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 140 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 141 "\0" 142 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 143 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 144 "\0" 145 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 146 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 147 "\0" 148 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 149 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 150 "\0" 151 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 152 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 153 "\0" 154 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 155 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 156 "\0" 157 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 158 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 159 "\0" 160 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 161 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 162 "\0" 163 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 164 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 165 "\0" 166 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 167 gettext_noop ("Invalid range end") /* REG_ERANGE */ 168 "\0" 169 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 170 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 171 "\0" 172 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 173 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 174 "\0" 175 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 176 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 177 "\0" 178 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 179 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 180 "\0" 181 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 182 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 183 }; 184 185 static const size_t __re_error_msgid_idx[] = 186 { 187 REG_NOERROR_IDX, 188 REG_NOMATCH_IDX, 189 REG_BADPAT_IDX, 190 REG_ECOLLATE_IDX, 191 REG_ECTYPE_IDX, 192 REG_EESCAPE_IDX, 193 REG_ESUBREG_IDX, 194 REG_EBRACK_IDX, 195 REG_EPAREN_IDX, 196 REG_EBRACE_IDX, 197 REG_BADBR_IDX, 198 REG_ERANGE_IDX, 199 REG_ESPACE_IDX, 200 REG_BADRPT_IDX, 201 REG_EEND_IDX, 202 REG_ESIZE_IDX, 203 REG_ERPAREN_IDX 204 }; 205 206 /* Entry points for GNU code. */ 207 208 /* re_compile_pattern is the GNU regular expression compiler: it 209 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 210 Returns 0 if the pattern was valid, otherwise an error string. 211 212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 213 are set in BUFP on entry. */ 214 215 #ifdef _LIBC 216 const char * 217 re_compile_pattern (pattern, length, bufp) 218 const char *pattern; 219 size_t length; 220 struct re_pattern_buffer *bufp; 221 #else /* size_t might promote */ 222 const char * 223 re_compile_pattern (const char *pattern, size_t length, 224 struct re_pattern_buffer *bufp) 225 #endif 226 { 227 reg_errcode_t ret; 228 229 /* And GNU code determines whether or not to get register information 230 by passing null for the REGS argument to re_match, etc., not by 231 setting no_sub, unless RE_NO_SUB is set. */ 232 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 233 234 /* Match anchors at newline. */ 235 bufp->newline_anchor = 1; 236 237 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 238 239 if (!ret) 240 return NULL; 241 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 242 } 243 #ifdef _LIBC 244 weak_alias (__re_compile_pattern, re_compile_pattern) 245 #endif 246 247 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 248 also be assigned to arbitrarily: each pattern buffer stores its own 249 syntax, so it can be changed between regex compilations. */ 250 /* This has no initializer because initialized variables in Emacs 251 become read-only after dumping. */ 252 reg_syntax_t re_syntax_options; 253 254 255 /* Specify the precise syntax of regexps for compilation. This provides 256 for compatibility for various utilities which historically have 257 different, incompatible syntaxes. 258 259 The argument SYNTAX is a bit mask comprised of the various bits 260 defined in regex.h. We return the old syntax. */ 261 262 reg_syntax_t 263 re_set_syntax (syntax) 264 reg_syntax_t syntax; 265 { 266 reg_syntax_t ret = re_syntax_options; 267 268 re_syntax_options = syntax; 269 return ret; 270 } 271 #ifdef _LIBC 272 weak_alias (__re_set_syntax, re_set_syntax) 273 #endif 274 275 int 276 re_compile_fastmap (bufp) 277 struct re_pattern_buffer *bufp; 278 { 279 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 280 char *fastmap = bufp->fastmap; 281 282 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 283 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 284 if (dfa->init_state != dfa->init_state_word) 285 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 286 if (dfa->init_state != dfa->init_state_nl) 287 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 288 if (dfa->init_state != dfa->init_state_begbuf) 289 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 290 bufp->fastmap_accurate = 1; 291 return 0; 292 } 293 #ifdef _LIBC 294 weak_alias (__re_compile_fastmap, re_compile_fastmap) 295 #endif 296 297 static inline void 298 __attribute ((always_inline)) 299 re_set_fastmap (char *fastmap, bool icase, int ch) 300 { 301 fastmap[ch] = 1; 302 if (icase) 303 fastmap[tolower (ch)] = 1; 304 } 305 306 /* Helper function for re_compile_fastmap. 307 Compile fastmap for the initial_state INIT_STATE. */ 308 309 static void 310 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 311 char *fastmap) 312 { 313 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 314 Idx node_cnt; 315 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 316 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 317 { 318 Idx node = init_state->nodes.elems[node_cnt]; 319 re_token_type_t type = dfa->nodes[node].type; 320 321 if (type == CHARACTER) 322 { 323 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 324 #ifdef RE_ENABLE_I18N 325 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 326 { 327 unsigned char buf[MB_LEN_MAX]; 328 unsigned char *p; 329 wchar_t wc; 330 mbstate_t state; 331 332 p = buf; 333 *p++ = dfa->nodes[node].opr.c; 334 while (++node < dfa->nodes_len 335 && dfa->nodes[node].type == CHARACTER 336 && dfa->nodes[node].mb_partial) 337 *p++ = dfa->nodes[node].opr.c; 338 memset (&state, '\0', sizeof (state)); 339 if (__mbrtowc (&wc, (const char *) buf, p - buf, 340 &state) == p - buf 341 && (__wcrtomb ((char *) buf, towlower (wc), &state) 342 != (size_t) -1)) 343 re_set_fastmap (fastmap, false, buf[0]); 344 } 345 #endif 346 } 347 else if (type == SIMPLE_BRACKET) 348 { 349 int i, ch; 350 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 351 { 352 int j; 353 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 354 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 355 if (w & ((bitset_word_t) 1 << j)) 356 re_set_fastmap (fastmap, icase, ch); 357 } 358 } 359 #ifdef RE_ENABLE_I18N 360 else if (type == COMPLEX_BRACKET) 361 { 362 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 363 Idx i; 364 365 # ifdef _LIBC 366 /* See if we have to try all bytes which start multiple collation 367 elements. 368 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 369 collation element, and don't catch 'b' since 'b' is 370 the only collation element which starts from 'b' (and 371 it is caught by SIMPLE_BRACKET). */ 372 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 373 && (cset->ncoll_syms || cset->nranges)) 374 { 375 const int32_t *table = (const int32_t *) 376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 377 for (i = 0; i < SBC_MAX; ++i) 378 if (table[i] < 0) 379 re_set_fastmap (fastmap, icase, i); 380 } 381 # endif /* _LIBC */ 382 383 /* See if we have to start the match at all multibyte characters, 384 i.e. where we would not find an invalid sequence. This only 385 applies to multibyte character sets; for single byte character 386 sets, the SIMPLE_BRACKET again suffices. */ 387 if (dfa->mb_cur_max > 1 388 && (cset->nchar_classes || cset->non_match || cset->nranges 389 # ifdef _LIBC 390 || cset->nequiv_classes 391 # endif /* _LIBC */ 392 )) 393 { 394 unsigned char c = 0; 395 do 396 { 397 mbstate_t mbs; 398 memset (&mbs, 0, sizeof (mbs)); 399 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 400 re_set_fastmap (fastmap, false, (int) c); 401 } 402 while (++c != 0); 403 } 404 405 else 406 { 407 /* ... Else catch all bytes which can start the mbchars. */ 408 for (i = 0; i < cset->nmbchars; ++i) 409 { 410 char buf[256]; 411 mbstate_t state; 412 memset (&state, '\0', sizeof (state)); 413 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 414 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 415 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 416 { 417 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 418 != (size_t) -1) 419 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 420 } 421 } 422 } 423 } 424 #endif /* RE_ENABLE_I18N */ 425 else if (type == OP_PERIOD 426 #ifdef RE_ENABLE_I18N 427 || type == OP_UTF8_PERIOD 428 #endif /* RE_ENABLE_I18N */ 429 || type == END_OF_RE) 430 { 431 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 432 if (type == END_OF_RE) 433 bufp->can_be_null = 1; 434 return; 435 } 436 } 437 } 438 439 /* Entry point for POSIX code. */ 440 /* regcomp takes a regular expression as a string and compiles it. 441 442 PREG is a regex_t *. We do not expect any fields to be initialized, 443 since POSIX says we shouldn't. Thus, we set 444 445 `buffer' to the compiled pattern; 446 `used' to the length of the compiled pattern; 447 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 448 REG_EXTENDED bit in CFLAGS is set; otherwise, to 449 RE_SYNTAX_POSIX_BASIC; 450 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 451 `fastmap' to an allocated space for the fastmap; 452 `fastmap_accurate' to zero; 453 `re_nsub' to the number of subexpressions in PATTERN. 454 455 PATTERN is the address of the pattern string. 456 457 CFLAGS is a series of bits which affect compilation. 458 459 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 460 use POSIX basic syntax. 461 462 If REG_NEWLINE is set, then . and [^...] don't match newline. 463 Also, regexec will try a match beginning after every newline. 464 465 If REG_ICASE is set, then we considers upper- and lowercase 466 versions of letters to be equivalent when matching. 467 468 If REG_NOSUB is set, then when PREG is passed to regexec, that 469 routine will report only success or failure, and nothing about the 470 registers. 471 472 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 473 the return codes and their meanings.) */ 474 475 int 476 regcomp (preg, pattern, cflags) 477 regex_t *_Restrict_ preg; 478 const char *_Restrict_ pattern; 479 int cflags; 480 { 481 reg_errcode_t ret; 482 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 483 : RE_SYNTAX_POSIX_BASIC); 484 485 preg->buffer = NULL; 486 preg->allocated = 0; 487 preg->used = 0; 488 489 /* Try to allocate space for the fastmap. */ 490 preg->fastmap = re_malloc (char, SBC_MAX); 491 if (BE (preg->fastmap == NULL, 0)) 492 return REG_ESPACE; 493 494 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 495 496 /* If REG_NEWLINE is set, newlines are treated differently. */ 497 if (cflags & REG_NEWLINE) 498 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 499 syntax &= ~RE_DOT_NEWLINE; 500 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 501 /* It also changes the matching behavior. */ 502 preg->newline_anchor = 1; 503 } 504 else 505 preg->newline_anchor = 0; 506 preg->no_sub = !!(cflags & REG_NOSUB); 507 preg->translate = NULL; 508 509 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 510 511 /* POSIX doesn't distinguish between an unmatched open-group and an 512 unmatched close-group: both are REG_EPAREN. */ 513 if (ret == REG_ERPAREN) 514 ret = REG_EPAREN; 515 516 /* We have already checked preg->fastmap != NULL. */ 517 if (BE (ret == REG_NOERROR, 1)) 518 /* Compute the fastmap now, since regexec cannot modify the pattern 519 buffer. This function never fails in this implementation. */ 520 (void) re_compile_fastmap (preg); 521 else 522 { 523 /* Some error occurred while compiling the expression. */ 524 re_free (preg->fastmap); 525 preg->fastmap = NULL; 526 } 527 528 return (int) ret; 529 } 530 #ifdef _LIBC 531 weak_alias (__regcomp, regcomp) 532 #endif 533 534 /* Returns a message corresponding to an error code, ERRCODE, returned 535 from either regcomp or regexec. We don't use PREG here. */ 536 537 #ifdef _LIBC 538 size_t 539 regerror (errcode, preg, errbuf, errbuf_size) 540 int errcode; 541 const regex_t *_Restrict_ preg; 542 char *_Restrict_ errbuf; 543 size_t errbuf_size; 544 #else /* size_t might promote */ 545 size_t 546 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_, 547 char *_Restrict_ errbuf, size_t errbuf_size) 548 #endif 549 { 550 const char *msg; 551 size_t msg_size; 552 553 if (BE (errcode < 0 554 || errcode >= (int) (sizeof (__re_error_msgid_idx) 555 / sizeof (__re_error_msgid_idx[0])), 0)) 556 /* Only error codes returned by the rest of the code should be passed 557 to this routine. If we are given anything else, or if other regex 558 code generates an invalid error code, then the program has a bug. 559 Dump core so we can fix it. */ 560 abort (); 561 562 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 563 564 msg_size = strlen (msg) + 1; /* Includes the null. */ 565 566 if (BE (errbuf_size != 0, 1)) 567 { 568 size_t cpy_size = msg_size; 569 if (BE (msg_size > errbuf_size, 0)) 570 { 571 cpy_size = errbuf_size - 1; 572 errbuf[cpy_size] = '\0'; 573 } 574 memcpy (errbuf, msg, cpy_size); 575 } 576 577 return msg_size; 578 } 579 #ifdef _LIBC 580 weak_alias (__regerror, regerror) 581 #endif 582 583 584 #ifdef RE_ENABLE_I18N 585 /* This static array is used for the map to single-byte characters when 586 UTF-8 is used. Otherwise we would allocate memory just to initialize 587 it the same all the time. UTF-8 is the preferred encoding so this is 588 a worthwhile optimization. */ 589 static const bitset_t utf8_sb_map = 590 { 591 /* Set the first 128 bits. */ 592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 593 # error "bitset_word_t is narrower than 32 bits" 594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 597 BITSET_WORD_MAX, BITSET_WORD_MAX, 598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 599 BITSET_WORD_MAX, 600 # endif 601 (BITSET_WORD_MAX 602 >> (SBC_MAX % BITSET_WORD_BITS == 0 603 ? 0 604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 605 }; 606 #endif 607 608 609 static void 610 free_dfa_content (re_dfa_t *dfa) 611 { 612 Idx i, j; 613 614 if (dfa->nodes) 615 for (i = 0; i < dfa->nodes_len; ++i) 616 free_token (dfa->nodes + i); 617 re_free (dfa->nexts); 618 for (i = 0; i < dfa->nodes_len; ++i) 619 { 620 if (dfa->eclosures != NULL) 621 re_node_set_free (dfa->eclosures + i); 622 if (dfa->inveclosures != NULL) 623 re_node_set_free (dfa->inveclosures + i); 624 if (dfa->edests != NULL) 625 re_node_set_free (dfa->edests + i); 626 } 627 re_free (dfa->edests); 628 re_free (dfa->eclosures); 629 re_free (dfa->inveclosures); 630 re_free (dfa->nodes); 631 632 if (dfa->state_table) 633 for (i = 0; i <= dfa->state_hash_mask; ++i) 634 { 635 struct re_state_table_entry *entry = dfa->state_table + i; 636 for (j = 0; j < entry->num; ++j) 637 { 638 re_dfastate_t *state = entry->array[j]; 639 free_state (state); 640 } 641 re_free (entry->array); 642 } 643 re_free (dfa->state_table); 644 #ifdef RE_ENABLE_I18N 645 if (dfa->sb_char != utf8_sb_map) 646 re_free (dfa->sb_char); 647 #endif 648 re_free (dfa->subexp_map); 649 #ifdef DEBUG 650 re_free (dfa->re_str); 651 #endif 652 653 re_free (dfa); 654 } 655 656 657 /* Free dynamically allocated space used by PREG. */ 658 659 void 660 regfree (preg) 661 regex_t *preg; 662 { 663 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 664 if (BE (dfa != NULL, 1)) 665 free_dfa_content (dfa); 666 preg->buffer = NULL; 667 preg->allocated = 0; 668 669 re_free (preg->fastmap); 670 preg->fastmap = NULL; 671 672 re_free (preg->translate); 673 preg->translate = NULL; 674 } 675 #ifdef _LIBC 676 weak_alias (__regfree, regfree) 677 #endif 678 679 /* Entry points compatible with 4.2 BSD regex library. We don't define 680 them unless specifically requested. */ 681 682 #if defined _REGEX_RE_COMP || defined _LIBC 683 684 /* BSD has one and only one pattern buffer. */ 685 static struct re_pattern_buffer re_comp_buf; 686 687 char * 688 # ifdef _LIBC 689 /* Make these definitions weak in libc, so POSIX programs can redefine 690 these names if they don't use our functions, and still use 691 regcomp/regexec above without link errors. */ 692 weak_function 693 # endif 694 re_comp (s) 695 const char *s; 696 { 697 reg_errcode_t ret; 698 char *fastmap; 699 700 if (!s) 701 { 702 if (!re_comp_buf.buffer) 703 return gettext ("No previous regular expression"); 704 return 0; 705 } 706 707 if (re_comp_buf.buffer) 708 { 709 fastmap = re_comp_buf.fastmap; 710 re_comp_buf.fastmap = NULL; 711 __regfree (&re_comp_buf); 712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 713 re_comp_buf.fastmap = fastmap; 714 } 715 716 if (re_comp_buf.fastmap == NULL) 717 { 718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 719 if (re_comp_buf.fastmap == NULL) 720 return (char *) gettext (__re_error_msgid 721 + __re_error_msgid_idx[(int) REG_ESPACE]); 722 } 723 724 /* Since `re_exec' always passes NULL for the `regs' argument, we 725 don't need to initialize the pattern buffer fields which affect it. */ 726 727 /* Match anchors at newlines. */ 728 re_comp_buf.newline_anchor = 1; 729 730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 731 732 if (!ret) 733 return NULL; 734 735 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 737 } 738 739 #ifdef _LIBC 740 libc_freeres_fn (free_mem) 741 { 742 __regfree (&re_comp_buf); 743 } 744 #endif 745 746 #endif /* _REGEX_RE_COMP */ 747 748 /* Internal entry point. 749 Compile the regular expression PATTERN, whose length is LENGTH. 750 SYNTAX indicate regular expression's syntax. */ 751 752 static reg_errcode_t 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 754 reg_syntax_t syntax) 755 { 756 reg_errcode_t err = REG_NOERROR; 757 re_dfa_t *dfa; 758 re_string_t regexp; 759 760 /* Initialize the pattern buffer. */ 761 preg->fastmap_accurate = 0; 762 preg->syntax = syntax; 763 preg->not_bol = preg->not_eol = 0; 764 preg->used = 0; 765 preg->re_nsub = 0; 766 preg->can_be_null = 0; 767 preg->regs_allocated = REGS_UNALLOCATED; 768 769 /* Initialize the dfa. */ 770 dfa = (re_dfa_t *) preg->buffer; 771 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 772 { 773 /* If zero allocated, but buffer is non-null, try to realloc 774 enough space. This loses if buffer's address is bogus, but 775 that is the user's responsibility. If ->buffer is NULL this 776 is a simple allocation. */ 777 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 778 if (dfa == NULL) 779 return REG_ESPACE; 780 preg->allocated = sizeof (re_dfa_t); 781 preg->buffer = (unsigned char *) dfa; 782 } 783 preg->used = sizeof (re_dfa_t); 784 785 err = init_dfa (dfa, length); 786 if (BE (err != REG_NOERROR, 0)) 787 { 788 free_dfa_content (dfa); 789 preg->buffer = NULL; 790 preg->allocated = 0; 791 return err; 792 } 793 #ifdef DEBUG 794 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 795 dfa->re_str = re_malloc (char, length + 1); 796 strncpy (dfa->re_str, pattern, length + 1); 797 #endif 798 799 __libc_lock_init (dfa->lock); 800 801 err = re_string_construct (®exp, pattern, length, preg->translate, 802 (syntax & RE_ICASE) != 0, dfa); 803 if (BE (err != REG_NOERROR, 0)) 804 { 805 re_compile_internal_free_return: 806 free_workarea_compile (preg); 807 re_string_destruct (®exp); 808 free_dfa_content (dfa); 809 preg->buffer = NULL; 810 preg->allocated = 0; 811 return err; 812 } 813 814 /* Parse the regular expression, and build a structure tree. */ 815 preg->re_nsub = 0; 816 dfa->str_tree = parse (®exp, preg, syntax, &err); 817 if (BE (dfa->str_tree == NULL, 0)) 818 goto re_compile_internal_free_return; 819 820 /* Analyze the tree and create the nfa. */ 821 err = analyze (preg); 822 if (BE (err != REG_NOERROR, 0)) 823 goto re_compile_internal_free_return; 824 825 #ifdef RE_ENABLE_I18N 826 /* If possible, do searching in single byte encoding to speed things up. */ 827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 828 optimize_utf8 (dfa); 829 #endif 830 831 /* Then create the initial state of the dfa. */ 832 err = create_initial_state (dfa); 833 834 /* Release work areas. */ 835 free_workarea_compile (preg); 836 re_string_destruct (®exp); 837 838 if (BE (err != REG_NOERROR, 0)) 839 { 840 free_dfa_content (dfa); 841 preg->buffer = NULL; 842 preg->allocated = 0; 843 } 844 845 return err; 846 } 847 848 /* Initialize DFA. We use the length of the regular expression PAT_LEN 849 as the initial length of some arrays. */ 850 851 static reg_errcode_t 852 init_dfa (re_dfa_t *dfa, size_t pat_len) 853 { 854 __re_size_t table_size; 855 #ifndef _LIBC 856 char *codeset_name; 857 #endif 858 #ifdef RE_ENABLE_I18N 859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 860 #else 861 size_t max_i18n_object_size = 0; 862 #endif 863 size_t max_object_size = 864 MAX (sizeof (struct re_state_table_entry), 865 MAX (sizeof (re_token_t), 866 MAX (sizeof (re_node_set), 867 MAX (sizeof (regmatch_t), 868 max_i18n_object_size)))); 869 870 memset (dfa, '\0', sizeof (re_dfa_t)); 871 872 /* Force allocation of str_tree_storage the first time. */ 873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 874 875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 876 calculation below, and for similar doubling calculations 877 elsewhere. And it's <= rather than <, because some of the 878 doubling calculations add 1 afterwards. */ 879 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) 880 return REG_ESPACE; 881 882 dfa->nodes_alloc = pat_len + 1; 883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 884 885 /* table_size = 2 ^ ceil(log pat_len) */ 886 for (table_size = 1; ; table_size <<= 1) 887 if (table_size > pat_len) 888 break; 889 890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 891 dfa->state_hash_mask = table_size - 1; 892 893 dfa->mb_cur_max = MB_CUR_MAX; 894 #ifdef _LIBC 895 if (dfa->mb_cur_max == 6 896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 897 dfa->is_utf8 = 1; 898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 899 != 0); 900 #else 901 codeset_name = nl_langinfo (CODESET); 902 if (strcasecmp (codeset_name, "UTF-8") == 0 903 || strcasecmp (codeset_name, "UTF8") == 0) 904 dfa->is_utf8 = 1; 905 906 /* We check exhaustively in the loop below if this charset is a 907 superset of ASCII. */ 908 dfa->map_notascii = 0; 909 #endif 910 911 #ifdef RE_ENABLE_I18N 912 if (dfa->mb_cur_max > 1) 913 { 914 if (dfa->is_utf8) 915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 916 else 917 { 918 int i, j, ch; 919 920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 921 if (BE (dfa->sb_char == NULL, 0)) 922 return REG_ESPACE; 923 924 /* Set the bits corresponding to single byte chars. */ 925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 927 { 928 wint_t wch = __btowc (ch); 929 if (wch != WEOF) 930 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 931 # ifndef _LIBC 932 if (isascii (ch) && wch != ch) 933 dfa->map_notascii = 1; 934 # endif 935 } 936 } 937 } 938 #endif 939 940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 941 return REG_ESPACE; 942 return REG_NOERROR; 943 } 944 945 /* Initialize WORD_CHAR table, which indicate which character is 946 "word". In this case "word" means that it is the word construction 947 character used by some operators like "\<", "\>", etc. */ 948 949 static void 950 internal_function 951 init_word_char (re_dfa_t *dfa) 952 { 953 int i, j, ch; 954 dfa->word_ops_used = 1; 955 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 956 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 957 if (isalnum (ch) || ch == '_') 958 dfa->word_char[i] |= (bitset_word_t) 1 << j; 959 } 960 961 /* Free the work area which are only used while compiling. */ 962 963 static void 964 free_workarea_compile (regex_t *preg) 965 { 966 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 967 bin_tree_storage_t *storage, *next; 968 for (storage = dfa->str_tree_storage; storage; storage = next) 969 { 970 next = storage->next; 971 re_free (storage); 972 } 973 dfa->str_tree_storage = NULL; 974 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 975 dfa->str_tree = NULL; 976 re_free (dfa->org_indices); 977 dfa->org_indices = NULL; 978 } 979 980 /* Create initial states for all contexts. */ 981 982 static reg_errcode_t 983 create_initial_state (re_dfa_t *dfa) 984 { 985 Idx first, i; 986 reg_errcode_t err; 987 re_node_set init_nodes; 988 989 /* Initial states have the epsilon closure of the node which is 990 the first node of the regular expression. */ 991 first = dfa->str_tree->first->node_idx; 992 dfa->init_node = first; 993 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 994 if (BE (err != REG_NOERROR, 0)) 995 return err; 996 997 /* The back-references which are in initial states can epsilon transit, 998 since in this case all of the subexpressions can be null. 999 Then we add epsilon closures of the nodes which are the next nodes of 1000 the back-references. */ 1001 if (dfa->nbackref > 0) 1002 for (i = 0; i < init_nodes.nelem; ++i) 1003 { 1004 Idx node_idx = init_nodes.elems[i]; 1005 re_token_type_t type = dfa->nodes[node_idx].type; 1006 1007 Idx clexp_idx; 1008 if (type != OP_BACK_REF) 1009 continue; 1010 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1011 { 1012 re_token_t *clexp_node; 1013 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1014 if (clexp_node->type == OP_CLOSE_SUBEXP 1015 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1016 break; 1017 } 1018 if (clexp_idx == init_nodes.nelem) 1019 continue; 1020 1021 if (type == OP_BACK_REF) 1022 { 1023 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1024 if (!re_node_set_contains (&init_nodes, dest_idx)) 1025 { 1026 reg_errcode_t merge_err 1027 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1028 if (merge_err != REG_NOERROR) 1029 return merge_err; 1030 i = 0; 1031 } 1032 } 1033 } 1034 1035 /* It must be the first time to invoke acquire_state. */ 1036 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1037 /* We don't check ERR here, since the initial state must not be NULL. */ 1038 if (BE (dfa->init_state == NULL, 0)) 1039 return err; 1040 if (dfa->init_state->has_constraint) 1041 { 1042 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1043 CONTEXT_WORD); 1044 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1045 CONTEXT_NEWLINE); 1046 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1047 &init_nodes, 1048 CONTEXT_NEWLINE 1049 | CONTEXT_BEGBUF); 1050 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1051 || dfa->init_state_begbuf == NULL, 0)) 1052 return err; 1053 } 1054 else 1055 dfa->init_state_word = dfa->init_state_nl 1056 = dfa->init_state_begbuf = dfa->init_state; 1057 1058 re_node_set_free (&init_nodes); 1059 return REG_NOERROR; 1060 } 1061 1062 #ifdef RE_ENABLE_I18N 1063 /* If it is possible to do searching in single byte encoding instead of UTF-8 1064 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1065 DFA nodes where needed. */ 1066 1067 static void 1068 optimize_utf8 (re_dfa_t *dfa) 1069 { 1070 Idx node; 1071 int i; 1072 bool mb_chars = false; 1073 bool has_period = false; 1074 1075 for (node = 0; node < dfa->nodes_len; ++node) 1076 switch (dfa->nodes[node].type) 1077 { 1078 case CHARACTER: 1079 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1080 mb_chars = true; 1081 break; 1082 case ANCHOR: 1083 switch (dfa->nodes[node].opr.ctx_type) 1084 { 1085 case LINE_FIRST: 1086 case LINE_LAST: 1087 case BUF_FIRST: 1088 case BUF_LAST: 1089 break; 1090 default: 1091 /* Word anchors etc. cannot be handled. It's okay to test 1092 opr.ctx_type since constraints (for all DFA nodes) are 1093 created by ORing one or more opr.ctx_type values. */ 1094 return; 1095 } 1096 break; 1097 case OP_PERIOD: 1098 has_period = true; 1099 break; 1100 case OP_BACK_REF: 1101 case OP_ALT: 1102 case END_OF_RE: 1103 case OP_DUP_ASTERISK: 1104 case OP_OPEN_SUBEXP: 1105 case OP_CLOSE_SUBEXP: 1106 break; 1107 case COMPLEX_BRACKET: 1108 return; 1109 case SIMPLE_BRACKET: 1110 /* Just double check. */ 1111 { 1112 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1113 ? 0 1114 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1115 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1116 { 1117 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1118 return; 1119 rshift = 0; 1120 } 1121 } 1122 break; 1123 default: 1124 abort (); 1125 } 1126 1127 if (mb_chars || has_period) 1128 for (node = 0; node < dfa->nodes_len; ++node) 1129 { 1130 if (dfa->nodes[node].type == CHARACTER 1131 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1132 dfa->nodes[node].mb_partial = 0; 1133 else if (dfa->nodes[node].type == OP_PERIOD) 1134 dfa->nodes[node].type = OP_UTF8_PERIOD; 1135 } 1136 1137 /* The search can be in single byte locale. */ 1138 dfa->mb_cur_max = 1; 1139 dfa->is_utf8 = 0; 1140 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1141 } 1142 #endif 1143 1144 /* Analyze the structure tree, and calculate "first", "next", "edest", 1145 "eclosure", and "inveclosure". */ 1146 1147 static reg_errcode_t 1148 analyze (regex_t *preg) 1149 { 1150 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1151 reg_errcode_t ret; 1152 1153 /* Allocate arrays. */ 1154 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1155 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1156 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1157 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1158 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1159 || dfa->eclosures == NULL, 0)) 1160 return REG_ESPACE; 1161 1162 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1163 if (dfa->subexp_map != NULL) 1164 { 1165 Idx i; 1166 for (i = 0; i < preg->re_nsub; i++) 1167 dfa->subexp_map[i] = i; 1168 preorder (dfa->str_tree, optimize_subexps, dfa); 1169 for (i = 0; i < preg->re_nsub; i++) 1170 if (dfa->subexp_map[i] != i) 1171 break; 1172 if (i == preg->re_nsub) 1173 { 1174 free (dfa->subexp_map); 1175 dfa->subexp_map = NULL; 1176 } 1177 } 1178 1179 ret = postorder (dfa->str_tree, lower_subexps, preg); 1180 if (BE (ret != REG_NOERROR, 0)) 1181 return ret; 1182 ret = postorder (dfa->str_tree, calc_first, dfa); 1183 if (BE (ret != REG_NOERROR, 0)) 1184 return ret; 1185 preorder (dfa->str_tree, calc_next, dfa); 1186 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1187 if (BE (ret != REG_NOERROR, 0)) 1188 return ret; 1189 ret = calc_eclosure (dfa); 1190 if (BE (ret != REG_NOERROR, 0)) 1191 return ret; 1192 1193 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1194 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1195 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1196 || dfa->nbackref) 1197 { 1198 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1199 if (BE (dfa->inveclosures == NULL, 0)) 1200 return REG_ESPACE; 1201 ret = calc_inveclosure (dfa); 1202 } 1203 1204 return ret; 1205 } 1206 1207 /* Our parse trees are very unbalanced, so we cannot use a stack to 1208 implement parse tree visits. Instead, we use parent pointers and 1209 some hairy code in these two functions. */ 1210 static reg_errcode_t 1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1212 void *extra) 1213 { 1214 bin_tree_t *node, *prev; 1215 1216 for (node = root; ; ) 1217 { 1218 /* Descend down the tree, preferably to the left (or to the right 1219 if that's the only child). */ 1220 while (node->left || node->right) 1221 if (node->left) 1222 node = node->left; 1223 else 1224 node = node->right; 1225 1226 do 1227 { 1228 reg_errcode_t err = fn (extra, node); 1229 if (BE (err != REG_NOERROR, 0)) 1230 return err; 1231 if (node->parent == NULL) 1232 return REG_NOERROR; 1233 prev = node; 1234 node = node->parent; 1235 } 1236 /* Go up while we have a node that is reached from the right. */ 1237 while (node->right == prev || node->right == NULL); 1238 node = node->right; 1239 } 1240 } 1241 1242 static reg_errcode_t 1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1244 void *extra) 1245 { 1246 bin_tree_t *node; 1247 1248 for (node = root; ; ) 1249 { 1250 reg_errcode_t err = fn (extra, node); 1251 if (BE (err != REG_NOERROR, 0)) 1252 return err; 1253 1254 /* Go to the left node, or up and to the right. */ 1255 if (node->left) 1256 node = node->left; 1257 else 1258 { 1259 bin_tree_t *prev = NULL; 1260 while (node->right == prev || node->right == NULL) 1261 { 1262 prev = node; 1263 node = node->parent; 1264 if (!node) 1265 return REG_NOERROR; 1266 } 1267 node = node->right; 1268 } 1269 } 1270 } 1271 1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1273 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1274 backreferences as well. Requires a preorder visit. */ 1275 static reg_errcode_t 1276 optimize_subexps (void *extra, bin_tree_t *node) 1277 { 1278 re_dfa_t *dfa = (re_dfa_t *) extra; 1279 1280 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1281 { 1282 int idx = node->token.opr.idx; 1283 node->token.opr.idx = dfa->subexp_map[idx]; 1284 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1285 } 1286 1287 else if (node->token.type == SUBEXP 1288 && node->left && node->left->token.type == SUBEXP) 1289 { 1290 Idx other_idx = node->left->token.opr.idx; 1291 1292 node->left = node->left->left; 1293 if (node->left) 1294 node->left->parent = node; 1295 1296 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1297 if (other_idx < BITSET_WORD_BITS) 1298 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1299 } 1300 1301 return REG_NOERROR; 1302 } 1303 1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1305 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1306 static reg_errcode_t 1307 lower_subexps (void *extra, bin_tree_t *node) 1308 { 1309 regex_t *preg = (regex_t *) extra; 1310 reg_errcode_t err = REG_NOERROR; 1311 1312 if (node->left && node->left->token.type == SUBEXP) 1313 { 1314 node->left = lower_subexp (&err, preg, node->left); 1315 if (node->left) 1316 node->left->parent = node; 1317 } 1318 if (node->right && node->right->token.type == SUBEXP) 1319 { 1320 node->right = lower_subexp (&err, preg, node->right); 1321 if (node->right) 1322 node->right->parent = node; 1323 } 1324 1325 return err; 1326 } 1327 1328 static bin_tree_t * 1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1330 { 1331 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1332 bin_tree_t *body = node->left; 1333 bin_tree_t *op, *cls, *tree1, *tree; 1334 1335 if (preg->no_sub 1336 /* We do not optimize empty subexpressions, because otherwise we may 1337 have bad CONCAT nodes with NULL children. This is obviously not 1338 very common, so we do not lose much. An example that triggers 1339 this case is the sed "script" /\(\)/x. */ 1340 && node->left != NULL 1341 && (node->token.opr.idx >= BITSET_WORD_BITS 1342 || !(dfa->used_bkref_map 1343 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1344 return node->left; 1345 1346 /* Convert the SUBEXP node to the concatenation of an 1347 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1348 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1349 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1350 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1351 tree = create_tree (dfa, op, tree1, CONCAT); 1352 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1353 { 1354 *err = REG_ESPACE; 1355 return NULL; 1356 } 1357 1358 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1359 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1360 return tree; 1361 } 1362 1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1364 nodes. Requires a postorder visit. */ 1365 static reg_errcode_t 1366 calc_first (void *extra, bin_tree_t *node) 1367 { 1368 re_dfa_t *dfa = (re_dfa_t *) extra; 1369 if (node->token.type == CONCAT) 1370 { 1371 node->first = node->left->first; 1372 node->node_idx = node->left->node_idx; 1373 } 1374 else 1375 { 1376 node->first = node; 1377 node->node_idx = re_dfa_add_node (dfa, node->token); 1378 if (BE (node->node_idx == REG_MISSING, 0)) 1379 return REG_ESPACE; 1380 if (node->token.type == ANCHOR) 1381 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1382 } 1383 return REG_NOERROR; 1384 } 1385 1386 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1387 static reg_errcode_t 1388 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 1389 { 1390 switch (node->token.type) 1391 { 1392 case OP_DUP_ASTERISK: 1393 node->left->next = node; 1394 break; 1395 case CONCAT: 1396 node->left->next = node->right->first; 1397 node->right->next = node->next; 1398 break; 1399 default: 1400 if (node->left) 1401 node->left->next = node->next; 1402 if (node->right) 1403 node->right->next = node->next; 1404 break; 1405 } 1406 return REG_NOERROR; 1407 } 1408 1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1410 static reg_errcode_t 1411 link_nfa_nodes (void *extra, bin_tree_t *node) 1412 { 1413 re_dfa_t *dfa = (re_dfa_t *) extra; 1414 Idx idx = node->node_idx; 1415 reg_errcode_t err = REG_NOERROR; 1416 1417 switch (node->token.type) 1418 { 1419 case CONCAT: 1420 break; 1421 1422 case END_OF_RE: 1423 assert (node->next == NULL); 1424 break; 1425 1426 case OP_DUP_ASTERISK: 1427 case OP_ALT: 1428 { 1429 Idx left, right; 1430 dfa->has_plural_match = 1; 1431 if (node->left != NULL) 1432 left = node->left->first->node_idx; 1433 else 1434 left = node->next->node_idx; 1435 if (node->right != NULL) 1436 right = node->right->first->node_idx; 1437 else 1438 right = node->next->node_idx; 1439 assert (REG_VALID_INDEX (left)); 1440 assert (REG_VALID_INDEX (right)); 1441 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1442 } 1443 break; 1444 1445 case ANCHOR: 1446 case OP_OPEN_SUBEXP: 1447 case OP_CLOSE_SUBEXP: 1448 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1449 break; 1450 1451 case OP_BACK_REF: 1452 dfa->nexts[idx] = node->next->node_idx; 1453 if (node->token.type == OP_BACK_REF) 1454 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1455 break; 1456 1457 default: 1458 assert (!IS_EPSILON_NODE (node->token.type)); 1459 dfa->nexts[idx] = node->next->node_idx; 1460 break; 1461 } 1462 1463 return err; 1464 } 1465 1466 /* Duplicate the epsilon closure of the node ROOT_NODE. 1467 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1468 to their own constraint. */ 1469 1470 static reg_errcode_t 1471 internal_function 1472 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1473 Idx root_node, unsigned int init_constraint) 1474 { 1475 Idx org_node, clone_node; 1476 bool ok; 1477 unsigned int constraint = init_constraint; 1478 for (org_node = top_org_node, clone_node = top_clone_node;;) 1479 { 1480 Idx org_dest, clone_dest; 1481 if (dfa->nodes[org_node].type == OP_BACK_REF) 1482 { 1483 /* If the back reference epsilon-transit, its destination must 1484 also have the constraint. Then duplicate the epsilon closure 1485 of the destination of the back reference, and store it in 1486 edests of the back reference. */ 1487 org_dest = dfa->nexts[org_node]; 1488 re_node_set_empty (dfa->edests + clone_node); 1489 clone_dest = duplicate_node (dfa, org_dest, constraint); 1490 if (BE (clone_dest == REG_MISSING, 0)) 1491 return REG_ESPACE; 1492 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1493 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1494 if (BE (! ok, 0)) 1495 return REG_ESPACE; 1496 } 1497 else if (dfa->edests[org_node].nelem == 0) 1498 { 1499 /* In case of the node can't epsilon-transit, don't duplicate the 1500 destination and store the original destination as the 1501 destination of the node. */ 1502 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1503 break; 1504 } 1505 else if (dfa->edests[org_node].nelem == 1) 1506 { 1507 /* In case of the node can epsilon-transit, and it has only one 1508 destination. */ 1509 org_dest = dfa->edests[org_node].elems[0]; 1510 re_node_set_empty (dfa->edests + clone_node); 1511 /* If the node is root_node itself, it means the epsilon closure 1512 has a loop. Then tie it to the destination of the root_node. */ 1513 if (org_node == root_node && clone_node != org_node) 1514 { 1515 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1516 if (BE (! ok, 0)) 1517 return REG_ESPACE; 1518 break; 1519 } 1520 /* In case the node has another constraint, append it. */ 1521 constraint |= dfa->nodes[org_node].constraint; 1522 clone_dest = duplicate_node (dfa, org_dest, constraint); 1523 if (BE (clone_dest == REG_MISSING, 0)) 1524 return REG_ESPACE; 1525 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1526 if (BE (! ok, 0)) 1527 return REG_ESPACE; 1528 } 1529 else /* dfa->edests[org_node].nelem == 2 */ 1530 { 1531 /* In case of the node can epsilon-transit, and it has two 1532 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1533 org_dest = dfa->edests[org_node].elems[0]; 1534 re_node_set_empty (dfa->edests + clone_node); 1535 /* Search for a duplicated node which satisfies the constraint. */ 1536 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1537 if (clone_dest == REG_MISSING) 1538 { 1539 /* There is no such duplicated node, create a new one. */ 1540 reg_errcode_t err; 1541 clone_dest = duplicate_node (dfa, org_dest, constraint); 1542 if (BE (clone_dest == REG_MISSING, 0)) 1543 return REG_ESPACE; 1544 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1545 if (BE (! ok, 0)) 1546 return REG_ESPACE; 1547 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1548 root_node, constraint); 1549 if (BE (err != REG_NOERROR, 0)) 1550 return err; 1551 } 1552 else 1553 { 1554 /* There is a duplicated node which satisfies the constraint, 1555 use it to avoid infinite loop. */ 1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1557 if (BE (! ok, 0)) 1558 return REG_ESPACE; 1559 } 1560 1561 org_dest = dfa->edests[org_node].elems[1]; 1562 clone_dest = duplicate_node (dfa, org_dest, constraint); 1563 if (BE (clone_dest == REG_MISSING, 0)) 1564 return REG_ESPACE; 1565 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1566 if (BE (! ok, 0)) 1567 return REG_ESPACE; 1568 } 1569 org_node = org_dest; 1570 clone_node = clone_dest; 1571 } 1572 return REG_NOERROR; 1573 } 1574 1575 /* Search for a node which is duplicated from the node ORG_NODE, and 1576 satisfies the constraint CONSTRAINT. */ 1577 1578 static Idx 1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1580 unsigned int constraint) 1581 { 1582 Idx idx; 1583 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1584 { 1585 if (org_node == dfa->org_indices[idx] 1586 && constraint == dfa->nodes[idx].constraint) 1587 return idx; /* Found. */ 1588 } 1589 return REG_MISSING; /* Not found. */ 1590 } 1591 1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1593 Return the index of the new node, or REG_MISSING if insufficient storage is 1594 available. */ 1595 1596 static Idx 1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1598 { 1599 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1600 if (BE (dup_idx != REG_MISSING, 1)) 1601 { 1602 dfa->nodes[dup_idx].constraint = constraint; 1603 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1604 dfa->nodes[dup_idx].duplicated = 1; 1605 1606 /* Store the index of the original node. */ 1607 dfa->org_indices[dup_idx] = org_idx; 1608 } 1609 return dup_idx; 1610 } 1611 1612 static reg_errcode_t 1613 calc_inveclosure (re_dfa_t *dfa) 1614 { 1615 Idx src, idx; 1616 bool ok; 1617 for (idx = 0; idx < dfa->nodes_len; ++idx) 1618 re_node_set_init_empty (dfa->inveclosures + idx); 1619 1620 for (src = 0; src < dfa->nodes_len; ++src) 1621 { 1622 Idx *elems = dfa->eclosures[src].elems; 1623 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1624 { 1625 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1626 if (BE (! ok, 0)) 1627 return REG_ESPACE; 1628 } 1629 } 1630 1631 return REG_NOERROR; 1632 } 1633 1634 /* Calculate "eclosure" for all the node in DFA. */ 1635 1636 static reg_errcode_t 1637 calc_eclosure (re_dfa_t *dfa) 1638 { 1639 Idx node_idx; 1640 bool incomplete; 1641 #ifdef DEBUG 1642 assert (dfa->nodes_len > 0); 1643 #endif 1644 incomplete = false; 1645 /* For each nodes, calculate epsilon closure. */ 1646 for (node_idx = 0; ; ++node_idx) 1647 { 1648 reg_errcode_t err; 1649 re_node_set eclosure_elem; 1650 if (node_idx == dfa->nodes_len) 1651 { 1652 if (!incomplete) 1653 break; 1654 incomplete = false; 1655 node_idx = 0; 1656 } 1657 1658 #ifdef DEBUG 1659 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1660 #endif 1661 1662 /* If we have already calculated, skip it. */ 1663 if (dfa->eclosures[node_idx].nelem != 0) 1664 continue; 1665 /* Calculate epsilon closure of `node_idx'. */ 1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1667 if (BE (err != REG_NOERROR, 0)) 1668 return err; 1669 1670 if (dfa->eclosures[node_idx].nelem == 0) 1671 { 1672 incomplete = true; 1673 re_node_set_free (&eclosure_elem); 1674 } 1675 } 1676 return REG_NOERROR; 1677 } 1678 1679 /* Calculate epsilon closure of NODE. */ 1680 1681 static reg_errcode_t 1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1683 { 1684 reg_errcode_t err; 1685 Idx i; 1686 re_node_set eclosure; 1687 bool ok; 1688 bool incomplete = false; 1689 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1690 if (BE (err != REG_NOERROR, 0)) 1691 return err; 1692 1693 /* This indicates that we are calculating this node now. 1694 We reference this value to avoid infinite loop. */ 1695 dfa->eclosures[node].nelem = REG_MISSING; 1696 1697 /* If the current node has constraints, duplicate all nodes 1698 since they must inherit the constraints. */ 1699 if (dfa->nodes[node].constraint 1700 && dfa->edests[node].nelem 1701 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1702 { 1703 err = duplicate_node_closure (dfa, node, node, node, 1704 dfa->nodes[node].constraint); 1705 if (BE (err != REG_NOERROR, 0)) 1706 return err; 1707 } 1708 1709 /* Expand each epsilon destination nodes. */ 1710 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1711 for (i = 0; i < dfa->edests[node].nelem; ++i) 1712 { 1713 re_node_set eclosure_elem; 1714 Idx edest = dfa->edests[node].elems[i]; 1715 /* If calculating the epsilon closure of `edest' is in progress, 1716 return intermediate result. */ 1717 if (dfa->eclosures[edest].nelem == REG_MISSING) 1718 { 1719 incomplete = true; 1720 continue; 1721 } 1722 /* If we haven't calculated the epsilon closure of `edest' yet, 1723 calculate now. Otherwise use calculated epsilon closure. */ 1724 if (dfa->eclosures[edest].nelem == 0) 1725 { 1726 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1727 if (BE (err != REG_NOERROR, 0)) 1728 return err; 1729 } 1730 else 1731 eclosure_elem = dfa->eclosures[edest]; 1732 /* Merge the epsilon closure of `edest'. */ 1733 err = re_node_set_merge (&eclosure, &eclosure_elem); 1734 if (BE (err != REG_NOERROR, 0)) 1735 return err; 1736 /* If the epsilon closure of `edest' is incomplete, 1737 the epsilon closure of this node is also incomplete. */ 1738 if (dfa->eclosures[edest].nelem == 0) 1739 { 1740 incomplete = true; 1741 re_node_set_free (&eclosure_elem); 1742 } 1743 } 1744 1745 /* An epsilon closure includes itself. */ 1746 ok = re_node_set_insert (&eclosure, node); 1747 if (BE (! ok, 0)) 1748 return REG_ESPACE; 1749 if (incomplete && !root) 1750 dfa->eclosures[node].nelem = 0; 1751 else 1752 dfa->eclosures[node] = eclosure; 1753 *new_set = eclosure; 1754 return REG_NOERROR; 1755 } 1756 1757 /* Functions for token which are used in the parser. */ 1758 1759 /* Fetch a token from INPUT. 1760 We must not use this function inside bracket expressions. */ 1761 1762 static void 1763 internal_function 1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1765 { 1766 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1767 } 1768 1769 /* Peek a token from INPUT, and return the length of the token. 1770 We must not use this function inside bracket expressions. */ 1771 1772 static int 1773 internal_function 1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1775 { 1776 unsigned char c; 1777 1778 if (re_string_eoi (input)) 1779 { 1780 token->type = END_OF_RE; 1781 return 0; 1782 } 1783 1784 c = re_string_peek_byte (input, 0); 1785 token->opr.c = c; 1786 1787 token->word_char = 0; 1788 #ifdef RE_ENABLE_I18N 1789 token->mb_partial = 0; 1790 if (input->mb_cur_max > 1 && 1791 !re_string_first_byte (input, re_string_cur_idx (input))) 1792 { 1793 token->type = CHARACTER; 1794 token->mb_partial = 1; 1795 return 1; 1796 } 1797 #endif 1798 if (c == '\\') 1799 { 1800 unsigned char c2; 1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1802 { 1803 token->type = BACK_SLASH; 1804 return 1; 1805 } 1806 1807 c2 = re_string_peek_byte_case (input, 1); 1808 token->opr.c = c2; 1809 token->type = CHARACTER; 1810 #ifdef RE_ENABLE_I18N 1811 if (input->mb_cur_max > 1) 1812 { 1813 wint_t wc = re_string_wchar_at (input, 1814 re_string_cur_idx (input) + 1); 1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1816 } 1817 else 1818 #endif 1819 token->word_char = IS_WORD_CHAR (c2) != 0; 1820 1821 switch (c2) 1822 { 1823 case '|': 1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1825 token->type = OP_ALT; 1826 break; 1827 case '1': case '2': case '3': case '4': case '5': 1828 case '6': case '7': case '8': case '9': 1829 if (!(syntax & RE_NO_BK_REFS)) 1830 { 1831 token->type = OP_BACK_REF; 1832 token->opr.idx = c2 - '1'; 1833 } 1834 break; 1835 case '<': 1836 if (!(syntax & RE_NO_GNU_OPS)) 1837 { 1838 token->type = ANCHOR; 1839 token->opr.ctx_type = WORD_FIRST; 1840 } 1841 break; 1842 case '>': 1843 if (!(syntax & RE_NO_GNU_OPS)) 1844 { 1845 token->type = ANCHOR; 1846 token->opr.ctx_type = WORD_LAST; 1847 } 1848 break; 1849 case 'b': 1850 if (!(syntax & RE_NO_GNU_OPS)) 1851 { 1852 token->type = ANCHOR; 1853 token->opr.ctx_type = WORD_DELIM; 1854 } 1855 break; 1856 case 'B': 1857 if (!(syntax & RE_NO_GNU_OPS)) 1858 { 1859 token->type = ANCHOR; 1860 token->opr.ctx_type = NOT_WORD_DELIM; 1861 } 1862 break; 1863 case 'w': 1864 if (!(syntax & RE_NO_GNU_OPS)) 1865 token->type = OP_WORD; 1866 break; 1867 case 'W': 1868 if (!(syntax & RE_NO_GNU_OPS)) 1869 token->type = OP_NOTWORD; 1870 break; 1871 case 's': 1872 if (!(syntax & RE_NO_GNU_OPS)) 1873 token->type = OP_SPACE; 1874 break; 1875 case 'S': 1876 if (!(syntax & RE_NO_GNU_OPS)) 1877 token->type = OP_NOTSPACE; 1878 break; 1879 case '`': 1880 if (!(syntax & RE_NO_GNU_OPS)) 1881 { 1882 token->type = ANCHOR; 1883 token->opr.ctx_type = BUF_FIRST; 1884 } 1885 break; 1886 case '\'': 1887 if (!(syntax & RE_NO_GNU_OPS)) 1888 { 1889 token->type = ANCHOR; 1890 token->opr.ctx_type = BUF_LAST; 1891 } 1892 break; 1893 case '(': 1894 if (!(syntax & RE_NO_BK_PARENS)) 1895 token->type = OP_OPEN_SUBEXP; 1896 break; 1897 case ')': 1898 if (!(syntax & RE_NO_BK_PARENS)) 1899 token->type = OP_CLOSE_SUBEXP; 1900 break; 1901 case '+': 1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1903 token->type = OP_DUP_PLUS; 1904 break; 1905 case '?': 1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1907 token->type = OP_DUP_QUESTION; 1908 break; 1909 case '{': 1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1911 token->type = OP_OPEN_DUP_NUM; 1912 break; 1913 case '}': 1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1915 token->type = OP_CLOSE_DUP_NUM; 1916 break; 1917 default: 1918 break; 1919 } 1920 return 2; 1921 } 1922 1923 token->type = CHARACTER; 1924 #ifdef RE_ENABLE_I18N 1925 if (input->mb_cur_max > 1) 1926 { 1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1929 } 1930 else 1931 #endif 1932 token->word_char = IS_WORD_CHAR (token->opr.c); 1933 1934 switch (c) 1935 { 1936 case '\n': 1937 if (syntax & RE_NEWLINE_ALT) 1938 token->type = OP_ALT; 1939 break; 1940 case '|': 1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1942 token->type = OP_ALT; 1943 break; 1944 case '*': 1945 token->type = OP_DUP_ASTERISK; 1946 break; 1947 case '+': 1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1949 token->type = OP_DUP_PLUS; 1950 break; 1951 case '?': 1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1953 token->type = OP_DUP_QUESTION; 1954 break; 1955 case '{': 1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1957 token->type = OP_OPEN_DUP_NUM; 1958 break; 1959 case '}': 1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1961 token->type = OP_CLOSE_DUP_NUM; 1962 break; 1963 case '(': 1964 if (syntax & RE_NO_BK_PARENS) 1965 token->type = OP_OPEN_SUBEXP; 1966 break; 1967 case ')': 1968 if (syntax & RE_NO_BK_PARENS) 1969 token->type = OP_CLOSE_SUBEXP; 1970 break; 1971 case '[': 1972 token->type = OP_OPEN_BRACKET; 1973 break; 1974 case '.': 1975 token->type = OP_PERIOD; 1976 break; 1977 case '^': 1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1979 re_string_cur_idx (input) != 0) 1980 { 1981 char prev = re_string_peek_byte (input, -1); 1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1983 break; 1984 } 1985 token->type = ANCHOR; 1986 token->opr.ctx_type = LINE_FIRST; 1987 break; 1988 case '$': 1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1990 re_string_cur_idx (input) + 1 != re_string_length (input)) 1991 { 1992 re_token_t next; 1993 re_string_skip_bytes (input, 1); 1994 peek_token (&next, input, syntax); 1995 re_string_skip_bytes (input, -1); 1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1997 break; 1998 } 1999 token->type = ANCHOR; 2000 token->opr.ctx_type = LINE_LAST; 2001 break; 2002 default: 2003 break; 2004 } 2005 return 1; 2006 } 2007 2008 /* Peek a token from INPUT, and return the length of the token. 2009 We must not use this function out of bracket expressions. */ 2010 2011 static int 2012 internal_function 2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2014 { 2015 unsigned char c; 2016 if (re_string_eoi (input)) 2017 { 2018 token->type = END_OF_RE; 2019 return 0; 2020 } 2021 c = re_string_peek_byte (input, 0); 2022 token->opr.c = c; 2023 2024 #ifdef RE_ENABLE_I18N 2025 if (input->mb_cur_max > 1 && 2026 !re_string_first_byte (input, re_string_cur_idx (input))) 2027 { 2028 token->type = CHARACTER; 2029 return 1; 2030 } 2031 #endif /* RE_ENABLE_I18N */ 2032 2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2034 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2035 { 2036 /* In this case, '\' escape a character. */ 2037 unsigned char c2; 2038 re_string_skip_bytes (input, 1); 2039 c2 = re_string_peek_byte (input, 0); 2040 token->opr.c = c2; 2041 token->type = CHARACTER; 2042 return 1; 2043 } 2044 if (c == '[') /* '[' is a special char in a bracket exps. */ 2045 { 2046 unsigned char c2; 2047 int token_len; 2048 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2049 c2 = re_string_peek_byte (input, 1); 2050 else 2051 c2 = 0; 2052 token->opr.c = c2; 2053 token_len = 2; 2054 switch (c2) 2055 { 2056 case '.': 2057 token->type = OP_OPEN_COLL_ELEM; 2058 break; 2059 case '=': 2060 token->type = OP_OPEN_EQUIV_CLASS; 2061 break; 2062 case ':': 2063 if (syntax & RE_CHAR_CLASSES) 2064 { 2065 token->type = OP_OPEN_CHAR_CLASS; 2066 break; 2067 } 2068 /* else fall through. */ 2069 default: 2070 token->type = CHARACTER; 2071 token->opr.c = c; 2072 token_len = 1; 2073 break; 2074 } 2075 return token_len; 2076 } 2077 switch (c) 2078 { 2079 case '-': 2080 token->type = OP_CHARSET_RANGE; 2081 break; 2082 case ']': 2083 token->type = OP_CLOSE_BRACKET; 2084 break; 2085 case '^': 2086 token->type = OP_NON_MATCH_LIST; 2087 break; 2088 default: 2089 token->type = CHARACTER; 2090 } 2091 return 1; 2092 } 2093 2094 /* Functions for parser. */ 2095 2096 /* Entry point of the parser. 2097 Parse the regular expression REGEXP and return the structure tree. 2098 If an error is occured, ERR is set by error code, and return NULL. 2099 This function build the following tree, from regular expression <reg_exp>: 2100 CAT 2101 / \ 2102 / \ 2103 <reg_exp> EOR 2104 2105 CAT means concatenation. 2106 EOR means end of regular expression. */ 2107 2108 static bin_tree_t * 2109 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2110 reg_errcode_t *err) 2111 { 2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2113 bin_tree_t *tree, *eor, *root; 2114 re_token_t current_token; 2115 dfa->syntax = syntax; 2116 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2117 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2118 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2119 return NULL; 2120 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2121 if (tree != NULL) 2122 root = create_tree (dfa, tree, eor, CONCAT); 2123 else 2124 root = eor; 2125 if (BE (eor == NULL || root == NULL, 0)) 2126 { 2127 *err = REG_ESPACE; 2128 return NULL; 2129 } 2130 return root; 2131 } 2132 2133 /* This function build the following tree, from regular expression 2134 <branch1>|<branch2>: 2135 ALT 2136 / \ 2137 / \ 2138 <branch1> <branch2> 2139 2140 ALT means alternative, which represents the operator `|'. */ 2141 2142 static bin_tree_t * 2143 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2144 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2145 { 2146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2147 bin_tree_t *tree, *branch = NULL; 2148 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2149 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2150 return NULL; 2151 2152 while (token->type == OP_ALT) 2153 { 2154 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2155 if (token->type != OP_ALT && token->type != END_OF_RE 2156 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2157 { 2158 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2159 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2160 return NULL; 2161 } 2162 else 2163 branch = NULL; 2164 tree = create_tree (dfa, tree, branch, OP_ALT); 2165 if (BE (tree == NULL, 0)) 2166 { 2167 *err = REG_ESPACE; 2168 return NULL; 2169 } 2170 } 2171 return tree; 2172 } 2173 2174 /* This function build the following tree, from regular expression 2175 <exp1><exp2>: 2176 CAT 2177 / \ 2178 / \ 2179 <exp1> <exp2> 2180 2181 CAT means concatenation. */ 2182 2183 static bin_tree_t * 2184 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2185 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2186 { 2187 bin_tree_t *tree, *expr; 2188 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2189 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2190 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2191 return NULL; 2192 2193 while (token->type != OP_ALT && token->type != END_OF_RE 2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2195 { 2196 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2197 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2198 { 2199 return NULL; 2200 } 2201 if (tree != NULL && expr != NULL) 2202 { 2203 tree = create_tree (dfa, tree, expr, CONCAT); 2204 if (tree == NULL) 2205 { 2206 *err = REG_ESPACE; 2207 return NULL; 2208 } 2209 } 2210 else if (tree == NULL) 2211 tree = expr; 2212 /* Otherwise expr == NULL, we don't need to create new tree. */ 2213 } 2214 return tree; 2215 } 2216 2217 /* This function build the following tree, from regular expression a*: 2218 * 2219 | 2220 a 2221 */ 2222 2223 static bin_tree_t * 2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2225 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2226 { 2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2228 bin_tree_t *tree; 2229 switch (token->type) 2230 { 2231 case CHARACTER: 2232 tree = create_token_tree (dfa, NULL, NULL, token); 2233 if (BE (tree == NULL, 0)) 2234 { 2235 *err = REG_ESPACE; 2236 return NULL; 2237 } 2238 #ifdef RE_ENABLE_I18N 2239 if (dfa->mb_cur_max > 1) 2240 { 2241 while (!re_string_eoi (regexp) 2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2243 { 2244 bin_tree_t *mbc_remain; 2245 fetch_token (token, regexp, syntax); 2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2248 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2249 { 2250 *err = REG_ESPACE; 2251 return NULL; 2252 } 2253 } 2254 } 2255 #endif 2256 break; 2257 case OP_OPEN_SUBEXP: 2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2259 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2260 return NULL; 2261 break; 2262 case OP_OPEN_BRACKET: 2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2264 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2265 return NULL; 2266 break; 2267 case OP_BACK_REF: 2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2269 { 2270 *err = REG_ESUBREG; 2271 return NULL; 2272 } 2273 dfa->used_bkref_map |= 1 << token->opr.idx; 2274 tree = create_token_tree (dfa, NULL, NULL, token); 2275 if (BE (tree == NULL, 0)) 2276 { 2277 *err = REG_ESPACE; 2278 return NULL; 2279 } 2280 ++dfa->nbackref; 2281 dfa->has_mb_node = 1; 2282 break; 2283 case OP_OPEN_DUP_NUM: 2284 if (syntax & RE_CONTEXT_INVALID_DUP) 2285 { 2286 *err = REG_BADRPT; 2287 return NULL; 2288 } 2289 /* FALLTHROUGH */ 2290 case OP_DUP_ASTERISK: 2291 case OP_DUP_PLUS: 2292 case OP_DUP_QUESTION: 2293 if (syntax & RE_CONTEXT_INVALID_OPS) 2294 { 2295 *err = REG_BADRPT; 2296 return NULL; 2297 } 2298 else if (syntax & RE_CONTEXT_INDEP_OPS) 2299 { 2300 fetch_token (token, regexp, syntax); 2301 return parse_expression (regexp, preg, token, syntax, nest, err); 2302 } 2303 /* else fall through */ 2304 case OP_CLOSE_SUBEXP: 2305 if ((token->type == OP_CLOSE_SUBEXP) && 2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2307 { 2308 *err = REG_ERPAREN; 2309 return NULL; 2310 } 2311 /* else fall through */ 2312 case OP_CLOSE_DUP_NUM: 2313 /* We treat it as a normal character. */ 2314 2315 /* Then we can these characters as normal characters. */ 2316 token->type = CHARACTER; 2317 /* mb_partial and word_char bits should be initialized already 2318 by peek_token. */ 2319 tree = create_token_tree (dfa, NULL, NULL, token); 2320 if (BE (tree == NULL, 0)) 2321 { 2322 *err = REG_ESPACE; 2323 return NULL; 2324 } 2325 break; 2326 case ANCHOR: 2327 if ((token->opr.ctx_type 2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2329 && dfa->word_ops_used == 0) 2330 init_word_char (dfa); 2331 if (token->opr.ctx_type == WORD_DELIM 2332 || token->opr.ctx_type == NOT_WORD_DELIM) 2333 { 2334 bin_tree_t *tree_first, *tree_last; 2335 if (token->opr.ctx_type == WORD_DELIM) 2336 { 2337 token->opr.ctx_type = WORD_FIRST; 2338 tree_first = create_token_tree (dfa, NULL, NULL, token); 2339 token->opr.ctx_type = WORD_LAST; 2340 } 2341 else 2342 { 2343 token->opr.ctx_type = INSIDE_WORD; 2344 tree_first = create_token_tree (dfa, NULL, NULL, token); 2345 token->opr.ctx_type = INSIDE_NOTWORD; 2346 } 2347 tree_last = create_token_tree (dfa, NULL, NULL, token); 2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2350 { 2351 *err = REG_ESPACE; 2352 return NULL; 2353 } 2354 } 2355 else 2356 { 2357 tree = create_token_tree (dfa, NULL, NULL, token); 2358 if (BE (tree == NULL, 0)) 2359 { 2360 *err = REG_ESPACE; 2361 return NULL; 2362 } 2363 } 2364 /* We must return here, since ANCHORs can't be followed 2365 by repetition operators. 2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2368 fetch_token (token, regexp, syntax); 2369 return tree; 2370 case OP_PERIOD: 2371 tree = create_token_tree (dfa, NULL, NULL, token); 2372 if (BE (tree == NULL, 0)) 2373 { 2374 *err = REG_ESPACE; 2375 return NULL; 2376 } 2377 if (dfa->mb_cur_max > 1) 2378 dfa->has_mb_node = 1; 2379 break; 2380 case OP_WORD: 2381 case OP_NOTWORD: 2382 tree = build_charclass_op (dfa, regexp->trans, 2383 (const unsigned char *) "alnum", 2384 (const unsigned char *) "_", 2385 token->type == OP_NOTWORD, err); 2386 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2387 return NULL; 2388 break; 2389 case OP_SPACE: 2390 case OP_NOTSPACE: 2391 tree = build_charclass_op (dfa, regexp->trans, 2392 (const unsigned char *) "space", 2393 (const unsigned char *) "", 2394 token->type == OP_NOTSPACE, err); 2395 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2396 return NULL; 2397 break; 2398 case OP_ALT: 2399 case END_OF_RE: 2400 return NULL; 2401 case BACK_SLASH: 2402 *err = REG_EESCAPE; 2403 return NULL; 2404 default: 2405 /* Must not happen? */ 2406 #ifdef DEBUG 2407 assert (0); 2408 #endif 2409 return NULL; 2410 } 2411 fetch_token (token, regexp, syntax); 2412 2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2415 { 2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2417 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2418 return NULL; 2419 /* In BRE consecutive duplications are not allowed. */ 2420 if ((syntax & RE_CONTEXT_INVALID_DUP) 2421 && (token->type == OP_DUP_ASTERISK 2422 || token->type == OP_OPEN_DUP_NUM)) 2423 { 2424 *err = REG_BADRPT; 2425 return NULL; 2426 } 2427 } 2428 2429 return tree; 2430 } 2431 2432 /* This function build the following tree, from regular expression 2433 (<reg_exp>): 2434 SUBEXP 2435 | 2436 <reg_exp> 2437 */ 2438 2439 static bin_tree_t * 2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2441 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2442 { 2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2444 bin_tree_t *tree; 2445 size_t cur_nsub; 2446 cur_nsub = preg->re_nsub++; 2447 2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2449 2450 /* The subexpression may be a null string. */ 2451 if (token->type == OP_CLOSE_SUBEXP) 2452 tree = NULL; 2453 else 2454 { 2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2457 *err = REG_EPAREN; 2458 if (BE (*err != REG_NOERROR, 0)) 2459 return NULL; 2460 } 2461 2462 if (cur_nsub <= '9' - '1') 2463 dfa->completed_bkref_map |= 1 << cur_nsub; 2464 2465 tree = create_tree (dfa, tree, NULL, SUBEXP); 2466 if (BE (tree == NULL, 0)) 2467 { 2468 *err = REG_ESPACE; 2469 return NULL; 2470 } 2471 tree->token.opr.idx = cur_nsub; 2472 return tree; 2473 } 2474 2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2476 2477 static bin_tree_t * 2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2480 { 2481 bin_tree_t *tree = NULL, *old_tree = NULL; 2482 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2483 re_token_t start_token = *token; 2484 2485 if (token->type == OP_OPEN_DUP_NUM) 2486 { 2487 end = 0; 2488 start = fetch_number (regexp, token, syntax); 2489 if (start == REG_MISSING) 2490 { 2491 if (token->type == CHARACTER && token->opr.c == ',') 2492 start = 0; /* We treat "{,m}" as "{0,m}". */ 2493 else 2494 { 2495 *err = REG_BADBR; /* <re>{} is invalid. */ 2496 return NULL; 2497 } 2498 } 2499 if (BE (start != REG_ERROR, 1)) 2500 { 2501 /* We treat "{n}" as "{n,n}". */ 2502 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2503 : ((token->type == CHARACTER && token->opr.c == ',') 2504 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2505 } 2506 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2507 { 2508 /* Invalid sequence. */ 2509 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2510 { 2511 if (token->type == END_OF_RE) 2512 *err = REG_EBRACE; 2513 else 2514 *err = REG_BADBR; 2515 2516 return NULL; 2517 } 2518 2519 /* If the syntax bit is set, rollback. */ 2520 re_string_set_index (regexp, start_idx); 2521 *token = start_token; 2522 token->type = CHARACTER; 2523 /* mb_partial and word_char bits should be already initialized by 2524 peek_token. */ 2525 return elem; 2526 } 2527 2528 if (BE ((end != REG_MISSING && start > end) 2529 || token->type != OP_CLOSE_DUP_NUM, 0)) 2530 { 2531 /* First number greater than second. */ 2532 *err = REG_BADBR; 2533 return NULL; 2534 } 2535 } 2536 else 2537 { 2538 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2539 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2540 } 2541 2542 fetch_token (token, regexp, syntax); 2543 2544 if (BE (elem == NULL, 0)) 2545 return NULL; 2546 if (BE (start == 0 && end == 0, 0)) 2547 { 2548 postorder (elem, free_tree, NULL); 2549 return NULL; 2550 } 2551 2552 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2553 if (BE (start > 0, 0)) 2554 { 2555 tree = elem; 2556 for (i = 2; i <= start; ++i) 2557 { 2558 elem = duplicate_tree (elem, dfa); 2559 tree = create_tree (dfa, tree, elem, CONCAT); 2560 if (BE (elem == NULL || tree == NULL, 0)) 2561 goto parse_dup_op_espace; 2562 } 2563 2564 if (start == end) 2565 return tree; 2566 2567 /* Duplicate ELEM before it is marked optional. */ 2568 elem = duplicate_tree (elem, dfa); 2569 old_tree = tree; 2570 } 2571 else 2572 old_tree = NULL; 2573 2574 if (elem->token.type == SUBEXP) 2575 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2576 2577 tree = create_tree (dfa, elem, NULL, 2578 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2579 if (BE (tree == NULL, 0)) 2580 goto parse_dup_op_espace; 2581 2582 /* From gnulib's "intprops.h": 2583 True if the arithmetic type T is signed. */ 2584 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) 2585 2586 /* This loop is actually executed only when end != REG_MISSING, 2587 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2588 already created the start+1-th copy. */ 2589 if (TYPE_SIGNED (Idx) || end != REG_MISSING) 2590 for (i = start + 2; i <= end; ++i) 2591 { 2592 elem = duplicate_tree (elem, dfa); 2593 tree = create_tree (dfa, tree, elem, CONCAT); 2594 if (BE (elem == NULL || tree == NULL, 0)) 2595 goto parse_dup_op_espace; 2596 2597 tree = create_tree (dfa, tree, NULL, OP_ALT); 2598 if (BE (tree == NULL, 0)) 2599 goto parse_dup_op_espace; 2600 } 2601 2602 if (old_tree) 2603 tree = create_tree (dfa, old_tree, tree, CONCAT); 2604 2605 return tree; 2606 2607 parse_dup_op_espace: 2608 *err = REG_ESPACE; 2609 return NULL; 2610 } 2611 2612 /* Size of the names for collating symbol/equivalence_class/character_class. 2613 I'm not sure, but maybe enough. */ 2614 #define BRACKET_NAME_BUF_SIZE 32 2615 2616 #ifndef _LIBC 2617 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2618 Build the range expression which starts from START_ELEM, and ends 2619 at END_ELEM. The result are written to MBCSET and SBCSET. 2620 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2621 mbcset->range_ends, is a pointer argument sinse we may 2622 update it. */ 2623 2624 static reg_errcode_t 2625 internal_function 2626 # ifdef RE_ENABLE_I18N 2627 build_range_exp (const reg_syntax_t syntax, 2628 bitset_t sbcset, 2629 re_charset_t *mbcset, 2630 Idx *range_alloc, 2631 const bracket_elem_t *start_elem, 2632 const bracket_elem_t *end_elem) 2633 # else /* not RE_ENABLE_I18N */ 2634 build_range_exp (const reg_syntax_t syntax, 2635 bitset_t sbcset, 2636 const bracket_elem_t *start_elem, 2637 const bracket_elem_t *end_elem) 2638 # endif /* not RE_ENABLE_I18N */ 2639 { 2640 unsigned int start_ch, end_ch; 2641 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2642 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2643 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2644 0)) 2645 return REG_ERANGE; 2646 2647 /* We can handle no multi character collating elements without libc 2648 support. */ 2649 if (BE ((start_elem->type == COLL_SYM 2650 && strlen ((char *) start_elem->opr.name) > 1) 2651 || (end_elem->type == COLL_SYM 2652 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2653 return REG_ECOLLATE; 2654 2655 # ifdef RE_ENABLE_I18N 2656 { 2657 wchar_t wc; 2658 wint_t start_wc; 2659 wint_t end_wc; 2660 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2661 2662 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2663 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2664 : 0)); 2665 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2666 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2667 : 0)); 2668 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2669 ? __btowc (start_ch) : start_elem->opr.wch); 2670 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2671 ? __btowc (end_ch) : end_elem->opr.wch); 2672 if (start_wc == WEOF || end_wc == WEOF) 2673 return REG_ECOLLATE; 2674 cmp_buf[0] = start_wc; 2675 cmp_buf[4] = end_wc; 2676 2677 if (BE ((syntax & RE_NO_EMPTY_RANGES) 2678 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) 2679 return REG_ERANGE; 2680 2681 /* Got valid collation sequence values, add them as a new entry. 2682 However, for !_LIBC we have no collation elements: if the 2683 character set is single byte, the single byte character set 2684 that we build below suffices. parse_bracket_exp passes 2685 no MBCSET if dfa->mb_cur_max == 1. */ 2686 if (mbcset) 2687 { 2688 /* Check the space of the arrays. */ 2689 if (BE (*range_alloc == mbcset->nranges, 0)) 2690 { 2691 /* There is not enough space, need realloc. */ 2692 wchar_t *new_array_start, *new_array_end; 2693 Idx new_nranges; 2694 2695 /* +1 in case of mbcset->nranges is 0. */ 2696 new_nranges = 2 * mbcset->nranges + 1; 2697 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2698 are NULL if *range_alloc == 0. */ 2699 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2700 new_nranges); 2701 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2702 new_nranges); 2703 2704 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2705 return REG_ESPACE; 2706 2707 mbcset->range_starts = new_array_start; 2708 mbcset->range_ends = new_array_end; 2709 *range_alloc = new_nranges; 2710 } 2711 2712 mbcset->range_starts[mbcset->nranges] = start_wc; 2713 mbcset->range_ends[mbcset->nranges++] = end_wc; 2714 } 2715 2716 /* Build the table for single byte characters. */ 2717 for (wc = 0; wc < SBC_MAX; ++wc) 2718 { 2719 cmp_buf[2] = wc; 2720 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2721 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2722 bitset_set (sbcset, wc); 2723 } 2724 } 2725 # else /* not RE_ENABLE_I18N */ 2726 { 2727 unsigned int ch; 2728 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2729 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2730 : 0)); 2731 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2732 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2733 : 0)); 2734 if (start_ch > end_ch) 2735 return REG_ERANGE; 2736 /* Build the table for single byte characters. */ 2737 for (ch = 0; ch < SBC_MAX; ++ch) 2738 if (start_ch <= ch && ch <= end_ch) 2739 bitset_set (sbcset, ch); 2740 } 2741 # endif /* not RE_ENABLE_I18N */ 2742 return REG_NOERROR; 2743 } 2744 #endif /* not _LIBC */ 2745 2746 #ifndef _LIBC 2747 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2748 Build the collating element which is represented by NAME. 2749 The result are written to MBCSET and SBCSET. 2750 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2751 pointer argument since we may update it. */ 2752 2753 static reg_errcode_t 2754 internal_function 2755 build_collating_symbol (bitset_t sbcset, 2756 # ifdef RE_ENABLE_I18N 2757 re_charset_t *mbcset _UNUSED_PARAMETER_, 2758 Idx *coll_sym_alloc _UNUSED_PARAMETER_, 2759 # endif 2760 const unsigned char *name) 2761 { 2762 size_t name_len = strlen ((const char *) name); 2763 if (BE (name_len != 1, 0)) 2764 return REG_ECOLLATE; 2765 else 2766 { 2767 bitset_set (sbcset, name[0]); 2768 return REG_NOERROR; 2769 } 2770 } 2771 #endif /* not _LIBC */ 2772 2773 /* This function parse bracket expression like "[abc]", "[a-c]", 2774 "[[.a-a.]]" etc. */ 2775 2776 static bin_tree_t * 2777 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2778 reg_syntax_t syntax, reg_errcode_t *err) 2779 { 2780 #ifdef _LIBC 2781 const unsigned char *collseqmb; 2782 const char *collseqwc; 2783 uint32_t nrules; 2784 int32_t table_size; 2785 const int32_t *symb_table; 2786 const unsigned char *extra; 2787 2788 /* Local function for parse_bracket_exp used in _LIBC environement. 2789 Seek the collating symbol entry correspondings to NAME. 2790 Return the index of the symbol in the SYMB_TABLE. */ 2791 2792 auto inline int32_t 2793 __attribute ((always_inline)) 2794 seek_collating_symbol_entry (name, name_len) 2795 const unsigned char *name; 2796 size_t name_len; 2797 { 2798 int32_t hash = elem_hash ((const char *) name, name_len); 2799 int32_t elem = hash % table_size; 2800 if (symb_table[2 * elem] != 0) 2801 { 2802 int32_t second = hash % (table_size - 2) + 1; 2803 2804 do 2805 { 2806 /* First compare the hashing value. */ 2807 if (symb_table[2 * elem] == hash 2808 /* Compare the length of the name. */ 2809 && name_len == extra[symb_table[2 * elem + 1]] 2810 /* Compare the name. */ 2811 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2812 name_len) == 0) 2813 { 2814 /* Yep, this is the entry. */ 2815 break; 2816 } 2817 2818 /* Next entry. */ 2819 elem += second; 2820 } 2821 while (symb_table[2 * elem] != 0); 2822 } 2823 return elem; 2824 } 2825 2826 /* Local function for parse_bracket_exp used in _LIBC environment. 2827 Look up the collation sequence value of BR_ELEM. 2828 Return the value if succeeded, UINT_MAX otherwise. */ 2829 2830 auto inline unsigned int 2831 __attribute ((always_inline)) 2832 lookup_collation_sequence_value (br_elem) 2833 bracket_elem_t *br_elem; 2834 { 2835 if (br_elem->type == SB_CHAR) 2836 { 2837 /* 2838 if (MB_CUR_MAX == 1) 2839 */ 2840 if (nrules == 0) 2841 return collseqmb[br_elem->opr.ch]; 2842 else 2843 { 2844 wint_t wc = __btowc (br_elem->opr.ch); 2845 return __collseq_table_lookup (collseqwc, wc); 2846 } 2847 } 2848 else if (br_elem->type == MB_CHAR) 2849 { 2850 if (nrules != 0) 2851 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2852 } 2853 else if (br_elem->type == COLL_SYM) 2854 { 2855 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2856 if (nrules != 0) 2857 { 2858 int32_t elem, idx; 2859 elem = seek_collating_symbol_entry (br_elem->opr.name, 2860 sym_name_len); 2861 if (symb_table[2 * elem] != 0) 2862 { 2863 /* We found the entry. */ 2864 idx = symb_table[2 * elem + 1]; 2865 /* Skip the name of collating element name. */ 2866 idx += 1 + extra[idx]; 2867 /* Skip the byte sequence of the collating element. */ 2868 idx += 1 + extra[idx]; 2869 /* Adjust for the alignment. */ 2870 idx = (idx + 3) & ~3; 2871 /* Skip the multibyte collation sequence value. */ 2872 idx += sizeof (unsigned int); 2873 /* Skip the wide char sequence of the collating element. */ 2874 idx += sizeof (unsigned int) * 2875 (1 + *(unsigned int *) (extra + idx)); 2876 /* Return the collation sequence value. */ 2877 return *(unsigned int *) (extra + idx); 2878 } 2879 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2880 { 2881 /* No valid character. Match it as a single byte 2882 character. */ 2883 return collseqmb[br_elem->opr.name[0]]; 2884 } 2885 } 2886 else if (sym_name_len == 1) 2887 return collseqmb[br_elem->opr.name[0]]; 2888 } 2889 return UINT_MAX; 2890 } 2891 2892 /* Local function for parse_bracket_exp used in _LIBC environement. 2893 Build the range expression which starts from START_ELEM, and ends 2894 at END_ELEM. The result are written to MBCSET and SBCSET. 2895 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2896 mbcset->range_ends, is a pointer argument sinse we may 2897 update it. */ 2898 2899 auto inline reg_errcode_t 2900 __attribute ((always_inline)) 2901 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2902 re_charset_t *mbcset; 2903 Idx *range_alloc; 2904 bitset_t sbcset; 2905 bracket_elem_t *start_elem, *end_elem; 2906 { 2907 unsigned int ch; 2908 uint32_t start_collseq; 2909 uint32_t end_collseq; 2910 2911 /* Equivalence Classes and Character Classes can't be a range 2912 start/end. */ 2913 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2914 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2915 0)) 2916 return REG_ERANGE; 2917 2918 start_collseq = lookup_collation_sequence_value (start_elem); 2919 end_collseq = lookup_collation_sequence_value (end_elem); 2920 /* Check start/end collation sequence values. */ 2921 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2922 return REG_ECOLLATE; 2923 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2924 return REG_ERANGE; 2925 2926 /* Got valid collation sequence values, add them as a new entry. 2927 However, if we have no collation elements, and the character set 2928 is single byte, the single byte character set that we 2929 build below suffices. */ 2930 if (nrules > 0 || dfa->mb_cur_max > 1) 2931 { 2932 /* Check the space of the arrays. */ 2933 if (BE (*range_alloc == mbcset->nranges, 0)) 2934 { 2935 /* There is not enough space, need realloc. */ 2936 uint32_t *new_array_start; 2937 uint32_t *new_array_end; 2938 Idx new_nranges; 2939 2940 /* +1 in case of mbcset->nranges is 0. */ 2941 new_nranges = 2 * mbcset->nranges + 1; 2942 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2943 new_nranges); 2944 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2945 new_nranges); 2946 2947 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2948 return REG_ESPACE; 2949 2950 mbcset->range_starts = new_array_start; 2951 mbcset->range_ends = new_array_end; 2952 *range_alloc = new_nranges; 2953 } 2954 2955 mbcset->range_starts[mbcset->nranges] = start_collseq; 2956 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2957 } 2958 2959 /* Build the table for single byte characters. */ 2960 for (ch = 0; ch < SBC_MAX; ch++) 2961 { 2962 uint32_t ch_collseq; 2963 /* 2964 if (MB_CUR_MAX == 1) 2965 */ 2966 if (nrules == 0) 2967 ch_collseq = collseqmb[ch]; 2968 else 2969 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2970 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2971 bitset_set (sbcset, ch); 2972 } 2973 return REG_NOERROR; 2974 } 2975 2976 /* Local function for parse_bracket_exp used in _LIBC environement. 2977 Build the collating element which is represented by NAME. 2978 The result are written to MBCSET and SBCSET. 2979 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2980 pointer argument sinse we may update it. */ 2981 2982 auto inline reg_errcode_t 2983 __attribute ((always_inline)) 2984 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2985 re_charset_t *mbcset; 2986 Idx *coll_sym_alloc; 2987 bitset_t sbcset; 2988 const unsigned char *name; 2989 { 2990 int32_t elem, idx; 2991 size_t name_len = strlen ((const char *) name); 2992 if (nrules != 0) 2993 { 2994 elem = seek_collating_symbol_entry (name, name_len); 2995 if (symb_table[2 * elem] != 0) 2996 { 2997 /* We found the entry. */ 2998 idx = symb_table[2 * elem + 1]; 2999 /* Skip the name of collating element name. */ 3000 idx += 1 + extra[idx]; 3001 } 3002 else if (symb_table[2 * elem] == 0 && name_len == 1) 3003 { 3004 /* No valid character, treat it as a normal 3005 character. */ 3006 bitset_set (sbcset, name[0]); 3007 return REG_NOERROR; 3008 } 3009 else 3010 return REG_ECOLLATE; 3011 3012 /* Got valid collation sequence, add it as a new entry. */ 3013 /* Check the space of the arrays. */ 3014 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3015 { 3016 /* Not enough, realloc it. */ 3017 /* +1 in case of mbcset->ncoll_syms is 0. */ 3018 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3019 /* Use realloc since mbcset->coll_syms is NULL 3020 if *alloc == 0. */ 3021 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3022 new_coll_sym_alloc); 3023 if (BE (new_coll_syms == NULL, 0)) 3024 return REG_ESPACE; 3025 mbcset->coll_syms = new_coll_syms; 3026 *coll_sym_alloc = new_coll_sym_alloc; 3027 } 3028 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3029 return REG_NOERROR; 3030 } 3031 else 3032 { 3033 if (BE (name_len != 1, 0)) 3034 return REG_ECOLLATE; 3035 else 3036 { 3037 bitset_set (sbcset, name[0]); 3038 return REG_NOERROR; 3039 } 3040 } 3041 } 3042 #endif 3043 3044 re_token_t br_token; 3045 re_bitset_ptr_t sbcset; 3046 #ifdef RE_ENABLE_I18N 3047 re_charset_t *mbcset; 3048 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3049 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3050 #endif /* not RE_ENABLE_I18N */ 3051 bool non_match = false; 3052 bin_tree_t *work_tree; 3053 int token_len; 3054 bool first_round = true; 3055 #ifdef _LIBC 3056 collseqmb = (const unsigned char *) 3057 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3058 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3059 if (nrules) 3060 { 3061 /* 3062 if (MB_CUR_MAX > 1) 3063 */ 3064 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3065 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3066 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3067 _NL_COLLATE_SYMB_TABLEMB); 3068 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3069 _NL_COLLATE_SYMB_EXTRAMB); 3070 } 3071 #endif 3072 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3073 #ifdef RE_ENABLE_I18N 3074 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3075 #endif /* RE_ENABLE_I18N */ 3076 #ifdef RE_ENABLE_I18N 3077 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3078 #else 3079 if (BE (sbcset == NULL, 0)) 3080 #endif /* RE_ENABLE_I18N */ 3081 { 3082 *err = REG_ESPACE; 3083 return NULL; 3084 } 3085 3086 token_len = peek_token_bracket (token, regexp, syntax); 3087 if (BE (token->type == END_OF_RE, 0)) 3088 { 3089 *err = REG_BADPAT; 3090 goto parse_bracket_exp_free_return; 3091 } 3092 if (token->type == OP_NON_MATCH_LIST) 3093 { 3094 #ifdef RE_ENABLE_I18N 3095 mbcset->non_match = 1; 3096 #endif /* not RE_ENABLE_I18N */ 3097 non_match = true; 3098 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3099 bitset_set (sbcset, '\n'); 3100 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3101 token_len = peek_token_bracket (token, regexp, syntax); 3102 if (BE (token->type == END_OF_RE, 0)) 3103 { 3104 *err = REG_BADPAT; 3105 goto parse_bracket_exp_free_return; 3106 } 3107 } 3108 3109 /* We treat the first ']' as a normal character. */ 3110 if (token->type == OP_CLOSE_BRACKET) 3111 token->type = CHARACTER; 3112 3113 while (1) 3114 { 3115 bracket_elem_t start_elem, end_elem; 3116 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3117 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3118 reg_errcode_t ret; 3119 int token_len2 = 0; 3120 bool is_range_exp = false; 3121 re_token_t token2; 3122 3123 start_elem.opr.name = start_name_buf; 3124 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3125 syntax, first_round); 3126 if (BE (ret != REG_NOERROR, 0)) 3127 { 3128 *err = ret; 3129 goto parse_bracket_exp_free_return; 3130 } 3131 first_round = false; 3132 3133 /* Get information about the next token. We need it in any case. */ 3134 token_len = peek_token_bracket (token, regexp, syntax); 3135 3136 /* Do not check for ranges if we know they are not allowed. */ 3137 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3138 { 3139 if (BE (token->type == END_OF_RE, 0)) 3140 { 3141 *err = REG_EBRACK; 3142 goto parse_bracket_exp_free_return; 3143 } 3144 if (token->type == OP_CHARSET_RANGE) 3145 { 3146 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3147 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3148 if (BE (token2.type == END_OF_RE, 0)) 3149 { 3150 *err = REG_EBRACK; 3151 goto parse_bracket_exp_free_return; 3152 } 3153 if (token2.type == OP_CLOSE_BRACKET) 3154 { 3155 /* We treat the last '-' as a normal character. */ 3156 re_string_skip_bytes (regexp, -token_len); 3157 token->type = CHARACTER; 3158 } 3159 else 3160 is_range_exp = true; 3161 } 3162 } 3163 3164 if (is_range_exp == true) 3165 { 3166 end_elem.opr.name = end_name_buf; 3167 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3168 dfa, syntax, true); 3169 if (BE (ret != REG_NOERROR, 0)) 3170 { 3171 *err = ret; 3172 goto parse_bracket_exp_free_return; 3173 } 3174 3175 token_len = peek_token_bracket (token, regexp, syntax); 3176 3177 #ifdef _LIBC 3178 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3179 &start_elem, &end_elem); 3180 #else 3181 # ifdef RE_ENABLE_I18N 3182 *err = build_range_exp (syntax, sbcset, 3183 dfa->mb_cur_max > 1 ? mbcset : NULL, 3184 &range_alloc, &start_elem, &end_elem); 3185 # else 3186 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3187 # endif 3188 #endif /* RE_ENABLE_I18N */ 3189 if (BE (*err != REG_NOERROR, 0)) 3190 goto parse_bracket_exp_free_return; 3191 } 3192 else 3193 { 3194 switch (start_elem.type) 3195 { 3196 case SB_CHAR: 3197 bitset_set (sbcset, start_elem.opr.ch); 3198 break; 3199 #ifdef RE_ENABLE_I18N 3200 case MB_CHAR: 3201 /* Check whether the array has enough space. */ 3202 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3203 { 3204 wchar_t *new_mbchars; 3205 /* Not enough, realloc it. */ 3206 /* +1 in case of mbcset->nmbchars is 0. */ 3207 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3208 /* Use realloc since array is NULL if *alloc == 0. */ 3209 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3210 mbchar_alloc); 3211 if (BE (new_mbchars == NULL, 0)) 3212 goto parse_bracket_exp_espace; 3213 mbcset->mbchars = new_mbchars; 3214 } 3215 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3216 break; 3217 #endif /* RE_ENABLE_I18N */ 3218 case EQUIV_CLASS: 3219 *err = build_equiv_class (sbcset, 3220 #ifdef RE_ENABLE_I18N 3221 mbcset, &equiv_class_alloc, 3222 #endif /* RE_ENABLE_I18N */ 3223 start_elem.opr.name); 3224 if (BE (*err != REG_NOERROR, 0)) 3225 goto parse_bracket_exp_free_return; 3226 break; 3227 case COLL_SYM: 3228 *err = build_collating_symbol (sbcset, 3229 #ifdef RE_ENABLE_I18N 3230 mbcset, &coll_sym_alloc, 3231 #endif /* RE_ENABLE_I18N */ 3232 start_elem.opr.name); 3233 if (BE (*err != REG_NOERROR, 0)) 3234 goto parse_bracket_exp_free_return; 3235 break; 3236 case CHAR_CLASS: 3237 *err = build_charclass (regexp->trans, sbcset, 3238 #ifdef RE_ENABLE_I18N 3239 mbcset, &char_class_alloc, 3240 #endif /* RE_ENABLE_I18N */ 3241 start_elem.opr.name, syntax); 3242 if (BE (*err != REG_NOERROR, 0)) 3243 goto parse_bracket_exp_free_return; 3244 break; 3245 default: 3246 assert (0); 3247 break; 3248 } 3249 } 3250 if (BE (token->type == END_OF_RE, 0)) 3251 { 3252 *err = REG_EBRACK; 3253 goto parse_bracket_exp_free_return; 3254 } 3255 if (token->type == OP_CLOSE_BRACKET) 3256 break; 3257 } 3258 3259 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3260 3261 /* If it is non-matching list. */ 3262 if (non_match) 3263 bitset_not (sbcset); 3264 3265 #ifdef RE_ENABLE_I18N 3266 /* Ensure only single byte characters are set. */ 3267 if (dfa->mb_cur_max > 1) 3268 bitset_mask (sbcset, dfa->sb_char); 3269 3270 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3271 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3272 || mbcset->non_match))) 3273 { 3274 bin_tree_t *mbc_tree; 3275 int sbc_idx; 3276 /* Build a tree for complex bracket. */ 3277 dfa->has_mb_node = 1; 3278 br_token.type = COMPLEX_BRACKET; 3279 br_token.opr.mbcset = mbcset; 3280 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3281 if (BE (mbc_tree == NULL, 0)) 3282 goto parse_bracket_exp_espace; 3283 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3284 if (sbcset[sbc_idx]) 3285 break; 3286 /* If there are no bits set in sbcset, there is no point 3287 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3288 if (sbc_idx < BITSET_WORDS) 3289 { 3290 /* Build a tree for simple bracket. */ 3291 br_token.type = SIMPLE_BRACKET; 3292 br_token.opr.sbcset = sbcset; 3293 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3294 if (BE (work_tree == NULL, 0)) 3295 goto parse_bracket_exp_espace; 3296 3297 /* Then join them by ALT node. */ 3298 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3299 if (BE (work_tree == NULL, 0)) 3300 goto parse_bracket_exp_espace; 3301 } 3302 else 3303 { 3304 re_free (sbcset); 3305 work_tree = mbc_tree; 3306 } 3307 } 3308 else 3309 #endif /* not RE_ENABLE_I18N */ 3310 { 3311 #ifdef RE_ENABLE_I18N 3312 free_charset (mbcset); 3313 #endif 3314 /* Build a tree for simple bracket. */ 3315 br_token.type = SIMPLE_BRACKET; 3316 br_token.opr.sbcset = sbcset; 3317 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3318 if (BE (work_tree == NULL, 0)) 3319 goto parse_bracket_exp_espace; 3320 } 3321 return work_tree; 3322 3323 parse_bracket_exp_espace: 3324 *err = REG_ESPACE; 3325 parse_bracket_exp_free_return: 3326 re_free (sbcset); 3327 #ifdef RE_ENABLE_I18N 3328 free_charset (mbcset); 3329 #endif /* RE_ENABLE_I18N */ 3330 return NULL; 3331 } 3332 3333 /* Parse an element in the bracket expression. */ 3334 3335 static reg_errcode_t 3336 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3337 re_token_t *token, int token_len, 3338 re_dfa_t *dfa _UNUSED_PARAMETER_, 3339 reg_syntax_t syntax, bool accept_hyphen) 3340 { 3341 #ifdef RE_ENABLE_I18N 3342 int cur_char_size; 3343 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3344 if (cur_char_size > 1) 3345 { 3346 elem->type = MB_CHAR; 3347 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3348 re_string_skip_bytes (regexp, cur_char_size); 3349 return REG_NOERROR; 3350 } 3351 #endif /* RE_ENABLE_I18N */ 3352 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3353 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3354 || token->type == OP_OPEN_EQUIV_CLASS) 3355 return parse_bracket_symbol (elem, regexp, token); 3356 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3357 { 3358 /* A '-' must only appear as anything but a range indicator before 3359 the closing bracket. Everything else is an error. */ 3360 re_token_t token2; 3361 (void) peek_token_bracket (&token2, regexp, syntax); 3362 if (token2.type != OP_CLOSE_BRACKET) 3363 /* The actual error value is not standardized since this whole 3364 case is undefined. But ERANGE makes good sense. */ 3365 return REG_ERANGE; 3366 } 3367 elem->type = SB_CHAR; 3368 elem->opr.ch = token->opr.c; 3369 return REG_NOERROR; 3370 } 3371 3372 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3373 such as [:<character_class>:], [.<collating_element>.], and 3374 [=<equivalent_class>=]. */ 3375 3376 static reg_errcode_t 3377 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3378 re_token_t *token) 3379 { 3380 unsigned char ch, delim = token->opr.c; 3381 int i = 0; 3382 if (re_string_eoi(regexp)) 3383 return REG_EBRACK; 3384 for (;; ++i) 3385 { 3386 if (i >= BRACKET_NAME_BUF_SIZE) 3387 return REG_EBRACK; 3388 if (token->type == OP_OPEN_CHAR_CLASS) 3389 ch = re_string_fetch_byte_case (regexp); 3390 else 3391 ch = re_string_fetch_byte (regexp); 3392 if (re_string_eoi(regexp)) 3393 return REG_EBRACK; 3394 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3395 break; 3396 elem->opr.name[i] = ch; 3397 } 3398 re_string_skip_bytes (regexp, 1); 3399 elem->opr.name[i] = '\0'; 3400 switch (token->type) 3401 { 3402 case OP_OPEN_COLL_ELEM: 3403 elem->type = COLL_SYM; 3404 break; 3405 case OP_OPEN_EQUIV_CLASS: 3406 elem->type = EQUIV_CLASS; 3407 break; 3408 case OP_OPEN_CHAR_CLASS: 3409 elem->type = CHAR_CLASS; 3410 break; 3411 default: 3412 break; 3413 } 3414 return REG_NOERROR; 3415 } 3416 3417 /* Helper function for parse_bracket_exp. 3418 Build the equivalence class which is represented by NAME. 3419 The result are written to MBCSET and SBCSET. 3420 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3421 is a pointer argument sinse we may update it. */ 3422 3423 static reg_errcode_t 3424 #ifdef RE_ENABLE_I18N 3425 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_, 3426 Idx *equiv_class_alloc _UNUSED_PARAMETER_, 3427 const unsigned char *name) 3428 #else /* not RE_ENABLE_I18N */ 3429 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3430 #endif /* not RE_ENABLE_I18N */ 3431 { 3432 #ifdef _LIBC 3433 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3434 if (nrules != 0) 3435 { 3436 const int32_t *table, *indirect; 3437 const unsigned char *weights, *extra, *cp; 3438 unsigned char char_buf[2]; 3439 int32_t idx1, idx2; 3440 unsigned int ch; 3441 size_t len; 3442 /* This #include defines a local function! */ 3443 # include <locale/weight.h> 3444 /* Calculate the index for equivalence class. */ 3445 cp = name; 3446 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3447 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3448 _NL_COLLATE_WEIGHTMB); 3449 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3450 _NL_COLLATE_EXTRAMB); 3451 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3452 _NL_COLLATE_INDIRECTMB); 3453 idx1 = findidx (&cp); 3454 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3455 /* This isn't a valid character. */ 3456 return REG_ECOLLATE; 3457 3458 /* Build single byte matcing table for this equivalence class. */ 3459 char_buf[1] = (unsigned char) '\0'; 3460 len = weights[idx1 & 0xffffff]; 3461 for (ch = 0; ch < SBC_MAX; ++ch) 3462 { 3463 char_buf[0] = ch; 3464 cp = char_buf; 3465 idx2 = findidx (&cp); 3466 /* 3467 idx2 = table[ch]; 3468 */ 3469 if (idx2 == 0) 3470 /* This isn't a valid character. */ 3471 continue; 3472 /* Compare only if the length matches and the collation rule 3473 index is the same. */ 3474 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3475 { 3476 int cnt = 0; 3477 3478 while (cnt <= len && 3479 weights[(idx1 & 0xffffff) + 1 + cnt] 3480 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3481 ++cnt; 3482 3483 if (cnt > len) 3484 bitset_set (sbcset, ch); 3485 } 3486 } 3487 /* Check whether the array has enough space. */ 3488 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3489 { 3490 /* Not enough, realloc it. */ 3491 /* +1 in case of mbcset->nequiv_classes is 0. */ 3492 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3493 /* Use realloc since the array is NULL if *alloc == 0. */ 3494 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3495 int32_t, 3496 new_equiv_class_alloc); 3497 if (BE (new_equiv_classes == NULL, 0)) 3498 return REG_ESPACE; 3499 mbcset->equiv_classes = new_equiv_classes; 3500 *equiv_class_alloc = new_equiv_class_alloc; 3501 } 3502 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3503 } 3504 else 3505 #endif /* _LIBC */ 3506 { 3507 if (BE (strlen ((const char *) name) != 1, 0)) 3508 return REG_ECOLLATE; 3509 bitset_set (sbcset, *name); 3510 } 3511 return REG_NOERROR; 3512 } 3513 3514 /* Helper function for parse_bracket_exp. 3515 Build the character class which is represented by NAME. 3516 The result are written to MBCSET and SBCSET. 3517 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3518 is a pointer argument sinse we may update it. */ 3519 3520 static reg_errcode_t 3521 #ifdef RE_ENABLE_I18N 3522 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3523 re_charset_t *mbcset, Idx *char_class_alloc, 3524 const unsigned char *class_name, reg_syntax_t syntax) 3525 #else /* not RE_ENABLE_I18N */ 3526 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3527 const unsigned char *class_name, reg_syntax_t syntax) 3528 #endif /* not RE_ENABLE_I18N */ 3529 { 3530 int i; 3531 const char *name = (const char *) class_name; 3532 3533 /* In case of REG_ICASE "upper" and "lower" match the both of 3534 upper and lower cases. */ 3535 if ((syntax & RE_ICASE) 3536 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3537 name = "alpha"; 3538 3539 #ifdef RE_ENABLE_I18N 3540 /* Check the space of the arrays. */ 3541 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3542 { 3543 /* Not enough, realloc it. */ 3544 /* +1 in case of mbcset->nchar_classes is 0. */ 3545 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3546 /* Use realloc since array is NULL if *alloc == 0. */ 3547 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3548 new_char_class_alloc); 3549 if (BE (new_char_classes == NULL, 0)) 3550 return REG_ESPACE; 3551 mbcset->char_classes = new_char_classes; 3552 *char_class_alloc = new_char_class_alloc; 3553 } 3554 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3555 #endif /* RE_ENABLE_I18N */ 3556 3557 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3558 do { \ 3559 if (BE (trans != NULL, 0)) \ 3560 { \ 3561 for (i = 0; i < SBC_MAX; ++i) \ 3562 if (ctype_func (i)) \ 3563 bitset_set (sbcset, trans[i]); \ 3564 } \ 3565 else \ 3566 { \ 3567 for (i = 0; i < SBC_MAX; ++i) \ 3568 if (ctype_func (i)) \ 3569 bitset_set (sbcset, i); \ 3570 } \ 3571 } while (0) 3572 3573 if (strcmp (name, "alnum") == 0) 3574 BUILD_CHARCLASS_LOOP (isalnum); 3575 else if (strcmp (name, "cntrl") == 0) 3576 BUILD_CHARCLASS_LOOP (iscntrl); 3577 else if (strcmp (name, "lower") == 0) 3578 BUILD_CHARCLASS_LOOP (islower); 3579 else if (strcmp (name, "space") == 0) 3580 BUILD_CHARCLASS_LOOP (isspace); 3581 else if (strcmp (name, "alpha") == 0) 3582 BUILD_CHARCLASS_LOOP (isalpha); 3583 else if (strcmp (name, "digit") == 0) 3584 BUILD_CHARCLASS_LOOP (isdigit); 3585 else if (strcmp (name, "print") == 0) 3586 BUILD_CHARCLASS_LOOP (isprint); 3587 else if (strcmp (name, "upper") == 0) 3588 BUILD_CHARCLASS_LOOP (isupper); 3589 else if (strcmp (name, "blank") == 0) 3590 BUILD_CHARCLASS_LOOP (isblank); 3591 else if (strcmp (name, "graph") == 0) 3592 BUILD_CHARCLASS_LOOP (isgraph); 3593 else if (strcmp (name, "punct") == 0) 3594 BUILD_CHARCLASS_LOOP (ispunct); 3595 else if (strcmp (name, "xdigit") == 0) 3596 BUILD_CHARCLASS_LOOP (isxdigit); 3597 else 3598 return REG_ECTYPE; 3599 3600 return REG_NOERROR; 3601 } 3602 3603 static bin_tree_t * 3604 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3605 const unsigned char *class_name, 3606 const unsigned char *extra, bool non_match, 3607 reg_errcode_t *err) 3608 { 3609 re_bitset_ptr_t sbcset; 3610 #ifdef RE_ENABLE_I18N 3611 re_charset_t *mbcset; 3612 Idx alloc = 0; 3613 #endif /* not RE_ENABLE_I18N */ 3614 reg_errcode_t ret; 3615 re_token_t br_token; 3616 bin_tree_t *tree; 3617 3618 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3619 #ifdef RE_ENABLE_I18N 3620 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3621 #endif /* RE_ENABLE_I18N */ 3622 3623 #ifdef RE_ENABLE_I18N 3624 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3625 #else /* not RE_ENABLE_I18N */ 3626 if (BE (sbcset == NULL, 0)) 3627 #endif /* not RE_ENABLE_I18N */ 3628 { 3629 *err = REG_ESPACE; 3630 return NULL; 3631 } 3632 3633 if (non_match) 3634 { 3635 #ifdef RE_ENABLE_I18N 3636 mbcset->non_match = 1; 3637 #endif /* not RE_ENABLE_I18N */ 3638 } 3639 3640 /* We don't care the syntax in this case. */ 3641 ret = build_charclass (trans, sbcset, 3642 #ifdef RE_ENABLE_I18N 3643 mbcset, &alloc, 3644 #endif /* RE_ENABLE_I18N */ 3645 class_name, 0); 3646 3647 if (BE (ret != REG_NOERROR, 0)) 3648 { 3649 re_free (sbcset); 3650 #ifdef RE_ENABLE_I18N 3651 free_charset (mbcset); 3652 #endif /* RE_ENABLE_I18N */ 3653 *err = ret; 3654 return NULL; 3655 } 3656 /* \w match '_' also. */ 3657 for (; *extra; extra++) 3658 bitset_set (sbcset, *extra); 3659 3660 /* If it is non-matching list. */ 3661 if (non_match) 3662 bitset_not (sbcset); 3663 3664 #ifdef RE_ENABLE_I18N 3665 /* Ensure only single byte characters are set. */ 3666 if (dfa->mb_cur_max > 1) 3667 bitset_mask (sbcset, dfa->sb_char); 3668 #endif 3669 3670 /* Build a tree for simple bracket. */ 3671 br_token.type = SIMPLE_BRACKET; 3672 br_token.opr.sbcset = sbcset; 3673 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3674 if (BE (tree == NULL, 0)) 3675 goto build_word_op_espace; 3676 3677 #ifdef RE_ENABLE_I18N 3678 if (dfa->mb_cur_max > 1) 3679 { 3680 bin_tree_t *mbc_tree; 3681 /* Build a tree for complex bracket. */ 3682 br_token.type = COMPLEX_BRACKET; 3683 br_token.opr.mbcset = mbcset; 3684 dfa->has_mb_node = 1; 3685 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3686 if (BE (mbc_tree == NULL, 0)) 3687 goto build_word_op_espace; 3688 /* Then join them by ALT node. */ 3689 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3690 if (BE (mbc_tree != NULL, 1)) 3691 return tree; 3692 } 3693 else 3694 { 3695 free_charset (mbcset); 3696 return tree; 3697 } 3698 #else /* not RE_ENABLE_I18N */ 3699 return tree; 3700 #endif /* not RE_ENABLE_I18N */ 3701 3702 build_word_op_espace: 3703 re_free (sbcset); 3704 #ifdef RE_ENABLE_I18N 3705 free_charset (mbcset); 3706 #endif /* RE_ENABLE_I18N */ 3707 *err = REG_ESPACE; 3708 return NULL; 3709 } 3710 3711 /* This is intended for the expressions like "a{1,3}". 3712 Fetch a number from `input', and return the number. 3713 Return REG_MISSING if the number field is empty like "{,1}". 3714 Return REG_ERROR if an error occurred. */ 3715 3716 static Idx 3717 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3718 { 3719 Idx num = REG_MISSING; 3720 unsigned char c; 3721 while (1) 3722 { 3723 fetch_token (token, input, syntax); 3724 c = token->opr.c; 3725 if (BE (token->type == END_OF_RE, 0)) 3726 return REG_ERROR; 3727 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3728 break; 3729 num = ((token->type != CHARACTER || c < '0' || '9' < c 3730 || num == REG_ERROR) 3731 ? REG_ERROR 3732 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3733 num = (num > RE_DUP_MAX) ? REG_ERROR : num; 3734 } 3735 return num; 3736 } 3737 3738 #ifdef RE_ENABLE_I18N 3739 static void 3740 free_charset (re_charset_t *cset) 3741 { 3742 re_free (cset->mbchars); 3743 # ifdef _LIBC 3744 re_free (cset->coll_syms); 3745 re_free (cset->equiv_classes); 3746 re_free (cset->range_starts); 3747 re_free (cset->range_ends); 3748 # endif 3749 re_free (cset->char_classes); 3750 re_free (cset); 3751 } 3752 #endif /* RE_ENABLE_I18N */ 3753 3754 /* Functions for binary tree operation. */ 3755 3756 /* Create a tree node. */ 3757 3758 static bin_tree_t * 3759 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3760 re_token_type_t type) 3761 { 3762 re_token_t t; 3763 t.type = type; 3764 return create_token_tree (dfa, left, right, &t); 3765 } 3766 3767 static bin_tree_t * 3768 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3769 const re_token_t *token) 3770 { 3771 bin_tree_t *tree; 3772 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3773 { 3774 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3775 3776 if (storage == NULL) 3777 return NULL; 3778 storage->next = dfa->str_tree_storage; 3779 dfa->str_tree_storage = storage; 3780 dfa->str_tree_storage_idx = 0; 3781 } 3782 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3783 3784 tree->parent = NULL; 3785 tree->left = left; 3786 tree->right = right; 3787 tree->token = *token; 3788 tree->token.duplicated = 0; 3789 tree->token.opt_subexp = 0; 3790 tree->first = NULL; 3791 tree->next = NULL; 3792 tree->node_idx = REG_MISSING; 3793 3794 if (left != NULL) 3795 left->parent = tree; 3796 if (right != NULL) 3797 right->parent = tree; 3798 return tree; 3799 } 3800 3801 /* Mark the tree SRC as an optional subexpression. 3802 To be called from preorder or postorder. */ 3803 3804 static reg_errcode_t 3805 mark_opt_subexp (void *extra, bin_tree_t *node) 3806 { 3807 Idx idx = (Idx) (long) extra; 3808 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3809 node->token.opt_subexp = 1; 3810 3811 return REG_NOERROR; 3812 } 3813 3814 /* Free the allocated memory inside NODE. */ 3815 3816 static void 3817 free_token (re_token_t *node) 3818 { 3819 #ifdef RE_ENABLE_I18N 3820 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3821 free_charset (node->opr.mbcset); 3822 else 3823 #endif /* RE_ENABLE_I18N */ 3824 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3825 re_free (node->opr.sbcset); 3826 } 3827 3828 /* Worker function for tree walking. Free the allocated memory inside NODE 3829 and its children. */ 3830 3831 static reg_errcode_t 3832 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 3833 { 3834 free_token (&node->token); 3835 return REG_NOERROR; 3836 } 3837 3838 3839 /* Duplicate the node SRC, and return new node. This is a preorder 3840 visit similar to the one implemented by the generic visitor, but 3841 we need more infrastructure to maintain two parallel trees --- so, 3842 it's easier to duplicate. */ 3843 3844 static bin_tree_t * 3845 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3846 { 3847 const bin_tree_t *node; 3848 bin_tree_t *dup_root; 3849 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3850 3851 for (node = root; ; ) 3852 { 3853 /* Create a new tree and link it back to the current parent. */ 3854 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3855 if (*p_new == NULL) 3856 return NULL; 3857 (*p_new)->parent = dup_node; 3858 (*p_new)->token.duplicated = 1; 3859 dup_node = *p_new; 3860 3861 /* Go to the left node, or up and to the right. */ 3862 if (node->left) 3863 { 3864 node = node->left; 3865 p_new = &dup_node->left; 3866 } 3867 else 3868 { 3869 const bin_tree_t *prev = NULL; 3870 while (node->right == prev || node->right == NULL) 3871 { 3872 prev = node; 3873 node = node->parent; 3874 dup_node = dup_node->parent; 3875 if (!node) 3876 return dup_root; 3877 } 3878 node = node->right; 3879 p_new = &dup_node->right; 3880 } 3881 } 3882 } 3883