1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002-2012 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License along 17 with this program; if not, see <http://www.gnu.org/licenses/>. */ 18 19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 20 size_t length, reg_syntax_t syntax); 21 static void re_compile_fastmap_iter (regex_t *bufp, 22 const re_dfastate_t *init_state, 23 char *fastmap); 24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 25 #ifdef RE_ENABLE_I18N 26 static void free_charset (re_charset_t *cset); 27 #endif /* RE_ENABLE_I18N */ 28 static void free_workarea_compile (regex_t *preg); 29 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 30 #ifdef RE_ENABLE_I18N 31 static void optimize_utf8 (re_dfa_t *dfa); 32 #endif 33 static reg_errcode_t analyze (regex_t *preg); 34 static reg_errcode_t preorder (bin_tree_t *root, 35 reg_errcode_t (fn (void *, bin_tree_t *)), 36 void *extra); 37 static reg_errcode_t postorder (bin_tree_t *root, 38 reg_errcode_t (fn (void *, bin_tree_t *)), 39 void *extra); 40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 43 bin_tree_t *node); 44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 49 unsigned int constraint); 50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 52 Idx node, bool root); 53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 54 static Idx fetch_number (re_string_t *input, re_token_t *token, 55 reg_syntax_t syntax); 56 static int peek_token (re_token_t *token, re_string_t *input, 57 reg_syntax_t syntax) internal_function; 58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 59 reg_syntax_t syntax, reg_errcode_t *err); 60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 61 re_token_t *token, reg_syntax_t syntax, 62 Idx nest, reg_errcode_t *err); 63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 64 re_token_t *token, reg_syntax_t syntax, 65 Idx nest, reg_errcode_t *err); 66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 67 re_token_t *token, reg_syntax_t syntax, 68 Idx nest, reg_errcode_t *err); 69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 70 re_token_t *token, reg_syntax_t syntax, 71 Idx nest, reg_errcode_t *err); 72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 73 re_dfa_t *dfa, re_token_t *token, 74 reg_syntax_t syntax, reg_errcode_t *err); 75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 76 re_token_t *token, reg_syntax_t syntax, 77 reg_errcode_t *err); 78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 79 re_string_t *regexp, 80 re_token_t *token, int token_len, 81 re_dfa_t *dfa, 82 reg_syntax_t syntax, 83 bool accept_hyphen); 84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 85 re_string_t *regexp, 86 re_token_t *token); 87 #ifdef RE_ENABLE_I18N 88 static reg_errcode_t build_equiv_class (bitset_t sbcset, 89 re_charset_t *mbcset, 90 Idx *equiv_class_alloc, 91 const unsigned char *name); 92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 93 bitset_t sbcset, 94 re_charset_t *mbcset, 95 Idx *char_class_alloc, 96 const unsigned char *class_name, 97 reg_syntax_t syntax); 98 #else /* not RE_ENABLE_I18N */ 99 static reg_errcode_t build_equiv_class (bitset_t sbcset, 100 const unsigned char *name); 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 102 bitset_t sbcset, 103 const unsigned char *class_name, 104 reg_syntax_t syntax); 105 #endif /* not RE_ENABLE_I18N */ 106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 107 RE_TRANSLATE_TYPE trans, 108 const unsigned char *class_name, 109 const unsigned char *extra, 110 bool non_match, reg_errcode_t *err); 111 static bin_tree_t *create_tree (re_dfa_t *dfa, 112 bin_tree_t *left, bin_tree_t *right, 113 re_token_type_t type); 114 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 115 bin_tree_t *left, bin_tree_t *right, 116 const re_token_t *token); 117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 118 static void free_token (re_token_t *node); 119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 121 122 /* This table gives an error message for each of the error codes listed 123 in regex.h. Obviously the order here has to be same as there. 124 POSIX doesn't require that we do anything for REG_NOERROR, 125 but why not be nice? */ 126 127 static const char __re_error_msgid[] = 128 { 129 #define REG_NOERROR_IDX 0 130 gettext_noop ("Success") /* REG_NOERROR */ 131 "\0" 132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 133 gettext_noop ("No match") /* REG_NOMATCH */ 134 "\0" 135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 137 "\0" 138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 140 "\0" 141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 143 "\0" 144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 146 "\0" 147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 149 "\0" 150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 152 "\0" 153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 155 "\0" 156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 158 "\0" 159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 161 "\0" 162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 163 gettext_noop ("Invalid range end") /* REG_ERANGE */ 164 "\0" 165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 166 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 167 "\0" 168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 170 "\0" 171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 172 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 173 "\0" 174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 175 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 176 "\0" 177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 179 }; 180 181 static const size_t __re_error_msgid_idx[] = 182 { 183 REG_NOERROR_IDX, 184 REG_NOMATCH_IDX, 185 REG_BADPAT_IDX, 186 REG_ECOLLATE_IDX, 187 REG_ECTYPE_IDX, 188 REG_EESCAPE_IDX, 189 REG_ESUBREG_IDX, 190 REG_EBRACK_IDX, 191 REG_EPAREN_IDX, 192 REG_EBRACE_IDX, 193 REG_BADBR_IDX, 194 REG_ERANGE_IDX, 195 REG_ESPACE_IDX, 196 REG_BADRPT_IDX, 197 REG_EEND_IDX, 198 REG_ESIZE_IDX, 199 REG_ERPAREN_IDX 200 }; 201 202 /* Entry points for GNU code. */ 203 204 /* re_compile_pattern is the GNU regular expression compiler: it 205 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 206 Returns 0 if the pattern was valid, otherwise an error string. 207 208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields 209 are set in BUFP on entry. */ 210 211 #ifdef _LIBC 212 const char * 213 re_compile_pattern (pattern, length, bufp) 214 const char *pattern; 215 size_t length; 216 struct re_pattern_buffer *bufp; 217 #else /* size_t might promote */ 218 const char * 219 re_compile_pattern (const char *pattern, size_t length, 220 struct re_pattern_buffer *bufp) 221 #endif 222 { 223 reg_errcode_t ret; 224 225 /* And GNU code determines whether or not to get register information 226 by passing null for the REGS argument to re_match, etc., not by 227 setting no_sub, unless RE_NO_SUB is set. */ 228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 229 230 /* Match anchors at newline. */ 231 bufp->newline_anchor = 1; 232 233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 234 235 if (!ret) 236 return NULL; 237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 238 } 239 #ifdef _LIBC 240 weak_alias (__re_compile_pattern, re_compile_pattern) 241 #endif 242 243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can 244 also be assigned to arbitrarily: each pattern buffer stores its own 245 syntax, so it can be changed between regex compilations. */ 246 /* This has no initializer because initialized variables in Emacs 247 become read-only after dumping. */ 248 reg_syntax_t re_syntax_options; 249 250 251 /* Specify the precise syntax of regexps for compilation. This provides 252 for compatibility for various utilities which historically have 253 different, incompatible syntaxes. 254 255 The argument SYNTAX is a bit mask comprised of the various bits 256 defined in regex.h. We return the old syntax. */ 257 258 reg_syntax_t 259 re_set_syntax (syntax) 260 reg_syntax_t syntax; 261 { 262 reg_syntax_t ret = re_syntax_options; 263 264 re_syntax_options = syntax; 265 return ret; 266 } 267 #ifdef _LIBC 268 weak_alias (__re_set_syntax, re_set_syntax) 269 #endif 270 271 int 272 re_compile_fastmap (bufp) 273 struct re_pattern_buffer *bufp; 274 { 275 re_dfa_t *dfa = bufp->buffer; 276 char *fastmap = bufp->fastmap; 277 278 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 280 if (dfa->init_state != dfa->init_state_word) 281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 282 if (dfa->init_state != dfa->init_state_nl) 283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 284 if (dfa->init_state != dfa->init_state_begbuf) 285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 286 bufp->fastmap_accurate = 1; 287 return 0; 288 } 289 #ifdef _LIBC 290 weak_alias (__re_compile_fastmap, re_compile_fastmap) 291 #endif 292 293 static inline void 294 __attribute ((always_inline)) 295 re_set_fastmap (char *fastmap, bool icase, int ch) 296 { 297 fastmap[ch] = 1; 298 if (icase) 299 fastmap[tolower (ch)] = 1; 300 } 301 302 /* Helper function for re_compile_fastmap. 303 Compile fastmap for the initial_state INIT_STATE. */ 304 305 static void 306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 307 char *fastmap) 308 { 309 re_dfa_t *dfa = bufp->buffer; 310 Idx node_cnt; 311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 313 { 314 Idx node = init_state->nodes.elems[node_cnt]; 315 re_token_type_t type = dfa->nodes[node].type; 316 317 if (type == CHARACTER) 318 { 319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 320 #ifdef RE_ENABLE_I18N 321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 322 { 323 unsigned char buf[MB_LEN_MAX]; 324 unsigned char *p; 325 wchar_t wc; 326 mbstate_t state; 327 328 p = buf; 329 *p++ = dfa->nodes[node].opr.c; 330 while (++node < dfa->nodes_len 331 && dfa->nodes[node].type == CHARACTER 332 && dfa->nodes[node].mb_partial) 333 *p++ = dfa->nodes[node].opr.c; 334 memset (&state, '\0', sizeof (state)); 335 if (__mbrtowc (&wc, (const char *) buf, p - buf, 336 &state) == p - buf 337 && (__wcrtomb ((char *) buf, towlower (wc), &state) 338 != (size_t) -1)) 339 re_set_fastmap (fastmap, false, buf[0]); 340 } 341 #endif 342 } 343 else if (type == SIMPLE_BRACKET) 344 { 345 int i, ch; 346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 347 { 348 int j; 349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 351 if (w & ((bitset_word_t) 1 << j)) 352 re_set_fastmap (fastmap, icase, ch); 353 } 354 } 355 #ifdef RE_ENABLE_I18N 356 else if (type == COMPLEX_BRACKET) 357 { 358 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 359 Idx i; 360 361 # ifdef _LIBC 362 /* See if we have to try all bytes which start multiple collation 363 elements. 364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 365 collation element, and don't catch 'b' since 'b' is 366 the only collation element which starts from 'b' (and 367 it is caught by SIMPLE_BRACKET). */ 368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 369 && (cset->ncoll_syms || cset->nranges)) 370 { 371 const int32_t *table = (const int32_t *) 372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 373 for (i = 0; i < SBC_MAX; ++i) 374 if (table[i] < 0) 375 re_set_fastmap (fastmap, icase, i); 376 } 377 # endif /* _LIBC */ 378 379 /* See if we have to start the match at all multibyte characters, 380 i.e. where we would not find an invalid sequence. This only 381 applies to multibyte character sets; for single byte character 382 sets, the SIMPLE_BRACKET again suffices. */ 383 if (dfa->mb_cur_max > 1 384 && (cset->nchar_classes || cset->non_match || cset->nranges 385 # ifdef _LIBC 386 || cset->nequiv_classes 387 # endif /* _LIBC */ 388 )) 389 { 390 unsigned char c = 0; 391 do 392 { 393 mbstate_t mbs; 394 memset (&mbs, 0, sizeof (mbs)); 395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 396 re_set_fastmap (fastmap, false, (int) c); 397 } 398 while (++c != 0); 399 } 400 401 else 402 { 403 /* ... Else catch all bytes which can start the mbchars. */ 404 for (i = 0; i < cset->nmbchars; ++i) 405 { 406 char buf[256]; 407 mbstate_t state; 408 memset (&state, '\0', sizeof (state)); 409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 412 { 413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 414 != (size_t) -1) 415 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 416 } 417 } 418 } 419 } 420 #endif /* RE_ENABLE_I18N */ 421 else if (type == OP_PERIOD 422 #ifdef RE_ENABLE_I18N 423 || type == OP_UTF8_PERIOD 424 #endif /* RE_ENABLE_I18N */ 425 || type == END_OF_RE) 426 { 427 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 428 if (type == END_OF_RE) 429 bufp->can_be_null = 1; 430 return; 431 } 432 } 433 } 434 435 /* Entry point for POSIX code. */ 436 /* regcomp takes a regular expression as a string and compiles it. 437 438 PREG is a regex_t *. We do not expect any fields to be initialized, 439 since POSIX says we shouldn't. Thus, we set 440 441 'buffer' to the compiled pattern; 442 'used' to the length of the compiled pattern; 443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the 444 REG_EXTENDED bit in CFLAGS is set; otherwise, to 445 RE_SYNTAX_POSIX_BASIC; 446 'newline_anchor' to REG_NEWLINE being set in CFLAGS; 447 'fastmap' to an allocated space for the fastmap; 448 'fastmap_accurate' to zero; 449 're_nsub' to the number of subexpressions in PATTERN. 450 451 PATTERN is the address of the pattern string. 452 453 CFLAGS is a series of bits which affect compilation. 454 455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 456 use POSIX basic syntax. 457 458 If REG_NEWLINE is set, then . and [^...] don't match newline. 459 Also, regexec will try a match beginning after every newline. 460 461 If REG_ICASE is set, then we considers upper- and lowercase 462 versions of letters to be equivalent when matching. 463 464 If REG_NOSUB is set, then when PREG is passed to regexec, that 465 routine will report only success or failure, and nothing about the 466 registers. 467 468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 469 the return codes and their meanings.) */ 470 471 int 472 regcomp (preg, pattern, cflags) 473 regex_t *_Restrict_ preg; 474 const char *_Restrict_ pattern; 475 int cflags; 476 { 477 reg_errcode_t ret; 478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 479 : RE_SYNTAX_POSIX_BASIC); 480 481 preg->buffer = NULL; 482 preg->allocated = 0; 483 preg->used = 0; 484 485 /* Try to allocate space for the fastmap. */ 486 preg->fastmap = re_malloc (char, SBC_MAX); 487 if (BE (preg->fastmap == NULL, 0)) 488 return REG_ESPACE; 489 490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 491 492 /* If REG_NEWLINE is set, newlines are treated differently. */ 493 if (cflags & REG_NEWLINE) 494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 495 syntax &= ~RE_DOT_NEWLINE; 496 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 497 /* It also changes the matching behavior. */ 498 preg->newline_anchor = 1; 499 } 500 else 501 preg->newline_anchor = 0; 502 preg->no_sub = !!(cflags & REG_NOSUB); 503 preg->translate = NULL; 504 505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 506 507 /* POSIX doesn't distinguish between an unmatched open-group and an 508 unmatched close-group: both are REG_EPAREN. */ 509 if (ret == REG_ERPAREN) 510 ret = REG_EPAREN; 511 512 /* We have already checked preg->fastmap != NULL. */ 513 if (BE (ret == REG_NOERROR, 1)) 514 /* Compute the fastmap now, since regexec cannot modify the pattern 515 buffer. This function never fails in this implementation. */ 516 (void) re_compile_fastmap (preg); 517 else 518 { 519 /* Some error occurred while compiling the expression. */ 520 re_free (preg->fastmap); 521 preg->fastmap = NULL; 522 } 523 524 return (int) ret; 525 } 526 #ifdef _LIBC 527 weak_alias (__regcomp, regcomp) 528 #endif 529 530 /* Returns a message corresponding to an error code, ERRCODE, returned 531 from either regcomp or regexec. We don't use PREG here. */ 532 533 #ifdef _LIBC 534 size_t 535 regerror (errcode, preg, errbuf, errbuf_size) 536 int errcode; 537 const regex_t *_Restrict_ preg; 538 char *_Restrict_ errbuf; 539 size_t errbuf_size; 540 #else /* size_t might promote */ 541 size_t 542 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_, 543 char *_Restrict_ errbuf, size_t errbuf_size) 544 #endif 545 { 546 const char *msg; 547 size_t msg_size; 548 549 if (BE (errcode < 0 550 || errcode >= (int) (sizeof (__re_error_msgid_idx) 551 / sizeof (__re_error_msgid_idx[0])), 0)) 552 /* Only error codes returned by the rest of the code should be passed 553 to this routine. If we are given anything else, or if other regex 554 code generates an invalid error code, then the program has a bug. 555 Dump core so we can fix it. */ 556 abort (); 557 558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 559 560 msg_size = strlen (msg) + 1; /* Includes the null. */ 561 562 if (BE (errbuf_size != 0, 1)) 563 { 564 size_t cpy_size = msg_size; 565 if (BE (msg_size > errbuf_size, 0)) 566 { 567 cpy_size = errbuf_size - 1; 568 errbuf[cpy_size] = '\0'; 569 } 570 memcpy (errbuf, msg, cpy_size); 571 } 572 573 return msg_size; 574 } 575 #ifdef _LIBC 576 weak_alias (__regerror, regerror) 577 #endif 578 579 580 #ifdef RE_ENABLE_I18N 581 /* This static array is used for the map to single-byte characters when 582 UTF-8 is used. Otherwise we would allocate memory just to initialize 583 it the same all the time. UTF-8 is the preferred encoding so this is 584 a worthwhile optimization. */ 585 static const bitset_t utf8_sb_map = 586 { 587 /* Set the first 128 bits. */ 588 # ifdef __GNUC__ 589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 590 # else 591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 592 # error "bitset_word_t is narrower than 32 bits" 593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 596 BITSET_WORD_MAX, BITSET_WORD_MAX, 597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 598 BITSET_WORD_MAX, 599 # endif 600 (BITSET_WORD_MAX 601 >> (SBC_MAX % BITSET_WORD_BITS == 0 602 ? 0 603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 604 # endif 605 }; 606 #endif 607 608 609 static void 610 free_dfa_content (re_dfa_t *dfa) 611 { 612 Idx i, j; 613 614 if (dfa->nodes) 615 for (i = 0; i < dfa->nodes_len; ++i) 616 free_token (dfa->nodes + i); 617 re_free (dfa->nexts); 618 for (i = 0; i < dfa->nodes_len; ++i) 619 { 620 if (dfa->eclosures != NULL) 621 re_node_set_free (dfa->eclosures + i); 622 if (dfa->inveclosures != NULL) 623 re_node_set_free (dfa->inveclosures + i); 624 if (dfa->edests != NULL) 625 re_node_set_free (dfa->edests + i); 626 } 627 re_free (dfa->edests); 628 re_free (dfa->eclosures); 629 re_free (dfa->inveclosures); 630 re_free (dfa->nodes); 631 632 if (dfa->state_table) 633 for (i = 0; i <= dfa->state_hash_mask; ++i) 634 { 635 struct re_state_table_entry *entry = dfa->state_table + i; 636 for (j = 0; j < entry->num; ++j) 637 { 638 re_dfastate_t *state = entry->array[j]; 639 free_state (state); 640 } 641 re_free (entry->array); 642 } 643 re_free (dfa->state_table); 644 #ifdef RE_ENABLE_I18N 645 if (dfa->sb_char != utf8_sb_map) 646 re_free (dfa->sb_char); 647 #endif 648 re_free (dfa->subexp_map); 649 #ifdef DEBUG 650 re_free (dfa->re_str); 651 #endif 652 653 re_free (dfa); 654 } 655 656 657 /* Free dynamically allocated space used by PREG. */ 658 659 void 660 regfree (preg) 661 regex_t *preg; 662 { 663 re_dfa_t *dfa = preg->buffer; 664 if (BE (dfa != NULL, 1)) 665 free_dfa_content (dfa); 666 preg->buffer = NULL; 667 preg->allocated = 0; 668 669 re_free (preg->fastmap); 670 preg->fastmap = NULL; 671 672 re_free (preg->translate); 673 preg->translate = NULL; 674 } 675 #ifdef _LIBC 676 weak_alias (__regfree, regfree) 677 #endif 678 679 /* Entry points compatible with 4.2 BSD regex library. We don't define 680 them unless specifically requested. */ 681 682 #if defined _REGEX_RE_COMP || defined _LIBC 683 684 /* BSD has one and only one pattern buffer. */ 685 static struct re_pattern_buffer re_comp_buf; 686 687 char * 688 # ifdef _LIBC 689 /* Make these definitions weak in libc, so POSIX programs can redefine 690 these names if they don't use our functions, and still use 691 regcomp/regexec above without link errors. */ 692 weak_function 693 # endif 694 re_comp (s) 695 const char *s; 696 { 697 reg_errcode_t ret; 698 char *fastmap; 699 700 if (!s) 701 { 702 if (!re_comp_buf.buffer) 703 return gettext ("No previous regular expression"); 704 return 0; 705 } 706 707 if (re_comp_buf.buffer) 708 { 709 fastmap = re_comp_buf.fastmap; 710 re_comp_buf.fastmap = NULL; 711 __regfree (&re_comp_buf); 712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 713 re_comp_buf.fastmap = fastmap; 714 } 715 716 if (re_comp_buf.fastmap == NULL) 717 { 718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 719 if (re_comp_buf.fastmap == NULL) 720 return (char *) gettext (__re_error_msgid 721 + __re_error_msgid_idx[(int) REG_ESPACE]); 722 } 723 724 /* Since 're_exec' always passes NULL for the 'regs' argument, we 725 don't need to initialize the pattern buffer fields which affect it. */ 726 727 /* Match anchors at newlines. */ 728 re_comp_buf.newline_anchor = 1; 729 730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 731 732 if (!ret) 733 return NULL; 734 735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ 736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 737 } 738 739 #ifdef _LIBC 740 libc_freeres_fn (free_mem) 741 { 742 __regfree (&re_comp_buf); 743 } 744 #endif 745 746 #endif /* _REGEX_RE_COMP */ 747 748 /* Internal entry point. 749 Compile the regular expression PATTERN, whose length is LENGTH. 750 SYNTAX indicate regular expression's syntax. */ 751 752 static reg_errcode_t 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 754 reg_syntax_t syntax) 755 { 756 reg_errcode_t err = REG_NOERROR; 757 re_dfa_t *dfa; 758 re_string_t regexp; 759 760 /* Initialize the pattern buffer. */ 761 preg->fastmap_accurate = 0; 762 preg->syntax = syntax; 763 preg->not_bol = preg->not_eol = 0; 764 preg->used = 0; 765 preg->re_nsub = 0; 766 preg->can_be_null = 0; 767 preg->regs_allocated = REGS_UNALLOCATED; 768 769 /* Initialize the dfa. */ 770 dfa = preg->buffer; 771 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 772 { 773 /* If zero allocated, but buffer is non-null, try to realloc 774 enough space. This loses if buffer's address is bogus, but 775 that is the user's responsibility. If ->buffer is NULL this 776 is a simple allocation. */ 777 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 778 if (dfa == NULL) 779 return REG_ESPACE; 780 preg->allocated = sizeof (re_dfa_t); 781 preg->buffer = dfa; 782 } 783 preg->used = sizeof (re_dfa_t); 784 785 err = init_dfa (dfa, length); 786 if (BE (err != REG_NOERROR, 0)) 787 { 788 free_dfa_content (dfa); 789 preg->buffer = NULL; 790 preg->allocated = 0; 791 return err; 792 } 793 #ifdef DEBUG 794 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 795 dfa->re_str = re_malloc (char, length + 1); 796 strncpy (dfa->re_str, pattern, length + 1); 797 #endif 798 799 __libc_lock_init (dfa->lock); 800 801 err = re_string_construct (®exp, pattern, length, preg->translate, 802 (syntax & RE_ICASE) != 0, dfa); 803 if (BE (err != REG_NOERROR, 0)) 804 { 805 re_compile_internal_free_return: 806 free_workarea_compile (preg); 807 re_string_destruct (®exp); 808 free_dfa_content (dfa); 809 preg->buffer = NULL; 810 preg->allocated = 0; 811 return err; 812 } 813 814 /* Parse the regular expression, and build a structure tree. */ 815 preg->re_nsub = 0; 816 dfa->str_tree = parse (®exp, preg, syntax, &err); 817 if (BE (dfa->str_tree == NULL, 0)) 818 goto re_compile_internal_free_return; 819 820 /* Analyze the tree and create the nfa. */ 821 err = analyze (preg); 822 if (BE (err != REG_NOERROR, 0)) 823 goto re_compile_internal_free_return; 824 825 #ifdef RE_ENABLE_I18N 826 /* If possible, do searching in single byte encoding to speed things up. */ 827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 828 optimize_utf8 (dfa); 829 #endif 830 831 /* Then create the initial state of the dfa. */ 832 err = create_initial_state (dfa); 833 834 /* Release work areas. */ 835 free_workarea_compile (preg); 836 re_string_destruct (®exp); 837 838 if (BE (err != REG_NOERROR, 0)) 839 { 840 free_dfa_content (dfa); 841 preg->buffer = NULL; 842 preg->allocated = 0; 843 } 844 845 return err; 846 } 847 848 /* Initialize DFA. We use the length of the regular expression PAT_LEN 849 as the initial length of some arrays. */ 850 851 static reg_errcode_t 852 init_dfa (re_dfa_t *dfa, size_t pat_len) 853 { 854 __re_size_t table_size; 855 #ifndef _LIBC 856 const char *codeset_name; 857 #endif 858 #ifdef RE_ENABLE_I18N 859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 860 #else 861 size_t max_i18n_object_size = 0; 862 #endif 863 size_t max_object_size = 864 MAX (sizeof (struct re_state_table_entry), 865 MAX (sizeof (re_token_t), 866 MAX (sizeof (re_node_set), 867 MAX (sizeof (regmatch_t), 868 max_i18n_object_size)))); 869 870 memset (dfa, '\0', sizeof (re_dfa_t)); 871 872 /* Force allocation of str_tree_storage the first time. */ 873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 874 875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 876 calculation below, and for similar doubling calculations 877 elsewhere. And it's <= rather than <, because some of the 878 doubling calculations add 1 afterwards. */ 879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) 880 return REG_ESPACE; 881 882 dfa->nodes_alloc = pat_len + 1; 883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 884 885 /* table_size = 2 ^ ceil(log pat_len) */ 886 for (table_size = 1; ; table_size <<= 1) 887 if (table_size > pat_len) 888 break; 889 890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 891 dfa->state_hash_mask = table_size - 1; 892 893 dfa->mb_cur_max = MB_CUR_MAX; 894 #ifdef _LIBC 895 if (dfa->mb_cur_max == 6 896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 897 dfa->is_utf8 = 1; 898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 899 != 0); 900 #else 901 codeset_name = nl_langinfo (CODESET); 902 if (strcasecmp (codeset_name, "UTF-8") == 0 903 || strcasecmp (codeset_name, "UTF8") == 0) 904 dfa->is_utf8 = 1; 905 906 /* We check exhaustively in the loop below if this charset is a 907 superset of ASCII. */ 908 dfa->map_notascii = 0; 909 #endif 910 911 #ifdef RE_ENABLE_I18N 912 if (dfa->mb_cur_max > 1) 913 { 914 if (dfa->is_utf8) 915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 916 else 917 { 918 int i, j, ch; 919 920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 921 if (BE (dfa->sb_char == NULL, 0)) 922 return REG_ESPACE; 923 924 /* Set the bits corresponding to single byte chars. */ 925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 927 { 928 wint_t wch = __btowc (ch); 929 if (wch != WEOF) 930 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 931 # ifndef _LIBC 932 if (isascii (ch) && wch != ch) 933 dfa->map_notascii = 1; 934 # endif 935 } 936 } 937 } 938 #endif 939 940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 941 return REG_ESPACE; 942 return REG_NOERROR; 943 } 944 945 /* Initialize WORD_CHAR table, which indicate which character is 946 "word". In this case "word" means that it is the word construction 947 character used by some operators like "\<", "\>", etc. */ 948 949 static void 950 internal_function 951 init_word_char (re_dfa_t *dfa) 952 { 953 dfa->word_ops_used = 1; 954 int i = 0; 955 int j; 956 int ch = 0; 957 if (BE (dfa->map_notascii == 0, 1)) 958 { 959 if (BITSET_WORD_BITS == 64) 960 { 961 dfa->word_char[0] = UINT64_C (0x03ff000000000000); 962 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe); 963 i = 2; 964 } 965 else if (BITSET_WORD_BITS == 32) 966 { 967 dfa->word_char[0] = UINT32_C (0x00000000); 968 dfa->word_char[1] = UINT32_C (0x03ff0000); 969 dfa->word_char[2] = UINT32_C (0x87fffffe); 970 dfa->word_char[3] = UINT32_C (0x07fffffe); 971 i = 4; 972 } 973 else 974 goto general_case; 975 ch = 128; 976 977 if (BE (dfa->is_utf8, 1)) 978 { 979 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 980 return; 981 } 982 } 983 984 general_case: 985 for (; i < BITSET_WORDS; ++i) 986 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 987 if (isalnum (ch) || ch == '_') 988 dfa->word_char[i] |= (bitset_word_t) 1 << j; 989 } 990 991 /* Free the work area which are only used while compiling. */ 992 993 static void 994 free_workarea_compile (regex_t *preg) 995 { 996 re_dfa_t *dfa = preg->buffer; 997 bin_tree_storage_t *storage, *next; 998 for (storage = dfa->str_tree_storage; storage; storage = next) 999 { 1000 next = storage->next; 1001 re_free (storage); 1002 } 1003 dfa->str_tree_storage = NULL; 1004 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 1005 dfa->str_tree = NULL; 1006 re_free (dfa->org_indices); 1007 dfa->org_indices = NULL; 1008 } 1009 1010 /* Create initial states for all contexts. */ 1011 1012 static reg_errcode_t 1013 create_initial_state (re_dfa_t *dfa) 1014 { 1015 Idx first, i; 1016 reg_errcode_t err; 1017 re_node_set init_nodes; 1018 1019 /* Initial states have the epsilon closure of the node which is 1020 the first node of the regular expression. */ 1021 first = dfa->str_tree->first->node_idx; 1022 dfa->init_node = first; 1023 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1024 if (BE (err != REG_NOERROR, 0)) 1025 return err; 1026 1027 /* The back-references which are in initial states can epsilon transit, 1028 since in this case all of the subexpressions can be null. 1029 Then we add epsilon closures of the nodes which are the next nodes of 1030 the back-references. */ 1031 if (dfa->nbackref > 0) 1032 for (i = 0; i < init_nodes.nelem; ++i) 1033 { 1034 Idx node_idx = init_nodes.elems[i]; 1035 re_token_type_t type = dfa->nodes[node_idx].type; 1036 1037 Idx clexp_idx; 1038 if (type != OP_BACK_REF) 1039 continue; 1040 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1041 { 1042 re_token_t *clexp_node; 1043 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1044 if (clexp_node->type == OP_CLOSE_SUBEXP 1045 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1046 break; 1047 } 1048 if (clexp_idx == init_nodes.nelem) 1049 continue; 1050 1051 if (type == OP_BACK_REF) 1052 { 1053 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1054 if (!re_node_set_contains (&init_nodes, dest_idx)) 1055 { 1056 reg_errcode_t merge_err 1057 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1058 if (merge_err != REG_NOERROR) 1059 return merge_err; 1060 i = 0; 1061 } 1062 } 1063 } 1064 1065 /* It must be the first time to invoke acquire_state. */ 1066 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1067 /* We don't check ERR here, since the initial state must not be NULL. */ 1068 if (BE (dfa->init_state == NULL, 0)) 1069 return err; 1070 if (dfa->init_state->has_constraint) 1071 { 1072 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1073 CONTEXT_WORD); 1074 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1075 CONTEXT_NEWLINE); 1076 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1077 &init_nodes, 1078 CONTEXT_NEWLINE 1079 | CONTEXT_BEGBUF); 1080 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1081 || dfa->init_state_begbuf == NULL, 0)) 1082 return err; 1083 } 1084 else 1085 dfa->init_state_word = dfa->init_state_nl 1086 = dfa->init_state_begbuf = dfa->init_state; 1087 1088 re_node_set_free (&init_nodes); 1089 return REG_NOERROR; 1090 } 1091 1092 #ifdef RE_ENABLE_I18N 1093 /* If it is possible to do searching in single byte encoding instead of UTF-8 1094 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1095 DFA nodes where needed. */ 1096 1097 static void 1098 optimize_utf8 (re_dfa_t *dfa) 1099 { 1100 Idx node; 1101 int i; 1102 bool mb_chars = false; 1103 bool has_period = false; 1104 1105 for (node = 0; node < dfa->nodes_len; ++node) 1106 switch (dfa->nodes[node].type) 1107 { 1108 case CHARACTER: 1109 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1110 mb_chars = true; 1111 break; 1112 case ANCHOR: 1113 switch (dfa->nodes[node].opr.ctx_type) 1114 { 1115 case LINE_FIRST: 1116 case LINE_LAST: 1117 case BUF_FIRST: 1118 case BUF_LAST: 1119 break; 1120 default: 1121 /* Word anchors etc. cannot be handled. It's okay to test 1122 opr.ctx_type since constraints (for all DFA nodes) are 1123 created by ORing one or more opr.ctx_type values. */ 1124 return; 1125 } 1126 break; 1127 case OP_PERIOD: 1128 has_period = true; 1129 break; 1130 case OP_BACK_REF: 1131 case OP_ALT: 1132 case END_OF_RE: 1133 case OP_DUP_ASTERISK: 1134 case OP_OPEN_SUBEXP: 1135 case OP_CLOSE_SUBEXP: 1136 break; 1137 case COMPLEX_BRACKET: 1138 return; 1139 case SIMPLE_BRACKET: 1140 /* Just double check. */ 1141 { 1142 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1143 ? 0 1144 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1145 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1146 { 1147 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1148 return; 1149 rshift = 0; 1150 } 1151 } 1152 break; 1153 default: 1154 abort (); 1155 } 1156 1157 if (mb_chars || has_period) 1158 for (node = 0; node < dfa->nodes_len; ++node) 1159 { 1160 if (dfa->nodes[node].type == CHARACTER 1161 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1162 dfa->nodes[node].mb_partial = 0; 1163 else if (dfa->nodes[node].type == OP_PERIOD) 1164 dfa->nodes[node].type = OP_UTF8_PERIOD; 1165 } 1166 1167 /* The search can be in single byte locale. */ 1168 dfa->mb_cur_max = 1; 1169 dfa->is_utf8 = 0; 1170 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1171 } 1172 #endif 1173 1174 /* Analyze the structure tree, and calculate "first", "next", "edest", 1175 "eclosure", and "inveclosure". */ 1176 1177 static reg_errcode_t 1178 analyze (regex_t *preg) 1179 { 1180 re_dfa_t *dfa = preg->buffer; 1181 reg_errcode_t ret; 1182 1183 /* Allocate arrays. */ 1184 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1185 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1186 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1187 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1188 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1189 || dfa->eclosures == NULL, 0)) 1190 return REG_ESPACE; 1191 1192 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1193 if (dfa->subexp_map != NULL) 1194 { 1195 Idx i; 1196 for (i = 0; i < preg->re_nsub; i++) 1197 dfa->subexp_map[i] = i; 1198 preorder (dfa->str_tree, optimize_subexps, dfa); 1199 for (i = 0; i < preg->re_nsub; i++) 1200 if (dfa->subexp_map[i] != i) 1201 break; 1202 if (i == preg->re_nsub) 1203 { 1204 free (dfa->subexp_map); 1205 dfa->subexp_map = NULL; 1206 } 1207 } 1208 1209 ret = postorder (dfa->str_tree, lower_subexps, preg); 1210 if (BE (ret != REG_NOERROR, 0)) 1211 return ret; 1212 ret = postorder (dfa->str_tree, calc_first, dfa); 1213 if (BE (ret != REG_NOERROR, 0)) 1214 return ret; 1215 preorder (dfa->str_tree, calc_next, dfa); 1216 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1217 if (BE (ret != REG_NOERROR, 0)) 1218 return ret; 1219 ret = calc_eclosure (dfa); 1220 if (BE (ret != REG_NOERROR, 0)) 1221 return ret; 1222 1223 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1224 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1225 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1226 || dfa->nbackref) 1227 { 1228 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1229 if (BE (dfa->inveclosures == NULL, 0)) 1230 return REG_ESPACE; 1231 ret = calc_inveclosure (dfa); 1232 } 1233 1234 return ret; 1235 } 1236 1237 /* Our parse trees are very unbalanced, so we cannot use a stack to 1238 implement parse tree visits. Instead, we use parent pointers and 1239 some hairy code in these two functions. */ 1240 static reg_errcode_t 1241 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1242 void *extra) 1243 { 1244 bin_tree_t *node, *prev; 1245 1246 for (node = root; ; ) 1247 { 1248 /* Descend down the tree, preferably to the left (or to the right 1249 if that's the only child). */ 1250 while (node->left || node->right) 1251 if (node->left) 1252 node = node->left; 1253 else 1254 node = node->right; 1255 1256 do 1257 { 1258 reg_errcode_t err = fn (extra, node); 1259 if (BE (err != REG_NOERROR, 0)) 1260 return err; 1261 if (node->parent == NULL) 1262 return REG_NOERROR; 1263 prev = node; 1264 node = node->parent; 1265 } 1266 /* Go up while we have a node that is reached from the right. */ 1267 while (node->right == prev || node->right == NULL); 1268 node = node->right; 1269 } 1270 } 1271 1272 static reg_errcode_t 1273 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1274 void *extra) 1275 { 1276 bin_tree_t *node; 1277 1278 for (node = root; ; ) 1279 { 1280 reg_errcode_t err = fn (extra, node); 1281 if (BE (err != REG_NOERROR, 0)) 1282 return err; 1283 1284 /* Go to the left node, or up and to the right. */ 1285 if (node->left) 1286 node = node->left; 1287 else 1288 { 1289 bin_tree_t *prev = NULL; 1290 while (node->right == prev || node->right == NULL) 1291 { 1292 prev = node; 1293 node = node->parent; 1294 if (!node) 1295 return REG_NOERROR; 1296 } 1297 node = node->right; 1298 } 1299 } 1300 } 1301 1302 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1303 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1304 backreferences as well. Requires a preorder visit. */ 1305 static reg_errcode_t 1306 optimize_subexps (void *extra, bin_tree_t *node) 1307 { 1308 re_dfa_t *dfa = (re_dfa_t *) extra; 1309 1310 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1311 { 1312 int idx = node->token.opr.idx; 1313 node->token.opr.idx = dfa->subexp_map[idx]; 1314 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1315 } 1316 1317 else if (node->token.type == SUBEXP 1318 && node->left && node->left->token.type == SUBEXP) 1319 { 1320 Idx other_idx = node->left->token.opr.idx; 1321 1322 node->left = node->left->left; 1323 if (node->left) 1324 node->left->parent = node; 1325 1326 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1327 if (other_idx < BITSET_WORD_BITS) 1328 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1329 } 1330 1331 return REG_NOERROR; 1332 } 1333 1334 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1335 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1336 static reg_errcode_t 1337 lower_subexps (void *extra, bin_tree_t *node) 1338 { 1339 regex_t *preg = (regex_t *) extra; 1340 reg_errcode_t err = REG_NOERROR; 1341 1342 if (node->left && node->left->token.type == SUBEXP) 1343 { 1344 node->left = lower_subexp (&err, preg, node->left); 1345 if (node->left) 1346 node->left->parent = node; 1347 } 1348 if (node->right && node->right->token.type == SUBEXP) 1349 { 1350 node->right = lower_subexp (&err, preg, node->right); 1351 if (node->right) 1352 node->right->parent = node; 1353 } 1354 1355 return err; 1356 } 1357 1358 static bin_tree_t * 1359 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1360 { 1361 re_dfa_t *dfa = preg->buffer; 1362 bin_tree_t *body = node->left; 1363 bin_tree_t *op, *cls, *tree1, *tree; 1364 1365 if (preg->no_sub 1366 /* We do not optimize empty subexpressions, because otherwise we may 1367 have bad CONCAT nodes with NULL children. This is obviously not 1368 very common, so we do not lose much. An example that triggers 1369 this case is the sed "script" /\(\)/x. */ 1370 && node->left != NULL 1371 && (node->token.opr.idx >= BITSET_WORD_BITS 1372 || !(dfa->used_bkref_map 1373 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1374 return node->left; 1375 1376 /* Convert the SUBEXP node to the concatenation of an 1377 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1378 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1379 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1380 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1381 tree = create_tree (dfa, op, tree1, CONCAT); 1382 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1383 { 1384 *err = REG_ESPACE; 1385 return NULL; 1386 } 1387 1388 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1389 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1390 return tree; 1391 } 1392 1393 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1394 nodes. Requires a postorder visit. */ 1395 static reg_errcode_t 1396 calc_first (void *extra, bin_tree_t *node) 1397 { 1398 re_dfa_t *dfa = (re_dfa_t *) extra; 1399 if (node->token.type == CONCAT) 1400 { 1401 node->first = node->left->first; 1402 node->node_idx = node->left->node_idx; 1403 } 1404 else 1405 { 1406 node->first = node; 1407 node->node_idx = re_dfa_add_node (dfa, node->token); 1408 if (BE (node->node_idx == REG_MISSING, 0)) 1409 return REG_ESPACE; 1410 if (node->token.type == ANCHOR) 1411 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1412 } 1413 return REG_NOERROR; 1414 } 1415 1416 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1417 static reg_errcode_t 1418 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 1419 { 1420 switch (node->token.type) 1421 { 1422 case OP_DUP_ASTERISK: 1423 node->left->next = node; 1424 break; 1425 case CONCAT: 1426 node->left->next = node->right->first; 1427 node->right->next = node->next; 1428 break; 1429 default: 1430 if (node->left) 1431 node->left->next = node->next; 1432 if (node->right) 1433 node->right->next = node->next; 1434 break; 1435 } 1436 return REG_NOERROR; 1437 } 1438 1439 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1440 static reg_errcode_t 1441 link_nfa_nodes (void *extra, bin_tree_t *node) 1442 { 1443 re_dfa_t *dfa = (re_dfa_t *) extra; 1444 Idx idx = node->node_idx; 1445 reg_errcode_t err = REG_NOERROR; 1446 1447 switch (node->token.type) 1448 { 1449 case CONCAT: 1450 break; 1451 1452 case END_OF_RE: 1453 assert (node->next == NULL); 1454 break; 1455 1456 case OP_DUP_ASTERISK: 1457 case OP_ALT: 1458 { 1459 Idx left, right; 1460 dfa->has_plural_match = 1; 1461 if (node->left != NULL) 1462 left = node->left->first->node_idx; 1463 else 1464 left = node->next->node_idx; 1465 if (node->right != NULL) 1466 right = node->right->first->node_idx; 1467 else 1468 right = node->next->node_idx; 1469 assert (REG_VALID_INDEX (left)); 1470 assert (REG_VALID_INDEX (right)); 1471 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1472 } 1473 break; 1474 1475 case ANCHOR: 1476 case OP_OPEN_SUBEXP: 1477 case OP_CLOSE_SUBEXP: 1478 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1479 break; 1480 1481 case OP_BACK_REF: 1482 dfa->nexts[idx] = node->next->node_idx; 1483 if (node->token.type == OP_BACK_REF) 1484 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1485 break; 1486 1487 default: 1488 assert (!IS_EPSILON_NODE (node->token.type)); 1489 dfa->nexts[idx] = node->next->node_idx; 1490 break; 1491 } 1492 1493 return err; 1494 } 1495 1496 /* Duplicate the epsilon closure of the node ROOT_NODE. 1497 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1498 to their own constraint. */ 1499 1500 static reg_errcode_t 1501 internal_function 1502 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1503 Idx root_node, unsigned int init_constraint) 1504 { 1505 Idx org_node, clone_node; 1506 bool ok; 1507 unsigned int constraint = init_constraint; 1508 for (org_node = top_org_node, clone_node = top_clone_node;;) 1509 { 1510 Idx org_dest, clone_dest; 1511 if (dfa->nodes[org_node].type == OP_BACK_REF) 1512 { 1513 /* If the back reference epsilon-transit, its destination must 1514 also have the constraint. Then duplicate the epsilon closure 1515 of the destination of the back reference, and store it in 1516 edests of the back reference. */ 1517 org_dest = dfa->nexts[org_node]; 1518 re_node_set_empty (dfa->edests + clone_node); 1519 clone_dest = duplicate_node (dfa, org_dest, constraint); 1520 if (BE (clone_dest == REG_MISSING, 0)) 1521 return REG_ESPACE; 1522 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1524 if (BE (! ok, 0)) 1525 return REG_ESPACE; 1526 } 1527 else if (dfa->edests[org_node].nelem == 0) 1528 { 1529 /* In case of the node can't epsilon-transit, don't duplicate the 1530 destination and store the original destination as the 1531 destination of the node. */ 1532 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1533 break; 1534 } 1535 else if (dfa->edests[org_node].nelem == 1) 1536 { 1537 /* In case of the node can epsilon-transit, and it has only one 1538 destination. */ 1539 org_dest = dfa->edests[org_node].elems[0]; 1540 re_node_set_empty (dfa->edests + clone_node); 1541 /* If the node is root_node itself, it means the epsilon closure 1542 has a loop. Then tie it to the destination of the root_node. */ 1543 if (org_node == root_node && clone_node != org_node) 1544 { 1545 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1546 if (BE (! ok, 0)) 1547 return REG_ESPACE; 1548 break; 1549 } 1550 /* In case the node has another constraint, append it. */ 1551 constraint |= dfa->nodes[org_node].constraint; 1552 clone_dest = duplicate_node (dfa, org_dest, constraint); 1553 if (BE (clone_dest == REG_MISSING, 0)) 1554 return REG_ESPACE; 1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1556 if (BE (! ok, 0)) 1557 return REG_ESPACE; 1558 } 1559 else /* dfa->edests[org_node].nelem == 2 */ 1560 { 1561 /* In case of the node can epsilon-transit, and it has two 1562 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1563 org_dest = dfa->edests[org_node].elems[0]; 1564 re_node_set_empty (dfa->edests + clone_node); 1565 /* Search for a duplicated node which satisfies the constraint. */ 1566 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1567 if (clone_dest == REG_MISSING) 1568 { 1569 /* There is no such duplicated node, create a new one. */ 1570 reg_errcode_t err; 1571 clone_dest = duplicate_node (dfa, org_dest, constraint); 1572 if (BE (clone_dest == REG_MISSING, 0)) 1573 return REG_ESPACE; 1574 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1575 if (BE (! ok, 0)) 1576 return REG_ESPACE; 1577 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1578 root_node, constraint); 1579 if (BE (err != REG_NOERROR, 0)) 1580 return err; 1581 } 1582 else 1583 { 1584 /* There is a duplicated node which satisfies the constraint, 1585 use it to avoid infinite loop. */ 1586 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1587 if (BE (! ok, 0)) 1588 return REG_ESPACE; 1589 } 1590 1591 org_dest = dfa->edests[org_node].elems[1]; 1592 clone_dest = duplicate_node (dfa, org_dest, constraint); 1593 if (BE (clone_dest == REG_MISSING, 0)) 1594 return REG_ESPACE; 1595 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1596 if (BE (! ok, 0)) 1597 return REG_ESPACE; 1598 } 1599 org_node = org_dest; 1600 clone_node = clone_dest; 1601 } 1602 return REG_NOERROR; 1603 } 1604 1605 /* Search for a node which is duplicated from the node ORG_NODE, and 1606 satisfies the constraint CONSTRAINT. */ 1607 1608 static Idx 1609 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1610 unsigned int constraint) 1611 { 1612 Idx idx; 1613 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1614 { 1615 if (org_node == dfa->org_indices[idx] 1616 && constraint == dfa->nodes[idx].constraint) 1617 return idx; /* Found. */ 1618 } 1619 return REG_MISSING; /* Not found. */ 1620 } 1621 1622 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1623 Return the index of the new node, or REG_MISSING if insufficient storage is 1624 available. */ 1625 1626 static Idx 1627 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1628 { 1629 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1630 if (BE (dup_idx != REG_MISSING, 1)) 1631 { 1632 dfa->nodes[dup_idx].constraint = constraint; 1633 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1634 dfa->nodes[dup_idx].duplicated = 1; 1635 1636 /* Store the index of the original node. */ 1637 dfa->org_indices[dup_idx] = org_idx; 1638 } 1639 return dup_idx; 1640 } 1641 1642 static reg_errcode_t 1643 calc_inveclosure (re_dfa_t *dfa) 1644 { 1645 Idx src, idx; 1646 bool ok; 1647 for (idx = 0; idx < dfa->nodes_len; ++idx) 1648 re_node_set_init_empty (dfa->inveclosures + idx); 1649 1650 for (src = 0; src < dfa->nodes_len; ++src) 1651 { 1652 Idx *elems = dfa->eclosures[src].elems; 1653 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1654 { 1655 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1656 if (BE (! ok, 0)) 1657 return REG_ESPACE; 1658 } 1659 } 1660 1661 return REG_NOERROR; 1662 } 1663 1664 /* Calculate "eclosure" for all the node in DFA. */ 1665 1666 static reg_errcode_t 1667 calc_eclosure (re_dfa_t *dfa) 1668 { 1669 Idx node_idx; 1670 bool incomplete; 1671 #ifdef DEBUG 1672 assert (dfa->nodes_len > 0); 1673 #endif 1674 incomplete = false; 1675 /* For each nodes, calculate epsilon closure. */ 1676 for (node_idx = 0; ; ++node_idx) 1677 { 1678 reg_errcode_t err; 1679 re_node_set eclosure_elem; 1680 if (node_idx == dfa->nodes_len) 1681 { 1682 if (!incomplete) 1683 break; 1684 incomplete = false; 1685 node_idx = 0; 1686 } 1687 1688 #ifdef DEBUG 1689 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1690 #endif 1691 1692 /* If we have already calculated, skip it. */ 1693 if (dfa->eclosures[node_idx].nelem != 0) 1694 continue; 1695 /* Calculate epsilon closure of 'node_idx'. */ 1696 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1697 if (BE (err != REG_NOERROR, 0)) 1698 return err; 1699 1700 if (dfa->eclosures[node_idx].nelem == 0) 1701 { 1702 incomplete = true; 1703 re_node_set_free (&eclosure_elem); 1704 } 1705 } 1706 return REG_NOERROR; 1707 } 1708 1709 /* Calculate epsilon closure of NODE. */ 1710 1711 static reg_errcode_t 1712 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1713 { 1714 reg_errcode_t err; 1715 Idx i; 1716 re_node_set eclosure; 1717 bool ok; 1718 bool incomplete = false; 1719 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1720 if (BE (err != REG_NOERROR, 0)) 1721 return err; 1722 1723 /* This indicates that we are calculating this node now. 1724 We reference this value to avoid infinite loop. */ 1725 dfa->eclosures[node].nelem = REG_MISSING; 1726 1727 /* If the current node has constraints, duplicate all nodes 1728 since they must inherit the constraints. */ 1729 if (dfa->nodes[node].constraint 1730 && dfa->edests[node].nelem 1731 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1732 { 1733 err = duplicate_node_closure (dfa, node, node, node, 1734 dfa->nodes[node].constraint); 1735 if (BE (err != REG_NOERROR, 0)) 1736 return err; 1737 } 1738 1739 /* Expand each epsilon destination nodes. */ 1740 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1741 for (i = 0; i < dfa->edests[node].nelem; ++i) 1742 { 1743 re_node_set eclosure_elem; 1744 Idx edest = dfa->edests[node].elems[i]; 1745 /* If calculating the epsilon closure of 'edest' is in progress, 1746 return intermediate result. */ 1747 if (dfa->eclosures[edest].nelem == REG_MISSING) 1748 { 1749 incomplete = true; 1750 continue; 1751 } 1752 /* If we haven't calculated the epsilon closure of 'edest' yet, 1753 calculate now. Otherwise use calculated epsilon closure. */ 1754 if (dfa->eclosures[edest].nelem == 0) 1755 { 1756 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1757 if (BE (err != REG_NOERROR, 0)) 1758 return err; 1759 } 1760 else 1761 eclosure_elem = dfa->eclosures[edest]; 1762 /* Merge the epsilon closure of 'edest'. */ 1763 err = re_node_set_merge (&eclosure, &eclosure_elem); 1764 if (BE (err != REG_NOERROR, 0)) 1765 return err; 1766 /* If the epsilon closure of 'edest' is incomplete, 1767 the epsilon closure of this node is also incomplete. */ 1768 if (dfa->eclosures[edest].nelem == 0) 1769 { 1770 incomplete = true; 1771 re_node_set_free (&eclosure_elem); 1772 } 1773 } 1774 1775 /* An epsilon closure includes itself. */ 1776 ok = re_node_set_insert (&eclosure, node); 1777 if (BE (! ok, 0)) 1778 return REG_ESPACE; 1779 if (incomplete && !root) 1780 dfa->eclosures[node].nelem = 0; 1781 else 1782 dfa->eclosures[node] = eclosure; 1783 *new_set = eclosure; 1784 return REG_NOERROR; 1785 } 1786 1787 /* Functions for token which are used in the parser. */ 1788 1789 /* Fetch a token from INPUT. 1790 We must not use this function inside bracket expressions. */ 1791 1792 static void 1793 internal_function 1794 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1795 { 1796 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1797 } 1798 1799 /* Peek a token from INPUT, and return the length of the token. 1800 We must not use this function inside bracket expressions. */ 1801 1802 static int 1803 internal_function 1804 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1805 { 1806 unsigned char c; 1807 1808 if (re_string_eoi (input)) 1809 { 1810 token->type = END_OF_RE; 1811 return 0; 1812 } 1813 1814 c = re_string_peek_byte (input, 0); 1815 token->opr.c = c; 1816 1817 token->word_char = 0; 1818 #ifdef RE_ENABLE_I18N 1819 token->mb_partial = 0; 1820 if (input->mb_cur_max > 1 && 1821 !re_string_first_byte (input, re_string_cur_idx (input))) 1822 { 1823 token->type = CHARACTER; 1824 token->mb_partial = 1; 1825 return 1; 1826 } 1827 #endif 1828 if (c == '\\') 1829 { 1830 unsigned char c2; 1831 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1832 { 1833 token->type = BACK_SLASH; 1834 return 1; 1835 } 1836 1837 c2 = re_string_peek_byte_case (input, 1); 1838 token->opr.c = c2; 1839 token->type = CHARACTER; 1840 #ifdef RE_ENABLE_I18N 1841 if (input->mb_cur_max > 1) 1842 { 1843 wint_t wc = re_string_wchar_at (input, 1844 re_string_cur_idx (input) + 1); 1845 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1846 } 1847 else 1848 #endif 1849 token->word_char = IS_WORD_CHAR (c2) != 0; 1850 1851 switch (c2) 1852 { 1853 case '|': 1854 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1855 token->type = OP_ALT; 1856 break; 1857 case '1': case '2': case '3': case '4': case '5': 1858 case '6': case '7': case '8': case '9': 1859 if (!(syntax & RE_NO_BK_REFS)) 1860 { 1861 token->type = OP_BACK_REF; 1862 token->opr.idx = c2 - '1'; 1863 } 1864 break; 1865 case '<': 1866 if (!(syntax & RE_NO_GNU_OPS)) 1867 { 1868 token->type = ANCHOR; 1869 token->opr.ctx_type = WORD_FIRST; 1870 } 1871 break; 1872 case '>': 1873 if (!(syntax & RE_NO_GNU_OPS)) 1874 { 1875 token->type = ANCHOR; 1876 token->opr.ctx_type = WORD_LAST; 1877 } 1878 break; 1879 case 'b': 1880 if (!(syntax & RE_NO_GNU_OPS)) 1881 { 1882 token->type = ANCHOR; 1883 token->opr.ctx_type = WORD_DELIM; 1884 } 1885 break; 1886 case 'B': 1887 if (!(syntax & RE_NO_GNU_OPS)) 1888 { 1889 token->type = ANCHOR; 1890 token->opr.ctx_type = NOT_WORD_DELIM; 1891 } 1892 break; 1893 case 'w': 1894 if (!(syntax & RE_NO_GNU_OPS)) 1895 token->type = OP_WORD; 1896 break; 1897 case 'W': 1898 if (!(syntax & RE_NO_GNU_OPS)) 1899 token->type = OP_NOTWORD; 1900 break; 1901 case 's': 1902 if (!(syntax & RE_NO_GNU_OPS)) 1903 token->type = OP_SPACE; 1904 break; 1905 case 'S': 1906 if (!(syntax & RE_NO_GNU_OPS)) 1907 token->type = OP_NOTSPACE; 1908 break; 1909 case '`': 1910 if (!(syntax & RE_NO_GNU_OPS)) 1911 { 1912 token->type = ANCHOR; 1913 token->opr.ctx_type = BUF_FIRST; 1914 } 1915 break; 1916 case '\'': 1917 if (!(syntax & RE_NO_GNU_OPS)) 1918 { 1919 token->type = ANCHOR; 1920 token->opr.ctx_type = BUF_LAST; 1921 } 1922 break; 1923 case '(': 1924 if (!(syntax & RE_NO_BK_PARENS)) 1925 token->type = OP_OPEN_SUBEXP; 1926 break; 1927 case ')': 1928 if (!(syntax & RE_NO_BK_PARENS)) 1929 token->type = OP_CLOSE_SUBEXP; 1930 break; 1931 case '+': 1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1933 token->type = OP_DUP_PLUS; 1934 break; 1935 case '?': 1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1937 token->type = OP_DUP_QUESTION; 1938 break; 1939 case '{': 1940 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1941 token->type = OP_OPEN_DUP_NUM; 1942 break; 1943 case '}': 1944 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1945 token->type = OP_CLOSE_DUP_NUM; 1946 break; 1947 default: 1948 break; 1949 } 1950 return 2; 1951 } 1952 1953 token->type = CHARACTER; 1954 #ifdef RE_ENABLE_I18N 1955 if (input->mb_cur_max > 1) 1956 { 1957 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1958 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1959 } 1960 else 1961 #endif 1962 token->word_char = IS_WORD_CHAR (token->opr.c); 1963 1964 switch (c) 1965 { 1966 case '\n': 1967 if (syntax & RE_NEWLINE_ALT) 1968 token->type = OP_ALT; 1969 break; 1970 case '|': 1971 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1972 token->type = OP_ALT; 1973 break; 1974 case '*': 1975 token->type = OP_DUP_ASTERISK; 1976 break; 1977 case '+': 1978 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1979 token->type = OP_DUP_PLUS; 1980 break; 1981 case '?': 1982 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1983 token->type = OP_DUP_QUESTION; 1984 break; 1985 case '{': 1986 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1987 token->type = OP_OPEN_DUP_NUM; 1988 break; 1989 case '}': 1990 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1991 token->type = OP_CLOSE_DUP_NUM; 1992 break; 1993 case '(': 1994 if (syntax & RE_NO_BK_PARENS) 1995 token->type = OP_OPEN_SUBEXP; 1996 break; 1997 case ')': 1998 if (syntax & RE_NO_BK_PARENS) 1999 token->type = OP_CLOSE_SUBEXP; 2000 break; 2001 case '[': 2002 token->type = OP_OPEN_BRACKET; 2003 break; 2004 case '.': 2005 token->type = OP_PERIOD; 2006 break; 2007 case '^': 2008 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 2009 re_string_cur_idx (input) != 0) 2010 { 2011 char prev = re_string_peek_byte (input, -1); 2012 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 2013 break; 2014 } 2015 token->type = ANCHOR; 2016 token->opr.ctx_type = LINE_FIRST; 2017 break; 2018 case '$': 2019 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2020 re_string_cur_idx (input) + 1 != re_string_length (input)) 2021 { 2022 re_token_t next; 2023 re_string_skip_bytes (input, 1); 2024 peek_token (&next, input, syntax); 2025 re_string_skip_bytes (input, -1); 2026 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2027 break; 2028 } 2029 token->type = ANCHOR; 2030 token->opr.ctx_type = LINE_LAST; 2031 break; 2032 default: 2033 break; 2034 } 2035 return 1; 2036 } 2037 2038 /* Peek a token from INPUT, and return the length of the token. 2039 We must not use this function out of bracket expressions. */ 2040 2041 static int 2042 internal_function 2043 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2044 { 2045 unsigned char c; 2046 if (re_string_eoi (input)) 2047 { 2048 token->type = END_OF_RE; 2049 return 0; 2050 } 2051 c = re_string_peek_byte (input, 0); 2052 token->opr.c = c; 2053 2054 #ifdef RE_ENABLE_I18N 2055 if (input->mb_cur_max > 1 && 2056 !re_string_first_byte (input, re_string_cur_idx (input))) 2057 { 2058 token->type = CHARACTER; 2059 return 1; 2060 } 2061 #endif /* RE_ENABLE_I18N */ 2062 2063 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2064 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2065 { 2066 /* In this case, '\' escape a character. */ 2067 unsigned char c2; 2068 re_string_skip_bytes (input, 1); 2069 c2 = re_string_peek_byte (input, 0); 2070 token->opr.c = c2; 2071 token->type = CHARACTER; 2072 return 1; 2073 } 2074 if (c == '[') /* '[' is a special char in a bracket exps. */ 2075 { 2076 unsigned char c2; 2077 int token_len; 2078 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2079 c2 = re_string_peek_byte (input, 1); 2080 else 2081 c2 = 0; 2082 token->opr.c = c2; 2083 token_len = 2; 2084 switch (c2) 2085 { 2086 case '.': 2087 token->type = OP_OPEN_COLL_ELEM; 2088 break; 2089 case '=': 2090 token->type = OP_OPEN_EQUIV_CLASS; 2091 break; 2092 case ':': 2093 if (syntax & RE_CHAR_CLASSES) 2094 { 2095 token->type = OP_OPEN_CHAR_CLASS; 2096 break; 2097 } 2098 /* else fall through. */ 2099 default: 2100 token->type = CHARACTER; 2101 token->opr.c = c; 2102 token_len = 1; 2103 break; 2104 } 2105 return token_len; 2106 } 2107 switch (c) 2108 { 2109 case '-': 2110 token->type = OP_CHARSET_RANGE; 2111 break; 2112 case ']': 2113 token->type = OP_CLOSE_BRACKET; 2114 break; 2115 case '^': 2116 token->type = OP_NON_MATCH_LIST; 2117 break; 2118 default: 2119 token->type = CHARACTER; 2120 } 2121 return 1; 2122 } 2123 2124 /* Functions for parser. */ 2125 2126 /* Entry point of the parser. 2127 Parse the regular expression REGEXP and return the structure tree. 2128 If an error occurs, ERR is set by error code, and return NULL. 2129 This function build the following tree, from regular expression <reg_exp>: 2130 CAT 2131 / \ 2132 / \ 2133 <reg_exp> EOR 2134 2135 CAT means concatenation. 2136 EOR means end of regular expression. */ 2137 2138 static bin_tree_t * 2139 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2140 reg_errcode_t *err) 2141 { 2142 re_dfa_t *dfa = preg->buffer; 2143 bin_tree_t *tree, *eor, *root; 2144 re_token_t current_token; 2145 dfa->syntax = syntax; 2146 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2147 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2148 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2149 return NULL; 2150 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2151 if (tree != NULL) 2152 root = create_tree (dfa, tree, eor, CONCAT); 2153 else 2154 root = eor; 2155 if (BE (eor == NULL || root == NULL, 0)) 2156 { 2157 *err = REG_ESPACE; 2158 return NULL; 2159 } 2160 return root; 2161 } 2162 2163 /* This function build the following tree, from regular expression 2164 <branch1>|<branch2>: 2165 ALT 2166 / \ 2167 / \ 2168 <branch1> <branch2> 2169 2170 ALT means alternative, which represents the operator '|'. */ 2171 2172 static bin_tree_t * 2173 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2174 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2175 { 2176 re_dfa_t *dfa = preg->buffer; 2177 bin_tree_t *tree, *branch = NULL; 2178 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2179 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2180 return NULL; 2181 2182 while (token->type == OP_ALT) 2183 { 2184 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2185 if (token->type != OP_ALT && token->type != END_OF_RE 2186 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2187 { 2188 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2189 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2190 return NULL; 2191 } 2192 else 2193 branch = NULL; 2194 tree = create_tree (dfa, tree, branch, OP_ALT); 2195 if (BE (tree == NULL, 0)) 2196 { 2197 *err = REG_ESPACE; 2198 return NULL; 2199 } 2200 } 2201 return tree; 2202 } 2203 2204 /* This function build the following tree, from regular expression 2205 <exp1><exp2>: 2206 CAT 2207 / \ 2208 / \ 2209 <exp1> <exp2> 2210 2211 CAT means concatenation. */ 2212 2213 static bin_tree_t * 2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2216 { 2217 bin_tree_t *tree, *expr; 2218 re_dfa_t *dfa = preg->buffer; 2219 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2220 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2221 return NULL; 2222 2223 while (token->type != OP_ALT && token->type != END_OF_RE 2224 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2225 { 2226 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2227 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2228 { 2229 if (tree != NULL) 2230 postorder (tree, free_tree, NULL); 2231 return NULL; 2232 } 2233 if (tree != NULL && expr != NULL) 2234 { 2235 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); 2236 if (newtree == NULL) 2237 { 2238 postorder (expr, free_tree, NULL); 2239 postorder (tree, free_tree, NULL); 2240 *err = REG_ESPACE; 2241 return NULL; 2242 } 2243 tree = newtree; 2244 } 2245 else if (tree == NULL) 2246 tree = expr; 2247 /* Otherwise expr == NULL, we don't need to create new tree. */ 2248 } 2249 return tree; 2250 } 2251 2252 /* This function build the following tree, from regular expression a*: 2253 * 2254 | 2255 a 2256 */ 2257 2258 static bin_tree_t * 2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2260 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2261 { 2262 re_dfa_t *dfa = preg->buffer; 2263 bin_tree_t *tree; 2264 switch (token->type) 2265 { 2266 case CHARACTER: 2267 tree = create_token_tree (dfa, NULL, NULL, token); 2268 if (BE (tree == NULL, 0)) 2269 { 2270 *err = REG_ESPACE; 2271 return NULL; 2272 } 2273 #ifdef RE_ENABLE_I18N 2274 if (dfa->mb_cur_max > 1) 2275 { 2276 while (!re_string_eoi (regexp) 2277 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2278 { 2279 bin_tree_t *mbc_remain; 2280 fetch_token (token, regexp, syntax); 2281 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2282 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2283 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2284 { 2285 *err = REG_ESPACE; 2286 return NULL; 2287 } 2288 } 2289 } 2290 #endif 2291 break; 2292 case OP_OPEN_SUBEXP: 2293 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2294 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2295 return NULL; 2296 break; 2297 case OP_OPEN_BRACKET: 2298 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2299 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2300 return NULL; 2301 break; 2302 case OP_BACK_REF: 2303 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2304 { 2305 *err = REG_ESUBREG; 2306 return NULL; 2307 } 2308 dfa->used_bkref_map |= 1 << token->opr.idx; 2309 tree = create_token_tree (dfa, NULL, NULL, token); 2310 if (BE (tree == NULL, 0)) 2311 { 2312 *err = REG_ESPACE; 2313 return NULL; 2314 } 2315 ++dfa->nbackref; 2316 dfa->has_mb_node = 1; 2317 break; 2318 case OP_OPEN_DUP_NUM: 2319 if (syntax & RE_CONTEXT_INVALID_DUP) 2320 { 2321 *err = REG_BADRPT; 2322 return NULL; 2323 } 2324 /* FALLTHROUGH */ 2325 case OP_DUP_ASTERISK: 2326 case OP_DUP_PLUS: 2327 case OP_DUP_QUESTION: 2328 if (syntax & RE_CONTEXT_INVALID_OPS) 2329 { 2330 *err = REG_BADRPT; 2331 return NULL; 2332 } 2333 else if (syntax & RE_CONTEXT_INDEP_OPS) 2334 { 2335 fetch_token (token, regexp, syntax); 2336 return parse_expression (regexp, preg, token, syntax, nest, err); 2337 } 2338 /* else fall through */ 2339 case OP_CLOSE_SUBEXP: 2340 if ((token->type == OP_CLOSE_SUBEXP) && 2341 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2342 { 2343 *err = REG_ERPAREN; 2344 return NULL; 2345 } 2346 /* else fall through */ 2347 case OP_CLOSE_DUP_NUM: 2348 /* We treat it as a normal character. */ 2349 2350 /* Then we can these characters as normal characters. */ 2351 token->type = CHARACTER; 2352 /* mb_partial and word_char bits should be initialized already 2353 by peek_token. */ 2354 tree = create_token_tree (dfa, NULL, NULL, token); 2355 if (BE (tree == NULL, 0)) 2356 { 2357 *err = REG_ESPACE; 2358 return NULL; 2359 } 2360 break; 2361 case ANCHOR: 2362 if ((token->opr.ctx_type 2363 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2364 && dfa->word_ops_used == 0) 2365 init_word_char (dfa); 2366 if (token->opr.ctx_type == WORD_DELIM 2367 || token->opr.ctx_type == NOT_WORD_DELIM) 2368 { 2369 bin_tree_t *tree_first, *tree_last; 2370 if (token->opr.ctx_type == WORD_DELIM) 2371 { 2372 token->opr.ctx_type = WORD_FIRST; 2373 tree_first = create_token_tree (dfa, NULL, NULL, token); 2374 token->opr.ctx_type = WORD_LAST; 2375 } 2376 else 2377 { 2378 token->opr.ctx_type = INSIDE_WORD; 2379 tree_first = create_token_tree (dfa, NULL, NULL, token); 2380 token->opr.ctx_type = INSIDE_NOTWORD; 2381 } 2382 tree_last = create_token_tree (dfa, NULL, NULL, token); 2383 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2384 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2385 { 2386 *err = REG_ESPACE; 2387 return NULL; 2388 } 2389 } 2390 else 2391 { 2392 tree = create_token_tree (dfa, NULL, NULL, token); 2393 if (BE (tree == NULL, 0)) 2394 { 2395 *err = REG_ESPACE; 2396 return NULL; 2397 } 2398 } 2399 /* We must return here, since ANCHORs can't be followed 2400 by repetition operators. 2401 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2402 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2403 fetch_token (token, regexp, syntax); 2404 return tree; 2405 case OP_PERIOD: 2406 tree = create_token_tree (dfa, NULL, NULL, token); 2407 if (BE (tree == NULL, 0)) 2408 { 2409 *err = REG_ESPACE; 2410 return NULL; 2411 } 2412 if (dfa->mb_cur_max > 1) 2413 dfa->has_mb_node = 1; 2414 break; 2415 case OP_WORD: 2416 case OP_NOTWORD: 2417 tree = build_charclass_op (dfa, regexp->trans, 2418 (const unsigned char *) "alnum", 2419 (const unsigned char *) "_", 2420 token->type == OP_NOTWORD, err); 2421 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2422 return NULL; 2423 break; 2424 case OP_SPACE: 2425 case OP_NOTSPACE: 2426 tree = build_charclass_op (dfa, regexp->trans, 2427 (const unsigned char *) "space", 2428 (const unsigned char *) "", 2429 token->type == OP_NOTSPACE, err); 2430 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2431 return NULL; 2432 break; 2433 case OP_ALT: 2434 case END_OF_RE: 2435 return NULL; 2436 case BACK_SLASH: 2437 *err = REG_EESCAPE; 2438 return NULL; 2439 default: 2440 /* Must not happen? */ 2441 #ifdef DEBUG 2442 assert (0); 2443 #endif 2444 return NULL; 2445 } 2446 fetch_token (token, regexp, syntax); 2447 2448 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2449 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2450 { 2451 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2452 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2453 return NULL; 2454 /* In BRE consecutive duplications are not allowed. */ 2455 if ((syntax & RE_CONTEXT_INVALID_DUP) 2456 && (token->type == OP_DUP_ASTERISK 2457 || token->type == OP_OPEN_DUP_NUM)) 2458 { 2459 *err = REG_BADRPT; 2460 return NULL; 2461 } 2462 } 2463 2464 return tree; 2465 } 2466 2467 /* This function build the following tree, from regular expression 2468 (<reg_exp>): 2469 SUBEXP 2470 | 2471 <reg_exp> 2472 */ 2473 2474 static bin_tree_t * 2475 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2476 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2477 { 2478 re_dfa_t *dfa = preg->buffer; 2479 bin_tree_t *tree; 2480 size_t cur_nsub; 2481 cur_nsub = preg->re_nsub++; 2482 2483 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2484 2485 /* The subexpression may be a null string. */ 2486 if (token->type == OP_CLOSE_SUBEXP) 2487 tree = NULL; 2488 else 2489 { 2490 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2491 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2492 { 2493 if (tree != NULL) 2494 postorder (tree, free_tree, NULL); 2495 *err = REG_EPAREN; 2496 } 2497 if (BE (*err != REG_NOERROR, 0)) 2498 return NULL; 2499 } 2500 2501 if (cur_nsub <= '9' - '1') 2502 dfa->completed_bkref_map |= 1 << cur_nsub; 2503 2504 tree = create_tree (dfa, tree, NULL, SUBEXP); 2505 if (BE (tree == NULL, 0)) 2506 { 2507 *err = REG_ESPACE; 2508 return NULL; 2509 } 2510 tree->token.opr.idx = cur_nsub; 2511 return tree; 2512 } 2513 2514 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2515 2516 static bin_tree_t * 2517 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2518 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2519 { 2520 bin_tree_t *tree = NULL, *old_tree = NULL; 2521 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2522 re_token_t start_token = *token; 2523 2524 if (token->type == OP_OPEN_DUP_NUM) 2525 { 2526 end = 0; 2527 start = fetch_number (regexp, token, syntax); 2528 if (start == REG_MISSING) 2529 { 2530 if (token->type == CHARACTER && token->opr.c == ',') 2531 start = 0; /* We treat "{,m}" as "{0,m}". */ 2532 else 2533 { 2534 *err = REG_BADBR; /* <re>{} is invalid. */ 2535 return NULL; 2536 } 2537 } 2538 if (BE (start != REG_ERROR, 1)) 2539 { 2540 /* We treat "{n}" as "{n,n}". */ 2541 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2542 : ((token->type == CHARACTER && token->opr.c == ',') 2543 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2544 } 2545 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2546 { 2547 /* Invalid sequence. */ 2548 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2549 { 2550 if (token->type == END_OF_RE) 2551 *err = REG_EBRACE; 2552 else 2553 *err = REG_BADBR; 2554 2555 return NULL; 2556 } 2557 2558 /* If the syntax bit is set, rollback. */ 2559 re_string_set_index (regexp, start_idx); 2560 *token = start_token; 2561 token->type = CHARACTER; 2562 /* mb_partial and word_char bits should be already initialized by 2563 peek_token. */ 2564 return elem; 2565 } 2566 2567 if (BE ((end != REG_MISSING && start > end) 2568 || token->type != OP_CLOSE_DUP_NUM, 0)) 2569 { 2570 /* First number greater than second. */ 2571 *err = REG_BADBR; 2572 return NULL; 2573 } 2574 2575 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) 2576 { 2577 *err = REG_ESIZE; 2578 return NULL; 2579 } 2580 } 2581 else 2582 { 2583 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2584 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2585 } 2586 2587 fetch_token (token, regexp, syntax); 2588 2589 if (BE (elem == NULL, 0)) 2590 return NULL; 2591 if (BE (start == 0 && end == 0, 0)) 2592 { 2593 postorder (elem, free_tree, NULL); 2594 return NULL; 2595 } 2596 2597 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2598 if (BE (start > 0, 0)) 2599 { 2600 tree = elem; 2601 for (i = 2; i <= start; ++i) 2602 { 2603 elem = duplicate_tree (elem, dfa); 2604 tree = create_tree (dfa, tree, elem, CONCAT); 2605 if (BE (elem == NULL || tree == NULL, 0)) 2606 goto parse_dup_op_espace; 2607 } 2608 2609 if (start == end) 2610 return tree; 2611 2612 /* Duplicate ELEM before it is marked optional. */ 2613 elem = duplicate_tree (elem, dfa); 2614 old_tree = tree; 2615 } 2616 else 2617 old_tree = NULL; 2618 2619 if (elem->token.type == SUBEXP) 2620 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2621 2622 tree = create_tree (dfa, elem, NULL, 2623 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2624 if (BE (tree == NULL, 0)) 2625 goto parse_dup_op_espace; 2626 2627 /* From gnulib's "intprops.h": 2628 True if the arithmetic type T is signed. */ 2629 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) 2630 2631 /* This loop is actually executed only when end != REG_MISSING, 2632 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2633 already created the start+1-th copy. */ 2634 if (TYPE_SIGNED (Idx) || end != REG_MISSING) 2635 for (i = start + 2; i <= end; ++i) 2636 { 2637 elem = duplicate_tree (elem, dfa); 2638 tree = create_tree (dfa, tree, elem, CONCAT); 2639 if (BE (elem == NULL || tree == NULL, 0)) 2640 goto parse_dup_op_espace; 2641 2642 tree = create_tree (dfa, tree, NULL, OP_ALT); 2643 if (BE (tree == NULL, 0)) 2644 goto parse_dup_op_espace; 2645 } 2646 2647 if (old_tree) 2648 tree = create_tree (dfa, old_tree, tree, CONCAT); 2649 2650 return tree; 2651 2652 parse_dup_op_espace: 2653 *err = REG_ESPACE; 2654 return NULL; 2655 } 2656 2657 /* Size of the names for collating symbol/equivalence_class/character_class. 2658 I'm not sure, but maybe enough. */ 2659 #define BRACKET_NAME_BUF_SIZE 32 2660 2661 #ifndef _LIBC 2662 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2663 Build the range expression which starts from START_ELEM, and ends 2664 at END_ELEM. The result are written to MBCSET and SBCSET. 2665 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2666 mbcset->range_ends, is a pointer argument since we may 2667 update it. */ 2668 2669 static reg_errcode_t 2670 internal_function 2671 # ifdef RE_ENABLE_I18N 2672 build_range_exp (const reg_syntax_t syntax, 2673 bitset_t sbcset, 2674 re_charset_t *mbcset, 2675 Idx *range_alloc, 2676 const bracket_elem_t *start_elem, 2677 const bracket_elem_t *end_elem) 2678 # else /* not RE_ENABLE_I18N */ 2679 build_range_exp (const reg_syntax_t syntax, 2680 bitset_t sbcset, 2681 const bracket_elem_t *start_elem, 2682 const bracket_elem_t *end_elem) 2683 # endif /* not RE_ENABLE_I18N */ 2684 { 2685 unsigned int start_ch, end_ch; 2686 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2687 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2688 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2689 0)) 2690 return REG_ERANGE; 2691 2692 /* We can handle no multi character collating elements without libc 2693 support. */ 2694 if (BE ((start_elem->type == COLL_SYM 2695 && strlen ((char *) start_elem->opr.name) > 1) 2696 || (end_elem->type == COLL_SYM 2697 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2698 return REG_ECOLLATE; 2699 2700 # ifdef RE_ENABLE_I18N 2701 { 2702 wchar_t wc; 2703 wint_t start_wc; 2704 wint_t end_wc; 2705 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2706 2707 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2708 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2709 : 0)); 2710 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2711 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2712 : 0)); 2713 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2714 ? __btowc (start_ch) : start_elem->opr.wch); 2715 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2716 ? __btowc (end_ch) : end_elem->opr.wch); 2717 if (start_wc == WEOF || end_wc == WEOF) 2718 return REG_ECOLLATE; 2719 cmp_buf[0] = start_wc; 2720 cmp_buf[4] = end_wc; 2721 2722 if (BE ((syntax & RE_NO_EMPTY_RANGES) 2723 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) 2724 return REG_ERANGE; 2725 2726 /* Got valid collation sequence values, add them as a new entry. 2727 However, for !_LIBC we have no collation elements: if the 2728 character set is single byte, the single byte character set 2729 that we build below suffices. parse_bracket_exp passes 2730 no MBCSET if dfa->mb_cur_max == 1. */ 2731 if (mbcset) 2732 { 2733 /* Check the space of the arrays. */ 2734 if (BE (*range_alloc == mbcset->nranges, 0)) 2735 { 2736 /* There is not enough space, need realloc. */ 2737 wchar_t *new_array_start, *new_array_end; 2738 Idx new_nranges; 2739 2740 /* +1 in case of mbcset->nranges is 0. */ 2741 new_nranges = 2 * mbcset->nranges + 1; 2742 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2743 are NULL if *range_alloc == 0. */ 2744 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2745 new_nranges); 2746 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2747 new_nranges); 2748 2749 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2750 return REG_ESPACE; 2751 2752 mbcset->range_starts = new_array_start; 2753 mbcset->range_ends = new_array_end; 2754 *range_alloc = new_nranges; 2755 } 2756 2757 mbcset->range_starts[mbcset->nranges] = start_wc; 2758 mbcset->range_ends[mbcset->nranges++] = end_wc; 2759 } 2760 2761 /* Build the table for single byte characters. */ 2762 for (wc = 0; wc < SBC_MAX; ++wc) 2763 { 2764 cmp_buf[2] = wc; 2765 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2766 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2767 bitset_set (sbcset, wc); 2768 } 2769 } 2770 # else /* not RE_ENABLE_I18N */ 2771 { 2772 unsigned int ch; 2773 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2774 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2775 : 0)); 2776 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2777 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2778 : 0)); 2779 if (start_ch > end_ch) 2780 return REG_ERANGE; 2781 /* Build the table for single byte characters. */ 2782 for (ch = 0; ch < SBC_MAX; ++ch) 2783 if (start_ch <= ch && ch <= end_ch) 2784 bitset_set (sbcset, ch); 2785 } 2786 # endif /* not RE_ENABLE_I18N */ 2787 return REG_NOERROR; 2788 } 2789 #endif /* not _LIBC */ 2790 2791 #ifndef _LIBC 2792 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2793 Build the collating element which is represented by NAME. 2794 The result are written to MBCSET and SBCSET. 2795 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2796 pointer argument since we may update it. */ 2797 2798 static reg_errcode_t 2799 internal_function 2800 # ifdef RE_ENABLE_I18N 2801 build_collating_symbol (bitset_t sbcset, 2802 re_charset_t *mbcset _UNUSED_PARAMETER_, 2803 Idx *coll_sym_alloc _UNUSED_PARAMETER_, 2804 const unsigned char *name) 2805 # else /* not RE_ENABLE_I18N */ 2806 build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2807 # endif /* not RE_ENABLE_I18N */ 2808 { 2809 size_t name_len = strlen ((const char *) name); 2810 if (BE (name_len != 1, 0)) 2811 return REG_ECOLLATE; 2812 else 2813 { 2814 bitset_set (sbcset, name[0]); 2815 return REG_NOERROR; 2816 } 2817 } 2818 #endif /* not _LIBC */ 2819 2820 /* This function parse bracket expression like "[abc]", "[a-c]", 2821 "[[.a-a.]]" etc. */ 2822 2823 static bin_tree_t * 2824 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2825 reg_syntax_t syntax, reg_errcode_t *err) 2826 { 2827 #ifdef _LIBC 2828 const unsigned char *collseqmb; 2829 const char *collseqwc; 2830 uint32_t nrules; 2831 int32_t table_size; 2832 const int32_t *symb_table; 2833 const unsigned char *extra; 2834 2835 /* Local function for parse_bracket_exp used in _LIBC environment. 2836 Seek the collating symbol entry corresponding to NAME. 2837 Return the index of the symbol in the SYMB_TABLE. */ 2838 2839 auto inline int32_t 2840 __attribute ((always_inline)) 2841 seek_collating_symbol_entry (name, name_len) 2842 const unsigned char *name; 2843 size_t name_len; 2844 { 2845 int32_t hash = elem_hash ((const char *) name, name_len); 2846 int32_t elem = hash % table_size; 2847 if (symb_table[2 * elem] != 0) 2848 { 2849 int32_t second = hash % (table_size - 2) + 1; 2850 2851 do 2852 { 2853 /* First compare the hashing value. */ 2854 if (symb_table[2 * elem] == hash 2855 /* Compare the length of the name. */ 2856 && name_len == extra[symb_table[2 * elem + 1]] 2857 /* Compare the name. */ 2858 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2859 name_len) == 0) 2860 { 2861 /* Yep, this is the entry. */ 2862 break; 2863 } 2864 2865 /* Next entry. */ 2866 elem += second; 2867 } 2868 while (symb_table[2 * elem] != 0); 2869 } 2870 return elem; 2871 } 2872 2873 /* Local function for parse_bracket_exp used in _LIBC environment. 2874 Look up the collation sequence value of BR_ELEM. 2875 Return the value if succeeded, UINT_MAX otherwise. */ 2876 2877 auto inline unsigned int 2878 __attribute ((always_inline)) 2879 lookup_collation_sequence_value (br_elem) 2880 bracket_elem_t *br_elem; 2881 { 2882 if (br_elem->type == SB_CHAR) 2883 { 2884 /* 2885 if (MB_CUR_MAX == 1) 2886 */ 2887 if (nrules == 0) 2888 return collseqmb[br_elem->opr.ch]; 2889 else 2890 { 2891 wint_t wc = __btowc (br_elem->opr.ch); 2892 return __collseq_table_lookup (collseqwc, wc); 2893 } 2894 } 2895 else if (br_elem->type == MB_CHAR) 2896 { 2897 if (nrules != 0) 2898 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2899 } 2900 else if (br_elem->type == COLL_SYM) 2901 { 2902 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2903 if (nrules != 0) 2904 { 2905 int32_t elem, idx; 2906 elem = seek_collating_symbol_entry (br_elem->opr.name, 2907 sym_name_len); 2908 if (symb_table[2 * elem] != 0) 2909 { 2910 /* We found the entry. */ 2911 idx = symb_table[2 * elem + 1]; 2912 /* Skip the name of collating element name. */ 2913 idx += 1 + extra[idx]; 2914 /* Skip the byte sequence of the collating element. */ 2915 idx += 1 + extra[idx]; 2916 /* Adjust for the alignment. */ 2917 idx = (idx + 3) & ~3; 2918 /* Skip the multibyte collation sequence value. */ 2919 idx += sizeof (unsigned int); 2920 /* Skip the wide char sequence of the collating element. */ 2921 idx += sizeof (unsigned int) * 2922 (1 + *(unsigned int *) (extra + idx)); 2923 /* Return the collation sequence value. */ 2924 return *(unsigned int *) (extra + idx); 2925 } 2926 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2927 { 2928 /* No valid character. Match it as a single byte 2929 character. */ 2930 return collseqmb[br_elem->opr.name[0]]; 2931 } 2932 } 2933 else if (sym_name_len == 1) 2934 return collseqmb[br_elem->opr.name[0]]; 2935 } 2936 return UINT_MAX; 2937 } 2938 2939 /* Local function for parse_bracket_exp used in _LIBC environment. 2940 Build the range expression which starts from START_ELEM, and ends 2941 at END_ELEM. The result are written to MBCSET and SBCSET. 2942 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2943 mbcset->range_ends, is a pointer argument since we may 2944 update it. */ 2945 2946 auto inline reg_errcode_t 2947 __attribute ((always_inline)) 2948 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2949 re_charset_t *mbcset; 2950 Idx *range_alloc; 2951 bitset_t sbcset; 2952 bracket_elem_t *start_elem, *end_elem; 2953 { 2954 unsigned int ch; 2955 uint32_t start_collseq; 2956 uint32_t end_collseq; 2957 2958 /* Equivalence Classes and Character Classes can't be a range 2959 start/end. */ 2960 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2961 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2962 0)) 2963 return REG_ERANGE; 2964 2965 start_collseq = lookup_collation_sequence_value (start_elem); 2966 end_collseq = lookup_collation_sequence_value (end_elem); 2967 /* Check start/end collation sequence values. */ 2968 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2969 return REG_ECOLLATE; 2970 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2971 return REG_ERANGE; 2972 2973 /* Got valid collation sequence values, add them as a new entry. 2974 However, if we have no collation elements, and the character set 2975 is single byte, the single byte character set that we 2976 build below suffices. */ 2977 if (nrules > 0 || dfa->mb_cur_max > 1) 2978 { 2979 /* Check the space of the arrays. */ 2980 if (BE (*range_alloc == mbcset->nranges, 0)) 2981 { 2982 /* There is not enough space, need realloc. */ 2983 uint32_t *new_array_start; 2984 uint32_t *new_array_end; 2985 Idx new_nranges; 2986 2987 /* +1 in case of mbcset->nranges is 0. */ 2988 new_nranges = 2 * mbcset->nranges + 1; 2989 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2990 new_nranges); 2991 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2992 new_nranges); 2993 2994 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2995 return REG_ESPACE; 2996 2997 mbcset->range_starts = new_array_start; 2998 mbcset->range_ends = new_array_end; 2999 *range_alloc = new_nranges; 3000 } 3001 3002 mbcset->range_starts[mbcset->nranges] = start_collseq; 3003 mbcset->range_ends[mbcset->nranges++] = end_collseq; 3004 } 3005 3006 /* Build the table for single byte characters. */ 3007 for (ch = 0; ch < SBC_MAX; ch++) 3008 { 3009 uint32_t ch_collseq; 3010 /* 3011 if (MB_CUR_MAX == 1) 3012 */ 3013 if (nrules == 0) 3014 ch_collseq = collseqmb[ch]; 3015 else 3016 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 3017 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 3018 bitset_set (sbcset, ch); 3019 } 3020 return REG_NOERROR; 3021 } 3022 3023 /* Local function for parse_bracket_exp used in _LIBC environment. 3024 Build the collating element which is represented by NAME. 3025 The result are written to MBCSET and SBCSET. 3026 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3027 pointer argument since we may update it. */ 3028 3029 auto inline reg_errcode_t 3030 __attribute ((always_inline)) 3031 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 3032 re_charset_t *mbcset; 3033 Idx *coll_sym_alloc; 3034 bitset_t sbcset; 3035 const unsigned char *name; 3036 { 3037 int32_t elem, idx; 3038 size_t name_len = strlen ((const char *) name); 3039 if (nrules != 0) 3040 { 3041 elem = seek_collating_symbol_entry (name, name_len); 3042 if (symb_table[2 * elem] != 0) 3043 { 3044 /* We found the entry. */ 3045 idx = symb_table[2 * elem + 1]; 3046 /* Skip the name of collating element name. */ 3047 idx += 1 + extra[idx]; 3048 } 3049 else if (symb_table[2 * elem] == 0 && name_len == 1) 3050 { 3051 /* No valid character, treat it as a normal 3052 character. */ 3053 bitset_set (sbcset, name[0]); 3054 return REG_NOERROR; 3055 } 3056 else 3057 return REG_ECOLLATE; 3058 3059 /* Got valid collation sequence, add it as a new entry. */ 3060 /* Check the space of the arrays. */ 3061 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3062 { 3063 /* Not enough, realloc it. */ 3064 /* +1 in case of mbcset->ncoll_syms is 0. */ 3065 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3066 /* Use realloc since mbcset->coll_syms is NULL 3067 if *alloc == 0. */ 3068 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3069 new_coll_sym_alloc); 3070 if (BE (new_coll_syms == NULL, 0)) 3071 return REG_ESPACE; 3072 mbcset->coll_syms = new_coll_syms; 3073 *coll_sym_alloc = new_coll_sym_alloc; 3074 } 3075 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3076 return REG_NOERROR; 3077 } 3078 else 3079 { 3080 if (BE (name_len != 1, 0)) 3081 return REG_ECOLLATE; 3082 else 3083 { 3084 bitset_set (sbcset, name[0]); 3085 return REG_NOERROR; 3086 } 3087 } 3088 } 3089 #endif 3090 3091 re_token_t br_token; 3092 re_bitset_ptr_t sbcset; 3093 #ifdef RE_ENABLE_I18N 3094 re_charset_t *mbcset; 3095 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3096 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3097 #endif /* not RE_ENABLE_I18N */ 3098 bool non_match = false; 3099 bin_tree_t *work_tree; 3100 int token_len; 3101 bool first_round = true; 3102 #ifdef _LIBC 3103 collseqmb = (const unsigned char *) 3104 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3105 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3106 if (nrules) 3107 { 3108 /* 3109 if (MB_CUR_MAX > 1) 3110 */ 3111 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3112 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3113 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3114 _NL_COLLATE_SYMB_TABLEMB); 3115 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3116 _NL_COLLATE_SYMB_EXTRAMB); 3117 } 3118 #endif 3119 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3120 #ifdef RE_ENABLE_I18N 3121 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3122 #endif /* RE_ENABLE_I18N */ 3123 #ifdef RE_ENABLE_I18N 3124 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3125 #else 3126 if (BE (sbcset == NULL, 0)) 3127 #endif /* RE_ENABLE_I18N */ 3128 { 3129 re_free (sbcset); 3130 #ifdef RE_ENABLE_I18N 3131 re_free (mbcset); 3132 #endif 3133 *err = REG_ESPACE; 3134 return NULL; 3135 } 3136 3137 token_len = peek_token_bracket (token, regexp, syntax); 3138 if (BE (token->type == END_OF_RE, 0)) 3139 { 3140 *err = REG_BADPAT; 3141 goto parse_bracket_exp_free_return; 3142 } 3143 if (token->type == OP_NON_MATCH_LIST) 3144 { 3145 #ifdef RE_ENABLE_I18N 3146 mbcset->non_match = 1; 3147 #endif /* not RE_ENABLE_I18N */ 3148 non_match = true; 3149 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3150 bitset_set (sbcset, '\n'); 3151 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3152 token_len = peek_token_bracket (token, regexp, syntax); 3153 if (BE (token->type == END_OF_RE, 0)) 3154 { 3155 *err = REG_BADPAT; 3156 goto parse_bracket_exp_free_return; 3157 } 3158 } 3159 3160 /* We treat the first ']' as a normal character. */ 3161 if (token->type == OP_CLOSE_BRACKET) 3162 token->type = CHARACTER; 3163 3164 while (1) 3165 { 3166 bracket_elem_t start_elem, end_elem; 3167 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3168 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3169 reg_errcode_t ret; 3170 int token_len2 = 0; 3171 bool is_range_exp = false; 3172 re_token_t token2; 3173 3174 start_elem.opr.name = start_name_buf; 3175 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3176 syntax, first_round); 3177 if (BE (ret != REG_NOERROR, 0)) 3178 { 3179 *err = ret; 3180 goto parse_bracket_exp_free_return; 3181 } 3182 first_round = false; 3183 3184 /* Get information about the next token. We need it in any case. */ 3185 token_len = peek_token_bracket (token, regexp, syntax); 3186 3187 /* Do not check for ranges if we know they are not allowed. */ 3188 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3189 { 3190 if (BE (token->type == END_OF_RE, 0)) 3191 { 3192 *err = REG_EBRACK; 3193 goto parse_bracket_exp_free_return; 3194 } 3195 if (token->type == OP_CHARSET_RANGE) 3196 { 3197 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3198 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3199 if (BE (token2.type == END_OF_RE, 0)) 3200 { 3201 *err = REG_EBRACK; 3202 goto parse_bracket_exp_free_return; 3203 } 3204 if (token2.type == OP_CLOSE_BRACKET) 3205 { 3206 /* We treat the last '-' as a normal character. */ 3207 re_string_skip_bytes (regexp, -token_len); 3208 token->type = CHARACTER; 3209 } 3210 else 3211 is_range_exp = true; 3212 } 3213 } 3214 3215 if (is_range_exp == true) 3216 { 3217 end_elem.opr.name = end_name_buf; 3218 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3219 dfa, syntax, true); 3220 if (BE (ret != REG_NOERROR, 0)) 3221 { 3222 *err = ret; 3223 goto parse_bracket_exp_free_return; 3224 } 3225 3226 token_len = peek_token_bracket (token, regexp, syntax); 3227 3228 #ifdef _LIBC 3229 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3230 &start_elem, &end_elem); 3231 #else 3232 # ifdef RE_ENABLE_I18N 3233 *err = build_range_exp (syntax, sbcset, 3234 dfa->mb_cur_max > 1 ? mbcset : NULL, 3235 &range_alloc, &start_elem, &end_elem); 3236 # else 3237 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3238 # endif 3239 #endif /* RE_ENABLE_I18N */ 3240 if (BE (*err != REG_NOERROR, 0)) 3241 goto parse_bracket_exp_free_return; 3242 } 3243 else 3244 { 3245 switch (start_elem.type) 3246 { 3247 case SB_CHAR: 3248 bitset_set (sbcset, start_elem.opr.ch); 3249 break; 3250 #ifdef RE_ENABLE_I18N 3251 case MB_CHAR: 3252 /* Check whether the array has enough space. */ 3253 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3254 { 3255 wchar_t *new_mbchars; 3256 /* Not enough, realloc it. */ 3257 /* +1 in case of mbcset->nmbchars is 0. */ 3258 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3259 /* Use realloc since array is NULL if *alloc == 0. */ 3260 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3261 mbchar_alloc); 3262 if (BE (new_mbchars == NULL, 0)) 3263 goto parse_bracket_exp_espace; 3264 mbcset->mbchars = new_mbchars; 3265 } 3266 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3267 break; 3268 #endif /* RE_ENABLE_I18N */ 3269 case EQUIV_CLASS: 3270 *err = build_equiv_class (sbcset, 3271 #ifdef RE_ENABLE_I18N 3272 mbcset, &equiv_class_alloc, 3273 #endif /* RE_ENABLE_I18N */ 3274 start_elem.opr.name); 3275 if (BE (*err != REG_NOERROR, 0)) 3276 goto parse_bracket_exp_free_return; 3277 break; 3278 case COLL_SYM: 3279 *err = build_collating_symbol (sbcset, 3280 #ifdef RE_ENABLE_I18N 3281 mbcset, &coll_sym_alloc, 3282 #endif /* RE_ENABLE_I18N */ 3283 start_elem.opr.name); 3284 if (BE (*err != REG_NOERROR, 0)) 3285 goto parse_bracket_exp_free_return; 3286 break; 3287 case CHAR_CLASS: 3288 *err = build_charclass (regexp->trans, sbcset, 3289 #ifdef RE_ENABLE_I18N 3290 mbcset, &char_class_alloc, 3291 #endif /* RE_ENABLE_I18N */ 3292 start_elem.opr.name, syntax); 3293 if (BE (*err != REG_NOERROR, 0)) 3294 goto parse_bracket_exp_free_return; 3295 break; 3296 default: 3297 assert (0); 3298 break; 3299 } 3300 } 3301 if (BE (token->type == END_OF_RE, 0)) 3302 { 3303 *err = REG_EBRACK; 3304 goto parse_bracket_exp_free_return; 3305 } 3306 if (token->type == OP_CLOSE_BRACKET) 3307 break; 3308 } 3309 3310 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3311 3312 /* If it is non-matching list. */ 3313 if (non_match) 3314 bitset_not (sbcset); 3315 3316 #ifdef RE_ENABLE_I18N 3317 /* Ensure only single byte characters are set. */ 3318 if (dfa->mb_cur_max > 1) 3319 bitset_mask (sbcset, dfa->sb_char); 3320 3321 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3322 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3323 || mbcset->non_match))) 3324 { 3325 bin_tree_t *mbc_tree; 3326 int sbc_idx; 3327 /* Build a tree for complex bracket. */ 3328 dfa->has_mb_node = 1; 3329 br_token.type = COMPLEX_BRACKET; 3330 br_token.opr.mbcset = mbcset; 3331 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3332 if (BE (mbc_tree == NULL, 0)) 3333 goto parse_bracket_exp_espace; 3334 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3335 if (sbcset[sbc_idx]) 3336 break; 3337 /* If there are no bits set in sbcset, there is no point 3338 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3339 if (sbc_idx < BITSET_WORDS) 3340 { 3341 /* Build a tree for simple bracket. */ 3342 br_token.type = SIMPLE_BRACKET; 3343 br_token.opr.sbcset = sbcset; 3344 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3345 if (BE (work_tree == NULL, 0)) 3346 goto parse_bracket_exp_espace; 3347 3348 /* Then join them by ALT node. */ 3349 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3350 if (BE (work_tree == NULL, 0)) 3351 goto parse_bracket_exp_espace; 3352 } 3353 else 3354 { 3355 re_free (sbcset); 3356 work_tree = mbc_tree; 3357 } 3358 } 3359 else 3360 #endif /* not RE_ENABLE_I18N */ 3361 { 3362 #ifdef RE_ENABLE_I18N 3363 free_charset (mbcset); 3364 #endif 3365 /* Build a tree for simple bracket. */ 3366 br_token.type = SIMPLE_BRACKET; 3367 br_token.opr.sbcset = sbcset; 3368 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3369 if (BE (work_tree == NULL, 0)) 3370 goto parse_bracket_exp_espace; 3371 } 3372 return work_tree; 3373 3374 parse_bracket_exp_espace: 3375 *err = REG_ESPACE; 3376 parse_bracket_exp_free_return: 3377 re_free (sbcset); 3378 #ifdef RE_ENABLE_I18N 3379 free_charset (mbcset); 3380 #endif /* RE_ENABLE_I18N */ 3381 return NULL; 3382 } 3383 3384 /* Parse an element in the bracket expression. */ 3385 3386 static reg_errcode_t 3387 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3388 re_token_t *token, int token_len, 3389 re_dfa_t *dfa _UNUSED_PARAMETER_, 3390 reg_syntax_t syntax, bool accept_hyphen) 3391 { 3392 #ifdef RE_ENABLE_I18N 3393 int cur_char_size; 3394 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3395 if (cur_char_size > 1) 3396 { 3397 elem->type = MB_CHAR; 3398 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3399 re_string_skip_bytes (regexp, cur_char_size); 3400 return REG_NOERROR; 3401 } 3402 #endif /* RE_ENABLE_I18N */ 3403 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3404 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3405 || token->type == OP_OPEN_EQUIV_CLASS) 3406 return parse_bracket_symbol (elem, regexp, token); 3407 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3408 { 3409 /* A '-' must only appear as anything but a range indicator before 3410 the closing bracket. Everything else is an error. */ 3411 re_token_t token2; 3412 (void) peek_token_bracket (&token2, regexp, syntax); 3413 if (token2.type != OP_CLOSE_BRACKET) 3414 /* The actual error value is not standardized since this whole 3415 case is undefined. But ERANGE makes good sense. */ 3416 return REG_ERANGE; 3417 } 3418 elem->type = SB_CHAR; 3419 elem->opr.ch = token->opr.c; 3420 return REG_NOERROR; 3421 } 3422 3423 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3424 such as [:<character_class>:], [.<collating_element>.], and 3425 [=<equivalent_class>=]. */ 3426 3427 static reg_errcode_t 3428 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3429 re_token_t *token) 3430 { 3431 unsigned char ch, delim = token->opr.c; 3432 int i = 0; 3433 if (re_string_eoi(regexp)) 3434 return REG_EBRACK; 3435 for (;; ++i) 3436 { 3437 if (i >= BRACKET_NAME_BUF_SIZE) 3438 return REG_EBRACK; 3439 if (token->type == OP_OPEN_CHAR_CLASS) 3440 ch = re_string_fetch_byte_case (regexp); 3441 else 3442 ch = re_string_fetch_byte (regexp); 3443 if (re_string_eoi(regexp)) 3444 return REG_EBRACK; 3445 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3446 break; 3447 elem->opr.name[i] = ch; 3448 } 3449 re_string_skip_bytes (regexp, 1); 3450 elem->opr.name[i] = '\0'; 3451 switch (token->type) 3452 { 3453 case OP_OPEN_COLL_ELEM: 3454 elem->type = COLL_SYM; 3455 break; 3456 case OP_OPEN_EQUIV_CLASS: 3457 elem->type = EQUIV_CLASS; 3458 break; 3459 case OP_OPEN_CHAR_CLASS: 3460 elem->type = CHAR_CLASS; 3461 break; 3462 default: 3463 break; 3464 } 3465 return REG_NOERROR; 3466 } 3467 3468 /* Helper function for parse_bracket_exp. 3469 Build the equivalence class which is represented by NAME. 3470 The result are written to MBCSET and SBCSET. 3471 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3472 is a pointer argument since we may update it. */ 3473 3474 static reg_errcode_t 3475 #ifdef RE_ENABLE_I18N 3476 build_equiv_class (bitset_t sbcset, 3477 re_charset_t *mbcset _UNUSED_PARAMETER_, 3478 Idx *equiv_class_alloc _UNUSED_PARAMETER_, 3479 const unsigned char *name) 3480 #else /* not RE_ENABLE_I18N */ 3481 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3482 #endif /* not RE_ENABLE_I18N */ 3483 { 3484 #ifdef _LIBC 3485 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3486 if (nrules != 0) 3487 { 3488 const int32_t *table, *indirect; 3489 const unsigned char *weights, *extra, *cp; 3490 unsigned char char_buf[2]; 3491 int32_t idx1, idx2; 3492 unsigned int ch; 3493 size_t len; 3494 /* This #include defines a local function! */ 3495 # include <locale/weight.h> 3496 /* Calculate the index for equivalence class. */ 3497 cp = name; 3498 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3499 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3500 _NL_COLLATE_WEIGHTMB); 3501 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3502 _NL_COLLATE_EXTRAMB); 3503 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3504 _NL_COLLATE_INDIRECTMB); 3505 idx1 = findidx (&cp, -1); 3506 if (BE (idx1 == 0 || *cp != '\0', 0)) 3507 /* This isn't a valid character. */ 3508 return REG_ECOLLATE; 3509 3510 /* Build single byte matching table for this equivalence class. */ 3511 len = weights[idx1 & 0xffffff]; 3512 for (ch = 0; ch < SBC_MAX; ++ch) 3513 { 3514 char_buf[0] = ch; 3515 cp = char_buf; 3516 idx2 = findidx (&cp, 1); 3517 /* 3518 idx2 = table[ch]; 3519 */ 3520 if (idx2 == 0) 3521 /* This isn't a valid character. */ 3522 continue; 3523 /* Compare only if the length matches and the collation rule 3524 index is the same. */ 3525 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3526 { 3527 int cnt = 0; 3528 3529 while (cnt <= len && 3530 weights[(idx1 & 0xffffff) + 1 + cnt] 3531 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3532 ++cnt; 3533 3534 if (cnt > len) 3535 bitset_set (sbcset, ch); 3536 } 3537 } 3538 /* Check whether the array has enough space. */ 3539 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3540 { 3541 /* Not enough, realloc it. */ 3542 /* +1 in case of mbcset->nequiv_classes is 0. */ 3543 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3544 /* Use realloc since the array is NULL if *alloc == 0. */ 3545 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3546 int32_t, 3547 new_equiv_class_alloc); 3548 if (BE (new_equiv_classes == NULL, 0)) 3549 return REG_ESPACE; 3550 mbcset->equiv_classes = new_equiv_classes; 3551 *equiv_class_alloc = new_equiv_class_alloc; 3552 } 3553 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3554 } 3555 else 3556 #endif /* _LIBC */ 3557 { 3558 if (BE (strlen ((const char *) name) != 1, 0)) 3559 return REG_ECOLLATE; 3560 bitset_set (sbcset, *name); 3561 } 3562 return REG_NOERROR; 3563 } 3564 3565 /* Helper function for parse_bracket_exp. 3566 Build the character class which is represented by NAME. 3567 The result are written to MBCSET and SBCSET. 3568 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3569 is a pointer argument since we may update it. */ 3570 3571 static reg_errcode_t 3572 #ifdef RE_ENABLE_I18N 3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3574 re_charset_t *mbcset, Idx *char_class_alloc, 3575 const unsigned char *class_name, reg_syntax_t syntax) 3576 #else /* not RE_ENABLE_I18N */ 3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3578 const unsigned char *class_name, reg_syntax_t syntax) 3579 #endif /* not RE_ENABLE_I18N */ 3580 { 3581 int i; 3582 const char *name = (const char *) class_name; 3583 3584 /* In case of REG_ICASE "upper" and "lower" match the both of 3585 upper and lower cases. */ 3586 if ((syntax & RE_ICASE) 3587 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3588 name = "alpha"; 3589 3590 #ifdef RE_ENABLE_I18N 3591 /* Check the space of the arrays. */ 3592 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3593 { 3594 /* Not enough, realloc it. */ 3595 /* +1 in case of mbcset->nchar_classes is 0. */ 3596 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3597 /* Use realloc since array is NULL if *alloc == 0. */ 3598 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3599 new_char_class_alloc); 3600 if (BE (new_char_classes == NULL, 0)) 3601 return REG_ESPACE; 3602 mbcset->char_classes = new_char_classes; 3603 *char_class_alloc = new_char_class_alloc; 3604 } 3605 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3606 #endif /* RE_ENABLE_I18N */ 3607 3608 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3609 do { \ 3610 if (BE (trans != NULL, 0)) \ 3611 { \ 3612 for (i = 0; i < SBC_MAX; ++i) \ 3613 if (ctype_func (i)) \ 3614 bitset_set (sbcset, trans[i]); \ 3615 } \ 3616 else \ 3617 { \ 3618 for (i = 0; i < SBC_MAX; ++i) \ 3619 if (ctype_func (i)) \ 3620 bitset_set (sbcset, i); \ 3621 } \ 3622 } while (0) 3623 3624 if (strcmp (name, "alnum") == 0) 3625 BUILD_CHARCLASS_LOOP (isalnum); 3626 else if (strcmp (name, "cntrl") == 0) 3627 BUILD_CHARCLASS_LOOP (iscntrl); 3628 else if (strcmp (name, "lower") == 0) 3629 BUILD_CHARCLASS_LOOP (islower); 3630 else if (strcmp (name, "space") == 0) 3631 BUILD_CHARCLASS_LOOP (isspace); 3632 else if (strcmp (name, "alpha") == 0) 3633 BUILD_CHARCLASS_LOOP (isalpha); 3634 else if (strcmp (name, "digit") == 0) 3635 BUILD_CHARCLASS_LOOP (isdigit); 3636 else if (strcmp (name, "print") == 0) 3637 BUILD_CHARCLASS_LOOP (isprint); 3638 else if (strcmp (name, "upper") == 0) 3639 BUILD_CHARCLASS_LOOP (isupper); 3640 else if (strcmp (name, "blank") == 0) 3641 BUILD_CHARCLASS_LOOP (isblank); 3642 else if (strcmp (name, "graph") == 0) 3643 BUILD_CHARCLASS_LOOP (isgraph); 3644 else if (strcmp (name, "punct") == 0) 3645 BUILD_CHARCLASS_LOOP (ispunct); 3646 else if (strcmp (name, "xdigit") == 0) 3647 BUILD_CHARCLASS_LOOP (isxdigit); 3648 else 3649 return REG_ECTYPE; 3650 3651 return REG_NOERROR; 3652 } 3653 3654 static bin_tree_t * 3655 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3656 const unsigned char *class_name, 3657 const unsigned char *extra, bool non_match, 3658 reg_errcode_t *err) 3659 { 3660 re_bitset_ptr_t sbcset; 3661 #ifdef RE_ENABLE_I18N 3662 re_charset_t *mbcset; 3663 Idx alloc = 0; 3664 #endif /* not RE_ENABLE_I18N */ 3665 reg_errcode_t ret; 3666 re_token_t br_token; 3667 bin_tree_t *tree; 3668 3669 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3670 #ifdef RE_ENABLE_I18N 3671 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3672 #endif /* RE_ENABLE_I18N */ 3673 3674 #ifdef RE_ENABLE_I18N 3675 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3676 #else /* not RE_ENABLE_I18N */ 3677 if (BE (sbcset == NULL, 0)) 3678 #endif /* not RE_ENABLE_I18N */ 3679 { 3680 *err = REG_ESPACE; 3681 return NULL; 3682 } 3683 3684 if (non_match) 3685 { 3686 #ifdef RE_ENABLE_I18N 3687 mbcset->non_match = 1; 3688 #endif /* not RE_ENABLE_I18N */ 3689 } 3690 3691 /* We don't care the syntax in this case. */ 3692 ret = build_charclass (trans, sbcset, 3693 #ifdef RE_ENABLE_I18N 3694 mbcset, &alloc, 3695 #endif /* RE_ENABLE_I18N */ 3696 class_name, 0); 3697 3698 if (BE (ret != REG_NOERROR, 0)) 3699 { 3700 re_free (sbcset); 3701 #ifdef RE_ENABLE_I18N 3702 free_charset (mbcset); 3703 #endif /* RE_ENABLE_I18N */ 3704 *err = ret; 3705 return NULL; 3706 } 3707 /* \w match '_' also. */ 3708 for (; *extra; extra++) 3709 bitset_set (sbcset, *extra); 3710 3711 /* If it is non-matching list. */ 3712 if (non_match) 3713 bitset_not (sbcset); 3714 3715 #ifdef RE_ENABLE_I18N 3716 /* Ensure only single byte characters are set. */ 3717 if (dfa->mb_cur_max > 1) 3718 bitset_mask (sbcset, dfa->sb_char); 3719 #endif 3720 3721 /* Build a tree for simple bracket. */ 3722 br_token.type = SIMPLE_BRACKET; 3723 br_token.opr.sbcset = sbcset; 3724 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3725 if (BE (tree == NULL, 0)) 3726 goto build_word_op_espace; 3727 3728 #ifdef RE_ENABLE_I18N 3729 if (dfa->mb_cur_max > 1) 3730 { 3731 bin_tree_t *mbc_tree; 3732 /* Build a tree for complex bracket. */ 3733 br_token.type = COMPLEX_BRACKET; 3734 br_token.opr.mbcset = mbcset; 3735 dfa->has_mb_node = 1; 3736 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3737 if (BE (mbc_tree == NULL, 0)) 3738 goto build_word_op_espace; 3739 /* Then join them by ALT node. */ 3740 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3741 if (BE (mbc_tree != NULL, 1)) 3742 return tree; 3743 } 3744 else 3745 { 3746 free_charset (mbcset); 3747 return tree; 3748 } 3749 #else /* not RE_ENABLE_I18N */ 3750 return tree; 3751 #endif /* not RE_ENABLE_I18N */ 3752 3753 build_word_op_espace: 3754 re_free (sbcset); 3755 #ifdef RE_ENABLE_I18N 3756 free_charset (mbcset); 3757 #endif /* RE_ENABLE_I18N */ 3758 *err = REG_ESPACE; 3759 return NULL; 3760 } 3761 3762 /* This is intended for the expressions like "a{1,3}". 3763 Fetch a number from 'input', and return the number. 3764 Return REG_MISSING if the number field is empty like "{,1}". 3765 Return RE_DUP_MAX + 1 if the number field is too large. 3766 Return REG_ERROR if an error occurred. */ 3767 3768 static Idx 3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3770 { 3771 Idx num = REG_MISSING; 3772 unsigned char c; 3773 while (1) 3774 { 3775 fetch_token (token, input, syntax); 3776 c = token->opr.c; 3777 if (BE (token->type == END_OF_RE, 0)) 3778 return REG_ERROR; 3779 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3780 break; 3781 num = ((token->type != CHARACTER || c < '0' || '9' < c 3782 || num == REG_ERROR) 3783 ? REG_ERROR 3784 : num == REG_MISSING 3785 ? c - '0' 3786 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); 3787 } 3788 return num; 3789 } 3790 3791 #ifdef RE_ENABLE_I18N 3792 static void 3793 free_charset (re_charset_t *cset) 3794 { 3795 re_free (cset->mbchars); 3796 # ifdef _LIBC 3797 re_free (cset->coll_syms); 3798 re_free (cset->equiv_classes); 3799 re_free (cset->range_starts); 3800 re_free (cset->range_ends); 3801 # endif 3802 re_free (cset->char_classes); 3803 re_free (cset); 3804 } 3805 #endif /* RE_ENABLE_I18N */ 3806 3807 /* Functions for binary tree operation. */ 3808 3809 /* Create a tree node. */ 3810 3811 static bin_tree_t * 3812 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3813 re_token_type_t type) 3814 { 3815 re_token_t t; 3816 t.type = type; 3817 return create_token_tree (dfa, left, right, &t); 3818 } 3819 3820 static bin_tree_t * 3821 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3822 const re_token_t *token) 3823 { 3824 bin_tree_t *tree; 3825 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3826 { 3827 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3828 3829 if (storage == NULL) 3830 return NULL; 3831 storage->next = dfa->str_tree_storage; 3832 dfa->str_tree_storage = storage; 3833 dfa->str_tree_storage_idx = 0; 3834 } 3835 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3836 3837 tree->parent = NULL; 3838 tree->left = left; 3839 tree->right = right; 3840 tree->token = *token; 3841 tree->token.duplicated = 0; 3842 tree->token.opt_subexp = 0; 3843 tree->first = NULL; 3844 tree->next = NULL; 3845 tree->node_idx = REG_MISSING; 3846 3847 if (left != NULL) 3848 left->parent = tree; 3849 if (right != NULL) 3850 right->parent = tree; 3851 return tree; 3852 } 3853 3854 /* Mark the tree SRC as an optional subexpression. 3855 To be called from preorder or postorder. */ 3856 3857 static reg_errcode_t 3858 mark_opt_subexp (void *extra, bin_tree_t *node) 3859 { 3860 Idx idx = (Idx) (long) extra; 3861 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3862 node->token.opt_subexp = 1; 3863 3864 return REG_NOERROR; 3865 } 3866 3867 /* Free the allocated memory inside NODE. */ 3868 3869 static void 3870 free_token (re_token_t *node) 3871 { 3872 #ifdef RE_ENABLE_I18N 3873 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3874 free_charset (node->opr.mbcset); 3875 else 3876 #endif /* RE_ENABLE_I18N */ 3877 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3878 re_free (node->opr.sbcset); 3879 } 3880 3881 /* Worker function for tree walking. Free the allocated memory inside NODE 3882 and its children. */ 3883 3884 static reg_errcode_t 3885 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 3886 { 3887 free_token (&node->token); 3888 return REG_NOERROR; 3889 } 3890 3891 3892 /* Duplicate the node SRC, and return new node. This is a preorder 3893 visit similar to the one implemented by the generic visitor, but 3894 we need more infrastructure to maintain two parallel trees --- so, 3895 it's easier to duplicate. */ 3896 3897 static bin_tree_t * 3898 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3899 { 3900 const bin_tree_t *node; 3901 bin_tree_t *dup_root; 3902 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3903 3904 for (node = root; ; ) 3905 { 3906 /* Create a new tree and link it back to the current parent. */ 3907 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3908 if (*p_new == NULL) 3909 return NULL; 3910 (*p_new)->parent = dup_node; 3911 (*p_new)->token.duplicated = 1; 3912 dup_node = *p_new; 3913 3914 /* Go to the left node, or up and to the right. */ 3915 if (node->left) 3916 { 3917 node = node->left; 3918 p_new = &dup_node->left; 3919 } 3920 else 3921 { 3922 const bin_tree_t *prev = NULL; 3923 while (node->right == prev || node->right == NULL) 3924 { 3925 prev = node; 3926 node = node->parent; 3927 dup_node = dup_node->parent; 3928 if (!node) 3929 return dup_root; 3930 } 3931 node = node->right; 3932 p_new = &dup_node->right; 3933 } 3934 } 3935 } 3936