1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002-2015 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU General Public 8 License as published by the Free Software Foundation; either 9 version 3 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public 17 License along with the GNU C Library; if not, see 18 <http://www.gnu.org/licenses/>. */ 19 20 #ifdef _LIBC 21 # include <locale/weight.h> 22 #endif 23 24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 25 size_t length, reg_syntax_t syntax); 26 static void re_compile_fastmap_iter (regex_t *bufp, 27 const re_dfastate_t *init_state, 28 char *fastmap); 29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 30 #ifdef RE_ENABLE_I18N 31 static void free_charset (re_charset_t *cset); 32 #endif /* RE_ENABLE_I18N */ 33 static void free_workarea_compile (regex_t *preg); 34 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 35 #ifdef RE_ENABLE_I18N 36 static void optimize_utf8 (re_dfa_t *dfa); 37 #endif 38 static reg_errcode_t analyze (regex_t *preg); 39 static reg_errcode_t preorder (bin_tree_t *root, 40 reg_errcode_t (fn (void *, bin_tree_t *)), 41 void *extra); 42 static reg_errcode_t postorder (bin_tree_t *root, 43 reg_errcode_t (fn (void *, bin_tree_t *)), 44 void *extra); 45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 48 bin_tree_t *node); 49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 54 unsigned int constraint); 55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 57 Idx node, bool root); 58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 59 static Idx fetch_number (re_string_t *input, re_token_t *token, 60 reg_syntax_t syntax); 61 static int peek_token (re_token_t *token, re_string_t *input, 62 reg_syntax_t syntax) internal_function; 63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 64 reg_syntax_t syntax, reg_errcode_t *err); 65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 66 re_token_t *token, reg_syntax_t syntax, 67 Idx nest, reg_errcode_t *err); 68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 69 re_token_t *token, reg_syntax_t syntax, 70 Idx nest, reg_errcode_t *err); 71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 72 re_token_t *token, reg_syntax_t syntax, 73 Idx nest, reg_errcode_t *err); 74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 75 re_token_t *token, reg_syntax_t syntax, 76 Idx nest, reg_errcode_t *err); 77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 78 re_dfa_t *dfa, re_token_t *token, 79 reg_syntax_t syntax, reg_errcode_t *err); 80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 81 re_token_t *token, reg_syntax_t syntax, 82 reg_errcode_t *err); 83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 84 re_string_t *regexp, 85 re_token_t *token, int token_len, 86 re_dfa_t *dfa, 87 reg_syntax_t syntax, 88 bool accept_hyphen); 89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 90 re_string_t *regexp, 91 re_token_t *token); 92 #ifdef RE_ENABLE_I18N 93 static reg_errcode_t build_equiv_class (bitset_t sbcset, 94 re_charset_t *mbcset, 95 Idx *equiv_class_alloc, 96 const unsigned char *name); 97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 98 bitset_t sbcset, 99 re_charset_t *mbcset, 100 Idx *char_class_alloc, 101 const char *class_name, 102 reg_syntax_t syntax); 103 #else /* not RE_ENABLE_I18N */ 104 static reg_errcode_t build_equiv_class (bitset_t sbcset, 105 const unsigned char *name); 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 107 bitset_t sbcset, 108 const char *class_name, 109 reg_syntax_t syntax); 110 #endif /* not RE_ENABLE_I18N */ 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 112 RE_TRANSLATE_TYPE trans, 113 const char *class_name, 114 const char *extra, 115 bool non_match, reg_errcode_t *err); 116 static bin_tree_t *create_tree (re_dfa_t *dfa, 117 bin_tree_t *left, bin_tree_t *right, 118 re_token_type_t type); 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 120 bin_tree_t *left, bin_tree_t *right, 121 const re_token_t *token); 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 123 static void free_token (re_token_t *node); 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 126 127 /* This table gives an error message for each of the error codes listed 128 in regex.h. Obviously the order here has to be same as there. 129 POSIX doesn't require that we do anything for REG_NOERROR, 130 but why not be nice? */ 131 132 static const char __re_error_msgid[] = 133 { 134 #define REG_NOERROR_IDX 0 135 gettext_noop ("Success") /* REG_NOERROR */ 136 "\0" 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 138 gettext_noop ("No match") /* REG_NOMATCH */ 139 "\0" 140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 142 "\0" 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 145 "\0" 146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 148 "\0" 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 151 "\0" 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 154 "\0" 155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 156 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 157 "\0" 158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 160 "\0" 161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 163 "\0" 164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 166 "\0" 167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 168 gettext_noop ("Invalid range end") /* REG_ERANGE */ 169 "\0" 170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 171 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 172 "\0" 173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 175 "\0" 176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 177 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 178 "\0" 179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 180 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 181 "\0" 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 184 }; 185 186 static const size_t __re_error_msgid_idx[] = 187 { 188 REG_NOERROR_IDX, 189 REG_NOMATCH_IDX, 190 REG_BADPAT_IDX, 191 REG_ECOLLATE_IDX, 192 REG_ECTYPE_IDX, 193 REG_EESCAPE_IDX, 194 REG_ESUBREG_IDX, 195 REG_EBRACK_IDX, 196 REG_EPAREN_IDX, 197 REG_EBRACE_IDX, 198 REG_BADBR_IDX, 199 REG_ERANGE_IDX, 200 REG_ESPACE_IDX, 201 REG_BADRPT_IDX, 202 REG_EEND_IDX, 203 REG_ESIZE_IDX, 204 REG_ERPAREN_IDX 205 }; 206 207 /* Entry points for GNU code. */ 208 209 /* re_compile_pattern is the GNU regular expression compiler: it 210 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 211 Returns 0 if the pattern was valid, otherwise an error string. 212 213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields 214 are set in BUFP on entry. */ 215 216 #ifdef _LIBC 217 const char * 218 re_compile_pattern (pattern, length, bufp) 219 const char *pattern; 220 size_t length; 221 struct re_pattern_buffer *bufp; 222 #else /* size_t might promote */ 223 const char * 224 re_compile_pattern (const char *pattern, size_t length, 225 struct re_pattern_buffer *bufp) 226 #endif 227 { 228 reg_errcode_t ret; 229 230 /* And GNU code determines whether or not to get register information 231 by passing null for the REGS argument to re_match, etc., not by 232 setting no_sub, unless RE_NO_SUB is set. */ 233 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 234 235 /* Match anchors at newline. */ 236 bufp->newline_anchor = 1; 237 238 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 239 240 if (!ret) 241 return NULL; 242 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 243 } 244 #ifdef _LIBC 245 weak_alias (__re_compile_pattern, re_compile_pattern) 246 #endif 247 248 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can 249 also be assigned to arbitrarily: each pattern buffer stores its own 250 syntax, so it can be changed between regex compilations. */ 251 /* This has no initializer because initialized variables in Emacs 252 become read-only after dumping. */ 253 reg_syntax_t re_syntax_options; 254 255 256 /* Specify the precise syntax of regexps for compilation. This provides 257 for compatibility for various utilities which historically have 258 different, incompatible syntaxes. 259 260 The argument SYNTAX is a bit mask comprised of the various bits 261 defined in regex.h. We return the old syntax. */ 262 263 reg_syntax_t 264 re_set_syntax (syntax) 265 reg_syntax_t syntax; 266 { 267 reg_syntax_t ret = re_syntax_options; 268 269 re_syntax_options = syntax; 270 return ret; 271 } 272 #ifdef _LIBC 273 weak_alias (__re_set_syntax, re_set_syntax) 274 #endif 275 276 int 277 re_compile_fastmap (bufp) 278 struct re_pattern_buffer *bufp; 279 { 280 re_dfa_t *dfa = bufp->buffer; 281 char *fastmap = bufp->fastmap; 282 283 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 284 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 285 if (dfa->init_state != dfa->init_state_word) 286 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 287 if (dfa->init_state != dfa->init_state_nl) 288 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 289 if (dfa->init_state != dfa->init_state_begbuf) 290 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 291 bufp->fastmap_accurate = 1; 292 return 0; 293 } 294 #ifdef _LIBC 295 weak_alias (__re_compile_fastmap, re_compile_fastmap) 296 #endif 297 298 static inline void 299 __attribute__ ((always_inline)) 300 re_set_fastmap (char *fastmap, bool icase, int ch) 301 { 302 fastmap[ch] = 1; 303 if (icase) 304 fastmap[tolower (ch)] = 1; 305 } 306 307 /* Helper function for re_compile_fastmap. 308 Compile fastmap for the initial_state INIT_STATE. */ 309 310 static void 311 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 312 char *fastmap) 313 { 314 re_dfa_t *dfa = bufp->buffer; 315 Idx node_cnt; 316 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 317 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 318 { 319 Idx node = init_state->nodes.elems[node_cnt]; 320 re_token_type_t type = dfa->nodes[node].type; 321 322 if (type == CHARACTER) 323 { 324 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 325 #ifdef RE_ENABLE_I18N 326 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 327 { 328 unsigned char buf[MB_LEN_MAX]; 329 unsigned char *p; 330 wchar_t wc; 331 mbstate_t state; 332 333 p = buf; 334 *p++ = dfa->nodes[node].opr.c; 335 while (++node < dfa->nodes_len 336 && dfa->nodes[node].type == CHARACTER 337 && dfa->nodes[node].mb_partial) 338 *p++ = dfa->nodes[node].opr.c; 339 memset (&state, '\0', sizeof (state)); 340 if (__mbrtowc (&wc, (const char *) buf, p - buf, 341 &state) == p - buf 342 && (__wcrtomb ((char *) buf, __towlower (wc), &state) 343 != (size_t) -1)) 344 re_set_fastmap (fastmap, false, buf[0]); 345 } 346 #endif 347 } 348 else if (type == SIMPLE_BRACKET) 349 { 350 int i, ch; 351 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 352 { 353 int j; 354 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 355 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 356 if (w & ((bitset_word_t) 1 << j)) 357 re_set_fastmap (fastmap, icase, ch); 358 } 359 } 360 #ifdef RE_ENABLE_I18N 361 else if (type == COMPLEX_BRACKET) 362 { 363 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 364 Idx i; 365 366 # ifdef _LIBC 367 /* See if we have to try all bytes which start multiple collation 368 elements. 369 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 370 collation element, and don't catch 'b' since 'b' is 371 the only collation element which starts from 'b' (and 372 it is caught by SIMPLE_BRACKET). */ 373 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 374 && (cset->ncoll_syms || cset->nranges)) 375 { 376 const int32_t *table = (const int32_t *) 377 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 378 for (i = 0; i < SBC_MAX; ++i) 379 if (table[i] < 0) 380 re_set_fastmap (fastmap, icase, i); 381 } 382 # endif /* _LIBC */ 383 384 /* See if we have to start the match at all multibyte characters, 385 i.e. where we would not find an invalid sequence. This only 386 applies to multibyte character sets; for single byte character 387 sets, the SIMPLE_BRACKET again suffices. */ 388 if (dfa->mb_cur_max > 1 389 && (cset->nchar_classes || cset->non_match || cset->nranges 390 # ifdef _LIBC 391 || cset->nequiv_classes 392 # endif /* _LIBC */ 393 )) 394 { 395 unsigned char c = 0; 396 do 397 { 398 mbstate_t mbs; 399 memset (&mbs, 0, sizeof (mbs)); 400 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 401 re_set_fastmap (fastmap, false, (int) c); 402 } 403 while (++c != 0); 404 } 405 406 else 407 { 408 /* ... Else catch all bytes which can start the mbchars. */ 409 for (i = 0; i < cset->nmbchars; ++i) 410 { 411 char buf[256]; 412 mbstate_t state; 413 memset (&state, '\0', sizeof (state)); 414 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 415 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 416 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 417 { 418 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state) 419 != (size_t) -1) 420 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 421 } 422 } 423 } 424 } 425 #endif /* RE_ENABLE_I18N */ 426 else if (type == OP_PERIOD 427 #ifdef RE_ENABLE_I18N 428 || type == OP_UTF8_PERIOD 429 #endif /* RE_ENABLE_I18N */ 430 || type == END_OF_RE) 431 { 432 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 433 if (type == END_OF_RE) 434 bufp->can_be_null = 1; 435 return; 436 } 437 } 438 } 439 440 /* Entry point for POSIX code. */ 441 /* regcomp takes a regular expression as a string and compiles it. 442 443 PREG is a regex_t *. We do not expect any fields to be initialized, 444 since POSIX says we shouldn't. Thus, we set 445 446 'buffer' to the compiled pattern; 447 'used' to the length of the compiled pattern; 448 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the 449 REG_EXTENDED bit in CFLAGS is set; otherwise, to 450 RE_SYNTAX_POSIX_BASIC; 451 'newline_anchor' to REG_NEWLINE being set in CFLAGS; 452 'fastmap' to an allocated space for the fastmap; 453 'fastmap_accurate' to zero; 454 're_nsub' to the number of subexpressions in PATTERN. 455 456 PATTERN is the address of the pattern string. 457 458 CFLAGS is a series of bits which affect compilation. 459 460 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 461 use POSIX basic syntax. 462 463 If REG_NEWLINE is set, then . and [^...] don't match newline. 464 Also, regexec will try a match beginning after every newline. 465 466 If REG_ICASE is set, then we considers upper- and lowercase 467 versions of letters to be equivalent when matching. 468 469 If REG_NOSUB is set, then when PREG is passed to regexec, that 470 routine will report only success or failure, and nothing about the 471 registers. 472 473 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 474 the return codes and their meanings.) */ 475 476 int 477 regcomp (preg, pattern, cflags) 478 regex_t *_Restrict_ preg; 479 const char *_Restrict_ pattern; 480 int cflags; 481 { 482 reg_errcode_t ret; 483 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 484 : RE_SYNTAX_POSIX_BASIC); 485 486 preg->buffer = NULL; 487 preg->allocated = 0; 488 preg->used = 0; 489 490 /* Try to allocate space for the fastmap. */ 491 preg->fastmap = re_malloc (char, SBC_MAX); 492 if (BE (preg->fastmap == NULL, 0)) 493 return REG_ESPACE; 494 495 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 496 497 /* If REG_NEWLINE is set, newlines are treated differently. */ 498 if (cflags & REG_NEWLINE) 499 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 500 syntax &= ~RE_DOT_NEWLINE; 501 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 502 /* It also changes the matching behavior. */ 503 preg->newline_anchor = 1; 504 } 505 else 506 preg->newline_anchor = 0; 507 preg->no_sub = !!(cflags & REG_NOSUB); 508 preg->translate = NULL; 509 510 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 511 512 /* POSIX doesn't distinguish between an unmatched open-group and an 513 unmatched close-group: both are REG_EPAREN. */ 514 if (ret == REG_ERPAREN) 515 ret = REG_EPAREN; 516 517 /* We have already checked preg->fastmap != NULL. */ 518 if (BE (ret == REG_NOERROR, 1)) 519 /* Compute the fastmap now, since regexec cannot modify the pattern 520 buffer. This function never fails in this implementation. */ 521 (void) re_compile_fastmap (preg); 522 else 523 { 524 /* Some error occurred while compiling the expression. */ 525 re_free (preg->fastmap); 526 preg->fastmap = NULL; 527 } 528 529 return (int) ret; 530 } 531 #ifdef _LIBC 532 weak_alias (__regcomp, regcomp) 533 #endif 534 535 /* Returns a message corresponding to an error code, ERRCODE, returned 536 from either regcomp or regexec. We don't use PREG here. */ 537 538 #ifdef _LIBC 539 size_t 540 regerror (errcode, preg, errbuf, errbuf_size) 541 int errcode; 542 const regex_t *_Restrict_ preg; 543 char *_Restrict_ errbuf; 544 size_t errbuf_size; 545 #else /* size_t might promote */ 546 size_t 547 regerror (int errcode, const regex_t *_Restrict_ preg, 548 char *_Restrict_ errbuf, size_t errbuf_size) 549 #endif 550 { 551 const char *msg; 552 size_t msg_size; 553 554 if (BE (errcode < 0 555 || errcode >= (int) (sizeof (__re_error_msgid_idx) 556 / sizeof (__re_error_msgid_idx[0])), 0)) 557 /* Only error codes returned by the rest of the code should be passed 558 to this routine. If we are given anything else, or if other regex 559 code generates an invalid error code, then the program has a bug. 560 Dump core so we can fix it. */ 561 abort (); 562 563 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 564 565 msg_size = strlen (msg) + 1; /* Includes the null. */ 566 567 if (BE (errbuf_size != 0, 1)) 568 { 569 size_t cpy_size = msg_size; 570 if (BE (msg_size > errbuf_size, 0)) 571 { 572 cpy_size = errbuf_size - 1; 573 errbuf[cpy_size] = '\0'; 574 } 575 memcpy (errbuf, msg, cpy_size); 576 } 577 578 return msg_size; 579 } 580 #ifdef _LIBC 581 weak_alias (__regerror, regerror) 582 #endif 583 584 585 #ifdef RE_ENABLE_I18N 586 /* This static array is used for the map to single-byte characters when 587 UTF-8 is used. Otherwise we would allocate memory just to initialize 588 it the same all the time. UTF-8 is the preferred encoding so this is 589 a worthwhile optimization. */ 590 static const bitset_t utf8_sb_map = 591 { 592 /* Set the first 128 bits. */ 593 # if defined __GNUC__ && !defined __STRICT_ANSI__ 594 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 595 # else 596 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 597 # error "bitset_word_t is narrower than 32 bits" 598 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 599 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 600 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 601 BITSET_WORD_MAX, BITSET_WORD_MAX, 602 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 603 BITSET_WORD_MAX, 604 # endif 605 (BITSET_WORD_MAX 606 >> (SBC_MAX % BITSET_WORD_BITS == 0 607 ? 0 608 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 609 # endif 610 }; 611 #endif 612 613 614 static void 615 free_dfa_content (re_dfa_t *dfa) 616 { 617 Idx i, j; 618 619 if (dfa->nodes) 620 for (i = 0; i < dfa->nodes_len; ++i) 621 free_token (dfa->nodes + i); 622 re_free (dfa->nexts); 623 for (i = 0; i < dfa->nodes_len; ++i) 624 { 625 if (dfa->eclosures != NULL) 626 re_node_set_free (dfa->eclosures + i); 627 if (dfa->inveclosures != NULL) 628 re_node_set_free (dfa->inveclosures + i); 629 if (dfa->edests != NULL) 630 re_node_set_free (dfa->edests + i); 631 } 632 re_free (dfa->edests); 633 re_free (dfa->eclosures); 634 re_free (dfa->inveclosures); 635 re_free (dfa->nodes); 636 637 if (dfa->state_table) 638 for (i = 0; i <= dfa->state_hash_mask; ++i) 639 { 640 struct re_state_table_entry *entry = dfa->state_table + i; 641 for (j = 0; j < entry->num; ++j) 642 { 643 re_dfastate_t *state = entry->array[j]; 644 free_state (state); 645 } 646 re_free (entry->array); 647 } 648 re_free (dfa->state_table); 649 #ifdef RE_ENABLE_I18N 650 if (dfa->sb_char != utf8_sb_map) 651 re_free (dfa->sb_char); 652 #endif 653 re_free (dfa->subexp_map); 654 #ifdef DEBUG 655 re_free (dfa->re_str); 656 #endif 657 658 re_free (dfa); 659 } 660 661 662 /* Free dynamically allocated space used by PREG. */ 663 664 void 665 regfree (preg) 666 regex_t *preg; 667 { 668 re_dfa_t *dfa = preg->buffer; 669 if (BE (dfa != NULL, 1)) 670 { 671 lock_fini (dfa->lock); 672 free_dfa_content (dfa); 673 } 674 preg->buffer = NULL; 675 preg->allocated = 0; 676 677 re_free (preg->fastmap); 678 preg->fastmap = NULL; 679 680 re_free (preg->translate); 681 preg->translate = NULL; 682 } 683 #ifdef _LIBC 684 weak_alias (__regfree, regfree) 685 #endif 686 687 /* Entry points compatible with 4.2 BSD regex library. We don't define 688 them unless specifically requested. */ 689 690 #if defined _REGEX_RE_COMP || defined _LIBC 691 692 /* BSD has one and only one pattern buffer. */ 693 static struct re_pattern_buffer re_comp_buf; 694 695 char * 696 # ifdef _LIBC 697 /* Make these definitions weak in libc, so POSIX programs can redefine 698 these names if they don't use our functions, and still use 699 regcomp/regexec above without link errors. */ 700 weak_function 701 # endif 702 re_comp (s) 703 const char *s; 704 { 705 reg_errcode_t ret; 706 char *fastmap; 707 708 if (!s) 709 { 710 if (!re_comp_buf.buffer) 711 return gettext ("No previous regular expression"); 712 return 0; 713 } 714 715 if (re_comp_buf.buffer) 716 { 717 fastmap = re_comp_buf.fastmap; 718 re_comp_buf.fastmap = NULL; 719 __regfree (&re_comp_buf); 720 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 721 re_comp_buf.fastmap = fastmap; 722 } 723 724 if (re_comp_buf.fastmap == NULL) 725 { 726 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 727 if (re_comp_buf.fastmap == NULL) 728 return (char *) gettext (__re_error_msgid 729 + __re_error_msgid_idx[(int) REG_ESPACE]); 730 } 731 732 /* Since 're_exec' always passes NULL for the 'regs' argument, we 733 don't need to initialize the pattern buffer fields which affect it. */ 734 735 /* Match anchors at newlines. */ 736 re_comp_buf.newline_anchor = 1; 737 738 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 739 740 if (!ret) 741 return NULL; 742 743 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ 744 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 745 } 746 747 #ifdef _LIBC 748 libc_freeres_fn (free_mem) 749 { 750 __regfree (&re_comp_buf); 751 } 752 #endif 753 754 #endif /* _REGEX_RE_COMP */ 755 756 /* Internal entry point. 757 Compile the regular expression PATTERN, whose length is LENGTH. 758 SYNTAX indicate regular expression's syntax. */ 759 760 static reg_errcode_t 761 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 762 reg_syntax_t syntax) 763 { 764 reg_errcode_t err = REG_NOERROR; 765 re_dfa_t *dfa; 766 re_string_t regexp; 767 768 /* Initialize the pattern buffer. */ 769 preg->fastmap_accurate = 0; 770 preg->syntax = syntax; 771 preg->not_bol = preg->not_eol = 0; 772 preg->used = 0; 773 preg->re_nsub = 0; 774 preg->can_be_null = 0; 775 preg->regs_allocated = REGS_UNALLOCATED; 776 777 /* Initialize the dfa. */ 778 dfa = preg->buffer; 779 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 780 { 781 /* If zero allocated, but buffer is non-null, try to realloc 782 enough space. This loses if buffer's address is bogus, but 783 that is the user's responsibility. If ->buffer is NULL this 784 is a simple allocation. */ 785 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 786 if (dfa == NULL) 787 return REG_ESPACE; 788 preg->allocated = sizeof (re_dfa_t); 789 preg->buffer = dfa; 790 } 791 preg->used = sizeof (re_dfa_t); 792 793 err = init_dfa (dfa, length); 794 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0)) 795 err = REG_ESPACE; 796 if (BE (err != REG_NOERROR, 0)) 797 { 798 free_dfa_content (dfa); 799 preg->buffer = NULL; 800 preg->allocated = 0; 801 return err; 802 } 803 #ifdef DEBUG 804 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 805 dfa->re_str = re_malloc (char, length + 1); 806 strncpy (dfa->re_str, pattern, length + 1); 807 #endif 808 809 err = re_string_construct (®exp, pattern, length, preg->translate, 810 (syntax & RE_ICASE) != 0, dfa); 811 if (BE (err != REG_NOERROR, 0)) 812 { 813 re_compile_internal_free_return: 814 free_workarea_compile (preg); 815 re_string_destruct (®exp); 816 lock_fini (dfa->lock); 817 free_dfa_content (dfa); 818 preg->buffer = NULL; 819 preg->allocated = 0; 820 return err; 821 } 822 823 /* Parse the regular expression, and build a structure tree. */ 824 preg->re_nsub = 0; 825 dfa->str_tree = parse (®exp, preg, syntax, &err); 826 if (BE (dfa->str_tree == NULL, 0)) 827 goto re_compile_internal_free_return; 828 829 /* Analyze the tree and create the nfa. */ 830 err = analyze (preg); 831 if (BE (err != REG_NOERROR, 0)) 832 goto re_compile_internal_free_return; 833 834 #ifdef RE_ENABLE_I18N 835 /* If possible, do searching in single byte encoding to speed things up. */ 836 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 837 optimize_utf8 (dfa); 838 #endif 839 840 /* Then create the initial state of the dfa. */ 841 err = create_initial_state (dfa); 842 843 /* Release work areas. */ 844 free_workarea_compile (preg); 845 re_string_destruct (®exp); 846 847 if (BE (err != REG_NOERROR, 0)) 848 { 849 lock_fini (dfa->lock); 850 free_dfa_content (dfa); 851 preg->buffer = NULL; 852 preg->allocated = 0; 853 } 854 855 return err; 856 } 857 858 /* Initialize DFA. We use the length of the regular expression PAT_LEN 859 as the initial length of some arrays. */ 860 861 static reg_errcode_t 862 init_dfa (re_dfa_t *dfa, size_t pat_len) 863 { 864 __re_size_t table_size; 865 #ifndef _LIBC 866 const char *codeset_name; 867 #endif 868 #ifdef RE_ENABLE_I18N 869 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 870 #else 871 size_t max_i18n_object_size = 0; 872 #endif 873 size_t max_object_size = 874 MAX (sizeof (struct re_state_table_entry), 875 MAX (sizeof (re_token_t), 876 MAX (sizeof (re_node_set), 877 MAX (sizeof (regmatch_t), 878 max_i18n_object_size)))); 879 880 memset (dfa, '\0', sizeof (re_dfa_t)); 881 882 /* Force allocation of str_tree_storage the first time. */ 883 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 884 885 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 886 calculation below, and for similar doubling calculations 887 elsewhere. And it's <= rather than <, because some of the 888 doubling calculations add 1 afterwards. */ 889 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) 890 return REG_ESPACE; 891 892 dfa->nodes_alloc = pat_len + 1; 893 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 894 895 /* table_size = 2 ^ ceil(log pat_len) */ 896 for (table_size = 1; ; table_size <<= 1) 897 if (table_size > pat_len) 898 break; 899 900 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 901 dfa->state_hash_mask = table_size - 1; 902 903 dfa->mb_cur_max = MB_CUR_MAX; 904 #ifdef _LIBC 905 if (dfa->mb_cur_max == 6 906 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 907 dfa->is_utf8 = 1; 908 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 909 != 0); 910 #else 911 codeset_name = nl_langinfo (CODESET); 912 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') 913 && (codeset_name[1] == 'T' || codeset_name[1] == 't') 914 && (codeset_name[2] == 'F' || codeset_name[2] == 'f') 915 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) 916 dfa->is_utf8 = 1; 917 918 /* We check exhaustively in the loop below if this charset is a 919 superset of ASCII. */ 920 dfa->map_notascii = 0; 921 #endif 922 923 #ifdef RE_ENABLE_I18N 924 if (dfa->mb_cur_max > 1) 925 { 926 if (dfa->is_utf8) 927 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 928 else 929 { 930 int i, j, ch; 931 932 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 933 if (BE (dfa->sb_char == NULL, 0)) 934 return REG_ESPACE; 935 936 /* Set the bits corresponding to single byte chars. */ 937 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 938 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 939 { 940 wint_t wch = __btowc (ch); 941 if (wch != WEOF) 942 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 943 # ifndef _LIBC 944 if (isascii (ch) && wch != ch) 945 dfa->map_notascii = 1; 946 # endif 947 } 948 } 949 } 950 #endif 951 952 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 953 return REG_ESPACE; 954 return REG_NOERROR; 955 } 956 957 /* Initialize WORD_CHAR table, which indicate which character is 958 "word". In this case "word" means that it is the word construction 959 character used by some operators like "\<", "\>", etc. */ 960 961 static void 962 internal_function 963 init_word_char (re_dfa_t *dfa) 964 { 965 int i = 0; 966 int j; 967 int ch = 0; 968 dfa->word_ops_used = 1; 969 if (BE (dfa->map_notascii == 0, 1)) 970 { 971 bitset_word_t bits0 = 0x00000000; 972 bitset_word_t bits1 = 0x03ff0000; 973 bitset_word_t bits2 = 0x87fffffe; 974 bitset_word_t bits3 = 0x07fffffe; 975 if (BITSET_WORD_BITS == 64) 976 { 977 dfa->word_char[0] = bits1 << 31 << 1 | bits0; 978 dfa->word_char[1] = bits3 << 31 << 1 | bits2; 979 i = 2; 980 } 981 else if (BITSET_WORD_BITS == 32) 982 { 983 dfa->word_char[0] = bits0; 984 dfa->word_char[1] = bits1; 985 dfa->word_char[2] = bits2; 986 dfa->word_char[3] = bits3; 987 i = 4; 988 } 989 else 990 goto general_case; 991 ch = 128; 992 993 if (BE (dfa->is_utf8, 1)) 994 { 995 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 996 return; 997 } 998 } 999 1000 general_case: 1001 for (; i < BITSET_WORDS; ++i) 1002 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 1003 if (isalnum (ch) || ch == '_') 1004 dfa->word_char[i] |= (bitset_word_t) 1 << j; 1005 } 1006 1007 /* Free the work area which are only used while compiling. */ 1008 1009 static void 1010 free_workarea_compile (regex_t *preg) 1011 { 1012 re_dfa_t *dfa = preg->buffer; 1013 bin_tree_storage_t *storage, *next; 1014 for (storage = dfa->str_tree_storage; storage; storage = next) 1015 { 1016 next = storage->next; 1017 re_free (storage); 1018 } 1019 dfa->str_tree_storage = NULL; 1020 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 1021 dfa->str_tree = NULL; 1022 re_free (dfa->org_indices); 1023 dfa->org_indices = NULL; 1024 } 1025 1026 /* Create initial states for all contexts. */ 1027 1028 static reg_errcode_t 1029 create_initial_state (re_dfa_t *dfa) 1030 { 1031 Idx first, i; 1032 reg_errcode_t err; 1033 re_node_set init_nodes; 1034 1035 /* Initial states have the epsilon closure of the node which is 1036 the first node of the regular expression. */ 1037 first = dfa->str_tree->first->node_idx; 1038 dfa->init_node = first; 1039 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1040 if (BE (err != REG_NOERROR, 0)) 1041 return err; 1042 1043 /* The back-references which are in initial states can epsilon transit, 1044 since in this case all of the subexpressions can be null. 1045 Then we add epsilon closures of the nodes which are the next nodes of 1046 the back-references. */ 1047 if (dfa->nbackref > 0) 1048 for (i = 0; i < init_nodes.nelem; ++i) 1049 { 1050 Idx node_idx = init_nodes.elems[i]; 1051 re_token_type_t type = dfa->nodes[node_idx].type; 1052 1053 Idx clexp_idx; 1054 if (type != OP_BACK_REF) 1055 continue; 1056 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1057 { 1058 re_token_t *clexp_node; 1059 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1060 if (clexp_node->type == OP_CLOSE_SUBEXP 1061 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1062 break; 1063 } 1064 if (clexp_idx == init_nodes.nelem) 1065 continue; 1066 1067 if (type == OP_BACK_REF) 1068 { 1069 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1070 if (!re_node_set_contains (&init_nodes, dest_idx)) 1071 { 1072 reg_errcode_t merge_err 1073 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1074 if (merge_err != REG_NOERROR) 1075 return merge_err; 1076 i = 0; 1077 } 1078 } 1079 } 1080 1081 /* It must be the first time to invoke acquire_state. */ 1082 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1083 /* We don't check ERR here, since the initial state must not be NULL. */ 1084 if (BE (dfa->init_state == NULL, 0)) 1085 return err; 1086 if (dfa->init_state->has_constraint) 1087 { 1088 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1089 CONTEXT_WORD); 1090 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1091 CONTEXT_NEWLINE); 1092 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1093 &init_nodes, 1094 CONTEXT_NEWLINE 1095 | CONTEXT_BEGBUF); 1096 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1097 || dfa->init_state_begbuf == NULL, 0)) 1098 return err; 1099 } 1100 else 1101 dfa->init_state_word = dfa->init_state_nl 1102 = dfa->init_state_begbuf = dfa->init_state; 1103 1104 re_node_set_free (&init_nodes); 1105 return REG_NOERROR; 1106 } 1107 1108 #ifdef RE_ENABLE_I18N 1109 /* If it is possible to do searching in single byte encoding instead of UTF-8 1110 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1111 DFA nodes where needed. */ 1112 1113 static void 1114 optimize_utf8 (re_dfa_t *dfa) 1115 { 1116 Idx node; 1117 int i; 1118 bool mb_chars = false; 1119 bool has_period = false; 1120 1121 for (node = 0; node < dfa->nodes_len; ++node) 1122 switch (dfa->nodes[node].type) 1123 { 1124 case CHARACTER: 1125 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1126 mb_chars = true; 1127 break; 1128 case ANCHOR: 1129 switch (dfa->nodes[node].opr.ctx_type) 1130 { 1131 case LINE_FIRST: 1132 case LINE_LAST: 1133 case BUF_FIRST: 1134 case BUF_LAST: 1135 break; 1136 default: 1137 /* Word anchors etc. cannot be handled. It's okay to test 1138 opr.ctx_type since constraints (for all DFA nodes) are 1139 created by ORing one or more opr.ctx_type values. */ 1140 return; 1141 } 1142 break; 1143 case OP_PERIOD: 1144 has_period = true; 1145 break; 1146 case OP_BACK_REF: 1147 case OP_ALT: 1148 case END_OF_RE: 1149 case OP_DUP_ASTERISK: 1150 case OP_OPEN_SUBEXP: 1151 case OP_CLOSE_SUBEXP: 1152 break; 1153 case COMPLEX_BRACKET: 1154 return; 1155 case SIMPLE_BRACKET: 1156 /* Just double check. */ 1157 { 1158 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1159 ? 0 1160 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1161 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1162 { 1163 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1164 return; 1165 rshift = 0; 1166 } 1167 } 1168 break; 1169 default: 1170 abort (); 1171 } 1172 1173 if (mb_chars || has_period) 1174 for (node = 0; node < dfa->nodes_len; ++node) 1175 { 1176 if (dfa->nodes[node].type == CHARACTER 1177 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1178 dfa->nodes[node].mb_partial = 0; 1179 else if (dfa->nodes[node].type == OP_PERIOD) 1180 dfa->nodes[node].type = OP_UTF8_PERIOD; 1181 } 1182 1183 /* The search can be in single byte locale. */ 1184 dfa->mb_cur_max = 1; 1185 dfa->is_utf8 = 0; 1186 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1187 } 1188 #endif 1189 1190 /* Analyze the structure tree, and calculate "first", "next", "edest", 1191 "eclosure", and "inveclosure". */ 1192 1193 static reg_errcode_t 1194 analyze (regex_t *preg) 1195 { 1196 re_dfa_t *dfa = preg->buffer; 1197 reg_errcode_t ret; 1198 1199 /* Allocate arrays. */ 1200 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1201 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1202 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1203 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1204 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1205 || dfa->eclosures == NULL, 0)) 1206 return REG_ESPACE; 1207 1208 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1209 if (dfa->subexp_map != NULL) 1210 { 1211 Idx i; 1212 for (i = 0; i < preg->re_nsub; i++) 1213 dfa->subexp_map[i] = i; 1214 preorder (dfa->str_tree, optimize_subexps, dfa); 1215 for (i = 0; i < preg->re_nsub; i++) 1216 if (dfa->subexp_map[i] != i) 1217 break; 1218 if (i == preg->re_nsub) 1219 { 1220 free (dfa->subexp_map); 1221 dfa->subexp_map = NULL; 1222 } 1223 } 1224 1225 ret = postorder (dfa->str_tree, lower_subexps, preg); 1226 if (BE (ret != REG_NOERROR, 0)) 1227 return ret; 1228 ret = postorder (dfa->str_tree, calc_first, dfa); 1229 if (BE (ret != REG_NOERROR, 0)) 1230 return ret; 1231 preorder (dfa->str_tree, calc_next, dfa); 1232 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1233 if (BE (ret != REG_NOERROR, 0)) 1234 return ret; 1235 ret = calc_eclosure (dfa); 1236 if (BE (ret != REG_NOERROR, 0)) 1237 return ret; 1238 1239 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1240 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1241 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1242 || dfa->nbackref) 1243 { 1244 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1245 if (BE (dfa->inveclosures == NULL, 0)) 1246 return REG_ESPACE; 1247 ret = calc_inveclosure (dfa); 1248 } 1249 1250 return ret; 1251 } 1252 1253 /* Our parse trees are very unbalanced, so we cannot use a stack to 1254 implement parse tree visits. Instead, we use parent pointers and 1255 some hairy code in these two functions. */ 1256 static reg_errcode_t 1257 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1258 void *extra) 1259 { 1260 bin_tree_t *node, *prev; 1261 1262 for (node = root; ; ) 1263 { 1264 /* Descend down the tree, preferably to the left (or to the right 1265 if that's the only child). */ 1266 while (node->left || node->right) 1267 if (node->left) 1268 node = node->left; 1269 else 1270 node = node->right; 1271 1272 do 1273 { 1274 reg_errcode_t err = fn (extra, node); 1275 if (BE (err != REG_NOERROR, 0)) 1276 return err; 1277 if (node->parent == NULL) 1278 return REG_NOERROR; 1279 prev = node; 1280 node = node->parent; 1281 } 1282 /* Go up while we have a node that is reached from the right. */ 1283 while (node->right == prev || node->right == NULL); 1284 node = node->right; 1285 } 1286 } 1287 1288 static reg_errcode_t 1289 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1290 void *extra) 1291 { 1292 bin_tree_t *node; 1293 1294 for (node = root; ; ) 1295 { 1296 reg_errcode_t err = fn (extra, node); 1297 if (BE (err != REG_NOERROR, 0)) 1298 return err; 1299 1300 /* Go to the left node, or up and to the right. */ 1301 if (node->left) 1302 node = node->left; 1303 else 1304 { 1305 bin_tree_t *prev = NULL; 1306 while (node->right == prev || node->right == NULL) 1307 { 1308 prev = node; 1309 node = node->parent; 1310 if (!node) 1311 return REG_NOERROR; 1312 } 1313 node = node->right; 1314 } 1315 } 1316 } 1317 1318 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1319 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1320 backreferences as well. Requires a preorder visit. */ 1321 static reg_errcode_t 1322 optimize_subexps (void *extra, bin_tree_t *node) 1323 { 1324 re_dfa_t *dfa = (re_dfa_t *) extra; 1325 1326 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1327 { 1328 int idx = node->token.opr.idx; 1329 node->token.opr.idx = dfa->subexp_map[idx]; 1330 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1331 } 1332 1333 else if (node->token.type == SUBEXP 1334 && node->left && node->left->token.type == SUBEXP) 1335 { 1336 Idx other_idx = node->left->token.opr.idx; 1337 1338 node->left = node->left->left; 1339 if (node->left) 1340 node->left->parent = node; 1341 1342 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1343 if (other_idx < BITSET_WORD_BITS) 1344 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1345 } 1346 1347 return REG_NOERROR; 1348 } 1349 1350 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1351 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1352 static reg_errcode_t 1353 lower_subexps (void *extra, bin_tree_t *node) 1354 { 1355 regex_t *preg = (regex_t *) extra; 1356 reg_errcode_t err = REG_NOERROR; 1357 1358 if (node->left && node->left->token.type == SUBEXP) 1359 { 1360 node->left = lower_subexp (&err, preg, node->left); 1361 if (node->left) 1362 node->left->parent = node; 1363 } 1364 if (node->right && node->right->token.type == SUBEXP) 1365 { 1366 node->right = lower_subexp (&err, preg, node->right); 1367 if (node->right) 1368 node->right->parent = node; 1369 } 1370 1371 return err; 1372 } 1373 1374 static bin_tree_t * 1375 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1376 { 1377 re_dfa_t *dfa = preg->buffer; 1378 bin_tree_t *body = node->left; 1379 bin_tree_t *op, *cls, *tree1, *tree; 1380 1381 if (preg->no_sub 1382 /* We do not optimize empty subexpressions, because otherwise we may 1383 have bad CONCAT nodes with NULL children. This is obviously not 1384 very common, so we do not lose much. An example that triggers 1385 this case is the sed "script" /\(\)/x. */ 1386 && node->left != NULL 1387 && (node->token.opr.idx >= BITSET_WORD_BITS 1388 || !(dfa->used_bkref_map 1389 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1390 return node->left; 1391 1392 /* Convert the SUBEXP node to the concatenation of an 1393 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1394 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1395 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1396 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1397 tree = create_tree (dfa, op, tree1, CONCAT); 1398 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1399 { 1400 *err = REG_ESPACE; 1401 return NULL; 1402 } 1403 1404 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1405 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1406 return tree; 1407 } 1408 1409 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1410 nodes. Requires a postorder visit. */ 1411 static reg_errcode_t 1412 calc_first (void *extra, bin_tree_t *node) 1413 { 1414 re_dfa_t *dfa = (re_dfa_t *) extra; 1415 if (node->token.type == CONCAT) 1416 { 1417 node->first = node->left->first; 1418 node->node_idx = node->left->node_idx; 1419 } 1420 else 1421 { 1422 node->first = node; 1423 node->node_idx = re_dfa_add_node (dfa, node->token); 1424 if (BE (node->node_idx == REG_MISSING, 0)) 1425 return REG_ESPACE; 1426 if (node->token.type == ANCHOR) 1427 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1428 } 1429 return REG_NOERROR; 1430 } 1431 1432 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1433 static reg_errcode_t 1434 calc_next (void *extra, bin_tree_t *node) 1435 { 1436 switch (node->token.type) 1437 { 1438 case OP_DUP_ASTERISK: 1439 node->left->next = node; 1440 break; 1441 case CONCAT: 1442 node->left->next = node->right->first; 1443 node->right->next = node->next; 1444 break; 1445 default: 1446 if (node->left) 1447 node->left->next = node->next; 1448 if (node->right) 1449 node->right->next = node->next; 1450 break; 1451 } 1452 return REG_NOERROR; 1453 } 1454 1455 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1456 static reg_errcode_t 1457 link_nfa_nodes (void *extra, bin_tree_t *node) 1458 { 1459 re_dfa_t *dfa = (re_dfa_t *) extra; 1460 Idx idx = node->node_idx; 1461 reg_errcode_t err = REG_NOERROR; 1462 1463 switch (node->token.type) 1464 { 1465 case CONCAT: 1466 break; 1467 1468 case END_OF_RE: 1469 assert (node->next == NULL); 1470 break; 1471 1472 case OP_DUP_ASTERISK: 1473 case OP_ALT: 1474 { 1475 Idx left, right; 1476 dfa->has_plural_match = 1; 1477 if (node->left != NULL) 1478 left = node->left->first->node_idx; 1479 else 1480 left = node->next->node_idx; 1481 if (node->right != NULL) 1482 right = node->right->first->node_idx; 1483 else 1484 right = node->next->node_idx; 1485 assert (REG_VALID_INDEX (left)); 1486 assert (REG_VALID_INDEX (right)); 1487 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1488 } 1489 break; 1490 1491 case ANCHOR: 1492 case OP_OPEN_SUBEXP: 1493 case OP_CLOSE_SUBEXP: 1494 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1495 break; 1496 1497 case OP_BACK_REF: 1498 dfa->nexts[idx] = node->next->node_idx; 1499 if (node->token.type == OP_BACK_REF) 1500 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1501 break; 1502 1503 default: 1504 assert (!IS_EPSILON_NODE (node->token.type)); 1505 dfa->nexts[idx] = node->next->node_idx; 1506 break; 1507 } 1508 1509 return err; 1510 } 1511 1512 /* Duplicate the epsilon closure of the node ROOT_NODE. 1513 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1514 to their own constraint. */ 1515 1516 static reg_errcode_t 1517 internal_function 1518 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1519 Idx root_node, unsigned int init_constraint) 1520 { 1521 Idx org_node, clone_node; 1522 bool ok; 1523 unsigned int constraint = init_constraint; 1524 for (org_node = top_org_node, clone_node = top_clone_node;;) 1525 { 1526 Idx org_dest, clone_dest; 1527 if (dfa->nodes[org_node].type == OP_BACK_REF) 1528 { 1529 /* If the back reference epsilon-transit, its destination must 1530 also have the constraint. Then duplicate the epsilon closure 1531 of the destination of the back reference, and store it in 1532 edests of the back reference. */ 1533 org_dest = dfa->nexts[org_node]; 1534 re_node_set_empty (dfa->edests + clone_node); 1535 clone_dest = duplicate_node (dfa, org_dest, constraint); 1536 if (BE (clone_dest == REG_MISSING, 0)) 1537 return REG_ESPACE; 1538 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1539 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1540 if (BE (! ok, 0)) 1541 return REG_ESPACE; 1542 } 1543 else if (dfa->edests[org_node].nelem == 0) 1544 { 1545 /* In case of the node can't epsilon-transit, don't duplicate the 1546 destination and store the original destination as the 1547 destination of the node. */ 1548 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1549 break; 1550 } 1551 else if (dfa->edests[org_node].nelem == 1) 1552 { 1553 /* In case of the node can epsilon-transit, and it has only one 1554 destination. */ 1555 org_dest = dfa->edests[org_node].elems[0]; 1556 re_node_set_empty (dfa->edests + clone_node); 1557 /* If the node is root_node itself, it means the epsilon closure 1558 has a loop. Then tie it to the destination of the root_node. */ 1559 if (org_node == root_node && clone_node != org_node) 1560 { 1561 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1562 if (BE (! ok, 0)) 1563 return REG_ESPACE; 1564 break; 1565 } 1566 /* In case the node has another constraint, append it. */ 1567 constraint |= dfa->nodes[org_node].constraint; 1568 clone_dest = duplicate_node (dfa, org_dest, constraint); 1569 if (BE (clone_dest == REG_MISSING, 0)) 1570 return REG_ESPACE; 1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1572 if (BE (! ok, 0)) 1573 return REG_ESPACE; 1574 } 1575 else /* dfa->edests[org_node].nelem == 2 */ 1576 { 1577 /* In case of the node can epsilon-transit, and it has two 1578 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1579 org_dest = dfa->edests[org_node].elems[0]; 1580 re_node_set_empty (dfa->edests + clone_node); 1581 /* Search for a duplicated node which satisfies the constraint. */ 1582 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1583 if (clone_dest == REG_MISSING) 1584 { 1585 /* There is no such duplicated node, create a new one. */ 1586 reg_errcode_t err; 1587 clone_dest = duplicate_node (dfa, org_dest, constraint); 1588 if (BE (clone_dest == REG_MISSING, 0)) 1589 return REG_ESPACE; 1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1591 if (BE (! ok, 0)) 1592 return REG_ESPACE; 1593 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1594 root_node, constraint); 1595 if (BE (err != REG_NOERROR, 0)) 1596 return err; 1597 } 1598 else 1599 { 1600 /* There is a duplicated node which satisfies the constraint, 1601 use it to avoid infinite loop. */ 1602 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1603 if (BE (! ok, 0)) 1604 return REG_ESPACE; 1605 } 1606 1607 org_dest = dfa->edests[org_node].elems[1]; 1608 clone_dest = duplicate_node (dfa, org_dest, constraint); 1609 if (BE (clone_dest == REG_MISSING, 0)) 1610 return REG_ESPACE; 1611 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1612 if (BE (! ok, 0)) 1613 return REG_ESPACE; 1614 } 1615 org_node = org_dest; 1616 clone_node = clone_dest; 1617 } 1618 return REG_NOERROR; 1619 } 1620 1621 /* Search for a node which is duplicated from the node ORG_NODE, and 1622 satisfies the constraint CONSTRAINT. */ 1623 1624 static Idx 1625 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1626 unsigned int constraint) 1627 { 1628 Idx idx; 1629 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1630 { 1631 if (org_node == dfa->org_indices[idx] 1632 && constraint == dfa->nodes[idx].constraint) 1633 return idx; /* Found. */ 1634 } 1635 return REG_MISSING; /* Not found. */ 1636 } 1637 1638 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1639 Return the index of the new node, or REG_MISSING if insufficient storage is 1640 available. */ 1641 1642 static Idx 1643 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1644 { 1645 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1646 if (BE (dup_idx != REG_MISSING, 1)) 1647 { 1648 dfa->nodes[dup_idx].constraint = constraint; 1649 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1650 dfa->nodes[dup_idx].duplicated = 1; 1651 1652 /* Store the index of the original node. */ 1653 dfa->org_indices[dup_idx] = org_idx; 1654 } 1655 return dup_idx; 1656 } 1657 1658 static reg_errcode_t 1659 calc_inveclosure (re_dfa_t *dfa) 1660 { 1661 Idx src, idx; 1662 bool ok; 1663 for (idx = 0; idx < dfa->nodes_len; ++idx) 1664 re_node_set_init_empty (dfa->inveclosures + idx); 1665 1666 for (src = 0; src < dfa->nodes_len; ++src) 1667 { 1668 Idx *elems = dfa->eclosures[src].elems; 1669 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1670 { 1671 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1672 if (BE (! ok, 0)) 1673 return REG_ESPACE; 1674 } 1675 } 1676 1677 return REG_NOERROR; 1678 } 1679 1680 /* Calculate "eclosure" for all the node in DFA. */ 1681 1682 static reg_errcode_t 1683 calc_eclosure (re_dfa_t *dfa) 1684 { 1685 Idx node_idx; 1686 bool incomplete; 1687 #ifdef DEBUG 1688 assert (dfa->nodes_len > 0); 1689 #endif 1690 incomplete = false; 1691 /* For each nodes, calculate epsilon closure. */ 1692 for (node_idx = 0; ; ++node_idx) 1693 { 1694 reg_errcode_t err; 1695 re_node_set eclosure_elem; 1696 if (node_idx == dfa->nodes_len) 1697 { 1698 if (!incomplete) 1699 break; 1700 incomplete = false; 1701 node_idx = 0; 1702 } 1703 1704 #ifdef DEBUG 1705 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1706 #endif 1707 1708 /* If we have already calculated, skip it. */ 1709 if (dfa->eclosures[node_idx].nelem != 0) 1710 continue; 1711 /* Calculate epsilon closure of 'node_idx'. */ 1712 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1713 if (BE (err != REG_NOERROR, 0)) 1714 return err; 1715 1716 if (dfa->eclosures[node_idx].nelem == 0) 1717 { 1718 incomplete = true; 1719 re_node_set_free (&eclosure_elem); 1720 } 1721 } 1722 return REG_NOERROR; 1723 } 1724 1725 /* Calculate epsilon closure of NODE. */ 1726 1727 static reg_errcode_t 1728 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1729 { 1730 reg_errcode_t err; 1731 Idx i; 1732 re_node_set eclosure; 1733 bool ok; 1734 bool incomplete = false; 1735 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1736 if (BE (err != REG_NOERROR, 0)) 1737 return err; 1738 1739 /* This indicates that we are calculating this node now. 1740 We reference this value to avoid infinite loop. */ 1741 dfa->eclosures[node].nelem = REG_MISSING; 1742 1743 /* If the current node has constraints, duplicate all nodes 1744 since they must inherit the constraints. */ 1745 if (dfa->nodes[node].constraint 1746 && dfa->edests[node].nelem 1747 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1748 { 1749 err = duplicate_node_closure (dfa, node, node, node, 1750 dfa->nodes[node].constraint); 1751 if (BE (err != REG_NOERROR, 0)) 1752 return err; 1753 } 1754 1755 /* Expand each epsilon destination nodes. */ 1756 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1757 for (i = 0; i < dfa->edests[node].nelem; ++i) 1758 { 1759 re_node_set eclosure_elem; 1760 Idx edest = dfa->edests[node].elems[i]; 1761 /* If calculating the epsilon closure of 'edest' is in progress, 1762 return intermediate result. */ 1763 if (dfa->eclosures[edest].nelem == REG_MISSING) 1764 { 1765 incomplete = true; 1766 continue; 1767 } 1768 /* If we haven't calculated the epsilon closure of 'edest' yet, 1769 calculate now. Otherwise use calculated epsilon closure. */ 1770 if (dfa->eclosures[edest].nelem == 0) 1771 { 1772 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1773 if (BE (err != REG_NOERROR, 0)) 1774 return err; 1775 } 1776 else 1777 eclosure_elem = dfa->eclosures[edest]; 1778 /* Merge the epsilon closure of 'edest'. */ 1779 err = re_node_set_merge (&eclosure, &eclosure_elem); 1780 if (BE (err != REG_NOERROR, 0)) 1781 return err; 1782 /* If the epsilon closure of 'edest' is incomplete, 1783 the epsilon closure of this node is also incomplete. */ 1784 if (dfa->eclosures[edest].nelem == 0) 1785 { 1786 incomplete = true; 1787 re_node_set_free (&eclosure_elem); 1788 } 1789 } 1790 1791 /* An epsilon closure includes itself. */ 1792 ok = re_node_set_insert (&eclosure, node); 1793 if (BE (! ok, 0)) 1794 return REG_ESPACE; 1795 if (incomplete && !root) 1796 dfa->eclosures[node].nelem = 0; 1797 else 1798 dfa->eclosures[node] = eclosure; 1799 *new_set = eclosure; 1800 return REG_NOERROR; 1801 } 1802 1803 /* Functions for token which are used in the parser. */ 1804 1805 /* Fetch a token from INPUT. 1806 We must not use this function inside bracket expressions. */ 1807 1808 static void 1809 internal_function 1810 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1811 { 1812 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1813 } 1814 1815 /* Peek a token from INPUT, and return the length of the token. 1816 We must not use this function inside bracket expressions. */ 1817 1818 static int 1819 internal_function 1820 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1821 { 1822 unsigned char c; 1823 1824 if (re_string_eoi (input)) 1825 { 1826 token->type = END_OF_RE; 1827 return 0; 1828 } 1829 1830 c = re_string_peek_byte (input, 0); 1831 token->opr.c = c; 1832 1833 token->word_char = 0; 1834 #ifdef RE_ENABLE_I18N 1835 token->mb_partial = 0; 1836 if (input->mb_cur_max > 1 && 1837 !re_string_first_byte (input, re_string_cur_idx (input))) 1838 { 1839 token->type = CHARACTER; 1840 token->mb_partial = 1; 1841 return 1; 1842 } 1843 #endif 1844 if (c == '\\') 1845 { 1846 unsigned char c2; 1847 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1848 { 1849 token->type = BACK_SLASH; 1850 return 1; 1851 } 1852 1853 c2 = re_string_peek_byte_case (input, 1); 1854 token->opr.c = c2; 1855 token->type = CHARACTER; 1856 #ifdef RE_ENABLE_I18N 1857 if (input->mb_cur_max > 1) 1858 { 1859 wint_t wc = re_string_wchar_at (input, 1860 re_string_cur_idx (input) + 1); 1861 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1862 } 1863 else 1864 #endif 1865 token->word_char = IS_WORD_CHAR (c2) != 0; 1866 1867 switch (c2) 1868 { 1869 case '|': 1870 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1871 token->type = OP_ALT; 1872 break; 1873 case '1': case '2': case '3': case '4': case '5': 1874 case '6': case '7': case '8': case '9': 1875 if (!(syntax & RE_NO_BK_REFS)) 1876 { 1877 token->type = OP_BACK_REF; 1878 token->opr.idx = c2 - '1'; 1879 } 1880 break; 1881 case '<': 1882 if (!(syntax & RE_NO_GNU_OPS)) 1883 { 1884 token->type = ANCHOR; 1885 token->opr.ctx_type = WORD_FIRST; 1886 } 1887 break; 1888 case '>': 1889 if (!(syntax & RE_NO_GNU_OPS)) 1890 { 1891 token->type = ANCHOR; 1892 token->opr.ctx_type = WORD_LAST; 1893 } 1894 break; 1895 case 'b': 1896 if (!(syntax & RE_NO_GNU_OPS)) 1897 { 1898 token->type = ANCHOR; 1899 token->opr.ctx_type = WORD_DELIM; 1900 } 1901 break; 1902 case 'B': 1903 if (!(syntax & RE_NO_GNU_OPS)) 1904 { 1905 token->type = ANCHOR; 1906 token->opr.ctx_type = NOT_WORD_DELIM; 1907 } 1908 break; 1909 case 'w': 1910 if (!(syntax & RE_NO_GNU_OPS)) 1911 token->type = OP_WORD; 1912 break; 1913 case 'W': 1914 if (!(syntax & RE_NO_GNU_OPS)) 1915 token->type = OP_NOTWORD; 1916 break; 1917 case 's': 1918 if (!(syntax & RE_NO_GNU_OPS)) 1919 token->type = OP_SPACE; 1920 break; 1921 case 'S': 1922 if (!(syntax & RE_NO_GNU_OPS)) 1923 token->type = OP_NOTSPACE; 1924 break; 1925 case '`': 1926 if (!(syntax & RE_NO_GNU_OPS)) 1927 { 1928 token->type = ANCHOR; 1929 token->opr.ctx_type = BUF_FIRST; 1930 } 1931 break; 1932 case '\'': 1933 if (!(syntax & RE_NO_GNU_OPS)) 1934 { 1935 token->type = ANCHOR; 1936 token->opr.ctx_type = BUF_LAST; 1937 } 1938 break; 1939 case '(': 1940 if (!(syntax & RE_NO_BK_PARENS)) 1941 token->type = OP_OPEN_SUBEXP; 1942 break; 1943 case ')': 1944 if (!(syntax & RE_NO_BK_PARENS)) 1945 token->type = OP_CLOSE_SUBEXP; 1946 break; 1947 case '+': 1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1949 token->type = OP_DUP_PLUS; 1950 break; 1951 case '?': 1952 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1953 token->type = OP_DUP_QUESTION; 1954 break; 1955 case '{': 1956 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1957 token->type = OP_OPEN_DUP_NUM; 1958 break; 1959 case '}': 1960 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1961 token->type = OP_CLOSE_DUP_NUM; 1962 break; 1963 default: 1964 break; 1965 } 1966 return 2; 1967 } 1968 1969 token->type = CHARACTER; 1970 #ifdef RE_ENABLE_I18N 1971 if (input->mb_cur_max > 1) 1972 { 1973 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1974 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1975 } 1976 else 1977 #endif 1978 token->word_char = IS_WORD_CHAR (token->opr.c); 1979 1980 switch (c) 1981 { 1982 case '\n': 1983 if (syntax & RE_NEWLINE_ALT) 1984 token->type = OP_ALT; 1985 break; 1986 case '|': 1987 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1988 token->type = OP_ALT; 1989 break; 1990 case '*': 1991 token->type = OP_DUP_ASTERISK; 1992 break; 1993 case '+': 1994 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1995 token->type = OP_DUP_PLUS; 1996 break; 1997 case '?': 1998 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1999 token->type = OP_DUP_QUESTION; 2000 break; 2001 case '{': 2002 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 2003 token->type = OP_OPEN_DUP_NUM; 2004 break; 2005 case '}': 2006 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 2007 token->type = OP_CLOSE_DUP_NUM; 2008 break; 2009 case '(': 2010 if (syntax & RE_NO_BK_PARENS) 2011 token->type = OP_OPEN_SUBEXP; 2012 break; 2013 case ')': 2014 if (syntax & RE_NO_BK_PARENS) 2015 token->type = OP_CLOSE_SUBEXP; 2016 break; 2017 case '[': 2018 token->type = OP_OPEN_BRACKET; 2019 break; 2020 case '.': 2021 token->type = OP_PERIOD; 2022 break; 2023 case '^': 2024 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 2025 re_string_cur_idx (input) != 0) 2026 { 2027 char prev = re_string_peek_byte (input, -1); 2028 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 2029 break; 2030 } 2031 token->type = ANCHOR; 2032 token->opr.ctx_type = LINE_FIRST; 2033 break; 2034 case '$': 2035 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2036 re_string_cur_idx (input) + 1 != re_string_length (input)) 2037 { 2038 re_token_t next; 2039 re_string_skip_bytes (input, 1); 2040 peek_token (&next, input, syntax); 2041 re_string_skip_bytes (input, -1); 2042 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2043 break; 2044 } 2045 token->type = ANCHOR; 2046 token->opr.ctx_type = LINE_LAST; 2047 break; 2048 default: 2049 break; 2050 } 2051 return 1; 2052 } 2053 2054 /* Peek a token from INPUT, and return the length of the token. 2055 We must not use this function out of bracket expressions. */ 2056 2057 static int 2058 internal_function 2059 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2060 { 2061 unsigned char c; 2062 if (re_string_eoi (input)) 2063 { 2064 token->type = END_OF_RE; 2065 return 0; 2066 } 2067 c = re_string_peek_byte (input, 0); 2068 token->opr.c = c; 2069 2070 #ifdef RE_ENABLE_I18N 2071 if (input->mb_cur_max > 1 && 2072 !re_string_first_byte (input, re_string_cur_idx (input))) 2073 { 2074 token->type = CHARACTER; 2075 return 1; 2076 } 2077 #endif /* RE_ENABLE_I18N */ 2078 2079 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2080 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2081 { 2082 /* In this case, '\' escape a character. */ 2083 unsigned char c2; 2084 re_string_skip_bytes (input, 1); 2085 c2 = re_string_peek_byte (input, 0); 2086 token->opr.c = c2; 2087 token->type = CHARACTER; 2088 return 1; 2089 } 2090 if (c == '[') /* '[' is a special char in a bracket exps. */ 2091 { 2092 unsigned char c2; 2093 int token_len; 2094 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2095 c2 = re_string_peek_byte (input, 1); 2096 else 2097 c2 = 0; 2098 token->opr.c = c2; 2099 token_len = 2; 2100 switch (c2) 2101 { 2102 case '.': 2103 token->type = OP_OPEN_COLL_ELEM; 2104 break; 2105 case '=': 2106 token->type = OP_OPEN_EQUIV_CLASS; 2107 break; 2108 case ':': 2109 if (syntax & RE_CHAR_CLASSES) 2110 { 2111 token->type = OP_OPEN_CHAR_CLASS; 2112 break; 2113 } 2114 /* else fall through. */ 2115 default: 2116 token->type = CHARACTER; 2117 token->opr.c = c; 2118 token_len = 1; 2119 break; 2120 } 2121 return token_len; 2122 } 2123 switch (c) 2124 { 2125 case '-': 2126 token->type = OP_CHARSET_RANGE; 2127 break; 2128 case ']': 2129 token->type = OP_CLOSE_BRACKET; 2130 break; 2131 case '^': 2132 token->type = OP_NON_MATCH_LIST; 2133 break; 2134 default: 2135 token->type = CHARACTER; 2136 } 2137 return 1; 2138 } 2139 2140 /* Functions for parser. */ 2141 2142 /* Entry point of the parser. 2143 Parse the regular expression REGEXP and return the structure tree. 2144 If an error occurs, ERR is set by error code, and return NULL. 2145 This function build the following tree, from regular expression <reg_exp>: 2146 CAT 2147 / \ 2148 / \ 2149 <reg_exp> EOR 2150 2151 CAT means concatenation. 2152 EOR means end of regular expression. */ 2153 2154 static bin_tree_t * 2155 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2156 reg_errcode_t *err) 2157 { 2158 re_dfa_t *dfa = preg->buffer; 2159 bin_tree_t *tree, *eor, *root; 2160 re_token_t current_token; 2161 dfa->syntax = syntax; 2162 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2163 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2164 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2165 return NULL; 2166 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2167 if (tree != NULL) 2168 root = create_tree (dfa, tree, eor, CONCAT); 2169 else 2170 root = eor; 2171 if (BE (eor == NULL || root == NULL, 0)) 2172 { 2173 *err = REG_ESPACE; 2174 return NULL; 2175 } 2176 return root; 2177 } 2178 2179 /* This function build the following tree, from regular expression 2180 <branch1>|<branch2>: 2181 ALT 2182 / \ 2183 / \ 2184 <branch1> <branch2> 2185 2186 ALT means alternative, which represents the operator '|'. */ 2187 2188 static bin_tree_t * 2189 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2190 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2191 { 2192 re_dfa_t *dfa = preg->buffer; 2193 bin_tree_t *tree, *branch = NULL; 2194 bitset_word_t initial_bkref_map = dfa->completed_bkref_map; 2195 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2196 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2197 return NULL; 2198 2199 while (token->type == OP_ALT) 2200 { 2201 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2202 if (token->type != OP_ALT && token->type != END_OF_RE 2203 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2204 { 2205 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map; 2206 dfa->completed_bkref_map = initial_bkref_map; 2207 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2208 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2209 { 2210 if (tree != NULL) 2211 postorder (tree, free_tree, NULL); 2212 return NULL; 2213 } 2214 dfa->completed_bkref_map |= accumulated_bkref_map; 2215 } 2216 else 2217 branch = NULL; 2218 tree = create_tree (dfa, tree, branch, OP_ALT); 2219 if (BE (tree == NULL, 0)) 2220 { 2221 *err = REG_ESPACE; 2222 return NULL; 2223 } 2224 } 2225 return tree; 2226 } 2227 2228 /* This function build the following tree, from regular expression 2229 <exp1><exp2>: 2230 CAT 2231 / \ 2232 / \ 2233 <exp1> <exp2> 2234 2235 CAT means concatenation. */ 2236 2237 static bin_tree_t * 2238 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2239 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2240 { 2241 bin_tree_t *tree, *expr; 2242 re_dfa_t *dfa = preg->buffer; 2243 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2244 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2245 return NULL; 2246 2247 while (token->type != OP_ALT && token->type != END_OF_RE 2248 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2249 { 2250 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2251 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2252 { 2253 if (tree != NULL) 2254 postorder (tree, free_tree, NULL); 2255 return NULL; 2256 } 2257 if (tree != NULL && expr != NULL) 2258 { 2259 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); 2260 if (newtree == NULL) 2261 { 2262 postorder (expr, free_tree, NULL); 2263 postorder (tree, free_tree, NULL); 2264 *err = REG_ESPACE; 2265 return NULL; 2266 } 2267 tree = newtree; 2268 } 2269 else if (tree == NULL) 2270 tree = expr; 2271 /* Otherwise expr == NULL, we don't need to create new tree. */ 2272 } 2273 return tree; 2274 } 2275 2276 /* This function build the following tree, from regular expression a*: 2277 * 2278 | 2279 a 2280 */ 2281 2282 static bin_tree_t * 2283 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2284 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2285 { 2286 re_dfa_t *dfa = preg->buffer; 2287 bin_tree_t *tree; 2288 switch (token->type) 2289 { 2290 case CHARACTER: 2291 tree = create_token_tree (dfa, NULL, NULL, token); 2292 if (BE (tree == NULL, 0)) 2293 { 2294 *err = REG_ESPACE; 2295 return NULL; 2296 } 2297 #ifdef RE_ENABLE_I18N 2298 if (dfa->mb_cur_max > 1) 2299 { 2300 while (!re_string_eoi (regexp) 2301 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2302 { 2303 bin_tree_t *mbc_remain; 2304 fetch_token (token, regexp, syntax); 2305 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2306 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2307 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2308 { 2309 *err = REG_ESPACE; 2310 return NULL; 2311 } 2312 } 2313 } 2314 #endif 2315 break; 2316 case OP_OPEN_SUBEXP: 2317 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2318 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2319 return NULL; 2320 break; 2321 case OP_OPEN_BRACKET: 2322 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2323 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2324 return NULL; 2325 break; 2326 case OP_BACK_REF: 2327 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2328 { 2329 *err = REG_ESUBREG; 2330 return NULL; 2331 } 2332 dfa->used_bkref_map |= 1 << token->opr.idx; 2333 tree = create_token_tree (dfa, NULL, NULL, token); 2334 if (BE (tree == NULL, 0)) 2335 { 2336 *err = REG_ESPACE; 2337 return NULL; 2338 } 2339 ++dfa->nbackref; 2340 dfa->has_mb_node = 1; 2341 break; 2342 case OP_OPEN_DUP_NUM: 2343 if (syntax & RE_CONTEXT_INVALID_DUP) 2344 { 2345 *err = REG_BADRPT; 2346 return NULL; 2347 } 2348 /* FALLTHROUGH */ 2349 case OP_DUP_ASTERISK: 2350 case OP_DUP_PLUS: 2351 case OP_DUP_QUESTION: 2352 if (syntax & RE_CONTEXT_INVALID_OPS) 2353 { 2354 *err = REG_BADRPT; 2355 return NULL; 2356 } 2357 else if (syntax & RE_CONTEXT_INDEP_OPS) 2358 { 2359 fetch_token (token, regexp, syntax); 2360 return parse_expression (regexp, preg, token, syntax, nest, err); 2361 } 2362 /* else fall through */ 2363 case OP_CLOSE_SUBEXP: 2364 if ((token->type == OP_CLOSE_SUBEXP) && 2365 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2366 { 2367 *err = REG_ERPAREN; 2368 return NULL; 2369 } 2370 /* else fall through */ 2371 case OP_CLOSE_DUP_NUM: 2372 /* We treat it as a normal character. */ 2373 2374 /* Then we can these characters as normal characters. */ 2375 token->type = CHARACTER; 2376 /* mb_partial and word_char bits should be initialized already 2377 by peek_token. */ 2378 tree = create_token_tree (dfa, NULL, NULL, token); 2379 if (BE (tree == NULL, 0)) 2380 { 2381 *err = REG_ESPACE; 2382 return NULL; 2383 } 2384 break; 2385 case ANCHOR: 2386 if ((token->opr.ctx_type 2387 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2388 && dfa->word_ops_used == 0) 2389 init_word_char (dfa); 2390 if (token->opr.ctx_type == WORD_DELIM 2391 || token->opr.ctx_type == NOT_WORD_DELIM) 2392 { 2393 bin_tree_t *tree_first, *tree_last; 2394 if (token->opr.ctx_type == WORD_DELIM) 2395 { 2396 token->opr.ctx_type = WORD_FIRST; 2397 tree_first = create_token_tree (dfa, NULL, NULL, token); 2398 token->opr.ctx_type = WORD_LAST; 2399 } 2400 else 2401 { 2402 token->opr.ctx_type = INSIDE_WORD; 2403 tree_first = create_token_tree (dfa, NULL, NULL, token); 2404 token->opr.ctx_type = INSIDE_NOTWORD; 2405 } 2406 tree_last = create_token_tree (dfa, NULL, NULL, token); 2407 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2408 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2409 { 2410 *err = REG_ESPACE; 2411 return NULL; 2412 } 2413 } 2414 else 2415 { 2416 tree = create_token_tree (dfa, NULL, NULL, token); 2417 if (BE (tree == NULL, 0)) 2418 { 2419 *err = REG_ESPACE; 2420 return NULL; 2421 } 2422 } 2423 /* We must return here, since ANCHORs can't be followed 2424 by repetition operators. 2425 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2426 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2427 fetch_token (token, regexp, syntax); 2428 return tree; 2429 case OP_PERIOD: 2430 tree = create_token_tree (dfa, NULL, NULL, token); 2431 if (BE (tree == NULL, 0)) 2432 { 2433 *err = REG_ESPACE; 2434 return NULL; 2435 } 2436 if (dfa->mb_cur_max > 1) 2437 dfa->has_mb_node = 1; 2438 break; 2439 case OP_WORD: 2440 case OP_NOTWORD: 2441 tree = build_charclass_op (dfa, regexp->trans, 2442 "alnum", 2443 "_", 2444 token->type == OP_NOTWORD, err); 2445 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2446 return NULL; 2447 break; 2448 case OP_SPACE: 2449 case OP_NOTSPACE: 2450 tree = build_charclass_op (dfa, regexp->trans, 2451 "space", 2452 "", 2453 token->type == OP_NOTSPACE, err); 2454 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2455 return NULL; 2456 break; 2457 case OP_ALT: 2458 case END_OF_RE: 2459 return NULL; 2460 case BACK_SLASH: 2461 *err = REG_EESCAPE; 2462 return NULL; 2463 default: 2464 /* Must not happen? */ 2465 #ifdef DEBUG 2466 assert (0); 2467 #endif 2468 return NULL; 2469 } 2470 fetch_token (token, regexp, syntax); 2471 2472 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2473 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2474 { 2475 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, 2476 syntax, err); 2477 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0)) 2478 { 2479 if (tree != NULL) 2480 postorder (tree, free_tree, NULL); 2481 return NULL; 2482 } 2483 tree = dup_tree; 2484 /* In BRE consecutive duplications are not allowed. */ 2485 if ((syntax & RE_CONTEXT_INVALID_DUP) 2486 && (token->type == OP_DUP_ASTERISK 2487 || token->type == OP_OPEN_DUP_NUM)) 2488 { 2489 if (tree != NULL) 2490 postorder (tree, free_tree, NULL); 2491 *err = REG_BADRPT; 2492 return NULL; 2493 } 2494 } 2495 2496 return tree; 2497 } 2498 2499 /* This function build the following tree, from regular expression 2500 (<reg_exp>): 2501 SUBEXP 2502 | 2503 <reg_exp> 2504 */ 2505 2506 static bin_tree_t * 2507 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2508 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2509 { 2510 re_dfa_t *dfa = preg->buffer; 2511 bin_tree_t *tree; 2512 size_t cur_nsub; 2513 cur_nsub = preg->re_nsub++; 2514 2515 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2516 2517 /* The subexpression may be a null string. */ 2518 if (token->type == OP_CLOSE_SUBEXP) 2519 tree = NULL; 2520 else 2521 { 2522 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2523 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2524 { 2525 if (tree != NULL) 2526 postorder (tree, free_tree, NULL); 2527 *err = REG_EPAREN; 2528 } 2529 if (BE (*err != REG_NOERROR, 0)) 2530 return NULL; 2531 } 2532 2533 if (cur_nsub <= '9' - '1') 2534 dfa->completed_bkref_map |= 1 << cur_nsub; 2535 2536 tree = create_tree (dfa, tree, NULL, SUBEXP); 2537 if (BE (tree == NULL, 0)) 2538 { 2539 *err = REG_ESPACE; 2540 return NULL; 2541 } 2542 tree->token.opr.idx = cur_nsub; 2543 return tree; 2544 } 2545 2546 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2547 2548 static bin_tree_t * 2549 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2550 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2551 { 2552 bin_tree_t *tree = NULL, *old_tree = NULL; 2553 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2554 re_token_t start_token = *token; 2555 2556 if (token->type == OP_OPEN_DUP_NUM) 2557 { 2558 end = 0; 2559 start = fetch_number (regexp, token, syntax); 2560 if (start == REG_MISSING) 2561 { 2562 if (token->type == CHARACTER && token->opr.c == ',') 2563 start = 0; /* We treat "{,m}" as "{0,m}". */ 2564 else 2565 { 2566 *err = REG_BADBR; /* <re>{} is invalid. */ 2567 return NULL; 2568 } 2569 } 2570 if (BE (start != REG_ERROR, 1)) 2571 { 2572 /* We treat "{n}" as "{n,n}". */ 2573 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2574 : ((token->type == CHARACTER && token->opr.c == ',') 2575 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2576 } 2577 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2578 { 2579 /* Invalid sequence. */ 2580 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2581 { 2582 if (token->type == END_OF_RE) 2583 *err = REG_EBRACE; 2584 else 2585 *err = REG_BADBR; 2586 2587 return NULL; 2588 } 2589 2590 /* If the syntax bit is set, rollback. */ 2591 re_string_set_index (regexp, start_idx); 2592 *token = start_token; 2593 token->type = CHARACTER; 2594 /* mb_partial and word_char bits should be already initialized by 2595 peek_token. */ 2596 return elem; 2597 } 2598 2599 if (BE ((end != REG_MISSING && start > end) 2600 || token->type != OP_CLOSE_DUP_NUM, 0)) 2601 { 2602 /* First number greater than second. */ 2603 *err = REG_BADBR; 2604 return NULL; 2605 } 2606 2607 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) 2608 { 2609 *err = REG_ESIZE; 2610 return NULL; 2611 } 2612 } 2613 else 2614 { 2615 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2616 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2617 } 2618 2619 fetch_token (token, regexp, syntax); 2620 2621 if (BE (elem == NULL, 0)) 2622 return NULL; 2623 if (BE (start == 0 && end == 0, 0)) 2624 { 2625 postorder (elem, free_tree, NULL); 2626 return NULL; 2627 } 2628 2629 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2630 if (BE (start > 0, 0)) 2631 { 2632 tree = elem; 2633 for (i = 2; i <= start; ++i) 2634 { 2635 elem = duplicate_tree (elem, dfa); 2636 tree = create_tree (dfa, tree, elem, CONCAT); 2637 if (BE (elem == NULL || tree == NULL, 0)) 2638 goto parse_dup_op_espace; 2639 } 2640 2641 if (start == end) 2642 return tree; 2643 2644 /* Duplicate ELEM before it is marked optional. */ 2645 elem = duplicate_tree (elem, dfa); 2646 if (BE (elem == NULL, 0)) 2647 goto parse_dup_op_espace; 2648 old_tree = tree; 2649 } 2650 else 2651 old_tree = NULL; 2652 2653 if (elem->token.type == SUBEXP) 2654 { 2655 uintptr_t subidx = elem->token.opr.idx; 2656 postorder (elem, mark_opt_subexp, (void *) subidx); 2657 } 2658 2659 tree = create_tree (dfa, elem, NULL, 2660 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2661 if (BE (tree == NULL, 0)) 2662 goto parse_dup_op_espace; 2663 2664 /* From gnulib's "intprops.h": 2665 True if the arithmetic type T is signed. */ 2666 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) 2667 2668 /* This loop is actually executed only when end != REG_MISSING, 2669 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2670 already created the start+1-th copy. */ 2671 if (TYPE_SIGNED (Idx) || end != REG_MISSING) 2672 for (i = start + 2; i <= end; ++i) 2673 { 2674 elem = duplicate_tree (elem, dfa); 2675 tree = create_tree (dfa, tree, elem, CONCAT); 2676 if (BE (elem == NULL || tree == NULL, 0)) 2677 goto parse_dup_op_espace; 2678 2679 tree = create_tree (dfa, tree, NULL, OP_ALT); 2680 if (BE (tree == NULL, 0)) 2681 goto parse_dup_op_espace; 2682 } 2683 2684 if (old_tree) 2685 tree = create_tree (dfa, old_tree, tree, CONCAT); 2686 2687 return tree; 2688 2689 parse_dup_op_espace: 2690 *err = REG_ESPACE; 2691 return NULL; 2692 } 2693 2694 /* Size of the names for collating symbol/equivalence_class/character_class. 2695 I'm not sure, but maybe enough. */ 2696 #define BRACKET_NAME_BUF_SIZE 32 2697 2698 #ifndef _LIBC 2699 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2700 Build the range expression which starts from START_ELEM, and ends 2701 at END_ELEM. The result are written to MBCSET and SBCSET. 2702 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2703 mbcset->range_ends, is a pointer argument since we may 2704 update it. */ 2705 2706 static reg_errcode_t 2707 internal_function 2708 # ifdef RE_ENABLE_I18N 2709 build_range_exp (const reg_syntax_t syntax, 2710 bitset_t sbcset, 2711 re_charset_t *mbcset, 2712 Idx *range_alloc, 2713 const bracket_elem_t *start_elem, 2714 const bracket_elem_t *end_elem) 2715 # else /* not RE_ENABLE_I18N */ 2716 build_range_exp (const reg_syntax_t syntax, 2717 bitset_t sbcset, 2718 const bracket_elem_t *start_elem, 2719 const bracket_elem_t *end_elem) 2720 # endif /* not RE_ENABLE_I18N */ 2721 { 2722 unsigned int start_ch, end_ch; 2723 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2724 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2725 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2726 0)) 2727 return REG_ERANGE; 2728 2729 /* We can handle no multi character collating elements without libc 2730 support. */ 2731 if (BE ((start_elem->type == COLL_SYM 2732 && strlen ((char *) start_elem->opr.name) > 1) 2733 || (end_elem->type == COLL_SYM 2734 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2735 return REG_ECOLLATE; 2736 2737 # ifdef RE_ENABLE_I18N 2738 { 2739 wchar_t wc; 2740 wint_t start_wc; 2741 wint_t end_wc; 2742 2743 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2744 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2745 : 0)); 2746 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2747 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2748 : 0)); 2749 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2750 ? __btowc (start_ch) : start_elem->opr.wch); 2751 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2752 ? __btowc (end_ch) : end_elem->opr.wch); 2753 if (start_wc == WEOF || end_wc == WEOF) 2754 return REG_ECOLLATE; 2755 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0)) 2756 return REG_ERANGE; 2757 2758 /* Got valid collation sequence values, add them as a new entry. 2759 However, for !_LIBC we have no collation elements: if the 2760 character set is single byte, the single byte character set 2761 that we build below suffices. parse_bracket_exp passes 2762 no MBCSET if dfa->mb_cur_max == 1. */ 2763 if (mbcset) 2764 { 2765 /* Check the space of the arrays. */ 2766 if (BE (*range_alloc == mbcset->nranges, 0)) 2767 { 2768 /* There is not enough space, need realloc. */ 2769 wchar_t *new_array_start, *new_array_end; 2770 Idx new_nranges; 2771 2772 /* +1 in case of mbcset->nranges is 0. */ 2773 new_nranges = 2 * mbcset->nranges + 1; 2774 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2775 are NULL if *range_alloc == 0. */ 2776 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2777 new_nranges); 2778 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2779 new_nranges); 2780 2781 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2782 return REG_ESPACE; 2783 2784 mbcset->range_starts = new_array_start; 2785 mbcset->range_ends = new_array_end; 2786 *range_alloc = new_nranges; 2787 } 2788 2789 mbcset->range_starts[mbcset->nranges] = start_wc; 2790 mbcset->range_ends[mbcset->nranges++] = end_wc; 2791 } 2792 2793 /* Build the table for single byte characters. */ 2794 for (wc = 0; wc < SBC_MAX; ++wc) 2795 { 2796 if (start_wc <= wc && wc <= end_wc) 2797 bitset_set (sbcset, wc); 2798 } 2799 } 2800 # else /* not RE_ENABLE_I18N */ 2801 { 2802 unsigned int ch; 2803 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2804 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2805 : 0)); 2806 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2807 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2808 : 0)); 2809 if (start_ch > end_ch) 2810 return REG_ERANGE; 2811 /* Build the table for single byte characters. */ 2812 for (ch = 0; ch < SBC_MAX; ++ch) 2813 if (start_ch <= ch && ch <= end_ch) 2814 bitset_set (sbcset, ch); 2815 } 2816 # endif /* not RE_ENABLE_I18N */ 2817 return REG_NOERROR; 2818 } 2819 #endif /* not _LIBC */ 2820 2821 #ifndef _LIBC 2822 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2823 Build the collating element which is represented by NAME. 2824 The result are written to MBCSET and SBCSET. 2825 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2826 pointer argument since we may update it. */ 2827 2828 static reg_errcode_t 2829 internal_function 2830 # ifdef RE_ENABLE_I18N 2831 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 2832 Idx *coll_sym_alloc, const unsigned char *name) 2833 # else /* not RE_ENABLE_I18N */ 2834 build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2835 # endif /* not RE_ENABLE_I18N */ 2836 { 2837 size_t name_len = strlen ((const char *) name); 2838 if (BE (name_len != 1, 0)) 2839 return REG_ECOLLATE; 2840 else 2841 { 2842 bitset_set (sbcset, name[0]); 2843 return REG_NOERROR; 2844 } 2845 } 2846 #endif /* not _LIBC */ 2847 2848 /* This function parse bracket expression like "[abc]", "[a-c]", 2849 "[[.a-a.]]" etc. */ 2850 2851 static bin_tree_t * 2852 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2853 reg_syntax_t syntax, reg_errcode_t *err) 2854 { 2855 #ifdef _LIBC 2856 const unsigned char *collseqmb; 2857 const char *collseqwc; 2858 uint32_t nrules; 2859 int32_t table_size; 2860 const int32_t *symb_table; 2861 const unsigned char *extra; 2862 2863 /* Local function for parse_bracket_exp used in _LIBC environment. 2864 Seek the collating symbol entry corresponding to NAME. 2865 Return the index of the symbol in the SYMB_TABLE, 2866 or -1 if not found. */ 2867 2868 auto inline int32_t 2869 __attribute__ ((always_inline)) 2870 seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2871 { 2872 int32_t elem; 2873 2874 for (elem = 0; elem < table_size; elem++) 2875 if (symb_table[2 * elem] != 0) 2876 { 2877 int32_t idx = symb_table[2 * elem + 1]; 2878 /* Skip the name of collating element name. */ 2879 idx += 1 + extra[idx]; 2880 if (/* Compare the length of the name. */ 2881 name_len == extra[idx] 2882 /* Compare the name. */ 2883 && memcmp (name, &extra[idx + 1], name_len) == 0) 2884 /* Yep, this is the entry. */ 2885 return elem; 2886 } 2887 return -1; 2888 } 2889 2890 /* Local function for parse_bracket_exp used in _LIBC environment. 2891 Look up the collation sequence value of BR_ELEM. 2892 Return the value if succeeded, UINT_MAX otherwise. */ 2893 2894 auto inline unsigned int 2895 __attribute__ ((always_inline)) 2896 lookup_collation_sequence_value (bracket_elem_t *br_elem) 2897 { 2898 if (br_elem->type == SB_CHAR) 2899 { 2900 /* 2901 if (MB_CUR_MAX == 1) 2902 */ 2903 if (nrules == 0) 2904 return collseqmb[br_elem->opr.ch]; 2905 else 2906 { 2907 wint_t wc = __btowc (br_elem->opr.ch); 2908 return __collseq_table_lookup (collseqwc, wc); 2909 } 2910 } 2911 else if (br_elem->type == MB_CHAR) 2912 { 2913 if (nrules != 0) 2914 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2915 } 2916 else if (br_elem->type == COLL_SYM) 2917 { 2918 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2919 if (nrules != 0) 2920 { 2921 int32_t elem, idx; 2922 elem = seek_collating_symbol_entry (br_elem->opr.name, 2923 sym_name_len); 2924 if (elem != -1) 2925 { 2926 /* We found the entry. */ 2927 idx = symb_table[2 * elem + 1]; 2928 /* Skip the name of collating element name. */ 2929 idx += 1 + extra[idx]; 2930 /* Skip the byte sequence of the collating element. */ 2931 idx += 1 + extra[idx]; 2932 /* Adjust for the alignment. */ 2933 idx = (idx + 3) & ~3; 2934 /* Skip the multibyte collation sequence value. */ 2935 idx += sizeof (unsigned int); 2936 /* Skip the wide char sequence of the collating element. */ 2937 idx += sizeof (unsigned int) * 2938 (1 + *(unsigned int *) (extra + idx)); 2939 /* Return the collation sequence value. */ 2940 return *(unsigned int *) (extra + idx); 2941 } 2942 else if (sym_name_len == 1) 2943 { 2944 /* No valid character. Match it as a single byte 2945 character. */ 2946 return collseqmb[br_elem->opr.name[0]]; 2947 } 2948 } 2949 else if (sym_name_len == 1) 2950 return collseqmb[br_elem->opr.name[0]]; 2951 } 2952 return UINT_MAX; 2953 } 2954 2955 /* Local function for parse_bracket_exp used in _LIBC environment. 2956 Build the range expression which starts from START_ELEM, and ends 2957 at END_ELEM. The result are written to MBCSET and SBCSET. 2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2959 mbcset->range_ends, is a pointer argument since we may 2960 update it. */ 2961 2962 auto inline reg_errcode_t 2963 __attribute__ ((always_inline)) 2964 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc, 2965 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2966 { 2967 unsigned int ch; 2968 uint32_t start_collseq; 2969 uint32_t end_collseq; 2970 2971 /* Equivalence Classes and Character Classes can't be a range 2972 start/end. */ 2973 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2974 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2975 0)) 2976 return REG_ERANGE; 2977 2978 /* FIXME: Implement rational ranges here, too. */ 2979 start_collseq = lookup_collation_sequence_value (start_elem); 2980 end_collseq = lookup_collation_sequence_value (end_elem); 2981 /* Check start/end collation sequence values. */ 2982 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2983 return REG_ECOLLATE; 2984 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2985 return REG_ERANGE; 2986 2987 /* Got valid collation sequence values, add them as a new entry. 2988 However, if we have no collation elements, and the character set 2989 is single byte, the single byte character set that we 2990 build below suffices. */ 2991 if (nrules > 0 || dfa->mb_cur_max > 1) 2992 { 2993 /* Check the space of the arrays. */ 2994 if (BE (*range_alloc == mbcset->nranges, 0)) 2995 { 2996 /* There is not enough space, need realloc. */ 2997 uint32_t *new_array_start; 2998 uint32_t *new_array_end; 2999 Idx new_nranges; 3000 3001 /* +1 in case of mbcset->nranges is 0. */ 3002 new_nranges = 2 * mbcset->nranges + 1; 3003 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 3004 new_nranges); 3005 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 3006 new_nranges); 3007 3008 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 3009 return REG_ESPACE; 3010 3011 mbcset->range_starts = new_array_start; 3012 mbcset->range_ends = new_array_end; 3013 *range_alloc = new_nranges; 3014 } 3015 3016 mbcset->range_starts[mbcset->nranges] = start_collseq; 3017 mbcset->range_ends[mbcset->nranges++] = end_collseq; 3018 } 3019 3020 /* Build the table for single byte characters. */ 3021 for (ch = 0; ch < SBC_MAX; ch++) 3022 { 3023 uint32_t ch_collseq; 3024 /* 3025 if (MB_CUR_MAX == 1) 3026 */ 3027 if (nrules == 0) 3028 ch_collseq = collseqmb[ch]; 3029 else 3030 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 3031 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 3032 bitset_set (sbcset, ch); 3033 } 3034 return REG_NOERROR; 3035 } 3036 3037 /* Local function for parse_bracket_exp used in _LIBC environment. 3038 Build the collating element which is represented by NAME. 3039 The result are written to MBCSET and SBCSET. 3040 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3041 pointer argument since we may update it. */ 3042 3043 auto inline reg_errcode_t 3044 __attribute__ ((always_inline)) 3045 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset, 3046 Idx *coll_sym_alloc, const unsigned char *name) 3047 { 3048 int32_t elem, idx; 3049 size_t name_len = strlen ((const char *) name); 3050 if (nrules != 0) 3051 { 3052 elem = seek_collating_symbol_entry (name, name_len); 3053 if (elem != -1) 3054 { 3055 /* We found the entry. */ 3056 idx = symb_table[2 * elem + 1]; 3057 /* Skip the name of collating element name. */ 3058 idx += 1 + extra[idx]; 3059 } 3060 else if (name_len == 1) 3061 { 3062 /* No valid character, treat it as a normal 3063 character. */ 3064 bitset_set (sbcset, name[0]); 3065 return REG_NOERROR; 3066 } 3067 else 3068 return REG_ECOLLATE; 3069 3070 /* Got valid collation sequence, add it as a new entry. */ 3071 /* Check the space of the arrays. */ 3072 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3073 { 3074 /* Not enough, realloc it. */ 3075 /* +1 in case of mbcset->ncoll_syms is 0. */ 3076 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3077 /* Use realloc since mbcset->coll_syms is NULL 3078 if *alloc == 0. */ 3079 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3080 new_coll_sym_alloc); 3081 if (BE (new_coll_syms == NULL, 0)) 3082 return REG_ESPACE; 3083 mbcset->coll_syms = new_coll_syms; 3084 *coll_sym_alloc = new_coll_sym_alloc; 3085 } 3086 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3087 return REG_NOERROR; 3088 } 3089 else 3090 { 3091 if (BE (name_len != 1, 0)) 3092 return REG_ECOLLATE; 3093 else 3094 { 3095 bitset_set (sbcset, name[0]); 3096 return REG_NOERROR; 3097 } 3098 } 3099 } 3100 #endif 3101 3102 re_token_t br_token; 3103 re_bitset_ptr_t sbcset; 3104 #ifdef RE_ENABLE_I18N 3105 re_charset_t *mbcset; 3106 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3107 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3108 #endif /* not RE_ENABLE_I18N */ 3109 bool non_match = false; 3110 bin_tree_t *work_tree; 3111 int token_len; 3112 bool first_round = true; 3113 #ifdef _LIBC 3114 collseqmb = (const unsigned char *) 3115 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3116 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3117 if (nrules) 3118 { 3119 /* 3120 if (MB_CUR_MAX > 1) 3121 */ 3122 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3123 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3124 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3125 _NL_COLLATE_SYMB_TABLEMB); 3126 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3127 _NL_COLLATE_SYMB_EXTRAMB); 3128 } 3129 #endif 3130 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3131 #ifdef RE_ENABLE_I18N 3132 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3133 #endif /* RE_ENABLE_I18N */ 3134 #ifdef RE_ENABLE_I18N 3135 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3136 #else 3137 if (BE (sbcset == NULL, 0)) 3138 #endif /* RE_ENABLE_I18N */ 3139 { 3140 re_free (sbcset); 3141 #ifdef RE_ENABLE_I18N 3142 re_free (mbcset); 3143 #endif 3144 *err = REG_ESPACE; 3145 return NULL; 3146 } 3147 3148 token_len = peek_token_bracket (token, regexp, syntax); 3149 if (BE (token->type == END_OF_RE, 0)) 3150 { 3151 *err = REG_BADPAT; 3152 goto parse_bracket_exp_free_return; 3153 } 3154 if (token->type == OP_NON_MATCH_LIST) 3155 { 3156 #ifdef RE_ENABLE_I18N 3157 mbcset->non_match = 1; 3158 #endif /* not RE_ENABLE_I18N */ 3159 non_match = true; 3160 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3161 bitset_set (sbcset, '\n'); 3162 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3163 token_len = peek_token_bracket (token, regexp, syntax); 3164 if (BE (token->type == END_OF_RE, 0)) 3165 { 3166 *err = REG_BADPAT; 3167 goto parse_bracket_exp_free_return; 3168 } 3169 } 3170 3171 /* We treat the first ']' as a normal character. */ 3172 if (token->type == OP_CLOSE_BRACKET) 3173 token->type = CHARACTER; 3174 3175 while (1) 3176 { 3177 bracket_elem_t start_elem, end_elem; 3178 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3179 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3180 reg_errcode_t ret; 3181 int token_len2 = 0; 3182 bool is_range_exp = false; 3183 re_token_t token2; 3184 3185 start_elem.opr.name = start_name_buf; 3186 start_elem.type = COLL_SYM; 3187 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3188 syntax, first_round); 3189 if (BE (ret != REG_NOERROR, 0)) 3190 { 3191 *err = ret; 3192 goto parse_bracket_exp_free_return; 3193 } 3194 first_round = false; 3195 3196 /* Get information about the next token. We need it in any case. */ 3197 token_len = peek_token_bracket (token, regexp, syntax); 3198 3199 /* Do not check for ranges if we know they are not allowed. */ 3200 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3201 { 3202 if (BE (token->type == END_OF_RE, 0)) 3203 { 3204 *err = REG_EBRACK; 3205 goto parse_bracket_exp_free_return; 3206 } 3207 if (token->type == OP_CHARSET_RANGE) 3208 { 3209 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3210 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3211 if (BE (token2.type == END_OF_RE, 0)) 3212 { 3213 *err = REG_EBRACK; 3214 goto parse_bracket_exp_free_return; 3215 } 3216 if (token2.type == OP_CLOSE_BRACKET) 3217 { 3218 /* We treat the last '-' as a normal character. */ 3219 re_string_skip_bytes (regexp, -token_len); 3220 token->type = CHARACTER; 3221 } 3222 else 3223 is_range_exp = true; 3224 } 3225 } 3226 3227 if (is_range_exp == true) 3228 { 3229 end_elem.opr.name = end_name_buf; 3230 end_elem.type = COLL_SYM; 3231 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3232 dfa, syntax, true); 3233 if (BE (ret != REG_NOERROR, 0)) 3234 { 3235 *err = ret; 3236 goto parse_bracket_exp_free_return; 3237 } 3238 3239 token_len = peek_token_bracket (token, regexp, syntax); 3240 3241 #ifdef _LIBC 3242 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3243 &start_elem, &end_elem); 3244 #else 3245 # ifdef RE_ENABLE_I18N 3246 *err = build_range_exp (syntax, sbcset, 3247 dfa->mb_cur_max > 1 ? mbcset : NULL, 3248 &range_alloc, &start_elem, &end_elem); 3249 # else 3250 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3251 # endif 3252 #endif /* RE_ENABLE_I18N */ 3253 if (BE (*err != REG_NOERROR, 0)) 3254 goto parse_bracket_exp_free_return; 3255 } 3256 else 3257 { 3258 switch (start_elem.type) 3259 { 3260 case SB_CHAR: 3261 bitset_set (sbcset, start_elem.opr.ch); 3262 break; 3263 #ifdef RE_ENABLE_I18N 3264 case MB_CHAR: 3265 /* Check whether the array has enough space. */ 3266 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3267 { 3268 wchar_t *new_mbchars; 3269 /* Not enough, realloc it. */ 3270 /* +1 in case of mbcset->nmbchars is 0. */ 3271 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3272 /* Use realloc since array is NULL if *alloc == 0. */ 3273 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3274 mbchar_alloc); 3275 if (BE (new_mbchars == NULL, 0)) 3276 goto parse_bracket_exp_espace; 3277 mbcset->mbchars = new_mbchars; 3278 } 3279 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3280 break; 3281 #endif /* RE_ENABLE_I18N */ 3282 case EQUIV_CLASS: 3283 *err = build_equiv_class (sbcset, 3284 #ifdef RE_ENABLE_I18N 3285 mbcset, &equiv_class_alloc, 3286 #endif /* RE_ENABLE_I18N */ 3287 start_elem.opr.name); 3288 if (BE (*err != REG_NOERROR, 0)) 3289 goto parse_bracket_exp_free_return; 3290 break; 3291 case COLL_SYM: 3292 *err = build_collating_symbol (sbcset, 3293 #ifdef RE_ENABLE_I18N 3294 mbcset, &coll_sym_alloc, 3295 #endif /* RE_ENABLE_I18N */ 3296 start_elem.opr.name); 3297 if (BE (*err != REG_NOERROR, 0)) 3298 goto parse_bracket_exp_free_return; 3299 break; 3300 case CHAR_CLASS: 3301 *err = build_charclass (regexp->trans, sbcset, 3302 #ifdef RE_ENABLE_I18N 3303 mbcset, &char_class_alloc, 3304 #endif /* RE_ENABLE_I18N */ 3305 (const char *) start_elem.opr.name, 3306 syntax); 3307 if (BE (*err != REG_NOERROR, 0)) 3308 goto parse_bracket_exp_free_return; 3309 break; 3310 default: 3311 assert (0); 3312 break; 3313 } 3314 } 3315 if (BE (token->type == END_OF_RE, 0)) 3316 { 3317 *err = REG_EBRACK; 3318 goto parse_bracket_exp_free_return; 3319 } 3320 if (token->type == OP_CLOSE_BRACKET) 3321 break; 3322 } 3323 3324 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3325 3326 /* If it is non-matching list. */ 3327 if (non_match) 3328 bitset_not (sbcset); 3329 3330 #ifdef RE_ENABLE_I18N 3331 /* Ensure only single byte characters are set. */ 3332 if (dfa->mb_cur_max > 1) 3333 bitset_mask (sbcset, dfa->sb_char); 3334 3335 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3336 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3337 || mbcset->non_match))) 3338 { 3339 bin_tree_t *mbc_tree; 3340 int sbc_idx; 3341 /* Build a tree for complex bracket. */ 3342 dfa->has_mb_node = 1; 3343 br_token.type = COMPLEX_BRACKET; 3344 br_token.opr.mbcset = mbcset; 3345 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3346 if (BE (mbc_tree == NULL, 0)) 3347 goto parse_bracket_exp_espace; 3348 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3349 if (sbcset[sbc_idx]) 3350 break; 3351 /* If there are no bits set in sbcset, there is no point 3352 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3353 if (sbc_idx < BITSET_WORDS) 3354 { 3355 /* Build a tree for simple bracket. */ 3356 br_token.type = SIMPLE_BRACKET; 3357 br_token.opr.sbcset = sbcset; 3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3359 if (BE (work_tree == NULL, 0)) 3360 goto parse_bracket_exp_espace; 3361 3362 /* Then join them by ALT node. */ 3363 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3364 if (BE (work_tree == NULL, 0)) 3365 goto parse_bracket_exp_espace; 3366 } 3367 else 3368 { 3369 re_free (sbcset); 3370 work_tree = mbc_tree; 3371 } 3372 } 3373 else 3374 #endif /* not RE_ENABLE_I18N */ 3375 { 3376 #ifdef RE_ENABLE_I18N 3377 free_charset (mbcset); 3378 #endif 3379 /* Build a tree for simple bracket. */ 3380 br_token.type = SIMPLE_BRACKET; 3381 br_token.opr.sbcset = sbcset; 3382 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3383 if (BE (work_tree == NULL, 0)) 3384 goto parse_bracket_exp_espace; 3385 } 3386 return work_tree; 3387 3388 parse_bracket_exp_espace: 3389 *err = REG_ESPACE; 3390 parse_bracket_exp_free_return: 3391 re_free (sbcset); 3392 #ifdef RE_ENABLE_I18N 3393 free_charset (mbcset); 3394 #endif /* RE_ENABLE_I18N */ 3395 return NULL; 3396 } 3397 3398 /* Parse an element in the bracket expression. */ 3399 3400 static reg_errcode_t 3401 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3402 re_token_t *token, int token_len, re_dfa_t *dfa, 3403 reg_syntax_t syntax, bool accept_hyphen) 3404 { 3405 #ifdef RE_ENABLE_I18N 3406 int cur_char_size; 3407 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3408 if (cur_char_size > 1) 3409 { 3410 elem->type = MB_CHAR; 3411 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3412 re_string_skip_bytes (regexp, cur_char_size); 3413 return REG_NOERROR; 3414 } 3415 #endif /* RE_ENABLE_I18N */ 3416 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3417 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3418 || token->type == OP_OPEN_EQUIV_CLASS) 3419 return parse_bracket_symbol (elem, regexp, token); 3420 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3421 { 3422 /* A '-' must only appear as anything but a range indicator before 3423 the closing bracket. Everything else is an error. */ 3424 re_token_t token2; 3425 (void) peek_token_bracket (&token2, regexp, syntax); 3426 if (token2.type != OP_CLOSE_BRACKET) 3427 /* The actual error value is not standardized since this whole 3428 case is undefined. But ERANGE makes good sense. */ 3429 return REG_ERANGE; 3430 } 3431 elem->type = SB_CHAR; 3432 elem->opr.ch = token->opr.c; 3433 return REG_NOERROR; 3434 } 3435 3436 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3437 such as [:<character_class>:], [.<collating_element>.], and 3438 [=<equivalent_class>=]. */ 3439 3440 static reg_errcode_t 3441 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3442 re_token_t *token) 3443 { 3444 unsigned char ch, delim = token->opr.c; 3445 int i = 0; 3446 if (re_string_eoi(regexp)) 3447 return REG_EBRACK; 3448 for (;; ++i) 3449 { 3450 if (i >= BRACKET_NAME_BUF_SIZE) 3451 return REG_EBRACK; 3452 if (token->type == OP_OPEN_CHAR_CLASS) 3453 ch = re_string_fetch_byte_case (regexp); 3454 else 3455 ch = re_string_fetch_byte (regexp); 3456 if (re_string_eoi(regexp)) 3457 return REG_EBRACK; 3458 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3459 break; 3460 elem->opr.name[i] = ch; 3461 } 3462 re_string_skip_bytes (regexp, 1); 3463 elem->opr.name[i] = '\0'; 3464 switch (token->type) 3465 { 3466 case OP_OPEN_COLL_ELEM: 3467 elem->type = COLL_SYM; 3468 break; 3469 case OP_OPEN_EQUIV_CLASS: 3470 elem->type = EQUIV_CLASS; 3471 break; 3472 case OP_OPEN_CHAR_CLASS: 3473 elem->type = CHAR_CLASS; 3474 break; 3475 default: 3476 break; 3477 } 3478 return REG_NOERROR; 3479 } 3480 3481 /* Helper function for parse_bracket_exp. 3482 Build the equivalence class which is represented by NAME. 3483 The result are written to MBCSET and SBCSET. 3484 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3485 is a pointer argument since we may update it. */ 3486 3487 static reg_errcode_t 3488 #ifdef RE_ENABLE_I18N 3489 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3490 Idx *equiv_class_alloc, const unsigned char *name) 3491 #else /* not RE_ENABLE_I18N */ 3492 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3493 #endif /* not RE_ENABLE_I18N */ 3494 { 3495 #ifdef _LIBC 3496 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3497 if (nrules != 0) 3498 { 3499 const int32_t *table, *indirect; 3500 const unsigned char *weights, *extra, *cp; 3501 unsigned char char_buf[2]; 3502 int32_t idx1, idx2; 3503 unsigned int ch; 3504 size_t len; 3505 /* Calculate the index for equivalence class. */ 3506 cp = name; 3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3509 _NL_COLLATE_WEIGHTMB); 3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3511 _NL_COLLATE_EXTRAMB); 3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3513 _NL_COLLATE_INDIRECTMB); 3514 idx1 = findidx (table, indirect, extra, &cp, -1); 3515 if (BE (idx1 == 0 || *cp != '\0', 0)) 3516 /* This isn't a valid character. */ 3517 return REG_ECOLLATE; 3518 3519 /* Build single byte matching table for this equivalence class. */ 3520 len = weights[idx1 & 0xffffff]; 3521 for (ch = 0; ch < SBC_MAX; ++ch) 3522 { 3523 char_buf[0] = ch; 3524 cp = char_buf; 3525 idx2 = findidx (table, indirect, extra, &cp, 1); 3526 /* 3527 idx2 = table[ch]; 3528 */ 3529 if (idx2 == 0) 3530 /* This isn't a valid character. */ 3531 continue; 3532 /* Compare only if the length matches and the collation rule 3533 index is the same. */ 3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3535 { 3536 int cnt = 0; 3537 3538 while (cnt <= len && 3539 weights[(idx1 & 0xffffff) + 1 + cnt] 3540 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3541 ++cnt; 3542 3543 if (cnt > len) 3544 bitset_set (sbcset, ch); 3545 } 3546 } 3547 /* Check whether the array has enough space. */ 3548 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3549 { 3550 /* Not enough, realloc it. */ 3551 /* +1 in case of mbcset->nequiv_classes is 0. */ 3552 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3553 /* Use realloc since the array is NULL if *alloc == 0. */ 3554 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3555 int32_t, 3556 new_equiv_class_alloc); 3557 if (BE (new_equiv_classes == NULL, 0)) 3558 return REG_ESPACE; 3559 mbcset->equiv_classes = new_equiv_classes; 3560 *equiv_class_alloc = new_equiv_class_alloc; 3561 } 3562 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3563 } 3564 else 3565 #endif /* _LIBC */ 3566 { 3567 if (BE (strlen ((const char *) name) != 1, 0)) 3568 return REG_ECOLLATE; 3569 bitset_set (sbcset, *name); 3570 } 3571 return REG_NOERROR; 3572 } 3573 3574 /* Helper function for parse_bracket_exp. 3575 Build the character class which is represented by NAME. 3576 The result are written to MBCSET and SBCSET. 3577 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3578 is a pointer argument since we may update it. */ 3579 3580 static reg_errcode_t 3581 #ifdef RE_ENABLE_I18N 3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3583 re_charset_t *mbcset, Idx *char_class_alloc, 3584 const char *class_name, reg_syntax_t syntax) 3585 #else /* not RE_ENABLE_I18N */ 3586 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3587 const char *class_name, reg_syntax_t syntax) 3588 #endif /* not RE_ENABLE_I18N */ 3589 { 3590 int i; 3591 const char *name = class_name; 3592 3593 /* In case of REG_ICASE "upper" and "lower" match the both of 3594 upper and lower cases. */ 3595 if ((syntax & RE_ICASE) 3596 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3597 name = "alpha"; 3598 3599 #ifdef RE_ENABLE_I18N 3600 /* Check the space of the arrays. */ 3601 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3602 { 3603 /* Not enough, realloc it. */ 3604 /* +1 in case of mbcset->nchar_classes is 0. */ 3605 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3606 /* Use realloc since array is NULL if *alloc == 0. */ 3607 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3608 new_char_class_alloc); 3609 if (BE (new_char_classes == NULL, 0)) 3610 return REG_ESPACE; 3611 mbcset->char_classes = new_char_classes; 3612 *char_class_alloc = new_char_class_alloc; 3613 } 3614 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3615 #endif /* RE_ENABLE_I18N */ 3616 3617 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3618 do { \ 3619 if (BE (trans != NULL, 0)) \ 3620 { \ 3621 for (i = 0; i < SBC_MAX; ++i) \ 3622 if (ctype_func (i)) \ 3623 bitset_set (sbcset, trans[i]); \ 3624 } \ 3625 else \ 3626 { \ 3627 for (i = 0; i < SBC_MAX; ++i) \ 3628 if (ctype_func (i)) \ 3629 bitset_set (sbcset, i); \ 3630 } \ 3631 } while (0) 3632 3633 if (strcmp (name, "alnum") == 0) 3634 BUILD_CHARCLASS_LOOP (isalnum); 3635 else if (strcmp (name, "cntrl") == 0) 3636 BUILD_CHARCLASS_LOOP (iscntrl); 3637 else if (strcmp (name, "lower") == 0) 3638 BUILD_CHARCLASS_LOOP (islower); 3639 else if (strcmp (name, "space") == 0) 3640 BUILD_CHARCLASS_LOOP (isspace); 3641 else if (strcmp (name, "alpha") == 0) 3642 BUILD_CHARCLASS_LOOP (isalpha); 3643 else if (strcmp (name, "digit") == 0) 3644 BUILD_CHARCLASS_LOOP (isdigit); 3645 else if (strcmp (name, "print") == 0) 3646 BUILD_CHARCLASS_LOOP (isprint); 3647 else if (strcmp (name, "upper") == 0) 3648 BUILD_CHARCLASS_LOOP (isupper); 3649 else if (strcmp (name, "blank") == 0) 3650 BUILD_CHARCLASS_LOOP (isblank); 3651 else if (strcmp (name, "graph") == 0) 3652 BUILD_CHARCLASS_LOOP (isgraph); 3653 else if (strcmp (name, "punct") == 0) 3654 BUILD_CHARCLASS_LOOP (ispunct); 3655 else if (strcmp (name, "xdigit") == 0) 3656 BUILD_CHARCLASS_LOOP (isxdigit); 3657 else 3658 return REG_ECTYPE; 3659 3660 return REG_NOERROR; 3661 } 3662 3663 static bin_tree_t * 3664 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3665 const char *class_name, 3666 const char *extra, bool non_match, 3667 reg_errcode_t *err) 3668 { 3669 re_bitset_ptr_t sbcset; 3670 #ifdef RE_ENABLE_I18N 3671 re_charset_t *mbcset; 3672 Idx alloc = 0; 3673 #endif /* not RE_ENABLE_I18N */ 3674 reg_errcode_t ret; 3675 re_token_t br_token; 3676 bin_tree_t *tree; 3677 3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3679 #ifdef RE_ENABLE_I18N 3680 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3681 #endif /* RE_ENABLE_I18N */ 3682 3683 #ifdef RE_ENABLE_I18N 3684 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3685 #else /* not RE_ENABLE_I18N */ 3686 if (BE (sbcset == NULL, 0)) 3687 #endif /* not RE_ENABLE_I18N */ 3688 { 3689 *err = REG_ESPACE; 3690 return NULL; 3691 } 3692 3693 if (non_match) 3694 { 3695 #ifdef RE_ENABLE_I18N 3696 mbcset->non_match = 1; 3697 #endif /* not RE_ENABLE_I18N */ 3698 } 3699 3700 /* We don't care the syntax in this case. */ 3701 ret = build_charclass (trans, sbcset, 3702 #ifdef RE_ENABLE_I18N 3703 mbcset, &alloc, 3704 #endif /* RE_ENABLE_I18N */ 3705 class_name, 0); 3706 3707 if (BE (ret != REG_NOERROR, 0)) 3708 { 3709 re_free (sbcset); 3710 #ifdef RE_ENABLE_I18N 3711 free_charset (mbcset); 3712 #endif /* RE_ENABLE_I18N */ 3713 *err = ret; 3714 return NULL; 3715 } 3716 /* \w match '_' also. */ 3717 for (; *extra; extra++) 3718 bitset_set (sbcset, *extra); 3719 3720 /* If it is non-matching list. */ 3721 if (non_match) 3722 bitset_not (sbcset); 3723 3724 #ifdef RE_ENABLE_I18N 3725 /* Ensure only single byte characters are set. */ 3726 if (dfa->mb_cur_max > 1) 3727 bitset_mask (sbcset, dfa->sb_char); 3728 #endif 3729 3730 /* Build a tree for simple bracket. */ 3731 br_token.type = SIMPLE_BRACKET; 3732 br_token.opr.sbcset = sbcset; 3733 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3734 if (BE (tree == NULL, 0)) 3735 goto build_word_op_espace; 3736 3737 #ifdef RE_ENABLE_I18N 3738 if (dfa->mb_cur_max > 1) 3739 { 3740 bin_tree_t *mbc_tree; 3741 /* Build a tree for complex bracket. */ 3742 br_token.type = COMPLEX_BRACKET; 3743 br_token.opr.mbcset = mbcset; 3744 dfa->has_mb_node = 1; 3745 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3746 if (BE (mbc_tree == NULL, 0)) 3747 goto build_word_op_espace; 3748 /* Then join them by ALT node. */ 3749 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3750 if (BE (mbc_tree != NULL, 1)) 3751 return tree; 3752 } 3753 else 3754 { 3755 free_charset (mbcset); 3756 return tree; 3757 } 3758 #else /* not RE_ENABLE_I18N */ 3759 return tree; 3760 #endif /* not RE_ENABLE_I18N */ 3761 3762 build_word_op_espace: 3763 re_free (sbcset); 3764 #ifdef RE_ENABLE_I18N 3765 free_charset (mbcset); 3766 #endif /* RE_ENABLE_I18N */ 3767 *err = REG_ESPACE; 3768 return NULL; 3769 } 3770 3771 /* This is intended for the expressions like "a{1,3}". 3772 Fetch a number from 'input', and return the number. 3773 Return REG_MISSING if the number field is empty like "{,1}". 3774 Return RE_DUP_MAX + 1 if the number field is too large. 3775 Return REG_ERROR if an error occurred. */ 3776 3777 static Idx 3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3779 { 3780 Idx num = REG_MISSING; 3781 unsigned char c; 3782 while (1) 3783 { 3784 fetch_token (token, input, syntax); 3785 c = token->opr.c; 3786 if (BE (token->type == END_OF_RE, 0)) 3787 return REG_ERROR; 3788 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3789 break; 3790 num = ((token->type != CHARACTER || c < '0' || '9' < c 3791 || num == REG_ERROR) 3792 ? REG_ERROR 3793 : num == REG_MISSING 3794 ? c - '0' 3795 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); 3796 } 3797 return num; 3798 } 3799 3800 #ifdef RE_ENABLE_I18N 3801 static void 3802 free_charset (re_charset_t *cset) 3803 { 3804 re_free (cset->mbchars); 3805 # ifdef _LIBC 3806 re_free (cset->coll_syms); 3807 re_free (cset->equiv_classes); 3808 re_free (cset->range_starts); 3809 re_free (cset->range_ends); 3810 # endif 3811 re_free (cset->char_classes); 3812 re_free (cset); 3813 } 3814 #endif /* RE_ENABLE_I18N */ 3815 3816 /* Functions for binary tree operation. */ 3817 3818 /* Create a tree node. */ 3819 3820 static bin_tree_t * 3821 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3822 re_token_type_t type) 3823 { 3824 re_token_t t; 3825 t.type = type; 3826 return create_token_tree (dfa, left, right, &t); 3827 } 3828 3829 static bin_tree_t * 3830 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3831 const re_token_t *token) 3832 { 3833 bin_tree_t *tree; 3834 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3835 { 3836 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3837 3838 if (storage == NULL) 3839 return NULL; 3840 storage->next = dfa->str_tree_storage; 3841 dfa->str_tree_storage = storage; 3842 dfa->str_tree_storage_idx = 0; 3843 } 3844 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3845 3846 tree->parent = NULL; 3847 tree->left = left; 3848 tree->right = right; 3849 tree->token = *token; 3850 tree->token.duplicated = 0; 3851 tree->token.opt_subexp = 0; 3852 tree->first = NULL; 3853 tree->next = NULL; 3854 tree->node_idx = REG_MISSING; 3855 3856 if (left != NULL) 3857 left->parent = tree; 3858 if (right != NULL) 3859 right->parent = tree; 3860 return tree; 3861 } 3862 3863 /* Mark the tree SRC as an optional subexpression. 3864 To be called from preorder or postorder. */ 3865 3866 static reg_errcode_t 3867 mark_opt_subexp (void *extra, bin_tree_t *node) 3868 { 3869 Idx idx = (uintptr_t) extra; 3870 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3871 node->token.opt_subexp = 1; 3872 3873 return REG_NOERROR; 3874 } 3875 3876 /* Free the allocated memory inside NODE. */ 3877 3878 static void 3879 free_token (re_token_t *node) 3880 { 3881 #ifdef RE_ENABLE_I18N 3882 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3883 free_charset (node->opr.mbcset); 3884 else 3885 #endif /* RE_ENABLE_I18N */ 3886 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3887 re_free (node->opr.sbcset); 3888 } 3889 3890 /* Worker function for tree walking. Free the allocated memory inside NODE 3891 and its children. */ 3892 3893 static reg_errcode_t 3894 free_tree (void *extra, bin_tree_t *node) 3895 { 3896 free_token (&node->token); 3897 return REG_NOERROR; 3898 } 3899 3900 3901 /* Duplicate the node SRC, and return new node. This is a preorder 3902 visit similar to the one implemented by the generic visitor, but 3903 we need more infrastructure to maintain two parallel trees --- so, 3904 it's easier to duplicate. */ 3905 3906 static bin_tree_t * 3907 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3908 { 3909 const bin_tree_t *node; 3910 bin_tree_t *dup_root; 3911 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3912 3913 for (node = root; ; ) 3914 { 3915 /* Create a new tree and link it back to the current parent. */ 3916 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3917 if (*p_new == NULL) 3918 return NULL; 3919 (*p_new)->parent = dup_node; 3920 (*p_new)->token.duplicated = 1; 3921 dup_node = *p_new; 3922 3923 /* Go to the left node, or up and to the right. */ 3924 if (node->left) 3925 { 3926 node = node->left; 3927 p_new = &dup_node->left; 3928 } 3929 else 3930 { 3931 const bin_tree_t *prev = NULL; 3932 while (node->right == prev || node->right == NULL) 3933 { 3934 prev = node; 3935 node = node->parent; 3936 dup_node = dup_node->parent; 3937 if (!node) 3938 return dup_root; 3939 } 3940 node = node->right; 3941 p_new = &dup_node->right; 3942 } 3943 } 3944 } 3945