1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002-2012 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License along 17 with this program; if not, see <http://www.gnu.org/licenses/>. */ 18 19 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 20 size_t length, reg_syntax_t syntax); 21 static void re_compile_fastmap_iter (regex_t *bufp, 22 const re_dfastate_t *init_state, 23 char *fastmap); 24 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 25 #ifdef RE_ENABLE_I18N 26 static void free_charset (re_charset_t *cset); 27 #endif /* RE_ENABLE_I18N */ 28 static void free_workarea_compile (regex_t *preg); 29 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 30 #ifdef RE_ENABLE_I18N 31 static void optimize_utf8 (re_dfa_t *dfa); 32 #endif 33 static reg_errcode_t analyze (regex_t *preg); 34 static reg_errcode_t preorder (bin_tree_t *root, 35 reg_errcode_t (fn (void *, bin_tree_t *)), 36 void *extra); 37 static reg_errcode_t postorder (bin_tree_t *root, 38 reg_errcode_t (fn (void *, bin_tree_t *)), 39 void *extra); 40 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 41 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 42 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 43 bin_tree_t *node); 44 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 45 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 46 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 47 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 48 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 49 unsigned int constraint); 50 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 51 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 52 Idx node, bool root); 53 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 54 static Idx fetch_number (re_string_t *input, re_token_t *token, 55 reg_syntax_t syntax); 56 static int peek_token (re_token_t *token, re_string_t *input, 57 reg_syntax_t syntax) internal_function; 58 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 59 reg_syntax_t syntax, reg_errcode_t *err); 60 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 61 re_token_t *token, reg_syntax_t syntax, 62 Idx nest, reg_errcode_t *err); 63 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 64 re_token_t *token, reg_syntax_t syntax, 65 Idx nest, reg_errcode_t *err); 66 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 67 re_token_t *token, reg_syntax_t syntax, 68 Idx nest, reg_errcode_t *err); 69 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 70 re_token_t *token, reg_syntax_t syntax, 71 Idx nest, reg_errcode_t *err); 72 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 73 re_dfa_t *dfa, re_token_t *token, 74 reg_syntax_t syntax, reg_errcode_t *err); 75 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 76 re_token_t *token, reg_syntax_t syntax, 77 reg_errcode_t *err); 78 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 79 re_string_t *regexp, 80 re_token_t *token, int token_len, 81 re_dfa_t *dfa, 82 reg_syntax_t syntax, 83 bool accept_hyphen); 84 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 85 re_string_t *regexp, 86 re_token_t *token); 87 #ifdef RE_ENABLE_I18N 88 static reg_errcode_t build_equiv_class (bitset_t sbcset, 89 re_charset_t *mbcset, 90 Idx *equiv_class_alloc, 91 const unsigned char *name); 92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 93 bitset_t sbcset, 94 re_charset_t *mbcset, 95 Idx *char_class_alloc, 96 const unsigned char *class_name, 97 reg_syntax_t syntax); 98 #else /* not RE_ENABLE_I18N */ 99 static reg_errcode_t build_equiv_class (bitset_t sbcset, 100 const unsigned char *name); 101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 102 bitset_t sbcset, 103 const unsigned char *class_name, 104 reg_syntax_t syntax); 105 #endif /* not RE_ENABLE_I18N */ 106 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 107 RE_TRANSLATE_TYPE trans, 108 const unsigned char *class_name, 109 const unsigned char *extra, 110 bool non_match, reg_errcode_t *err); 111 static bin_tree_t *create_tree (re_dfa_t *dfa, 112 bin_tree_t *left, bin_tree_t *right, 113 re_token_type_t type); 114 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 115 bin_tree_t *left, bin_tree_t *right, 116 const re_token_t *token); 117 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 118 static void free_token (re_token_t *node); 119 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 120 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 121 122 /* This table gives an error message for each of the error codes listed 123 in regex.h. Obviously the order here has to be same as there. 124 POSIX doesn't require that we do anything for REG_NOERROR, 125 but why not be nice? */ 126 127 static const char __re_error_msgid[] = 128 { 129 #define REG_NOERROR_IDX 0 130 gettext_noop ("Success") /* REG_NOERROR */ 131 "\0" 132 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 133 gettext_noop ("No match") /* REG_NOMATCH */ 134 "\0" 135 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 137 "\0" 138 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 140 "\0" 141 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 143 "\0" 144 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 146 "\0" 147 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 149 "\0" 150 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 152 "\0" 153 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 155 "\0" 156 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 158 "\0" 159 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 161 "\0" 162 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 163 gettext_noop ("Invalid range end") /* REG_ERANGE */ 164 "\0" 165 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 166 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 167 "\0" 168 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 170 "\0" 171 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 172 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 173 "\0" 174 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 175 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 176 "\0" 177 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 179 }; 180 181 static const size_t __re_error_msgid_idx[] = 182 { 183 REG_NOERROR_IDX, 184 REG_NOMATCH_IDX, 185 REG_BADPAT_IDX, 186 REG_ECOLLATE_IDX, 187 REG_ECTYPE_IDX, 188 REG_EESCAPE_IDX, 189 REG_ESUBREG_IDX, 190 REG_EBRACK_IDX, 191 REG_EPAREN_IDX, 192 REG_EBRACE_IDX, 193 REG_BADBR_IDX, 194 REG_ERANGE_IDX, 195 REG_ESPACE_IDX, 196 REG_BADRPT_IDX, 197 REG_EEND_IDX, 198 REG_ESIZE_IDX, 199 REG_ERPAREN_IDX 200 }; 201 202 /* Entry points for GNU code. */ 203 204 /* re_compile_pattern is the GNU regular expression compiler: it 205 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 206 Returns 0 if the pattern was valid, otherwise an error string. 207 208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields 209 are set in BUFP on entry. */ 210 211 #ifdef _LIBC 212 const char * 213 re_compile_pattern (pattern, length, bufp) 214 const char *pattern; 215 size_t length; 216 struct re_pattern_buffer *bufp; 217 #else /* size_t might promote */ 218 const char * 219 re_compile_pattern (const char *pattern, size_t length, 220 struct re_pattern_buffer *bufp) 221 #endif 222 { 223 reg_errcode_t ret; 224 225 /* And GNU code determines whether or not to get register information 226 by passing null for the REGS argument to re_match, etc., not by 227 setting no_sub, unless RE_NO_SUB is set. */ 228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 229 230 /* Match anchors at newline. */ 231 bufp->newline_anchor = 1; 232 233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 234 235 if (!ret) 236 return NULL; 237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 238 } 239 #ifdef _LIBC 240 weak_alias (__re_compile_pattern, re_compile_pattern) 241 #endif 242 243 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can 244 also be assigned to arbitrarily: each pattern buffer stores its own 245 syntax, so it can be changed between regex compilations. */ 246 /* This has no initializer because initialized variables in Emacs 247 become read-only after dumping. */ 248 reg_syntax_t re_syntax_options; 249 250 251 /* Specify the precise syntax of regexps for compilation. This provides 252 for compatibility for various utilities which historically have 253 different, incompatible syntaxes. 254 255 The argument SYNTAX is a bit mask comprised of the various bits 256 defined in regex.h. We return the old syntax. */ 257 258 reg_syntax_t 259 re_set_syntax (syntax) 260 reg_syntax_t syntax; 261 { 262 reg_syntax_t ret = re_syntax_options; 263 264 re_syntax_options = syntax; 265 return ret; 266 } 267 #ifdef _LIBC 268 weak_alias (__re_set_syntax, re_set_syntax) 269 #endif 270 271 int 272 re_compile_fastmap (bufp) 273 struct re_pattern_buffer *bufp; 274 { 275 re_dfa_t *dfa = bufp->buffer; 276 char *fastmap = bufp->fastmap; 277 278 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 280 if (dfa->init_state != dfa->init_state_word) 281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 282 if (dfa->init_state != dfa->init_state_nl) 283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 284 if (dfa->init_state != dfa->init_state_begbuf) 285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 286 bufp->fastmap_accurate = 1; 287 return 0; 288 } 289 #ifdef _LIBC 290 weak_alias (__re_compile_fastmap, re_compile_fastmap) 291 #endif 292 293 static inline void 294 __attribute ((always_inline)) 295 re_set_fastmap (char *fastmap, bool icase, int ch) 296 { 297 fastmap[ch] = 1; 298 if (icase) 299 fastmap[tolower (ch)] = 1; 300 } 301 302 /* Helper function for re_compile_fastmap. 303 Compile fastmap for the initial_state INIT_STATE. */ 304 305 static void 306 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 307 char *fastmap) 308 { 309 re_dfa_t *dfa = bufp->buffer; 310 Idx node_cnt; 311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 313 { 314 Idx node = init_state->nodes.elems[node_cnt]; 315 re_token_type_t type = dfa->nodes[node].type; 316 317 if (type == CHARACTER) 318 { 319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 320 #ifdef RE_ENABLE_I18N 321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 322 { 323 unsigned char buf[MB_LEN_MAX]; 324 unsigned char *p; 325 wchar_t wc; 326 mbstate_t state; 327 328 p = buf; 329 *p++ = dfa->nodes[node].opr.c; 330 while (++node < dfa->nodes_len 331 && dfa->nodes[node].type == CHARACTER 332 && dfa->nodes[node].mb_partial) 333 *p++ = dfa->nodes[node].opr.c; 334 memset (&state, '\0', sizeof (state)); 335 if (__mbrtowc (&wc, (const char *) buf, p - buf, 336 &state) == p - buf 337 && (__wcrtomb ((char *) buf, towlower (wc), &state) 338 != (size_t) -1)) 339 re_set_fastmap (fastmap, false, buf[0]); 340 } 341 #endif 342 } 343 else if (type == SIMPLE_BRACKET) 344 { 345 int i, ch; 346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 347 { 348 int j; 349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 351 if (w & ((bitset_word_t) 1 << j)) 352 re_set_fastmap (fastmap, icase, ch); 353 } 354 } 355 #ifdef RE_ENABLE_I18N 356 else if (type == COMPLEX_BRACKET) 357 { 358 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 359 Idx i; 360 361 # ifdef _LIBC 362 /* See if we have to try all bytes which start multiple collation 363 elements. 364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 365 collation element, and don't catch 'b' since 'b' is 366 the only collation element which starts from 'b' (and 367 it is caught by SIMPLE_BRACKET). */ 368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 369 && (cset->ncoll_syms || cset->nranges)) 370 { 371 const int32_t *table = (const int32_t *) 372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 373 for (i = 0; i < SBC_MAX; ++i) 374 if (table[i] < 0) 375 re_set_fastmap (fastmap, icase, i); 376 } 377 # endif /* _LIBC */ 378 379 /* See if we have to start the match at all multibyte characters, 380 i.e. where we would not find an invalid sequence. This only 381 applies to multibyte character sets; for single byte character 382 sets, the SIMPLE_BRACKET again suffices. */ 383 if (dfa->mb_cur_max > 1 384 && (cset->nchar_classes || cset->non_match || cset->nranges 385 # ifdef _LIBC 386 || cset->nequiv_classes 387 # endif /* _LIBC */ 388 )) 389 { 390 unsigned char c = 0; 391 do 392 { 393 mbstate_t mbs; 394 memset (&mbs, 0, sizeof (mbs)); 395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 396 re_set_fastmap (fastmap, false, (int) c); 397 } 398 while (++c != 0); 399 } 400 401 else 402 { 403 /* ... Else catch all bytes which can start the mbchars. */ 404 for (i = 0; i < cset->nmbchars; ++i) 405 { 406 char buf[256]; 407 mbstate_t state; 408 memset (&state, '\0', sizeof (state)); 409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 412 { 413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 414 != (size_t) -1) 415 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 416 } 417 } 418 } 419 } 420 #endif /* RE_ENABLE_I18N */ 421 else if (type == OP_PERIOD 422 #ifdef RE_ENABLE_I18N 423 || type == OP_UTF8_PERIOD 424 #endif /* RE_ENABLE_I18N */ 425 || type == END_OF_RE) 426 { 427 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 428 if (type == END_OF_RE) 429 bufp->can_be_null = 1; 430 return; 431 } 432 } 433 } 434 435 /* Entry point for POSIX code. */ 436 /* regcomp takes a regular expression as a string and compiles it. 437 438 PREG is a regex_t *. We do not expect any fields to be initialized, 439 since POSIX says we shouldn't. Thus, we set 440 441 'buffer' to the compiled pattern; 442 'used' to the length of the compiled pattern; 443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the 444 REG_EXTENDED bit in CFLAGS is set; otherwise, to 445 RE_SYNTAX_POSIX_BASIC; 446 'newline_anchor' to REG_NEWLINE being set in CFLAGS; 447 'fastmap' to an allocated space for the fastmap; 448 'fastmap_accurate' to zero; 449 're_nsub' to the number of subexpressions in PATTERN. 450 451 PATTERN is the address of the pattern string. 452 453 CFLAGS is a series of bits which affect compilation. 454 455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 456 use POSIX basic syntax. 457 458 If REG_NEWLINE is set, then . and [^...] don't match newline. 459 Also, regexec will try a match beginning after every newline. 460 461 If REG_ICASE is set, then we considers upper- and lowercase 462 versions of letters to be equivalent when matching. 463 464 If REG_NOSUB is set, then when PREG is passed to regexec, that 465 routine will report only success or failure, and nothing about the 466 registers. 467 468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 469 the return codes and their meanings.) */ 470 471 int 472 regcomp (preg, pattern, cflags) 473 regex_t *_Restrict_ preg; 474 const char *_Restrict_ pattern; 475 int cflags; 476 { 477 reg_errcode_t ret; 478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 479 : RE_SYNTAX_POSIX_BASIC); 480 481 preg->buffer = NULL; 482 preg->allocated = 0; 483 preg->used = 0; 484 485 /* Try to allocate space for the fastmap. */ 486 preg->fastmap = re_malloc (char, SBC_MAX); 487 if (BE (preg->fastmap == NULL, 0)) 488 return REG_ESPACE; 489 490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 491 492 /* If REG_NEWLINE is set, newlines are treated differently. */ 493 if (cflags & REG_NEWLINE) 494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 495 syntax &= ~RE_DOT_NEWLINE; 496 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 497 /* It also changes the matching behavior. */ 498 preg->newline_anchor = 1; 499 } 500 else 501 preg->newline_anchor = 0; 502 preg->no_sub = !!(cflags & REG_NOSUB); 503 preg->translate = NULL; 504 505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 506 507 /* POSIX doesn't distinguish between an unmatched open-group and an 508 unmatched close-group: both are REG_EPAREN. */ 509 if (ret == REG_ERPAREN) 510 ret = REG_EPAREN; 511 512 /* We have already checked preg->fastmap != NULL. */ 513 if (BE (ret == REG_NOERROR, 1)) 514 /* Compute the fastmap now, since regexec cannot modify the pattern 515 buffer. This function never fails in this implementation. */ 516 (void) re_compile_fastmap (preg); 517 else 518 { 519 /* Some error occurred while compiling the expression. */ 520 re_free (preg->fastmap); 521 preg->fastmap = NULL; 522 } 523 524 return (int) ret; 525 } 526 #ifdef _LIBC 527 weak_alias (__regcomp, regcomp) 528 #endif 529 530 /* Returns a message corresponding to an error code, ERRCODE, returned 531 from either regcomp or regexec. We don't use PREG here. */ 532 533 #ifdef _LIBC 534 size_t 535 regerror (errcode, preg, errbuf, errbuf_size) 536 int errcode; 537 const regex_t *_Restrict_ preg; 538 char *_Restrict_ errbuf; 539 size_t errbuf_size; 540 #else /* size_t might promote */ 541 size_t 542 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_, 543 char *_Restrict_ errbuf, size_t errbuf_size) 544 #endif 545 { 546 const char *msg; 547 size_t msg_size; 548 549 if (BE (errcode < 0 550 || errcode >= (int) (sizeof (__re_error_msgid_idx) 551 / sizeof (__re_error_msgid_idx[0])), 0)) 552 /* Only error codes returned by the rest of the code should be passed 553 to this routine. If we are given anything else, or if other regex 554 code generates an invalid error code, then the program has a bug. 555 Dump core so we can fix it. */ 556 abort (); 557 558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 559 560 msg_size = strlen (msg) + 1; /* Includes the null. */ 561 562 if (BE (errbuf_size != 0, 1)) 563 { 564 size_t cpy_size = msg_size; 565 if (BE (msg_size > errbuf_size, 0)) 566 { 567 cpy_size = errbuf_size - 1; 568 errbuf[cpy_size] = '\0'; 569 } 570 memcpy (errbuf, msg, cpy_size); 571 } 572 573 return msg_size; 574 } 575 #ifdef _LIBC 576 weak_alias (__regerror, regerror) 577 #endif 578 579 580 #ifdef RE_ENABLE_I18N 581 /* This static array is used for the map to single-byte characters when 582 UTF-8 is used. Otherwise we would allocate memory just to initialize 583 it the same all the time. UTF-8 is the preferred encoding so this is 584 a worthwhile optimization. */ 585 static const bitset_t utf8_sb_map = 586 { 587 /* Set the first 128 bits. */ 588 # ifdef __GNUC__ 589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX 590 # else 591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 592 # error "bitset_word_t is narrower than 32 bits" 593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 596 BITSET_WORD_MAX, BITSET_WORD_MAX, 597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 598 BITSET_WORD_MAX, 599 # endif 600 (BITSET_WORD_MAX 601 >> (SBC_MAX % BITSET_WORD_BITS == 0 602 ? 0 603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 604 # endif 605 }; 606 #endif 607 608 609 static void 610 free_dfa_content (re_dfa_t *dfa) 611 { 612 Idx i, j; 613 614 if (dfa->nodes) 615 for (i = 0; i < dfa->nodes_len; ++i) 616 free_token (dfa->nodes + i); 617 re_free (dfa->nexts); 618 for (i = 0; i < dfa->nodes_len; ++i) 619 { 620 if (dfa->eclosures != NULL) 621 re_node_set_free (dfa->eclosures + i); 622 if (dfa->inveclosures != NULL) 623 re_node_set_free (dfa->inveclosures + i); 624 if (dfa->edests != NULL) 625 re_node_set_free (dfa->edests + i); 626 } 627 re_free (dfa->edests); 628 re_free (dfa->eclosures); 629 re_free (dfa->inveclosures); 630 re_free (dfa->nodes); 631 632 if (dfa->state_table) 633 for (i = 0; i <= dfa->state_hash_mask; ++i) 634 { 635 struct re_state_table_entry *entry = dfa->state_table + i; 636 for (j = 0; j < entry->num; ++j) 637 { 638 re_dfastate_t *state = entry->array[j]; 639 free_state (state); 640 } 641 re_free (entry->array); 642 } 643 re_free (dfa->state_table); 644 #ifdef RE_ENABLE_I18N 645 if (dfa->sb_char != utf8_sb_map) 646 re_free (dfa->sb_char); 647 #endif 648 re_free (dfa->subexp_map); 649 #ifdef DEBUG 650 re_free (dfa->re_str); 651 #endif 652 653 re_free (dfa); 654 } 655 656 657 /* Free dynamically allocated space used by PREG. */ 658 659 void 660 regfree (preg) 661 regex_t *preg; 662 { 663 re_dfa_t *dfa = preg->buffer; 664 if (BE (dfa != NULL, 1)) 665 free_dfa_content (dfa); 666 preg->buffer = NULL; 667 preg->allocated = 0; 668 669 re_free (preg->fastmap); 670 preg->fastmap = NULL; 671 672 re_free (preg->translate); 673 preg->translate = NULL; 674 } 675 #ifdef _LIBC 676 weak_alias (__regfree, regfree) 677 #endif 678 679 /* Entry points compatible with 4.2 BSD regex library. We don't define 680 them unless specifically requested. */ 681 682 #if defined _REGEX_RE_COMP || defined _LIBC 683 684 /* BSD has one and only one pattern buffer. */ 685 static struct re_pattern_buffer re_comp_buf; 686 687 char * 688 # ifdef _LIBC 689 /* Make these definitions weak in libc, so POSIX programs can redefine 690 these names if they don't use our functions, and still use 691 regcomp/regexec above without link errors. */ 692 weak_function 693 # endif 694 re_comp (s) 695 const char *s; 696 { 697 reg_errcode_t ret; 698 char *fastmap; 699 700 if (!s) 701 { 702 if (!re_comp_buf.buffer) 703 return gettext ("No previous regular expression"); 704 return 0; 705 } 706 707 if (re_comp_buf.buffer) 708 { 709 fastmap = re_comp_buf.fastmap; 710 re_comp_buf.fastmap = NULL; 711 __regfree (&re_comp_buf); 712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 713 re_comp_buf.fastmap = fastmap; 714 } 715 716 if (re_comp_buf.fastmap == NULL) 717 { 718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 719 if (re_comp_buf.fastmap == NULL) 720 return (char *) gettext (__re_error_msgid 721 + __re_error_msgid_idx[(int) REG_ESPACE]); 722 } 723 724 /* Since 're_exec' always passes NULL for the 'regs' argument, we 725 don't need to initialize the pattern buffer fields which affect it. */ 726 727 /* Match anchors at newlines. */ 728 re_comp_buf.newline_anchor = 1; 729 730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 731 732 if (!ret) 733 return NULL; 734 735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */ 736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 737 } 738 739 #ifdef _LIBC 740 libc_freeres_fn (free_mem) 741 { 742 __regfree (&re_comp_buf); 743 } 744 #endif 745 746 #endif /* _REGEX_RE_COMP */ 747 748 /* Internal entry point. 749 Compile the regular expression PATTERN, whose length is LENGTH. 750 SYNTAX indicate regular expression's syntax. */ 751 752 static reg_errcode_t 753 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 754 reg_syntax_t syntax) 755 { 756 reg_errcode_t err = REG_NOERROR; 757 re_dfa_t *dfa; 758 re_string_t regexp; 759 760 /* Initialize the pattern buffer. */ 761 preg->fastmap_accurate = 0; 762 preg->syntax = syntax; 763 preg->not_bol = preg->not_eol = 0; 764 preg->used = 0; 765 preg->re_nsub = 0; 766 preg->can_be_null = 0; 767 preg->regs_allocated = REGS_UNALLOCATED; 768 769 /* Initialize the dfa. */ 770 dfa = preg->buffer; 771 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 772 { 773 /* If zero allocated, but buffer is non-null, try to realloc 774 enough space. This loses if buffer's address is bogus, but 775 that is the user's responsibility. If ->buffer is NULL this 776 is a simple allocation. */ 777 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 778 if (dfa == NULL) 779 return REG_ESPACE; 780 preg->allocated = sizeof (re_dfa_t); 781 preg->buffer = dfa; 782 } 783 preg->used = sizeof (re_dfa_t); 784 785 err = init_dfa (dfa, length); 786 if (BE (err != REG_NOERROR, 0)) 787 { 788 free_dfa_content (dfa); 789 preg->buffer = NULL; 790 preg->allocated = 0; 791 return err; 792 } 793 #ifdef DEBUG 794 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 795 dfa->re_str = re_malloc (char, length + 1); 796 strncpy (dfa->re_str, pattern, length + 1); 797 #endif 798 799 __libc_lock_init (dfa->lock); 800 801 err = re_string_construct (®exp, pattern, length, preg->translate, 802 (syntax & RE_ICASE) != 0, dfa); 803 if (BE (err != REG_NOERROR, 0)) 804 { 805 re_compile_internal_free_return: 806 free_workarea_compile (preg); 807 re_string_destruct (®exp); 808 free_dfa_content (dfa); 809 preg->buffer = NULL; 810 preg->allocated = 0; 811 return err; 812 } 813 814 /* Parse the regular expression, and build a structure tree. */ 815 preg->re_nsub = 0; 816 dfa->str_tree = parse (®exp, preg, syntax, &err); 817 if (BE (dfa->str_tree == NULL, 0)) 818 goto re_compile_internal_free_return; 819 820 /* Analyze the tree and create the nfa. */ 821 err = analyze (preg); 822 if (BE (err != REG_NOERROR, 0)) 823 goto re_compile_internal_free_return; 824 825 #ifdef RE_ENABLE_I18N 826 /* If possible, do searching in single byte encoding to speed things up. */ 827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 828 optimize_utf8 (dfa); 829 #endif 830 831 /* Then create the initial state of the dfa. */ 832 err = create_initial_state (dfa); 833 834 /* Release work areas. */ 835 free_workarea_compile (preg); 836 re_string_destruct (®exp); 837 838 if (BE (err != REG_NOERROR, 0)) 839 { 840 free_dfa_content (dfa); 841 preg->buffer = NULL; 842 preg->allocated = 0; 843 } 844 845 return err; 846 } 847 848 /* Initialize DFA. We use the length of the regular expression PAT_LEN 849 as the initial length of some arrays. */ 850 851 static reg_errcode_t 852 init_dfa (re_dfa_t *dfa, size_t pat_len) 853 { 854 __re_size_t table_size; 855 #ifndef _LIBC 856 const char *codeset_name; 857 #endif 858 #ifdef RE_ENABLE_I18N 859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 860 #else 861 size_t max_i18n_object_size = 0; 862 #endif 863 size_t max_object_size = 864 MAX (sizeof (struct re_state_table_entry), 865 MAX (sizeof (re_token_t), 866 MAX (sizeof (re_node_set), 867 MAX (sizeof (regmatch_t), 868 max_i18n_object_size)))); 869 870 memset (dfa, '\0', sizeof (re_dfa_t)); 871 872 /* Force allocation of str_tree_storage the first time. */ 873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 874 875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 876 calculation below, and for similar doubling calculations 877 elsewhere. And it's <= rather than <, because some of the 878 doubling calculations add 1 afterwards. */ 879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0)) 880 return REG_ESPACE; 881 882 dfa->nodes_alloc = pat_len + 1; 883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 884 885 /* table_size = 2 ^ ceil(log pat_len) */ 886 for (table_size = 1; ; table_size <<= 1) 887 if (table_size > pat_len) 888 break; 889 890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 891 dfa->state_hash_mask = table_size - 1; 892 893 dfa->mb_cur_max = MB_CUR_MAX; 894 #ifdef _LIBC 895 if (dfa->mb_cur_max == 6 896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 897 dfa->is_utf8 = 1; 898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 899 != 0); 900 #else 901 codeset_name = nl_langinfo (CODESET); 902 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u') 903 && (codeset_name[1] == 'T' || codeset_name[1] == 't') 904 && (codeset_name[2] == 'F' || codeset_name[2] == 'f') 905 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0) 906 dfa->is_utf8 = 1; 907 908 /* We check exhaustively in the loop below if this charset is a 909 superset of ASCII. */ 910 dfa->map_notascii = 0; 911 #endif 912 913 #ifdef RE_ENABLE_I18N 914 if (dfa->mb_cur_max > 1) 915 { 916 if (dfa->is_utf8) 917 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 918 else 919 { 920 int i, j, ch; 921 922 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 923 if (BE (dfa->sb_char == NULL, 0)) 924 return REG_ESPACE; 925 926 /* Set the bits corresponding to single byte chars. */ 927 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 928 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 929 { 930 wint_t wch = __btowc (ch); 931 if (wch != WEOF) 932 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 933 # ifndef _LIBC 934 if (isascii (ch) && wch != ch) 935 dfa->map_notascii = 1; 936 # endif 937 } 938 } 939 } 940 #endif 941 942 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 943 return REG_ESPACE; 944 return REG_NOERROR; 945 } 946 947 /* Initialize WORD_CHAR table, which indicate which character is 948 "word". In this case "word" means that it is the word construction 949 character used by some operators like "\<", "\>", etc. */ 950 951 static void 952 internal_function 953 init_word_char (re_dfa_t *dfa) 954 { 955 dfa->word_ops_used = 1; 956 int i = 0; 957 int j; 958 int ch = 0; 959 if (BE (dfa->map_notascii == 0, 1)) 960 { 961 bitset_word_t bits0 = 0x00000000; 962 bitset_word_t bits1 = 0x03ff0000; 963 bitset_word_t bits2 = 0x87fffffe; 964 bitset_word_t bits3 = 0x07fffffe; 965 if (BITSET_WORD_BITS == 64) 966 { 967 dfa->word_char[0] = bits1 << 31 << 1 | bits0; 968 dfa->word_char[1] = bits3 << 31 << 1 | bits2; 969 i = 2; 970 } 971 else if (BITSET_WORD_BITS == 32) 972 { 973 dfa->word_char[0] = bits0; 974 dfa->word_char[1] = bits1; 975 dfa->word_char[2] = bits2; 976 dfa->word_char[3] = bits3; 977 i = 4; 978 } 979 else 980 goto general_case; 981 ch = 128; 982 983 if (BE (dfa->is_utf8, 1)) 984 { 985 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8); 986 return; 987 } 988 } 989 990 general_case: 991 for (; i < BITSET_WORDS; ++i) 992 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 993 if (isalnum (ch) || ch == '_') 994 dfa->word_char[i] |= (bitset_word_t) 1 << j; 995 } 996 997 /* Free the work area which are only used while compiling. */ 998 999 static void 1000 free_workarea_compile (regex_t *preg) 1001 { 1002 re_dfa_t *dfa = preg->buffer; 1003 bin_tree_storage_t *storage, *next; 1004 for (storage = dfa->str_tree_storage; storage; storage = next) 1005 { 1006 next = storage->next; 1007 re_free (storage); 1008 } 1009 dfa->str_tree_storage = NULL; 1010 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 1011 dfa->str_tree = NULL; 1012 re_free (dfa->org_indices); 1013 dfa->org_indices = NULL; 1014 } 1015 1016 /* Create initial states for all contexts. */ 1017 1018 static reg_errcode_t 1019 create_initial_state (re_dfa_t *dfa) 1020 { 1021 Idx first, i; 1022 reg_errcode_t err; 1023 re_node_set init_nodes; 1024 1025 /* Initial states have the epsilon closure of the node which is 1026 the first node of the regular expression. */ 1027 first = dfa->str_tree->first->node_idx; 1028 dfa->init_node = first; 1029 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 1030 if (BE (err != REG_NOERROR, 0)) 1031 return err; 1032 1033 /* The back-references which are in initial states can epsilon transit, 1034 since in this case all of the subexpressions can be null. 1035 Then we add epsilon closures of the nodes which are the next nodes of 1036 the back-references. */ 1037 if (dfa->nbackref > 0) 1038 for (i = 0; i < init_nodes.nelem; ++i) 1039 { 1040 Idx node_idx = init_nodes.elems[i]; 1041 re_token_type_t type = dfa->nodes[node_idx].type; 1042 1043 Idx clexp_idx; 1044 if (type != OP_BACK_REF) 1045 continue; 1046 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1047 { 1048 re_token_t *clexp_node; 1049 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1050 if (clexp_node->type == OP_CLOSE_SUBEXP 1051 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1052 break; 1053 } 1054 if (clexp_idx == init_nodes.nelem) 1055 continue; 1056 1057 if (type == OP_BACK_REF) 1058 { 1059 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1060 if (!re_node_set_contains (&init_nodes, dest_idx)) 1061 { 1062 reg_errcode_t merge_err 1063 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1064 if (merge_err != REG_NOERROR) 1065 return merge_err; 1066 i = 0; 1067 } 1068 } 1069 } 1070 1071 /* It must be the first time to invoke acquire_state. */ 1072 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1073 /* We don't check ERR here, since the initial state must not be NULL. */ 1074 if (BE (dfa->init_state == NULL, 0)) 1075 return err; 1076 if (dfa->init_state->has_constraint) 1077 { 1078 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1079 CONTEXT_WORD); 1080 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1081 CONTEXT_NEWLINE); 1082 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1083 &init_nodes, 1084 CONTEXT_NEWLINE 1085 | CONTEXT_BEGBUF); 1086 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1087 || dfa->init_state_begbuf == NULL, 0)) 1088 return err; 1089 } 1090 else 1091 dfa->init_state_word = dfa->init_state_nl 1092 = dfa->init_state_begbuf = dfa->init_state; 1093 1094 re_node_set_free (&init_nodes); 1095 return REG_NOERROR; 1096 } 1097 1098 #ifdef RE_ENABLE_I18N 1099 /* If it is possible to do searching in single byte encoding instead of UTF-8 1100 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1101 DFA nodes where needed. */ 1102 1103 static void 1104 optimize_utf8 (re_dfa_t *dfa) 1105 { 1106 Idx node; 1107 int i; 1108 bool mb_chars = false; 1109 bool has_period = false; 1110 1111 for (node = 0; node < dfa->nodes_len; ++node) 1112 switch (dfa->nodes[node].type) 1113 { 1114 case CHARACTER: 1115 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1116 mb_chars = true; 1117 break; 1118 case ANCHOR: 1119 switch (dfa->nodes[node].opr.ctx_type) 1120 { 1121 case LINE_FIRST: 1122 case LINE_LAST: 1123 case BUF_FIRST: 1124 case BUF_LAST: 1125 break; 1126 default: 1127 /* Word anchors etc. cannot be handled. It's okay to test 1128 opr.ctx_type since constraints (for all DFA nodes) are 1129 created by ORing one or more opr.ctx_type values. */ 1130 return; 1131 } 1132 break; 1133 case OP_PERIOD: 1134 has_period = true; 1135 break; 1136 case OP_BACK_REF: 1137 case OP_ALT: 1138 case END_OF_RE: 1139 case OP_DUP_ASTERISK: 1140 case OP_OPEN_SUBEXP: 1141 case OP_CLOSE_SUBEXP: 1142 break; 1143 case COMPLEX_BRACKET: 1144 return; 1145 case SIMPLE_BRACKET: 1146 /* Just double check. */ 1147 { 1148 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1149 ? 0 1150 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1151 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1152 { 1153 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1154 return; 1155 rshift = 0; 1156 } 1157 } 1158 break; 1159 default: 1160 abort (); 1161 } 1162 1163 if (mb_chars || has_period) 1164 for (node = 0; node < dfa->nodes_len; ++node) 1165 { 1166 if (dfa->nodes[node].type == CHARACTER 1167 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1168 dfa->nodes[node].mb_partial = 0; 1169 else if (dfa->nodes[node].type == OP_PERIOD) 1170 dfa->nodes[node].type = OP_UTF8_PERIOD; 1171 } 1172 1173 /* The search can be in single byte locale. */ 1174 dfa->mb_cur_max = 1; 1175 dfa->is_utf8 = 0; 1176 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1177 } 1178 #endif 1179 1180 /* Analyze the structure tree, and calculate "first", "next", "edest", 1181 "eclosure", and "inveclosure". */ 1182 1183 static reg_errcode_t 1184 analyze (regex_t *preg) 1185 { 1186 re_dfa_t *dfa = preg->buffer; 1187 reg_errcode_t ret; 1188 1189 /* Allocate arrays. */ 1190 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1191 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1192 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1193 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1194 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1195 || dfa->eclosures == NULL, 0)) 1196 return REG_ESPACE; 1197 1198 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1199 if (dfa->subexp_map != NULL) 1200 { 1201 Idx i; 1202 for (i = 0; i < preg->re_nsub; i++) 1203 dfa->subexp_map[i] = i; 1204 preorder (dfa->str_tree, optimize_subexps, dfa); 1205 for (i = 0; i < preg->re_nsub; i++) 1206 if (dfa->subexp_map[i] != i) 1207 break; 1208 if (i == preg->re_nsub) 1209 { 1210 free (dfa->subexp_map); 1211 dfa->subexp_map = NULL; 1212 } 1213 } 1214 1215 ret = postorder (dfa->str_tree, lower_subexps, preg); 1216 if (BE (ret != REG_NOERROR, 0)) 1217 return ret; 1218 ret = postorder (dfa->str_tree, calc_first, dfa); 1219 if (BE (ret != REG_NOERROR, 0)) 1220 return ret; 1221 preorder (dfa->str_tree, calc_next, dfa); 1222 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1223 if (BE (ret != REG_NOERROR, 0)) 1224 return ret; 1225 ret = calc_eclosure (dfa); 1226 if (BE (ret != REG_NOERROR, 0)) 1227 return ret; 1228 1229 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1230 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1231 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1232 || dfa->nbackref) 1233 { 1234 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1235 if (BE (dfa->inveclosures == NULL, 0)) 1236 return REG_ESPACE; 1237 ret = calc_inveclosure (dfa); 1238 } 1239 1240 return ret; 1241 } 1242 1243 /* Our parse trees are very unbalanced, so we cannot use a stack to 1244 implement parse tree visits. Instead, we use parent pointers and 1245 some hairy code in these two functions. */ 1246 static reg_errcode_t 1247 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1248 void *extra) 1249 { 1250 bin_tree_t *node, *prev; 1251 1252 for (node = root; ; ) 1253 { 1254 /* Descend down the tree, preferably to the left (or to the right 1255 if that's the only child). */ 1256 while (node->left || node->right) 1257 if (node->left) 1258 node = node->left; 1259 else 1260 node = node->right; 1261 1262 do 1263 { 1264 reg_errcode_t err = fn (extra, node); 1265 if (BE (err != REG_NOERROR, 0)) 1266 return err; 1267 if (node->parent == NULL) 1268 return REG_NOERROR; 1269 prev = node; 1270 node = node->parent; 1271 } 1272 /* Go up while we have a node that is reached from the right. */ 1273 while (node->right == prev || node->right == NULL); 1274 node = node->right; 1275 } 1276 } 1277 1278 static reg_errcode_t 1279 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1280 void *extra) 1281 { 1282 bin_tree_t *node; 1283 1284 for (node = root; ; ) 1285 { 1286 reg_errcode_t err = fn (extra, node); 1287 if (BE (err != REG_NOERROR, 0)) 1288 return err; 1289 1290 /* Go to the left node, or up and to the right. */ 1291 if (node->left) 1292 node = node->left; 1293 else 1294 { 1295 bin_tree_t *prev = NULL; 1296 while (node->right == prev || node->right == NULL) 1297 { 1298 prev = node; 1299 node = node->parent; 1300 if (!node) 1301 return REG_NOERROR; 1302 } 1303 node = node->right; 1304 } 1305 } 1306 } 1307 1308 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1309 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1310 backreferences as well. Requires a preorder visit. */ 1311 static reg_errcode_t 1312 optimize_subexps (void *extra, bin_tree_t *node) 1313 { 1314 re_dfa_t *dfa = (re_dfa_t *) extra; 1315 1316 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1317 { 1318 int idx = node->token.opr.idx; 1319 node->token.opr.idx = dfa->subexp_map[idx]; 1320 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1321 } 1322 1323 else if (node->token.type == SUBEXP 1324 && node->left && node->left->token.type == SUBEXP) 1325 { 1326 Idx other_idx = node->left->token.opr.idx; 1327 1328 node->left = node->left->left; 1329 if (node->left) 1330 node->left->parent = node; 1331 1332 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1333 if (other_idx < BITSET_WORD_BITS) 1334 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1335 } 1336 1337 return REG_NOERROR; 1338 } 1339 1340 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1341 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1342 static reg_errcode_t 1343 lower_subexps (void *extra, bin_tree_t *node) 1344 { 1345 regex_t *preg = (regex_t *) extra; 1346 reg_errcode_t err = REG_NOERROR; 1347 1348 if (node->left && node->left->token.type == SUBEXP) 1349 { 1350 node->left = lower_subexp (&err, preg, node->left); 1351 if (node->left) 1352 node->left->parent = node; 1353 } 1354 if (node->right && node->right->token.type == SUBEXP) 1355 { 1356 node->right = lower_subexp (&err, preg, node->right); 1357 if (node->right) 1358 node->right->parent = node; 1359 } 1360 1361 return err; 1362 } 1363 1364 static bin_tree_t * 1365 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1366 { 1367 re_dfa_t *dfa = preg->buffer; 1368 bin_tree_t *body = node->left; 1369 bin_tree_t *op, *cls, *tree1, *tree; 1370 1371 if (preg->no_sub 1372 /* We do not optimize empty subexpressions, because otherwise we may 1373 have bad CONCAT nodes with NULL children. This is obviously not 1374 very common, so we do not lose much. An example that triggers 1375 this case is the sed "script" /\(\)/x. */ 1376 && node->left != NULL 1377 && (node->token.opr.idx >= BITSET_WORD_BITS 1378 || !(dfa->used_bkref_map 1379 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1380 return node->left; 1381 1382 /* Convert the SUBEXP node to the concatenation of an 1383 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1384 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1385 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1386 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1387 tree = create_tree (dfa, op, tree1, CONCAT); 1388 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1389 { 1390 *err = REG_ESPACE; 1391 return NULL; 1392 } 1393 1394 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1395 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1396 return tree; 1397 } 1398 1399 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1400 nodes. Requires a postorder visit. */ 1401 static reg_errcode_t 1402 calc_first (void *extra, bin_tree_t *node) 1403 { 1404 re_dfa_t *dfa = (re_dfa_t *) extra; 1405 if (node->token.type == CONCAT) 1406 { 1407 node->first = node->left->first; 1408 node->node_idx = node->left->node_idx; 1409 } 1410 else 1411 { 1412 node->first = node; 1413 node->node_idx = re_dfa_add_node (dfa, node->token); 1414 if (BE (node->node_idx == REG_MISSING, 0)) 1415 return REG_ESPACE; 1416 if (node->token.type == ANCHOR) 1417 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1418 } 1419 return REG_NOERROR; 1420 } 1421 1422 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1423 static reg_errcode_t 1424 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 1425 { 1426 switch (node->token.type) 1427 { 1428 case OP_DUP_ASTERISK: 1429 node->left->next = node; 1430 break; 1431 case CONCAT: 1432 node->left->next = node->right->first; 1433 node->right->next = node->next; 1434 break; 1435 default: 1436 if (node->left) 1437 node->left->next = node->next; 1438 if (node->right) 1439 node->right->next = node->next; 1440 break; 1441 } 1442 return REG_NOERROR; 1443 } 1444 1445 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1446 static reg_errcode_t 1447 link_nfa_nodes (void *extra, bin_tree_t *node) 1448 { 1449 re_dfa_t *dfa = (re_dfa_t *) extra; 1450 Idx idx = node->node_idx; 1451 reg_errcode_t err = REG_NOERROR; 1452 1453 switch (node->token.type) 1454 { 1455 case CONCAT: 1456 break; 1457 1458 case END_OF_RE: 1459 assert (node->next == NULL); 1460 break; 1461 1462 case OP_DUP_ASTERISK: 1463 case OP_ALT: 1464 { 1465 Idx left, right; 1466 dfa->has_plural_match = 1; 1467 if (node->left != NULL) 1468 left = node->left->first->node_idx; 1469 else 1470 left = node->next->node_idx; 1471 if (node->right != NULL) 1472 right = node->right->first->node_idx; 1473 else 1474 right = node->next->node_idx; 1475 assert (REG_VALID_INDEX (left)); 1476 assert (REG_VALID_INDEX (right)); 1477 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1478 } 1479 break; 1480 1481 case ANCHOR: 1482 case OP_OPEN_SUBEXP: 1483 case OP_CLOSE_SUBEXP: 1484 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1485 break; 1486 1487 case OP_BACK_REF: 1488 dfa->nexts[idx] = node->next->node_idx; 1489 if (node->token.type == OP_BACK_REF) 1490 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1491 break; 1492 1493 default: 1494 assert (!IS_EPSILON_NODE (node->token.type)); 1495 dfa->nexts[idx] = node->next->node_idx; 1496 break; 1497 } 1498 1499 return err; 1500 } 1501 1502 /* Duplicate the epsilon closure of the node ROOT_NODE. 1503 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1504 to their own constraint. */ 1505 1506 static reg_errcode_t 1507 internal_function 1508 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1509 Idx root_node, unsigned int init_constraint) 1510 { 1511 Idx org_node, clone_node; 1512 bool ok; 1513 unsigned int constraint = init_constraint; 1514 for (org_node = top_org_node, clone_node = top_clone_node;;) 1515 { 1516 Idx org_dest, clone_dest; 1517 if (dfa->nodes[org_node].type == OP_BACK_REF) 1518 { 1519 /* If the back reference epsilon-transit, its destination must 1520 also have the constraint. Then duplicate the epsilon closure 1521 of the destination of the back reference, and store it in 1522 edests of the back reference. */ 1523 org_dest = dfa->nexts[org_node]; 1524 re_node_set_empty (dfa->edests + clone_node); 1525 clone_dest = duplicate_node (dfa, org_dest, constraint); 1526 if (BE (clone_dest == REG_MISSING, 0)) 1527 return REG_ESPACE; 1528 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1529 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1530 if (BE (! ok, 0)) 1531 return REG_ESPACE; 1532 } 1533 else if (dfa->edests[org_node].nelem == 0) 1534 { 1535 /* In case of the node can't epsilon-transit, don't duplicate the 1536 destination and store the original destination as the 1537 destination of the node. */ 1538 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1539 break; 1540 } 1541 else if (dfa->edests[org_node].nelem == 1) 1542 { 1543 /* In case of the node can epsilon-transit, and it has only one 1544 destination. */ 1545 org_dest = dfa->edests[org_node].elems[0]; 1546 re_node_set_empty (dfa->edests + clone_node); 1547 /* If the node is root_node itself, it means the epsilon closure 1548 has a loop. Then tie it to the destination of the root_node. */ 1549 if (org_node == root_node && clone_node != org_node) 1550 { 1551 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1552 if (BE (! ok, 0)) 1553 return REG_ESPACE; 1554 break; 1555 } 1556 /* In case the node has another constraint, append it. */ 1557 constraint |= dfa->nodes[org_node].constraint; 1558 clone_dest = duplicate_node (dfa, org_dest, constraint); 1559 if (BE (clone_dest == REG_MISSING, 0)) 1560 return REG_ESPACE; 1561 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1562 if (BE (! ok, 0)) 1563 return REG_ESPACE; 1564 } 1565 else /* dfa->edests[org_node].nelem == 2 */ 1566 { 1567 /* In case of the node can epsilon-transit, and it has two 1568 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1569 org_dest = dfa->edests[org_node].elems[0]; 1570 re_node_set_empty (dfa->edests + clone_node); 1571 /* Search for a duplicated node which satisfies the constraint. */ 1572 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1573 if (clone_dest == REG_MISSING) 1574 { 1575 /* There is no such duplicated node, create a new one. */ 1576 reg_errcode_t err; 1577 clone_dest = duplicate_node (dfa, org_dest, constraint); 1578 if (BE (clone_dest == REG_MISSING, 0)) 1579 return REG_ESPACE; 1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1581 if (BE (! ok, 0)) 1582 return REG_ESPACE; 1583 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1584 root_node, constraint); 1585 if (BE (err != REG_NOERROR, 0)) 1586 return err; 1587 } 1588 else 1589 { 1590 /* There is a duplicated node which satisfies the constraint, 1591 use it to avoid infinite loop. */ 1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1593 if (BE (! ok, 0)) 1594 return REG_ESPACE; 1595 } 1596 1597 org_dest = dfa->edests[org_node].elems[1]; 1598 clone_dest = duplicate_node (dfa, org_dest, constraint); 1599 if (BE (clone_dest == REG_MISSING, 0)) 1600 return REG_ESPACE; 1601 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1602 if (BE (! ok, 0)) 1603 return REG_ESPACE; 1604 } 1605 org_node = org_dest; 1606 clone_node = clone_dest; 1607 } 1608 return REG_NOERROR; 1609 } 1610 1611 /* Search for a node which is duplicated from the node ORG_NODE, and 1612 satisfies the constraint CONSTRAINT. */ 1613 1614 static Idx 1615 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1616 unsigned int constraint) 1617 { 1618 Idx idx; 1619 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1620 { 1621 if (org_node == dfa->org_indices[idx] 1622 && constraint == dfa->nodes[idx].constraint) 1623 return idx; /* Found. */ 1624 } 1625 return REG_MISSING; /* Not found. */ 1626 } 1627 1628 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1629 Return the index of the new node, or REG_MISSING if insufficient storage is 1630 available. */ 1631 1632 static Idx 1633 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1634 { 1635 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1636 if (BE (dup_idx != REG_MISSING, 1)) 1637 { 1638 dfa->nodes[dup_idx].constraint = constraint; 1639 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1640 dfa->nodes[dup_idx].duplicated = 1; 1641 1642 /* Store the index of the original node. */ 1643 dfa->org_indices[dup_idx] = org_idx; 1644 } 1645 return dup_idx; 1646 } 1647 1648 static reg_errcode_t 1649 calc_inveclosure (re_dfa_t *dfa) 1650 { 1651 Idx src, idx; 1652 bool ok; 1653 for (idx = 0; idx < dfa->nodes_len; ++idx) 1654 re_node_set_init_empty (dfa->inveclosures + idx); 1655 1656 for (src = 0; src < dfa->nodes_len; ++src) 1657 { 1658 Idx *elems = dfa->eclosures[src].elems; 1659 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1660 { 1661 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1662 if (BE (! ok, 0)) 1663 return REG_ESPACE; 1664 } 1665 } 1666 1667 return REG_NOERROR; 1668 } 1669 1670 /* Calculate "eclosure" for all the node in DFA. */ 1671 1672 static reg_errcode_t 1673 calc_eclosure (re_dfa_t *dfa) 1674 { 1675 Idx node_idx; 1676 bool incomplete; 1677 #ifdef DEBUG 1678 assert (dfa->nodes_len > 0); 1679 #endif 1680 incomplete = false; 1681 /* For each nodes, calculate epsilon closure. */ 1682 for (node_idx = 0; ; ++node_idx) 1683 { 1684 reg_errcode_t err; 1685 re_node_set eclosure_elem; 1686 if (node_idx == dfa->nodes_len) 1687 { 1688 if (!incomplete) 1689 break; 1690 incomplete = false; 1691 node_idx = 0; 1692 } 1693 1694 #ifdef DEBUG 1695 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1696 #endif 1697 1698 /* If we have already calculated, skip it. */ 1699 if (dfa->eclosures[node_idx].nelem != 0) 1700 continue; 1701 /* Calculate epsilon closure of 'node_idx'. */ 1702 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1703 if (BE (err != REG_NOERROR, 0)) 1704 return err; 1705 1706 if (dfa->eclosures[node_idx].nelem == 0) 1707 { 1708 incomplete = true; 1709 re_node_set_free (&eclosure_elem); 1710 } 1711 } 1712 return REG_NOERROR; 1713 } 1714 1715 /* Calculate epsilon closure of NODE. */ 1716 1717 static reg_errcode_t 1718 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1719 { 1720 reg_errcode_t err; 1721 Idx i; 1722 re_node_set eclosure; 1723 bool ok; 1724 bool incomplete = false; 1725 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1726 if (BE (err != REG_NOERROR, 0)) 1727 return err; 1728 1729 /* This indicates that we are calculating this node now. 1730 We reference this value to avoid infinite loop. */ 1731 dfa->eclosures[node].nelem = REG_MISSING; 1732 1733 /* If the current node has constraints, duplicate all nodes 1734 since they must inherit the constraints. */ 1735 if (dfa->nodes[node].constraint 1736 && dfa->edests[node].nelem 1737 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1738 { 1739 err = duplicate_node_closure (dfa, node, node, node, 1740 dfa->nodes[node].constraint); 1741 if (BE (err != REG_NOERROR, 0)) 1742 return err; 1743 } 1744 1745 /* Expand each epsilon destination nodes. */ 1746 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1747 for (i = 0; i < dfa->edests[node].nelem; ++i) 1748 { 1749 re_node_set eclosure_elem; 1750 Idx edest = dfa->edests[node].elems[i]; 1751 /* If calculating the epsilon closure of 'edest' is in progress, 1752 return intermediate result. */ 1753 if (dfa->eclosures[edest].nelem == REG_MISSING) 1754 { 1755 incomplete = true; 1756 continue; 1757 } 1758 /* If we haven't calculated the epsilon closure of 'edest' yet, 1759 calculate now. Otherwise use calculated epsilon closure. */ 1760 if (dfa->eclosures[edest].nelem == 0) 1761 { 1762 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1763 if (BE (err != REG_NOERROR, 0)) 1764 return err; 1765 } 1766 else 1767 eclosure_elem = dfa->eclosures[edest]; 1768 /* Merge the epsilon closure of 'edest'. */ 1769 err = re_node_set_merge (&eclosure, &eclosure_elem); 1770 if (BE (err != REG_NOERROR, 0)) 1771 return err; 1772 /* If the epsilon closure of 'edest' is incomplete, 1773 the epsilon closure of this node is also incomplete. */ 1774 if (dfa->eclosures[edest].nelem == 0) 1775 { 1776 incomplete = true; 1777 re_node_set_free (&eclosure_elem); 1778 } 1779 } 1780 1781 /* An epsilon closure includes itself. */ 1782 ok = re_node_set_insert (&eclosure, node); 1783 if (BE (! ok, 0)) 1784 return REG_ESPACE; 1785 if (incomplete && !root) 1786 dfa->eclosures[node].nelem = 0; 1787 else 1788 dfa->eclosures[node] = eclosure; 1789 *new_set = eclosure; 1790 return REG_NOERROR; 1791 } 1792 1793 /* Functions for token which are used in the parser. */ 1794 1795 /* Fetch a token from INPUT. 1796 We must not use this function inside bracket expressions. */ 1797 1798 static void 1799 internal_function 1800 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1801 { 1802 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1803 } 1804 1805 /* Peek a token from INPUT, and return the length of the token. 1806 We must not use this function inside bracket expressions. */ 1807 1808 static int 1809 internal_function 1810 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1811 { 1812 unsigned char c; 1813 1814 if (re_string_eoi (input)) 1815 { 1816 token->type = END_OF_RE; 1817 return 0; 1818 } 1819 1820 c = re_string_peek_byte (input, 0); 1821 token->opr.c = c; 1822 1823 token->word_char = 0; 1824 #ifdef RE_ENABLE_I18N 1825 token->mb_partial = 0; 1826 if (input->mb_cur_max > 1 && 1827 !re_string_first_byte (input, re_string_cur_idx (input))) 1828 { 1829 token->type = CHARACTER; 1830 token->mb_partial = 1; 1831 return 1; 1832 } 1833 #endif 1834 if (c == '\\') 1835 { 1836 unsigned char c2; 1837 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1838 { 1839 token->type = BACK_SLASH; 1840 return 1; 1841 } 1842 1843 c2 = re_string_peek_byte_case (input, 1); 1844 token->opr.c = c2; 1845 token->type = CHARACTER; 1846 #ifdef RE_ENABLE_I18N 1847 if (input->mb_cur_max > 1) 1848 { 1849 wint_t wc = re_string_wchar_at (input, 1850 re_string_cur_idx (input) + 1); 1851 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1852 } 1853 else 1854 #endif 1855 token->word_char = IS_WORD_CHAR (c2) != 0; 1856 1857 switch (c2) 1858 { 1859 case '|': 1860 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1861 token->type = OP_ALT; 1862 break; 1863 case '1': case '2': case '3': case '4': case '5': 1864 case '6': case '7': case '8': case '9': 1865 if (!(syntax & RE_NO_BK_REFS)) 1866 { 1867 token->type = OP_BACK_REF; 1868 token->opr.idx = c2 - '1'; 1869 } 1870 break; 1871 case '<': 1872 if (!(syntax & RE_NO_GNU_OPS)) 1873 { 1874 token->type = ANCHOR; 1875 token->opr.ctx_type = WORD_FIRST; 1876 } 1877 break; 1878 case '>': 1879 if (!(syntax & RE_NO_GNU_OPS)) 1880 { 1881 token->type = ANCHOR; 1882 token->opr.ctx_type = WORD_LAST; 1883 } 1884 break; 1885 case 'b': 1886 if (!(syntax & RE_NO_GNU_OPS)) 1887 { 1888 token->type = ANCHOR; 1889 token->opr.ctx_type = WORD_DELIM; 1890 } 1891 break; 1892 case 'B': 1893 if (!(syntax & RE_NO_GNU_OPS)) 1894 { 1895 token->type = ANCHOR; 1896 token->opr.ctx_type = NOT_WORD_DELIM; 1897 } 1898 break; 1899 case 'w': 1900 if (!(syntax & RE_NO_GNU_OPS)) 1901 token->type = OP_WORD; 1902 break; 1903 case 'W': 1904 if (!(syntax & RE_NO_GNU_OPS)) 1905 token->type = OP_NOTWORD; 1906 break; 1907 case 's': 1908 if (!(syntax & RE_NO_GNU_OPS)) 1909 token->type = OP_SPACE; 1910 break; 1911 case 'S': 1912 if (!(syntax & RE_NO_GNU_OPS)) 1913 token->type = OP_NOTSPACE; 1914 break; 1915 case '`': 1916 if (!(syntax & RE_NO_GNU_OPS)) 1917 { 1918 token->type = ANCHOR; 1919 token->opr.ctx_type = BUF_FIRST; 1920 } 1921 break; 1922 case '\'': 1923 if (!(syntax & RE_NO_GNU_OPS)) 1924 { 1925 token->type = ANCHOR; 1926 token->opr.ctx_type = BUF_LAST; 1927 } 1928 break; 1929 case '(': 1930 if (!(syntax & RE_NO_BK_PARENS)) 1931 token->type = OP_OPEN_SUBEXP; 1932 break; 1933 case ')': 1934 if (!(syntax & RE_NO_BK_PARENS)) 1935 token->type = OP_CLOSE_SUBEXP; 1936 break; 1937 case '+': 1938 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1939 token->type = OP_DUP_PLUS; 1940 break; 1941 case '?': 1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1943 token->type = OP_DUP_QUESTION; 1944 break; 1945 case '{': 1946 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1947 token->type = OP_OPEN_DUP_NUM; 1948 break; 1949 case '}': 1950 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1951 token->type = OP_CLOSE_DUP_NUM; 1952 break; 1953 default: 1954 break; 1955 } 1956 return 2; 1957 } 1958 1959 token->type = CHARACTER; 1960 #ifdef RE_ENABLE_I18N 1961 if (input->mb_cur_max > 1) 1962 { 1963 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1964 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1965 } 1966 else 1967 #endif 1968 token->word_char = IS_WORD_CHAR (token->opr.c); 1969 1970 switch (c) 1971 { 1972 case '\n': 1973 if (syntax & RE_NEWLINE_ALT) 1974 token->type = OP_ALT; 1975 break; 1976 case '|': 1977 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1978 token->type = OP_ALT; 1979 break; 1980 case '*': 1981 token->type = OP_DUP_ASTERISK; 1982 break; 1983 case '+': 1984 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1985 token->type = OP_DUP_PLUS; 1986 break; 1987 case '?': 1988 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1989 token->type = OP_DUP_QUESTION; 1990 break; 1991 case '{': 1992 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1993 token->type = OP_OPEN_DUP_NUM; 1994 break; 1995 case '}': 1996 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1997 token->type = OP_CLOSE_DUP_NUM; 1998 break; 1999 case '(': 2000 if (syntax & RE_NO_BK_PARENS) 2001 token->type = OP_OPEN_SUBEXP; 2002 break; 2003 case ')': 2004 if (syntax & RE_NO_BK_PARENS) 2005 token->type = OP_CLOSE_SUBEXP; 2006 break; 2007 case '[': 2008 token->type = OP_OPEN_BRACKET; 2009 break; 2010 case '.': 2011 token->type = OP_PERIOD; 2012 break; 2013 case '^': 2014 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 2015 re_string_cur_idx (input) != 0) 2016 { 2017 char prev = re_string_peek_byte (input, -1); 2018 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 2019 break; 2020 } 2021 token->type = ANCHOR; 2022 token->opr.ctx_type = LINE_FIRST; 2023 break; 2024 case '$': 2025 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 2026 re_string_cur_idx (input) + 1 != re_string_length (input)) 2027 { 2028 re_token_t next; 2029 re_string_skip_bytes (input, 1); 2030 peek_token (&next, input, syntax); 2031 re_string_skip_bytes (input, -1); 2032 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 2033 break; 2034 } 2035 token->type = ANCHOR; 2036 token->opr.ctx_type = LINE_LAST; 2037 break; 2038 default: 2039 break; 2040 } 2041 return 1; 2042 } 2043 2044 /* Peek a token from INPUT, and return the length of the token. 2045 We must not use this function out of bracket expressions. */ 2046 2047 static int 2048 internal_function 2049 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2050 { 2051 unsigned char c; 2052 if (re_string_eoi (input)) 2053 { 2054 token->type = END_OF_RE; 2055 return 0; 2056 } 2057 c = re_string_peek_byte (input, 0); 2058 token->opr.c = c; 2059 2060 #ifdef RE_ENABLE_I18N 2061 if (input->mb_cur_max > 1 && 2062 !re_string_first_byte (input, re_string_cur_idx (input))) 2063 { 2064 token->type = CHARACTER; 2065 return 1; 2066 } 2067 #endif /* RE_ENABLE_I18N */ 2068 2069 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2070 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2071 { 2072 /* In this case, '\' escape a character. */ 2073 unsigned char c2; 2074 re_string_skip_bytes (input, 1); 2075 c2 = re_string_peek_byte (input, 0); 2076 token->opr.c = c2; 2077 token->type = CHARACTER; 2078 return 1; 2079 } 2080 if (c == '[') /* '[' is a special char in a bracket exps. */ 2081 { 2082 unsigned char c2; 2083 int token_len; 2084 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2085 c2 = re_string_peek_byte (input, 1); 2086 else 2087 c2 = 0; 2088 token->opr.c = c2; 2089 token_len = 2; 2090 switch (c2) 2091 { 2092 case '.': 2093 token->type = OP_OPEN_COLL_ELEM; 2094 break; 2095 case '=': 2096 token->type = OP_OPEN_EQUIV_CLASS; 2097 break; 2098 case ':': 2099 if (syntax & RE_CHAR_CLASSES) 2100 { 2101 token->type = OP_OPEN_CHAR_CLASS; 2102 break; 2103 } 2104 /* else fall through. */ 2105 default: 2106 token->type = CHARACTER; 2107 token->opr.c = c; 2108 token_len = 1; 2109 break; 2110 } 2111 return token_len; 2112 } 2113 switch (c) 2114 { 2115 case '-': 2116 token->type = OP_CHARSET_RANGE; 2117 break; 2118 case ']': 2119 token->type = OP_CLOSE_BRACKET; 2120 break; 2121 case '^': 2122 token->type = OP_NON_MATCH_LIST; 2123 break; 2124 default: 2125 token->type = CHARACTER; 2126 } 2127 return 1; 2128 } 2129 2130 /* Functions for parser. */ 2131 2132 /* Entry point of the parser. 2133 Parse the regular expression REGEXP and return the structure tree. 2134 If an error occurs, ERR is set by error code, and return NULL. 2135 This function build the following tree, from regular expression <reg_exp>: 2136 CAT 2137 / \ 2138 / \ 2139 <reg_exp> EOR 2140 2141 CAT means concatenation. 2142 EOR means end of regular expression. */ 2143 2144 static bin_tree_t * 2145 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2146 reg_errcode_t *err) 2147 { 2148 re_dfa_t *dfa = preg->buffer; 2149 bin_tree_t *tree, *eor, *root; 2150 re_token_t current_token; 2151 dfa->syntax = syntax; 2152 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2153 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2154 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2155 return NULL; 2156 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2157 if (tree != NULL) 2158 root = create_tree (dfa, tree, eor, CONCAT); 2159 else 2160 root = eor; 2161 if (BE (eor == NULL || root == NULL, 0)) 2162 { 2163 *err = REG_ESPACE; 2164 return NULL; 2165 } 2166 return root; 2167 } 2168 2169 /* This function build the following tree, from regular expression 2170 <branch1>|<branch2>: 2171 ALT 2172 / \ 2173 / \ 2174 <branch1> <branch2> 2175 2176 ALT means alternative, which represents the operator '|'. */ 2177 2178 static bin_tree_t * 2179 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2180 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2181 { 2182 re_dfa_t *dfa = preg->buffer; 2183 bin_tree_t *tree, *branch = NULL; 2184 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2185 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2186 return NULL; 2187 2188 while (token->type == OP_ALT) 2189 { 2190 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2191 if (token->type != OP_ALT && token->type != END_OF_RE 2192 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2193 { 2194 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2195 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2196 return NULL; 2197 } 2198 else 2199 branch = NULL; 2200 tree = create_tree (dfa, tree, branch, OP_ALT); 2201 if (BE (tree == NULL, 0)) 2202 { 2203 *err = REG_ESPACE; 2204 return NULL; 2205 } 2206 } 2207 return tree; 2208 } 2209 2210 /* This function build the following tree, from regular expression 2211 <exp1><exp2>: 2212 CAT 2213 / \ 2214 / \ 2215 <exp1> <exp2> 2216 2217 CAT means concatenation. */ 2218 2219 static bin_tree_t * 2220 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2221 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2222 { 2223 bin_tree_t *tree, *expr; 2224 re_dfa_t *dfa = preg->buffer; 2225 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2226 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2227 return NULL; 2228 2229 while (token->type != OP_ALT && token->type != END_OF_RE 2230 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2231 { 2232 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2233 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2234 { 2235 if (tree != NULL) 2236 postorder (tree, free_tree, NULL); 2237 return NULL; 2238 } 2239 if (tree != NULL && expr != NULL) 2240 { 2241 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT); 2242 if (newtree == NULL) 2243 { 2244 postorder (expr, free_tree, NULL); 2245 postorder (tree, free_tree, NULL); 2246 *err = REG_ESPACE; 2247 return NULL; 2248 } 2249 tree = newtree; 2250 } 2251 else if (tree == NULL) 2252 tree = expr; 2253 /* Otherwise expr == NULL, we don't need to create new tree. */ 2254 } 2255 return tree; 2256 } 2257 2258 /* This function build the following tree, from regular expression a*: 2259 * 2260 | 2261 a 2262 */ 2263 2264 static bin_tree_t * 2265 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2266 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2267 { 2268 re_dfa_t *dfa = preg->buffer; 2269 bin_tree_t *tree; 2270 switch (token->type) 2271 { 2272 case CHARACTER: 2273 tree = create_token_tree (dfa, NULL, NULL, token); 2274 if (BE (tree == NULL, 0)) 2275 { 2276 *err = REG_ESPACE; 2277 return NULL; 2278 } 2279 #ifdef RE_ENABLE_I18N 2280 if (dfa->mb_cur_max > 1) 2281 { 2282 while (!re_string_eoi (regexp) 2283 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2284 { 2285 bin_tree_t *mbc_remain; 2286 fetch_token (token, regexp, syntax); 2287 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2288 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2289 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2290 { 2291 *err = REG_ESPACE; 2292 return NULL; 2293 } 2294 } 2295 } 2296 #endif 2297 break; 2298 case OP_OPEN_SUBEXP: 2299 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2300 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2301 return NULL; 2302 break; 2303 case OP_OPEN_BRACKET: 2304 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2305 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2306 return NULL; 2307 break; 2308 case OP_BACK_REF: 2309 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2310 { 2311 *err = REG_ESUBREG; 2312 return NULL; 2313 } 2314 dfa->used_bkref_map |= 1 << token->opr.idx; 2315 tree = create_token_tree (dfa, NULL, NULL, token); 2316 if (BE (tree == NULL, 0)) 2317 { 2318 *err = REG_ESPACE; 2319 return NULL; 2320 } 2321 ++dfa->nbackref; 2322 dfa->has_mb_node = 1; 2323 break; 2324 case OP_OPEN_DUP_NUM: 2325 if (syntax & RE_CONTEXT_INVALID_DUP) 2326 { 2327 *err = REG_BADRPT; 2328 return NULL; 2329 } 2330 /* FALLTHROUGH */ 2331 case OP_DUP_ASTERISK: 2332 case OP_DUP_PLUS: 2333 case OP_DUP_QUESTION: 2334 if (syntax & RE_CONTEXT_INVALID_OPS) 2335 { 2336 *err = REG_BADRPT; 2337 return NULL; 2338 } 2339 else if (syntax & RE_CONTEXT_INDEP_OPS) 2340 { 2341 fetch_token (token, regexp, syntax); 2342 return parse_expression (regexp, preg, token, syntax, nest, err); 2343 } 2344 /* else fall through */ 2345 case OP_CLOSE_SUBEXP: 2346 if ((token->type == OP_CLOSE_SUBEXP) && 2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2348 { 2349 *err = REG_ERPAREN; 2350 return NULL; 2351 } 2352 /* else fall through */ 2353 case OP_CLOSE_DUP_NUM: 2354 /* We treat it as a normal character. */ 2355 2356 /* Then we can these characters as normal characters. */ 2357 token->type = CHARACTER; 2358 /* mb_partial and word_char bits should be initialized already 2359 by peek_token. */ 2360 tree = create_token_tree (dfa, NULL, NULL, token); 2361 if (BE (tree == NULL, 0)) 2362 { 2363 *err = REG_ESPACE; 2364 return NULL; 2365 } 2366 break; 2367 case ANCHOR: 2368 if ((token->opr.ctx_type 2369 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2370 && dfa->word_ops_used == 0) 2371 init_word_char (dfa); 2372 if (token->opr.ctx_type == WORD_DELIM 2373 || token->opr.ctx_type == NOT_WORD_DELIM) 2374 { 2375 bin_tree_t *tree_first, *tree_last; 2376 if (token->opr.ctx_type == WORD_DELIM) 2377 { 2378 token->opr.ctx_type = WORD_FIRST; 2379 tree_first = create_token_tree (dfa, NULL, NULL, token); 2380 token->opr.ctx_type = WORD_LAST; 2381 } 2382 else 2383 { 2384 token->opr.ctx_type = INSIDE_WORD; 2385 tree_first = create_token_tree (dfa, NULL, NULL, token); 2386 token->opr.ctx_type = INSIDE_NOTWORD; 2387 } 2388 tree_last = create_token_tree (dfa, NULL, NULL, token); 2389 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2390 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2391 { 2392 *err = REG_ESPACE; 2393 return NULL; 2394 } 2395 } 2396 else 2397 { 2398 tree = create_token_tree (dfa, NULL, NULL, token); 2399 if (BE (tree == NULL, 0)) 2400 { 2401 *err = REG_ESPACE; 2402 return NULL; 2403 } 2404 } 2405 /* We must return here, since ANCHORs can't be followed 2406 by repetition operators. 2407 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2408 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2409 fetch_token (token, regexp, syntax); 2410 return tree; 2411 case OP_PERIOD: 2412 tree = create_token_tree (dfa, NULL, NULL, token); 2413 if (BE (tree == NULL, 0)) 2414 { 2415 *err = REG_ESPACE; 2416 return NULL; 2417 } 2418 if (dfa->mb_cur_max > 1) 2419 dfa->has_mb_node = 1; 2420 break; 2421 case OP_WORD: 2422 case OP_NOTWORD: 2423 tree = build_charclass_op (dfa, regexp->trans, 2424 (const unsigned char *) "alnum", 2425 (const unsigned char *) "_", 2426 token->type == OP_NOTWORD, err); 2427 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2428 return NULL; 2429 break; 2430 case OP_SPACE: 2431 case OP_NOTSPACE: 2432 tree = build_charclass_op (dfa, regexp->trans, 2433 (const unsigned char *) "space", 2434 (const unsigned char *) "", 2435 token->type == OP_NOTSPACE, err); 2436 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2437 return NULL; 2438 break; 2439 case OP_ALT: 2440 case END_OF_RE: 2441 return NULL; 2442 case BACK_SLASH: 2443 *err = REG_EESCAPE; 2444 return NULL; 2445 default: 2446 /* Must not happen? */ 2447 #ifdef DEBUG 2448 assert (0); 2449 #endif 2450 return NULL; 2451 } 2452 fetch_token (token, regexp, syntax); 2453 2454 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2455 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2456 { 2457 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2458 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2459 return NULL; 2460 /* In BRE consecutive duplications are not allowed. */ 2461 if ((syntax & RE_CONTEXT_INVALID_DUP) 2462 && (token->type == OP_DUP_ASTERISK 2463 || token->type == OP_OPEN_DUP_NUM)) 2464 { 2465 *err = REG_BADRPT; 2466 return NULL; 2467 } 2468 } 2469 2470 return tree; 2471 } 2472 2473 /* This function build the following tree, from regular expression 2474 (<reg_exp>): 2475 SUBEXP 2476 | 2477 <reg_exp> 2478 */ 2479 2480 static bin_tree_t * 2481 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2482 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2483 { 2484 re_dfa_t *dfa = preg->buffer; 2485 bin_tree_t *tree; 2486 size_t cur_nsub; 2487 cur_nsub = preg->re_nsub++; 2488 2489 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2490 2491 /* The subexpression may be a null string. */ 2492 if (token->type == OP_CLOSE_SUBEXP) 2493 tree = NULL; 2494 else 2495 { 2496 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2497 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2498 { 2499 if (tree != NULL) 2500 postorder (tree, free_tree, NULL); 2501 *err = REG_EPAREN; 2502 } 2503 if (BE (*err != REG_NOERROR, 0)) 2504 return NULL; 2505 } 2506 2507 if (cur_nsub <= '9' - '1') 2508 dfa->completed_bkref_map |= 1 << cur_nsub; 2509 2510 tree = create_tree (dfa, tree, NULL, SUBEXP); 2511 if (BE (tree == NULL, 0)) 2512 { 2513 *err = REG_ESPACE; 2514 return NULL; 2515 } 2516 tree->token.opr.idx = cur_nsub; 2517 return tree; 2518 } 2519 2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2521 2522 static bin_tree_t * 2523 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2524 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2525 { 2526 bin_tree_t *tree = NULL, *old_tree = NULL; 2527 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2528 re_token_t start_token = *token; 2529 2530 if (token->type == OP_OPEN_DUP_NUM) 2531 { 2532 end = 0; 2533 start = fetch_number (regexp, token, syntax); 2534 if (start == REG_MISSING) 2535 { 2536 if (token->type == CHARACTER && token->opr.c == ',') 2537 start = 0; /* We treat "{,m}" as "{0,m}". */ 2538 else 2539 { 2540 *err = REG_BADBR; /* <re>{} is invalid. */ 2541 return NULL; 2542 } 2543 } 2544 if (BE (start != REG_ERROR, 1)) 2545 { 2546 /* We treat "{n}" as "{n,n}". */ 2547 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2548 : ((token->type == CHARACTER && token->opr.c == ',') 2549 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2550 } 2551 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2552 { 2553 /* Invalid sequence. */ 2554 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2555 { 2556 if (token->type == END_OF_RE) 2557 *err = REG_EBRACE; 2558 else 2559 *err = REG_BADBR; 2560 2561 return NULL; 2562 } 2563 2564 /* If the syntax bit is set, rollback. */ 2565 re_string_set_index (regexp, start_idx); 2566 *token = start_token; 2567 token->type = CHARACTER; 2568 /* mb_partial and word_char bits should be already initialized by 2569 peek_token. */ 2570 return elem; 2571 } 2572 2573 if (BE ((end != REG_MISSING && start > end) 2574 || token->type != OP_CLOSE_DUP_NUM, 0)) 2575 { 2576 /* First number greater than second. */ 2577 *err = REG_BADBR; 2578 return NULL; 2579 } 2580 2581 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0)) 2582 { 2583 *err = REG_ESIZE; 2584 return NULL; 2585 } 2586 } 2587 else 2588 { 2589 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2590 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2591 } 2592 2593 fetch_token (token, regexp, syntax); 2594 2595 if (BE (elem == NULL, 0)) 2596 return NULL; 2597 if (BE (start == 0 && end == 0, 0)) 2598 { 2599 postorder (elem, free_tree, NULL); 2600 return NULL; 2601 } 2602 2603 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2604 if (BE (start > 0, 0)) 2605 { 2606 tree = elem; 2607 for (i = 2; i <= start; ++i) 2608 { 2609 elem = duplicate_tree (elem, dfa); 2610 tree = create_tree (dfa, tree, elem, CONCAT); 2611 if (BE (elem == NULL || tree == NULL, 0)) 2612 goto parse_dup_op_espace; 2613 } 2614 2615 if (start == end) 2616 return tree; 2617 2618 /* Duplicate ELEM before it is marked optional. */ 2619 elem = duplicate_tree (elem, dfa); 2620 old_tree = tree; 2621 } 2622 else 2623 old_tree = NULL; 2624 2625 if (elem->token.type == SUBEXP) 2626 { 2627 uintptr_t subidx = elem->token.opr.idx; 2628 postorder (elem, mark_opt_subexp, (void *) subidx); 2629 } 2630 2631 tree = create_tree (dfa, elem, NULL, 2632 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2633 if (BE (tree == NULL, 0)) 2634 goto parse_dup_op_espace; 2635 2636 /* From gnulib's "intprops.h": 2637 True if the arithmetic type T is signed. */ 2638 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) 2639 2640 /* This loop is actually executed only when end != REG_MISSING, 2641 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2642 already created the start+1-th copy. */ 2643 if (TYPE_SIGNED (Idx) || end != REG_MISSING) 2644 for (i = start + 2; i <= end; ++i) 2645 { 2646 elem = duplicate_tree (elem, dfa); 2647 tree = create_tree (dfa, tree, elem, CONCAT); 2648 if (BE (elem == NULL || tree == NULL, 0)) 2649 goto parse_dup_op_espace; 2650 2651 tree = create_tree (dfa, tree, NULL, OP_ALT); 2652 if (BE (tree == NULL, 0)) 2653 goto parse_dup_op_espace; 2654 } 2655 2656 if (old_tree) 2657 tree = create_tree (dfa, old_tree, tree, CONCAT); 2658 2659 return tree; 2660 2661 parse_dup_op_espace: 2662 *err = REG_ESPACE; 2663 return NULL; 2664 } 2665 2666 /* Size of the names for collating symbol/equivalence_class/character_class. 2667 I'm not sure, but maybe enough. */ 2668 #define BRACKET_NAME_BUF_SIZE 32 2669 2670 #ifndef _LIBC 2671 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2672 Build the range expression which starts from START_ELEM, and ends 2673 at END_ELEM. The result are written to MBCSET and SBCSET. 2674 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2675 mbcset->range_ends, is a pointer argument since we may 2676 update it. */ 2677 2678 static reg_errcode_t 2679 internal_function 2680 # ifdef RE_ENABLE_I18N 2681 build_range_exp (const reg_syntax_t syntax, 2682 bitset_t sbcset, 2683 re_charset_t *mbcset, 2684 Idx *range_alloc, 2685 const bracket_elem_t *start_elem, 2686 const bracket_elem_t *end_elem) 2687 # else /* not RE_ENABLE_I18N */ 2688 build_range_exp (const reg_syntax_t syntax, 2689 bitset_t sbcset, 2690 const bracket_elem_t *start_elem, 2691 const bracket_elem_t *end_elem) 2692 # endif /* not RE_ENABLE_I18N */ 2693 { 2694 unsigned int start_ch, end_ch; 2695 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2696 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2697 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2698 0)) 2699 return REG_ERANGE; 2700 2701 /* We can handle no multi character collating elements without libc 2702 support. */ 2703 if (BE ((start_elem->type == COLL_SYM 2704 && strlen ((char *) start_elem->opr.name) > 1) 2705 || (end_elem->type == COLL_SYM 2706 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2707 return REG_ECOLLATE; 2708 2709 # ifdef RE_ENABLE_I18N 2710 { 2711 wchar_t wc; 2712 wint_t start_wc; 2713 wint_t end_wc; 2714 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2715 2716 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2718 : 0)); 2719 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2721 : 0)); 2722 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2723 ? __btowc (start_ch) : start_elem->opr.wch); 2724 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2725 ? __btowc (end_ch) : end_elem->opr.wch); 2726 if (start_wc == WEOF || end_wc == WEOF) 2727 return REG_ECOLLATE; 2728 cmp_buf[0] = start_wc; 2729 cmp_buf[4] = end_wc; 2730 2731 if (BE ((syntax & RE_NO_EMPTY_RANGES) 2732 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) 2733 return REG_ERANGE; 2734 2735 /* Got valid collation sequence values, add them as a new entry. 2736 However, for !_LIBC we have no collation elements: if the 2737 character set is single byte, the single byte character set 2738 that we build below suffices. parse_bracket_exp passes 2739 no MBCSET if dfa->mb_cur_max == 1. */ 2740 if (mbcset) 2741 { 2742 /* Check the space of the arrays. */ 2743 if (BE (*range_alloc == mbcset->nranges, 0)) 2744 { 2745 /* There is not enough space, need realloc. */ 2746 wchar_t *new_array_start, *new_array_end; 2747 Idx new_nranges; 2748 2749 /* +1 in case of mbcset->nranges is 0. */ 2750 new_nranges = 2 * mbcset->nranges + 1; 2751 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2752 are NULL if *range_alloc == 0. */ 2753 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2754 new_nranges); 2755 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2756 new_nranges); 2757 2758 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2759 return REG_ESPACE; 2760 2761 mbcset->range_starts = new_array_start; 2762 mbcset->range_ends = new_array_end; 2763 *range_alloc = new_nranges; 2764 } 2765 2766 mbcset->range_starts[mbcset->nranges] = start_wc; 2767 mbcset->range_ends[mbcset->nranges++] = end_wc; 2768 } 2769 2770 /* Build the table for single byte characters. */ 2771 for (wc = 0; wc < SBC_MAX; ++wc) 2772 { 2773 cmp_buf[2] = wc; 2774 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2775 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2776 bitset_set (sbcset, wc); 2777 } 2778 } 2779 # else /* not RE_ENABLE_I18N */ 2780 { 2781 unsigned int ch; 2782 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2783 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2784 : 0)); 2785 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2786 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2787 : 0)); 2788 if (start_ch > end_ch) 2789 return REG_ERANGE; 2790 /* Build the table for single byte characters. */ 2791 for (ch = 0; ch < SBC_MAX; ++ch) 2792 if (start_ch <= ch && ch <= end_ch) 2793 bitset_set (sbcset, ch); 2794 } 2795 # endif /* not RE_ENABLE_I18N */ 2796 return REG_NOERROR; 2797 } 2798 #endif /* not _LIBC */ 2799 2800 #ifndef _LIBC 2801 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2802 Build the collating element which is represented by NAME. 2803 The result are written to MBCSET and SBCSET. 2804 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2805 pointer argument since we may update it. */ 2806 2807 static reg_errcode_t 2808 internal_function 2809 # ifdef RE_ENABLE_I18N 2810 build_collating_symbol (bitset_t sbcset, 2811 re_charset_t *mbcset _UNUSED_PARAMETER_, 2812 Idx *coll_sym_alloc _UNUSED_PARAMETER_, 2813 const unsigned char *name) 2814 # else /* not RE_ENABLE_I18N */ 2815 build_collating_symbol (bitset_t sbcset, const unsigned char *name) 2816 # endif /* not RE_ENABLE_I18N */ 2817 { 2818 size_t name_len = strlen ((const char *) name); 2819 if (BE (name_len != 1, 0)) 2820 return REG_ECOLLATE; 2821 else 2822 { 2823 bitset_set (sbcset, name[0]); 2824 return REG_NOERROR; 2825 } 2826 } 2827 #endif /* not _LIBC */ 2828 2829 /* This function parse bracket expression like "[abc]", "[a-c]", 2830 "[[.a-a.]]" etc. */ 2831 2832 static bin_tree_t * 2833 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2834 reg_syntax_t syntax, reg_errcode_t *err) 2835 { 2836 #ifdef _LIBC 2837 const unsigned char *collseqmb; 2838 const char *collseqwc; 2839 uint32_t nrules; 2840 int32_t table_size; 2841 const int32_t *symb_table; 2842 const unsigned char *extra; 2843 2844 /* Local function for parse_bracket_exp used in _LIBC environment. 2845 Seek the collating symbol entry corresponding to NAME. 2846 Return the index of the symbol in the SYMB_TABLE. */ 2847 2848 auto inline int32_t 2849 __attribute ((always_inline)) 2850 seek_collating_symbol_entry (name, name_len) 2851 const unsigned char *name; 2852 size_t name_len; 2853 { 2854 int32_t hash = elem_hash ((const char *) name, name_len); 2855 int32_t elem = hash % table_size; 2856 if (symb_table[2 * elem] != 0) 2857 { 2858 int32_t second = hash % (table_size - 2) + 1; 2859 2860 do 2861 { 2862 /* First compare the hashing value. */ 2863 if (symb_table[2 * elem] == hash 2864 /* Compare the length of the name. */ 2865 && name_len == extra[symb_table[2 * elem + 1]] 2866 /* Compare the name. */ 2867 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2868 name_len) == 0) 2869 { 2870 /* Yep, this is the entry. */ 2871 break; 2872 } 2873 2874 /* Next entry. */ 2875 elem += second; 2876 } 2877 while (symb_table[2 * elem] != 0); 2878 } 2879 return elem; 2880 } 2881 2882 /* Local function for parse_bracket_exp used in _LIBC environment. 2883 Look up the collation sequence value of BR_ELEM. 2884 Return the value if succeeded, UINT_MAX otherwise. */ 2885 2886 auto inline unsigned int 2887 __attribute ((always_inline)) 2888 lookup_collation_sequence_value (br_elem) 2889 bracket_elem_t *br_elem; 2890 { 2891 if (br_elem->type == SB_CHAR) 2892 { 2893 /* 2894 if (MB_CUR_MAX == 1) 2895 */ 2896 if (nrules == 0) 2897 return collseqmb[br_elem->opr.ch]; 2898 else 2899 { 2900 wint_t wc = __btowc (br_elem->opr.ch); 2901 return __collseq_table_lookup (collseqwc, wc); 2902 } 2903 } 2904 else if (br_elem->type == MB_CHAR) 2905 { 2906 if (nrules != 0) 2907 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2908 } 2909 else if (br_elem->type == COLL_SYM) 2910 { 2911 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2912 if (nrules != 0) 2913 { 2914 int32_t elem, idx; 2915 elem = seek_collating_symbol_entry (br_elem->opr.name, 2916 sym_name_len); 2917 if (symb_table[2 * elem] != 0) 2918 { 2919 /* We found the entry. */ 2920 idx = symb_table[2 * elem + 1]; 2921 /* Skip the name of collating element name. */ 2922 idx += 1 + extra[idx]; 2923 /* Skip the byte sequence of the collating element. */ 2924 idx += 1 + extra[idx]; 2925 /* Adjust for the alignment. */ 2926 idx = (idx + 3) & ~3; 2927 /* Skip the multibyte collation sequence value. */ 2928 idx += sizeof (unsigned int); 2929 /* Skip the wide char sequence of the collating element. */ 2930 idx += sizeof (unsigned int) * 2931 (1 + *(unsigned int *) (extra + idx)); 2932 /* Return the collation sequence value. */ 2933 return *(unsigned int *) (extra + idx); 2934 } 2935 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2936 { 2937 /* No valid character. Match it as a single byte 2938 character. */ 2939 return collseqmb[br_elem->opr.name[0]]; 2940 } 2941 } 2942 else if (sym_name_len == 1) 2943 return collseqmb[br_elem->opr.name[0]]; 2944 } 2945 return UINT_MAX; 2946 } 2947 2948 /* Local function for parse_bracket_exp used in _LIBC environment. 2949 Build the range expression which starts from START_ELEM, and ends 2950 at END_ELEM. The result are written to MBCSET and SBCSET. 2951 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2952 mbcset->range_ends, is a pointer argument since we may 2953 update it. */ 2954 2955 auto inline reg_errcode_t 2956 __attribute ((always_inline)) 2957 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2958 re_charset_t *mbcset; 2959 Idx *range_alloc; 2960 bitset_t sbcset; 2961 bracket_elem_t *start_elem, *end_elem; 2962 { 2963 unsigned int ch; 2964 uint32_t start_collseq; 2965 uint32_t end_collseq; 2966 2967 /* Equivalence Classes and Character Classes can't be a range 2968 start/end. */ 2969 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2970 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2971 0)) 2972 return REG_ERANGE; 2973 2974 start_collseq = lookup_collation_sequence_value (start_elem); 2975 end_collseq = lookup_collation_sequence_value (end_elem); 2976 /* Check start/end collation sequence values. */ 2977 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2978 return REG_ECOLLATE; 2979 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2980 return REG_ERANGE; 2981 2982 /* Got valid collation sequence values, add them as a new entry. 2983 However, if we have no collation elements, and the character set 2984 is single byte, the single byte character set that we 2985 build below suffices. */ 2986 if (nrules > 0 || dfa->mb_cur_max > 1) 2987 { 2988 /* Check the space of the arrays. */ 2989 if (BE (*range_alloc == mbcset->nranges, 0)) 2990 { 2991 /* There is not enough space, need realloc. */ 2992 uint32_t *new_array_start; 2993 uint32_t *new_array_end; 2994 Idx new_nranges; 2995 2996 /* +1 in case of mbcset->nranges is 0. */ 2997 new_nranges = 2 * mbcset->nranges + 1; 2998 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2999 new_nranges); 3000 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 3001 new_nranges); 3002 3003 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 3004 return REG_ESPACE; 3005 3006 mbcset->range_starts = new_array_start; 3007 mbcset->range_ends = new_array_end; 3008 *range_alloc = new_nranges; 3009 } 3010 3011 mbcset->range_starts[mbcset->nranges] = start_collseq; 3012 mbcset->range_ends[mbcset->nranges++] = end_collseq; 3013 } 3014 3015 /* Build the table for single byte characters. */ 3016 for (ch = 0; ch < SBC_MAX; ch++) 3017 { 3018 uint32_t ch_collseq; 3019 /* 3020 if (MB_CUR_MAX == 1) 3021 */ 3022 if (nrules == 0) 3023 ch_collseq = collseqmb[ch]; 3024 else 3025 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 3026 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 3027 bitset_set (sbcset, ch); 3028 } 3029 return REG_NOERROR; 3030 } 3031 3032 /* Local function for parse_bracket_exp used in _LIBC environment. 3033 Build the collating element which is represented by NAME. 3034 The result are written to MBCSET and SBCSET. 3035 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 3036 pointer argument since we may update it. */ 3037 3038 auto inline reg_errcode_t 3039 __attribute ((always_inline)) 3040 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 3041 re_charset_t *mbcset; 3042 Idx *coll_sym_alloc; 3043 bitset_t sbcset; 3044 const unsigned char *name; 3045 { 3046 int32_t elem, idx; 3047 size_t name_len = strlen ((const char *) name); 3048 if (nrules != 0) 3049 { 3050 elem = seek_collating_symbol_entry (name, name_len); 3051 if (symb_table[2 * elem] != 0) 3052 { 3053 /* We found the entry. */ 3054 idx = symb_table[2 * elem + 1]; 3055 /* Skip the name of collating element name. */ 3056 idx += 1 + extra[idx]; 3057 } 3058 else if (symb_table[2 * elem] == 0 && name_len == 1) 3059 { 3060 /* No valid character, treat it as a normal 3061 character. */ 3062 bitset_set (sbcset, name[0]); 3063 return REG_NOERROR; 3064 } 3065 else 3066 return REG_ECOLLATE; 3067 3068 /* Got valid collation sequence, add it as a new entry. */ 3069 /* Check the space of the arrays. */ 3070 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3071 { 3072 /* Not enough, realloc it. */ 3073 /* +1 in case of mbcset->ncoll_syms is 0. */ 3074 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3075 /* Use realloc since mbcset->coll_syms is NULL 3076 if *alloc == 0. */ 3077 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3078 new_coll_sym_alloc); 3079 if (BE (new_coll_syms == NULL, 0)) 3080 return REG_ESPACE; 3081 mbcset->coll_syms = new_coll_syms; 3082 *coll_sym_alloc = new_coll_sym_alloc; 3083 } 3084 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3085 return REG_NOERROR; 3086 } 3087 else 3088 { 3089 if (BE (name_len != 1, 0)) 3090 return REG_ECOLLATE; 3091 else 3092 { 3093 bitset_set (sbcset, name[0]); 3094 return REG_NOERROR; 3095 } 3096 } 3097 } 3098 #endif 3099 3100 re_token_t br_token; 3101 re_bitset_ptr_t sbcset; 3102 #ifdef RE_ENABLE_I18N 3103 re_charset_t *mbcset; 3104 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3105 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3106 #endif /* not RE_ENABLE_I18N */ 3107 bool non_match = false; 3108 bin_tree_t *work_tree; 3109 int token_len; 3110 bool first_round = true; 3111 #ifdef _LIBC 3112 collseqmb = (const unsigned char *) 3113 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3114 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3115 if (nrules) 3116 { 3117 /* 3118 if (MB_CUR_MAX > 1) 3119 */ 3120 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3121 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3122 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3123 _NL_COLLATE_SYMB_TABLEMB); 3124 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3125 _NL_COLLATE_SYMB_EXTRAMB); 3126 } 3127 #endif 3128 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3129 #ifdef RE_ENABLE_I18N 3130 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3131 #endif /* RE_ENABLE_I18N */ 3132 #ifdef RE_ENABLE_I18N 3133 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3134 #else 3135 if (BE (sbcset == NULL, 0)) 3136 #endif /* RE_ENABLE_I18N */ 3137 { 3138 re_free (sbcset); 3139 #ifdef RE_ENABLE_I18N 3140 re_free (mbcset); 3141 #endif 3142 *err = REG_ESPACE; 3143 return NULL; 3144 } 3145 3146 token_len = peek_token_bracket (token, regexp, syntax); 3147 if (BE (token->type == END_OF_RE, 0)) 3148 { 3149 *err = REG_BADPAT; 3150 goto parse_bracket_exp_free_return; 3151 } 3152 if (token->type == OP_NON_MATCH_LIST) 3153 { 3154 #ifdef RE_ENABLE_I18N 3155 mbcset->non_match = 1; 3156 #endif /* not RE_ENABLE_I18N */ 3157 non_match = true; 3158 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3159 bitset_set (sbcset, '\n'); 3160 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3161 token_len = peek_token_bracket (token, regexp, syntax); 3162 if (BE (token->type == END_OF_RE, 0)) 3163 { 3164 *err = REG_BADPAT; 3165 goto parse_bracket_exp_free_return; 3166 } 3167 } 3168 3169 /* We treat the first ']' as a normal character. */ 3170 if (token->type == OP_CLOSE_BRACKET) 3171 token->type = CHARACTER; 3172 3173 while (1) 3174 { 3175 bracket_elem_t start_elem, end_elem; 3176 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3177 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3178 reg_errcode_t ret; 3179 int token_len2 = 0; 3180 bool is_range_exp = false; 3181 re_token_t token2; 3182 3183 start_elem.opr.name = start_name_buf; 3184 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3185 syntax, first_round); 3186 if (BE (ret != REG_NOERROR, 0)) 3187 { 3188 *err = ret; 3189 goto parse_bracket_exp_free_return; 3190 } 3191 first_round = false; 3192 3193 /* Get information about the next token. We need it in any case. */ 3194 token_len = peek_token_bracket (token, regexp, syntax); 3195 3196 /* Do not check for ranges if we know they are not allowed. */ 3197 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3198 { 3199 if (BE (token->type == END_OF_RE, 0)) 3200 { 3201 *err = REG_EBRACK; 3202 goto parse_bracket_exp_free_return; 3203 } 3204 if (token->type == OP_CHARSET_RANGE) 3205 { 3206 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3207 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3208 if (BE (token2.type == END_OF_RE, 0)) 3209 { 3210 *err = REG_EBRACK; 3211 goto parse_bracket_exp_free_return; 3212 } 3213 if (token2.type == OP_CLOSE_BRACKET) 3214 { 3215 /* We treat the last '-' as a normal character. */ 3216 re_string_skip_bytes (regexp, -token_len); 3217 token->type = CHARACTER; 3218 } 3219 else 3220 is_range_exp = true; 3221 } 3222 } 3223 3224 if (is_range_exp == true) 3225 { 3226 end_elem.opr.name = end_name_buf; 3227 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3228 dfa, syntax, true); 3229 if (BE (ret != REG_NOERROR, 0)) 3230 { 3231 *err = ret; 3232 goto parse_bracket_exp_free_return; 3233 } 3234 3235 token_len = peek_token_bracket (token, regexp, syntax); 3236 3237 #ifdef _LIBC 3238 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3239 &start_elem, &end_elem); 3240 #else 3241 # ifdef RE_ENABLE_I18N 3242 *err = build_range_exp (syntax, sbcset, 3243 dfa->mb_cur_max > 1 ? mbcset : NULL, 3244 &range_alloc, &start_elem, &end_elem); 3245 # else 3246 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3247 # endif 3248 #endif /* RE_ENABLE_I18N */ 3249 if (BE (*err != REG_NOERROR, 0)) 3250 goto parse_bracket_exp_free_return; 3251 } 3252 else 3253 { 3254 switch (start_elem.type) 3255 { 3256 case SB_CHAR: 3257 bitset_set (sbcset, start_elem.opr.ch); 3258 break; 3259 #ifdef RE_ENABLE_I18N 3260 case MB_CHAR: 3261 /* Check whether the array has enough space. */ 3262 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3263 { 3264 wchar_t *new_mbchars; 3265 /* Not enough, realloc it. */ 3266 /* +1 in case of mbcset->nmbchars is 0. */ 3267 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3268 /* Use realloc since array is NULL if *alloc == 0. */ 3269 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3270 mbchar_alloc); 3271 if (BE (new_mbchars == NULL, 0)) 3272 goto parse_bracket_exp_espace; 3273 mbcset->mbchars = new_mbchars; 3274 } 3275 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3276 break; 3277 #endif /* RE_ENABLE_I18N */ 3278 case EQUIV_CLASS: 3279 *err = build_equiv_class (sbcset, 3280 #ifdef RE_ENABLE_I18N 3281 mbcset, &equiv_class_alloc, 3282 #endif /* RE_ENABLE_I18N */ 3283 start_elem.opr.name); 3284 if (BE (*err != REG_NOERROR, 0)) 3285 goto parse_bracket_exp_free_return; 3286 break; 3287 case COLL_SYM: 3288 *err = build_collating_symbol (sbcset, 3289 #ifdef RE_ENABLE_I18N 3290 mbcset, &coll_sym_alloc, 3291 #endif /* RE_ENABLE_I18N */ 3292 start_elem.opr.name); 3293 if (BE (*err != REG_NOERROR, 0)) 3294 goto parse_bracket_exp_free_return; 3295 break; 3296 case CHAR_CLASS: 3297 *err = build_charclass (regexp->trans, sbcset, 3298 #ifdef RE_ENABLE_I18N 3299 mbcset, &char_class_alloc, 3300 #endif /* RE_ENABLE_I18N */ 3301 start_elem.opr.name, syntax); 3302 if (BE (*err != REG_NOERROR, 0)) 3303 goto parse_bracket_exp_free_return; 3304 break; 3305 default: 3306 assert (0); 3307 break; 3308 } 3309 } 3310 if (BE (token->type == END_OF_RE, 0)) 3311 { 3312 *err = REG_EBRACK; 3313 goto parse_bracket_exp_free_return; 3314 } 3315 if (token->type == OP_CLOSE_BRACKET) 3316 break; 3317 } 3318 3319 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3320 3321 /* If it is non-matching list. */ 3322 if (non_match) 3323 bitset_not (sbcset); 3324 3325 #ifdef RE_ENABLE_I18N 3326 /* Ensure only single byte characters are set. */ 3327 if (dfa->mb_cur_max > 1) 3328 bitset_mask (sbcset, dfa->sb_char); 3329 3330 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3331 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3332 || mbcset->non_match))) 3333 { 3334 bin_tree_t *mbc_tree; 3335 int sbc_idx; 3336 /* Build a tree for complex bracket. */ 3337 dfa->has_mb_node = 1; 3338 br_token.type = COMPLEX_BRACKET; 3339 br_token.opr.mbcset = mbcset; 3340 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3341 if (BE (mbc_tree == NULL, 0)) 3342 goto parse_bracket_exp_espace; 3343 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3344 if (sbcset[sbc_idx]) 3345 break; 3346 /* If there are no bits set in sbcset, there is no point 3347 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3348 if (sbc_idx < BITSET_WORDS) 3349 { 3350 /* Build a tree for simple bracket. */ 3351 br_token.type = SIMPLE_BRACKET; 3352 br_token.opr.sbcset = sbcset; 3353 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3354 if (BE (work_tree == NULL, 0)) 3355 goto parse_bracket_exp_espace; 3356 3357 /* Then join them by ALT node. */ 3358 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3359 if (BE (work_tree == NULL, 0)) 3360 goto parse_bracket_exp_espace; 3361 } 3362 else 3363 { 3364 re_free (sbcset); 3365 work_tree = mbc_tree; 3366 } 3367 } 3368 else 3369 #endif /* not RE_ENABLE_I18N */ 3370 { 3371 #ifdef RE_ENABLE_I18N 3372 free_charset (mbcset); 3373 #endif 3374 /* Build a tree for simple bracket. */ 3375 br_token.type = SIMPLE_BRACKET; 3376 br_token.opr.sbcset = sbcset; 3377 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3378 if (BE (work_tree == NULL, 0)) 3379 goto parse_bracket_exp_espace; 3380 } 3381 return work_tree; 3382 3383 parse_bracket_exp_espace: 3384 *err = REG_ESPACE; 3385 parse_bracket_exp_free_return: 3386 re_free (sbcset); 3387 #ifdef RE_ENABLE_I18N 3388 free_charset (mbcset); 3389 #endif /* RE_ENABLE_I18N */ 3390 return NULL; 3391 } 3392 3393 /* Parse an element in the bracket expression. */ 3394 3395 static reg_errcode_t 3396 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3397 re_token_t *token, int token_len, 3398 re_dfa_t *dfa _UNUSED_PARAMETER_, 3399 reg_syntax_t syntax, bool accept_hyphen) 3400 { 3401 #ifdef RE_ENABLE_I18N 3402 int cur_char_size; 3403 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3404 if (cur_char_size > 1) 3405 { 3406 elem->type = MB_CHAR; 3407 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3408 re_string_skip_bytes (regexp, cur_char_size); 3409 return REG_NOERROR; 3410 } 3411 #endif /* RE_ENABLE_I18N */ 3412 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3413 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3414 || token->type == OP_OPEN_EQUIV_CLASS) 3415 return parse_bracket_symbol (elem, regexp, token); 3416 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3417 { 3418 /* A '-' must only appear as anything but a range indicator before 3419 the closing bracket. Everything else is an error. */ 3420 re_token_t token2; 3421 (void) peek_token_bracket (&token2, regexp, syntax); 3422 if (token2.type != OP_CLOSE_BRACKET) 3423 /* The actual error value is not standardized since this whole 3424 case is undefined. But ERANGE makes good sense. */ 3425 return REG_ERANGE; 3426 } 3427 elem->type = SB_CHAR; 3428 elem->opr.ch = token->opr.c; 3429 return REG_NOERROR; 3430 } 3431 3432 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3433 such as [:<character_class>:], [.<collating_element>.], and 3434 [=<equivalent_class>=]. */ 3435 3436 static reg_errcode_t 3437 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3438 re_token_t *token) 3439 { 3440 unsigned char ch, delim = token->opr.c; 3441 int i = 0; 3442 if (re_string_eoi(regexp)) 3443 return REG_EBRACK; 3444 for (;; ++i) 3445 { 3446 if (i >= BRACKET_NAME_BUF_SIZE) 3447 return REG_EBRACK; 3448 if (token->type == OP_OPEN_CHAR_CLASS) 3449 ch = re_string_fetch_byte_case (regexp); 3450 else 3451 ch = re_string_fetch_byte (regexp); 3452 if (re_string_eoi(regexp)) 3453 return REG_EBRACK; 3454 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3455 break; 3456 elem->opr.name[i] = ch; 3457 } 3458 re_string_skip_bytes (regexp, 1); 3459 elem->opr.name[i] = '\0'; 3460 switch (token->type) 3461 { 3462 case OP_OPEN_COLL_ELEM: 3463 elem->type = COLL_SYM; 3464 break; 3465 case OP_OPEN_EQUIV_CLASS: 3466 elem->type = EQUIV_CLASS; 3467 break; 3468 case OP_OPEN_CHAR_CLASS: 3469 elem->type = CHAR_CLASS; 3470 break; 3471 default: 3472 break; 3473 } 3474 return REG_NOERROR; 3475 } 3476 3477 /* Helper function for parse_bracket_exp. 3478 Build the equivalence class which is represented by NAME. 3479 The result are written to MBCSET and SBCSET. 3480 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3481 is a pointer argument since we may update it. */ 3482 3483 static reg_errcode_t 3484 #ifdef RE_ENABLE_I18N 3485 build_equiv_class (bitset_t sbcset, 3486 re_charset_t *mbcset _UNUSED_PARAMETER_, 3487 Idx *equiv_class_alloc _UNUSED_PARAMETER_, 3488 const unsigned char *name) 3489 #else /* not RE_ENABLE_I18N */ 3490 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3491 #endif /* not RE_ENABLE_I18N */ 3492 { 3493 #ifdef _LIBC 3494 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3495 if (nrules != 0) 3496 { 3497 const int32_t *table, *indirect; 3498 const unsigned char *weights, *extra, *cp; 3499 unsigned char char_buf[2]; 3500 int32_t idx1, idx2; 3501 unsigned int ch; 3502 size_t len; 3503 /* This #include defines a local function! */ 3504 # include <locale/weight.h> 3505 /* Calculate the index for equivalence class. */ 3506 cp = name; 3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3509 _NL_COLLATE_WEIGHTMB); 3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3511 _NL_COLLATE_EXTRAMB); 3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3513 _NL_COLLATE_INDIRECTMB); 3514 idx1 = findidx (&cp, -1); 3515 if (BE (idx1 == 0 || *cp != '\0', 0)) 3516 /* This isn't a valid character. */ 3517 return REG_ECOLLATE; 3518 3519 /* Build single byte matching table for this equivalence class. */ 3520 len = weights[idx1 & 0xffffff]; 3521 for (ch = 0; ch < SBC_MAX; ++ch) 3522 { 3523 char_buf[0] = ch; 3524 cp = char_buf; 3525 idx2 = findidx (&cp, 1); 3526 /* 3527 idx2 = table[ch]; 3528 */ 3529 if (idx2 == 0) 3530 /* This isn't a valid character. */ 3531 continue; 3532 /* Compare only if the length matches and the collation rule 3533 index is the same. */ 3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3535 { 3536 int cnt = 0; 3537 3538 while (cnt <= len && 3539 weights[(idx1 & 0xffffff) + 1 + cnt] 3540 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3541 ++cnt; 3542 3543 if (cnt > len) 3544 bitset_set (sbcset, ch); 3545 } 3546 } 3547 /* Check whether the array has enough space. */ 3548 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3549 { 3550 /* Not enough, realloc it. */ 3551 /* +1 in case of mbcset->nequiv_classes is 0. */ 3552 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3553 /* Use realloc since the array is NULL if *alloc == 0. */ 3554 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3555 int32_t, 3556 new_equiv_class_alloc); 3557 if (BE (new_equiv_classes == NULL, 0)) 3558 return REG_ESPACE; 3559 mbcset->equiv_classes = new_equiv_classes; 3560 *equiv_class_alloc = new_equiv_class_alloc; 3561 } 3562 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3563 } 3564 else 3565 #endif /* _LIBC */ 3566 { 3567 if (BE (strlen ((const char *) name) != 1, 0)) 3568 return REG_ECOLLATE; 3569 bitset_set (sbcset, *name); 3570 } 3571 return REG_NOERROR; 3572 } 3573 3574 /* Helper function for parse_bracket_exp. 3575 Build the character class which is represented by NAME. 3576 The result are written to MBCSET and SBCSET. 3577 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3578 is a pointer argument since we may update it. */ 3579 3580 static reg_errcode_t 3581 #ifdef RE_ENABLE_I18N 3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3583 re_charset_t *mbcset, Idx *char_class_alloc, 3584 const unsigned char *class_name, reg_syntax_t syntax) 3585 #else /* not RE_ENABLE_I18N */ 3586 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3587 const unsigned char *class_name, reg_syntax_t syntax) 3588 #endif /* not RE_ENABLE_I18N */ 3589 { 3590 int i; 3591 const char *name = (const char *) class_name; 3592 3593 /* In case of REG_ICASE "upper" and "lower" match the both of 3594 upper and lower cases. */ 3595 if ((syntax & RE_ICASE) 3596 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3597 name = "alpha"; 3598 3599 #ifdef RE_ENABLE_I18N 3600 /* Check the space of the arrays. */ 3601 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3602 { 3603 /* Not enough, realloc it. */ 3604 /* +1 in case of mbcset->nchar_classes is 0. */ 3605 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3606 /* Use realloc since array is NULL if *alloc == 0. */ 3607 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3608 new_char_class_alloc); 3609 if (BE (new_char_classes == NULL, 0)) 3610 return REG_ESPACE; 3611 mbcset->char_classes = new_char_classes; 3612 *char_class_alloc = new_char_class_alloc; 3613 } 3614 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3615 #endif /* RE_ENABLE_I18N */ 3616 3617 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3618 do { \ 3619 if (BE (trans != NULL, 0)) \ 3620 { \ 3621 for (i = 0; i < SBC_MAX; ++i) \ 3622 if (ctype_func (i)) \ 3623 bitset_set (sbcset, trans[i]); \ 3624 } \ 3625 else \ 3626 { \ 3627 for (i = 0; i < SBC_MAX; ++i) \ 3628 if (ctype_func (i)) \ 3629 bitset_set (sbcset, i); \ 3630 } \ 3631 } while (0) 3632 3633 if (strcmp (name, "alnum") == 0) 3634 BUILD_CHARCLASS_LOOP (isalnum); 3635 else if (strcmp (name, "cntrl") == 0) 3636 BUILD_CHARCLASS_LOOP (iscntrl); 3637 else if (strcmp (name, "lower") == 0) 3638 BUILD_CHARCLASS_LOOP (islower); 3639 else if (strcmp (name, "space") == 0) 3640 BUILD_CHARCLASS_LOOP (isspace); 3641 else if (strcmp (name, "alpha") == 0) 3642 BUILD_CHARCLASS_LOOP (isalpha); 3643 else if (strcmp (name, "digit") == 0) 3644 BUILD_CHARCLASS_LOOP (isdigit); 3645 else if (strcmp (name, "print") == 0) 3646 BUILD_CHARCLASS_LOOP (isprint); 3647 else if (strcmp (name, "upper") == 0) 3648 BUILD_CHARCLASS_LOOP (isupper); 3649 else if (strcmp (name, "blank") == 0) 3650 BUILD_CHARCLASS_LOOP (isblank); 3651 else if (strcmp (name, "graph") == 0) 3652 BUILD_CHARCLASS_LOOP (isgraph); 3653 else if (strcmp (name, "punct") == 0) 3654 BUILD_CHARCLASS_LOOP (ispunct); 3655 else if (strcmp (name, "xdigit") == 0) 3656 BUILD_CHARCLASS_LOOP (isxdigit); 3657 else 3658 return REG_ECTYPE; 3659 3660 return REG_NOERROR; 3661 } 3662 3663 static bin_tree_t * 3664 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3665 const unsigned char *class_name, 3666 const unsigned char *extra, bool non_match, 3667 reg_errcode_t *err) 3668 { 3669 re_bitset_ptr_t sbcset; 3670 #ifdef RE_ENABLE_I18N 3671 re_charset_t *mbcset; 3672 Idx alloc = 0; 3673 #endif /* not RE_ENABLE_I18N */ 3674 reg_errcode_t ret; 3675 re_token_t br_token; 3676 bin_tree_t *tree; 3677 3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3679 #ifdef RE_ENABLE_I18N 3680 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3681 #endif /* RE_ENABLE_I18N */ 3682 3683 #ifdef RE_ENABLE_I18N 3684 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3685 #else /* not RE_ENABLE_I18N */ 3686 if (BE (sbcset == NULL, 0)) 3687 #endif /* not RE_ENABLE_I18N */ 3688 { 3689 *err = REG_ESPACE; 3690 return NULL; 3691 } 3692 3693 if (non_match) 3694 { 3695 #ifdef RE_ENABLE_I18N 3696 mbcset->non_match = 1; 3697 #endif /* not RE_ENABLE_I18N */ 3698 } 3699 3700 /* We don't care the syntax in this case. */ 3701 ret = build_charclass (trans, sbcset, 3702 #ifdef RE_ENABLE_I18N 3703 mbcset, &alloc, 3704 #endif /* RE_ENABLE_I18N */ 3705 class_name, 0); 3706 3707 if (BE (ret != REG_NOERROR, 0)) 3708 { 3709 re_free (sbcset); 3710 #ifdef RE_ENABLE_I18N 3711 free_charset (mbcset); 3712 #endif /* RE_ENABLE_I18N */ 3713 *err = ret; 3714 return NULL; 3715 } 3716 /* \w match '_' also. */ 3717 for (; *extra; extra++) 3718 bitset_set (sbcset, *extra); 3719 3720 /* If it is non-matching list. */ 3721 if (non_match) 3722 bitset_not (sbcset); 3723 3724 #ifdef RE_ENABLE_I18N 3725 /* Ensure only single byte characters are set. */ 3726 if (dfa->mb_cur_max > 1) 3727 bitset_mask (sbcset, dfa->sb_char); 3728 #endif 3729 3730 /* Build a tree for simple bracket. */ 3731 br_token.type = SIMPLE_BRACKET; 3732 br_token.opr.sbcset = sbcset; 3733 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3734 if (BE (tree == NULL, 0)) 3735 goto build_word_op_espace; 3736 3737 #ifdef RE_ENABLE_I18N 3738 if (dfa->mb_cur_max > 1) 3739 { 3740 bin_tree_t *mbc_tree; 3741 /* Build a tree for complex bracket. */ 3742 br_token.type = COMPLEX_BRACKET; 3743 br_token.opr.mbcset = mbcset; 3744 dfa->has_mb_node = 1; 3745 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3746 if (BE (mbc_tree == NULL, 0)) 3747 goto build_word_op_espace; 3748 /* Then join them by ALT node. */ 3749 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3750 if (BE (mbc_tree != NULL, 1)) 3751 return tree; 3752 } 3753 else 3754 { 3755 free_charset (mbcset); 3756 return tree; 3757 } 3758 #else /* not RE_ENABLE_I18N */ 3759 return tree; 3760 #endif /* not RE_ENABLE_I18N */ 3761 3762 build_word_op_espace: 3763 re_free (sbcset); 3764 #ifdef RE_ENABLE_I18N 3765 free_charset (mbcset); 3766 #endif /* RE_ENABLE_I18N */ 3767 *err = REG_ESPACE; 3768 return NULL; 3769 } 3770 3771 /* This is intended for the expressions like "a{1,3}". 3772 Fetch a number from 'input', and return the number. 3773 Return REG_MISSING if the number field is empty like "{,1}". 3774 Return RE_DUP_MAX + 1 if the number field is too large. 3775 Return REG_ERROR if an error occurred. */ 3776 3777 static Idx 3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3779 { 3780 Idx num = REG_MISSING; 3781 unsigned char c; 3782 while (1) 3783 { 3784 fetch_token (token, input, syntax); 3785 c = token->opr.c; 3786 if (BE (token->type == END_OF_RE, 0)) 3787 return REG_ERROR; 3788 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3789 break; 3790 num = ((token->type != CHARACTER || c < '0' || '9' < c 3791 || num == REG_ERROR) 3792 ? REG_ERROR 3793 : num == REG_MISSING 3794 ? c - '0' 3795 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0')); 3796 } 3797 return num; 3798 } 3799 3800 #ifdef RE_ENABLE_I18N 3801 static void 3802 free_charset (re_charset_t *cset) 3803 { 3804 re_free (cset->mbchars); 3805 # ifdef _LIBC 3806 re_free (cset->coll_syms); 3807 re_free (cset->equiv_classes); 3808 re_free (cset->range_starts); 3809 re_free (cset->range_ends); 3810 # endif 3811 re_free (cset->char_classes); 3812 re_free (cset); 3813 } 3814 #endif /* RE_ENABLE_I18N */ 3815 3816 /* Functions for binary tree operation. */ 3817 3818 /* Create a tree node. */ 3819 3820 static bin_tree_t * 3821 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3822 re_token_type_t type) 3823 { 3824 re_token_t t; 3825 t.type = type; 3826 return create_token_tree (dfa, left, right, &t); 3827 } 3828 3829 static bin_tree_t * 3830 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3831 const re_token_t *token) 3832 { 3833 bin_tree_t *tree; 3834 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3835 { 3836 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3837 3838 if (storage == NULL) 3839 return NULL; 3840 storage->next = dfa->str_tree_storage; 3841 dfa->str_tree_storage = storage; 3842 dfa->str_tree_storage_idx = 0; 3843 } 3844 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3845 3846 tree->parent = NULL; 3847 tree->left = left; 3848 tree->right = right; 3849 tree->token = *token; 3850 tree->token.duplicated = 0; 3851 tree->token.opt_subexp = 0; 3852 tree->first = NULL; 3853 tree->next = NULL; 3854 tree->node_idx = REG_MISSING; 3855 3856 if (left != NULL) 3857 left->parent = tree; 3858 if (right != NULL) 3859 right->parent = tree; 3860 return tree; 3861 } 3862 3863 /* Mark the tree SRC as an optional subexpression. 3864 To be called from preorder or postorder. */ 3865 3866 static reg_errcode_t 3867 mark_opt_subexp (void *extra, bin_tree_t *node) 3868 { 3869 Idx idx = (uintptr_t) extra; 3870 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3871 node->token.opt_subexp = 1; 3872 3873 return REG_NOERROR; 3874 } 3875 3876 /* Free the allocated memory inside NODE. */ 3877 3878 static void 3879 free_token (re_token_t *node) 3880 { 3881 #ifdef RE_ENABLE_I18N 3882 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3883 free_charset (node->opr.mbcset); 3884 else 3885 #endif /* RE_ENABLE_I18N */ 3886 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3887 re_free (node->opr.sbcset); 3888 } 3889 3890 /* Worker function for tree walking. Free the allocated memory inside NODE 3891 and its children. */ 3892 3893 static reg_errcode_t 3894 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 3895 { 3896 free_token (&node->token); 3897 return REG_NOERROR; 3898 } 3899 3900 3901 /* Duplicate the node SRC, and return new node. This is a preorder 3902 visit similar to the one implemented by the generic visitor, but 3903 we need more infrastructure to maintain two parallel trees --- so, 3904 it's easier to duplicate. */ 3905 3906 static bin_tree_t * 3907 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3908 { 3909 const bin_tree_t *node; 3910 bin_tree_t *dup_root; 3911 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3912 3913 for (node = root; ; ) 3914 { 3915 /* Create a new tree and link it back to the current parent. */ 3916 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3917 if (*p_new == NULL) 3918 return NULL; 3919 (*p_new)->parent = dup_node; 3920 (*p_new)->token.duplicated = 1; 3921 dup_node = *p_new; 3922 3923 /* Go to the left node, or up and to the right. */ 3924 if (node->left) 3925 { 3926 node = node->left; 3927 p_new = &dup_node->left; 3928 } 3929 else 3930 { 3931 const bin_tree_t *prev = NULL; 3932 while (node->right == prev || node->right == NULL) 3933 { 3934 prev = node; 3935 node = node->parent; 3936 dup_node = dup_node->parent; 3937 if (!node) 3938 return dup_root; 3939 } 3940 node = node->right; 3941 p_new = &dup_node->right; 3942 } 3943 } 3944 } 3945