1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002-2011 Free Software Foundation, Inc. 5 This file is part of the GNU C Library. 6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 7 8 This program is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3, or (at your option) 11 any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License along 19 with this program; if not, write to the Free Software Foundation, 20 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 21 22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 23 size_t length, reg_syntax_t syntax); 24 static void re_compile_fastmap_iter (regex_t *bufp, 25 const re_dfastate_t *init_state, 26 char *fastmap); 27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 28 #ifdef RE_ENABLE_I18N 29 static void free_charset (re_charset_t *cset); 30 #endif /* RE_ENABLE_I18N */ 31 static void free_workarea_compile (regex_t *preg); 32 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 33 #ifdef RE_ENABLE_I18N 34 static void optimize_utf8 (re_dfa_t *dfa); 35 #endif 36 static reg_errcode_t analyze (regex_t *preg); 37 static reg_errcode_t preorder (bin_tree_t *root, 38 reg_errcode_t (fn (void *, bin_tree_t *)), 39 void *extra); 40 static reg_errcode_t postorder (bin_tree_t *root, 41 reg_errcode_t (fn (void *, bin_tree_t *)), 42 void *extra); 43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 46 bin_tree_t *node); 47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 50 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 51 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 52 unsigned int constraint); 53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 55 Idx node, bool root); 56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 57 static Idx fetch_number (re_string_t *input, re_token_t *token, 58 reg_syntax_t syntax); 59 static int peek_token (re_token_t *token, re_string_t *input, 60 reg_syntax_t syntax) internal_function; 61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 62 reg_syntax_t syntax, reg_errcode_t *err); 63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 64 re_token_t *token, reg_syntax_t syntax, 65 Idx nest, reg_errcode_t *err); 66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 67 re_token_t *token, reg_syntax_t syntax, 68 Idx nest, reg_errcode_t *err); 69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 70 re_token_t *token, reg_syntax_t syntax, 71 Idx nest, reg_errcode_t *err); 72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 73 re_token_t *token, reg_syntax_t syntax, 74 Idx nest, reg_errcode_t *err); 75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 76 re_dfa_t *dfa, re_token_t *token, 77 reg_syntax_t syntax, reg_errcode_t *err); 78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 79 re_token_t *token, reg_syntax_t syntax, 80 reg_errcode_t *err); 81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 82 re_string_t *regexp, 83 re_token_t *token, int token_len, 84 re_dfa_t *dfa, 85 reg_syntax_t syntax, 86 bool accept_hyphen); 87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 88 re_string_t *regexp, 89 re_token_t *token); 90 #ifdef RE_ENABLE_I18N 91 static reg_errcode_t build_equiv_class (bitset_t sbcset, 92 re_charset_t *mbcset, 93 Idx *equiv_class_alloc, 94 const unsigned char *name); 95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 96 bitset_t sbcset, 97 re_charset_t *mbcset, 98 Idx *char_class_alloc, 99 const unsigned char *class_name, 100 reg_syntax_t syntax); 101 #else /* not RE_ENABLE_I18N */ 102 static reg_errcode_t build_equiv_class (bitset_t sbcset, 103 const unsigned char *name); 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 105 bitset_t sbcset, 106 const unsigned char *class_name, 107 reg_syntax_t syntax); 108 #endif /* not RE_ENABLE_I18N */ 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 110 RE_TRANSLATE_TYPE trans, 111 const unsigned char *class_name, 112 const unsigned char *extra, 113 bool non_match, reg_errcode_t *err); 114 static bin_tree_t *create_tree (re_dfa_t *dfa, 115 bin_tree_t *left, bin_tree_t *right, 116 re_token_type_t type); 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 118 bin_tree_t *left, bin_tree_t *right, 119 const re_token_t *token); 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 121 static void free_token (re_token_t *node); 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 124 125 /* This table gives an error message for each of the error codes listed 126 in regex.h. Obviously the order here has to be same as there. 127 POSIX doesn't require that we do anything for REG_NOERROR, 128 but why not be nice? */ 129 130 static const char __re_error_msgid[] = 131 { 132 #define REG_NOERROR_IDX 0 133 gettext_noop ("Success") /* REG_NOERROR */ 134 "\0" 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 136 gettext_noop ("No match") /* REG_NOMATCH */ 137 "\0" 138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 140 "\0" 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 143 "\0" 144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 146 "\0" 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 149 "\0" 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 152 "\0" 153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 155 "\0" 156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 158 "\0" 159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 161 "\0" 162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 164 "\0" 165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 166 gettext_noop ("Invalid range end") /* REG_ERANGE */ 167 "\0" 168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 169 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 170 "\0" 171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 173 "\0" 174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 175 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 176 "\0" 177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 178 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 179 "\0" 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 182 }; 183 184 static const size_t __re_error_msgid_idx[] = 185 { 186 REG_NOERROR_IDX, 187 REG_NOMATCH_IDX, 188 REG_BADPAT_IDX, 189 REG_ECOLLATE_IDX, 190 REG_ECTYPE_IDX, 191 REG_EESCAPE_IDX, 192 REG_ESUBREG_IDX, 193 REG_EBRACK_IDX, 194 REG_EPAREN_IDX, 195 REG_EBRACE_IDX, 196 REG_BADBR_IDX, 197 REG_ERANGE_IDX, 198 REG_ESPACE_IDX, 199 REG_BADRPT_IDX, 200 REG_EEND_IDX, 201 REG_ESIZE_IDX, 202 REG_ERPAREN_IDX 203 }; 204 205 /* Entry points for GNU code. */ 206 207 /* re_compile_pattern is the GNU regular expression compiler: it 208 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 209 Returns 0 if the pattern was valid, otherwise an error string. 210 211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 212 are set in BUFP on entry. */ 213 214 #ifdef _LIBC 215 const char * 216 re_compile_pattern (pattern, length, bufp) 217 const char *pattern; 218 size_t length; 219 struct re_pattern_buffer *bufp; 220 #else /* size_t might promote */ 221 const char * 222 re_compile_pattern (const char *pattern, size_t length, 223 struct re_pattern_buffer *bufp) 224 #endif 225 { 226 reg_errcode_t ret; 227 228 /* And GNU code determines whether or not to get register information 229 by passing null for the REGS argument to re_match, etc., not by 230 setting no_sub, unless RE_NO_SUB is set. */ 231 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 232 233 /* Match anchors at newline. */ 234 bufp->newline_anchor = 1; 235 236 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 237 238 if (!ret) 239 return NULL; 240 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 241 } 242 #ifdef _LIBC 243 weak_alias (__re_compile_pattern, re_compile_pattern) 244 #endif 245 246 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 247 also be assigned to arbitrarily: each pattern buffer stores its own 248 syntax, so it can be changed between regex compilations. */ 249 /* This has no initializer because initialized variables in Emacs 250 become read-only after dumping. */ 251 reg_syntax_t re_syntax_options; 252 253 254 /* Specify the precise syntax of regexps for compilation. This provides 255 for compatibility for various utilities which historically have 256 different, incompatible syntaxes. 257 258 The argument SYNTAX is a bit mask comprised of the various bits 259 defined in regex.h. We return the old syntax. */ 260 261 reg_syntax_t 262 re_set_syntax (syntax) 263 reg_syntax_t syntax; 264 { 265 reg_syntax_t ret = re_syntax_options; 266 267 re_syntax_options = syntax; 268 return ret; 269 } 270 #ifdef _LIBC 271 weak_alias (__re_set_syntax, re_set_syntax) 272 #endif 273 274 int 275 re_compile_fastmap (bufp) 276 struct re_pattern_buffer *bufp; 277 { 278 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 279 char *fastmap = bufp->fastmap; 280 281 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 282 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 283 if (dfa->init_state != dfa->init_state_word) 284 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 285 if (dfa->init_state != dfa->init_state_nl) 286 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 287 if (dfa->init_state != dfa->init_state_begbuf) 288 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 289 bufp->fastmap_accurate = 1; 290 return 0; 291 } 292 #ifdef _LIBC 293 weak_alias (__re_compile_fastmap, re_compile_fastmap) 294 #endif 295 296 static inline void 297 __attribute ((always_inline)) 298 re_set_fastmap (char *fastmap, bool icase, int ch) 299 { 300 fastmap[ch] = 1; 301 if (icase) 302 fastmap[tolower (ch)] = 1; 303 } 304 305 /* Helper function for re_compile_fastmap. 306 Compile fastmap for the initial_state INIT_STATE. */ 307 308 static void 309 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 310 char *fastmap) 311 { 312 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 313 Idx node_cnt; 314 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 315 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 316 { 317 Idx node = init_state->nodes.elems[node_cnt]; 318 re_token_type_t type = dfa->nodes[node].type; 319 320 if (type == CHARACTER) 321 { 322 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 323 #ifdef RE_ENABLE_I18N 324 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 325 { 326 unsigned char buf[MB_LEN_MAX]; 327 unsigned char *p; 328 wchar_t wc; 329 mbstate_t state; 330 331 p = buf; 332 *p++ = dfa->nodes[node].opr.c; 333 while (++node < dfa->nodes_len 334 && dfa->nodes[node].type == CHARACTER 335 && dfa->nodes[node].mb_partial) 336 *p++ = dfa->nodes[node].opr.c; 337 memset (&state, '\0', sizeof (state)); 338 if (__mbrtowc (&wc, (const char *) buf, p - buf, 339 &state) == p - buf 340 && (__wcrtomb ((char *) buf, towlower (wc), &state) 341 != (size_t) -1)) 342 re_set_fastmap (fastmap, false, buf[0]); 343 } 344 #endif 345 } 346 else if (type == SIMPLE_BRACKET) 347 { 348 int i, ch; 349 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 350 { 351 int j; 352 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 353 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 354 if (w & ((bitset_word_t) 1 << j)) 355 re_set_fastmap (fastmap, icase, ch); 356 } 357 } 358 #ifdef RE_ENABLE_I18N 359 else if (type == COMPLEX_BRACKET) 360 { 361 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 362 Idx i; 363 364 # ifdef _LIBC 365 /* See if we have to try all bytes which start multiple collation 366 elements. 367 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 368 collation element, and don't catch 'b' since 'b' is 369 the only collation element which starts from 'b' (and 370 it is caught by SIMPLE_BRACKET). */ 371 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 372 && (cset->ncoll_syms || cset->nranges)) 373 { 374 const int32_t *table = (const int32_t *) 375 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 376 for (i = 0; i < SBC_MAX; ++i) 377 if (table[i] < 0) 378 re_set_fastmap (fastmap, icase, i); 379 } 380 # endif /* _LIBC */ 381 382 /* See if we have to start the match at all multibyte characters, 383 i.e. where we would not find an invalid sequence. This only 384 applies to multibyte character sets; for single byte character 385 sets, the SIMPLE_BRACKET again suffices. */ 386 if (dfa->mb_cur_max > 1 387 && (cset->nchar_classes || cset->non_match || cset->nranges 388 # ifdef _LIBC 389 || cset->nequiv_classes 390 # endif /* _LIBC */ 391 )) 392 { 393 unsigned char c = 0; 394 do 395 { 396 mbstate_t mbs; 397 memset (&mbs, 0, sizeof (mbs)); 398 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 399 re_set_fastmap (fastmap, false, (int) c); 400 } 401 while (++c != 0); 402 } 403 404 else 405 { 406 /* ... Else catch all bytes which can start the mbchars. */ 407 for (i = 0; i < cset->nmbchars; ++i) 408 { 409 char buf[256]; 410 mbstate_t state; 411 memset (&state, '\0', sizeof (state)); 412 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 415 { 416 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 417 != (size_t) -1) 418 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 419 } 420 } 421 } 422 } 423 #endif /* RE_ENABLE_I18N */ 424 else if (type == OP_PERIOD 425 #ifdef RE_ENABLE_I18N 426 || type == OP_UTF8_PERIOD 427 #endif /* RE_ENABLE_I18N */ 428 || type == END_OF_RE) 429 { 430 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 431 if (type == END_OF_RE) 432 bufp->can_be_null = 1; 433 return; 434 } 435 } 436 } 437 438 /* Entry point for POSIX code. */ 439 /* regcomp takes a regular expression as a string and compiles it. 440 441 PREG is a regex_t *. We do not expect any fields to be initialized, 442 since POSIX says we shouldn't. Thus, we set 443 444 `buffer' to the compiled pattern; 445 `used' to the length of the compiled pattern; 446 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 447 REG_EXTENDED bit in CFLAGS is set; otherwise, to 448 RE_SYNTAX_POSIX_BASIC; 449 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 450 `fastmap' to an allocated space for the fastmap; 451 `fastmap_accurate' to zero; 452 `re_nsub' to the number of subexpressions in PATTERN. 453 454 PATTERN is the address of the pattern string. 455 456 CFLAGS is a series of bits which affect compilation. 457 458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 459 use POSIX basic syntax. 460 461 If REG_NEWLINE is set, then . and [^...] don't match newline. 462 Also, regexec will try a match beginning after every newline. 463 464 If REG_ICASE is set, then we considers upper- and lowercase 465 versions of letters to be equivalent when matching. 466 467 If REG_NOSUB is set, then when PREG is passed to regexec, that 468 routine will report only success or failure, and nothing about the 469 registers. 470 471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 472 the return codes and their meanings.) */ 473 474 int 475 regcomp (preg, pattern, cflags) 476 regex_t *_Restrict_ preg; 477 const char *_Restrict_ pattern; 478 int cflags; 479 { 480 reg_errcode_t ret; 481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 482 : RE_SYNTAX_POSIX_BASIC); 483 484 preg->buffer = NULL; 485 preg->allocated = 0; 486 preg->used = 0; 487 488 /* Try to allocate space for the fastmap. */ 489 preg->fastmap = re_malloc (char, SBC_MAX); 490 if (BE (preg->fastmap == NULL, 0)) 491 return REG_ESPACE; 492 493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 494 495 /* If REG_NEWLINE is set, newlines are treated differently. */ 496 if (cflags & REG_NEWLINE) 497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 498 syntax &= ~RE_DOT_NEWLINE; 499 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 500 /* It also changes the matching behavior. */ 501 preg->newline_anchor = 1; 502 } 503 else 504 preg->newline_anchor = 0; 505 preg->no_sub = !!(cflags & REG_NOSUB); 506 preg->translate = NULL; 507 508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 509 510 /* POSIX doesn't distinguish between an unmatched open-group and an 511 unmatched close-group: both are REG_EPAREN. */ 512 if (ret == REG_ERPAREN) 513 ret = REG_EPAREN; 514 515 /* We have already checked preg->fastmap != NULL. */ 516 if (BE (ret == REG_NOERROR, 1)) 517 /* Compute the fastmap now, since regexec cannot modify the pattern 518 buffer. This function never fails in this implementation. */ 519 (void) re_compile_fastmap (preg); 520 else 521 { 522 /* Some error occurred while compiling the expression. */ 523 re_free (preg->fastmap); 524 preg->fastmap = NULL; 525 } 526 527 return (int) ret; 528 } 529 #ifdef _LIBC 530 weak_alias (__regcomp, regcomp) 531 #endif 532 533 /* Returns a message corresponding to an error code, ERRCODE, returned 534 from either regcomp or regexec. We don't use PREG here. */ 535 536 #ifdef _LIBC 537 size_t 538 regerror (errcode, preg, errbuf, errbuf_size) 539 int errcode; 540 const regex_t *_Restrict_ preg; 541 char *_Restrict_ errbuf; 542 size_t errbuf_size; 543 #else /* size_t might promote */ 544 size_t 545 regerror (int errcode, const regex_t *_Restrict_ preg _UNUSED_PARAMETER_, 546 char *_Restrict_ errbuf, size_t errbuf_size) 547 #endif 548 { 549 const char *msg; 550 size_t msg_size; 551 552 if (BE (errcode < 0 553 || errcode >= (int) (sizeof (__re_error_msgid_idx) 554 / sizeof (__re_error_msgid_idx[0])), 0)) 555 /* Only error codes returned by the rest of the code should be passed 556 to this routine. If we are given anything else, or if other regex 557 code generates an invalid error code, then the program has a bug. 558 Dump core so we can fix it. */ 559 abort (); 560 561 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 562 563 msg_size = strlen (msg) + 1; /* Includes the null. */ 564 565 if (BE (errbuf_size != 0, 1)) 566 { 567 size_t cpy_size = msg_size; 568 if (BE (msg_size > errbuf_size, 0)) 569 { 570 cpy_size = errbuf_size - 1; 571 errbuf[cpy_size] = '\0'; 572 } 573 memcpy (errbuf, msg, cpy_size); 574 } 575 576 return msg_size; 577 } 578 #ifdef _LIBC 579 weak_alias (__regerror, regerror) 580 #endif 581 582 583 #ifdef RE_ENABLE_I18N 584 /* This static array is used for the map to single-byte characters when 585 UTF-8 is used. Otherwise we would allocate memory just to initialize 586 it the same all the time. UTF-8 is the preferred encoding so this is 587 a worthwhile optimization. */ 588 static const bitset_t utf8_sb_map = 589 { 590 /* Set the first 128 bits. */ 591 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 592 # error "bitset_word_t is narrower than 32 bits" 593 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 595 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 596 BITSET_WORD_MAX, BITSET_WORD_MAX, 597 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 598 BITSET_WORD_MAX, 599 # endif 600 (BITSET_WORD_MAX 601 >> (SBC_MAX % BITSET_WORD_BITS == 0 602 ? 0 603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 604 }; 605 #endif 606 607 608 static void 609 free_dfa_content (re_dfa_t *dfa) 610 { 611 Idx i, j; 612 613 if (dfa->nodes) 614 for (i = 0; i < dfa->nodes_len; ++i) 615 free_token (dfa->nodes + i); 616 re_free (dfa->nexts); 617 for (i = 0; i < dfa->nodes_len; ++i) 618 { 619 if (dfa->eclosures != NULL) 620 re_node_set_free (dfa->eclosures + i); 621 if (dfa->inveclosures != NULL) 622 re_node_set_free (dfa->inveclosures + i); 623 if (dfa->edests != NULL) 624 re_node_set_free (dfa->edests + i); 625 } 626 re_free (dfa->edests); 627 re_free (dfa->eclosures); 628 re_free (dfa->inveclosures); 629 re_free (dfa->nodes); 630 631 if (dfa->state_table) 632 for (i = 0; i <= dfa->state_hash_mask; ++i) 633 { 634 struct re_state_table_entry *entry = dfa->state_table + i; 635 for (j = 0; j < entry->num; ++j) 636 { 637 re_dfastate_t *state = entry->array[j]; 638 free_state (state); 639 } 640 re_free (entry->array); 641 } 642 re_free (dfa->state_table); 643 #ifdef RE_ENABLE_I18N 644 if (dfa->sb_char != utf8_sb_map) 645 re_free (dfa->sb_char); 646 #endif 647 re_free (dfa->subexp_map); 648 #ifdef DEBUG 649 re_free (dfa->re_str); 650 #endif 651 652 re_free (dfa); 653 } 654 655 656 /* Free dynamically allocated space used by PREG. */ 657 658 void 659 regfree (preg) 660 regex_t *preg; 661 { 662 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 663 if (BE (dfa != NULL, 1)) 664 free_dfa_content (dfa); 665 preg->buffer = NULL; 666 preg->allocated = 0; 667 668 re_free (preg->fastmap); 669 preg->fastmap = NULL; 670 671 re_free (preg->translate); 672 preg->translate = NULL; 673 } 674 #ifdef _LIBC 675 weak_alias (__regfree, regfree) 676 #endif 677 678 /* Entry points compatible with 4.2 BSD regex library. We don't define 679 them unless specifically requested. */ 680 681 #if defined _REGEX_RE_COMP || defined _LIBC 682 683 /* BSD has one and only one pattern buffer. */ 684 static struct re_pattern_buffer re_comp_buf; 685 686 char * 687 # ifdef _LIBC 688 /* Make these definitions weak in libc, so POSIX programs can redefine 689 these names if they don't use our functions, and still use 690 regcomp/regexec above without link errors. */ 691 weak_function 692 # endif 693 re_comp (s) 694 const char *s; 695 { 696 reg_errcode_t ret; 697 char *fastmap; 698 699 if (!s) 700 { 701 if (!re_comp_buf.buffer) 702 return gettext ("No previous regular expression"); 703 return 0; 704 } 705 706 if (re_comp_buf.buffer) 707 { 708 fastmap = re_comp_buf.fastmap; 709 re_comp_buf.fastmap = NULL; 710 __regfree (&re_comp_buf); 711 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 712 re_comp_buf.fastmap = fastmap; 713 } 714 715 if (re_comp_buf.fastmap == NULL) 716 { 717 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 718 if (re_comp_buf.fastmap == NULL) 719 return (char *) gettext (__re_error_msgid 720 + __re_error_msgid_idx[(int) REG_ESPACE]); 721 } 722 723 /* Since `re_exec' always passes NULL for the `regs' argument, we 724 don't need to initialize the pattern buffer fields which affect it. */ 725 726 /* Match anchors at newlines. */ 727 re_comp_buf.newline_anchor = 1; 728 729 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 730 731 if (!ret) 732 return NULL; 733 734 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 735 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 736 } 737 738 #ifdef _LIBC 739 libc_freeres_fn (free_mem) 740 { 741 __regfree (&re_comp_buf); 742 } 743 #endif 744 745 #endif /* _REGEX_RE_COMP */ 746 747 /* Internal entry point. 748 Compile the regular expression PATTERN, whose length is LENGTH. 749 SYNTAX indicate regular expression's syntax. */ 750 751 static reg_errcode_t 752 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 753 reg_syntax_t syntax) 754 { 755 reg_errcode_t err = REG_NOERROR; 756 re_dfa_t *dfa; 757 re_string_t regexp; 758 759 /* Initialize the pattern buffer. */ 760 preg->fastmap_accurate = 0; 761 preg->syntax = syntax; 762 preg->not_bol = preg->not_eol = 0; 763 preg->used = 0; 764 preg->re_nsub = 0; 765 preg->can_be_null = 0; 766 preg->regs_allocated = REGS_UNALLOCATED; 767 768 /* Initialize the dfa. */ 769 dfa = (re_dfa_t *) preg->buffer; 770 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 771 { 772 /* If zero allocated, but buffer is non-null, try to realloc 773 enough space. This loses if buffer's address is bogus, but 774 that is the user's responsibility. If ->buffer is NULL this 775 is a simple allocation. */ 776 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 777 if (dfa == NULL) 778 return REG_ESPACE; 779 preg->allocated = sizeof (re_dfa_t); 780 preg->buffer = (unsigned char *) dfa; 781 } 782 preg->used = sizeof (re_dfa_t); 783 784 err = init_dfa (dfa, length); 785 if (BE (err != REG_NOERROR, 0)) 786 { 787 free_dfa_content (dfa); 788 preg->buffer = NULL; 789 preg->allocated = 0; 790 return err; 791 } 792 #ifdef DEBUG 793 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 794 dfa->re_str = re_malloc (char, length + 1); 795 strncpy (dfa->re_str, pattern, length + 1); 796 #endif 797 798 __libc_lock_init (dfa->lock); 799 800 err = re_string_construct (®exp, pattern, length, preg->translate, 801 (syntax & RE_ICASE) != 0, dfa); 802 if (BE (err != REG_NOERROR, 0)) 803 { 804 re_compile_internal_free_return: 805 free_workarea_compile (preg); 806 re_string_destruct (®exp); 807 free_dfa_content (dfa); 808 preg->buffer = NULL; 809 preg->allocated = 0; 810 return err; 811 } 812 813 /* Parse the regular expression, and build a structure tree. */ 814 preg->re_nsub = 0; 815 dfa->str_tree = parse (®exp, preg, syntax, &err); 816 if (BE (dfa->str_tree == NULL, 0)) 817 goto re_compile_internal_free_return; 818 819 /* Analyze the tree and create the nfa. */ 820 err = analyze (preg); 821 if (BE (err != REG_NOERROR, 0)) 822 goto re_compile_internal_free_return; 823 824 #ifdef RE_ENABLE_I18N 825 /* If possible, do searching in single byte encoding to speed things up. */ 826 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 827 optimize_utf8 (dfa); 828 #endif 829 830 /* Then create the initial state of the dfa. */ 831 err = create_initial_state (dfa); 832 833 /* Release work areas. */ 834 free_workarea_compile (preg); 835 re_string_destruct (®exp); 836 837 if (BE (err != REG_NOERROR, 0)) 838 { 839 free_dfa_content (dfa); 840 preg->buffer = NULL; 841 preg->allocated = 0; 842 } 843 844 return err; 845 } 846 847 /* Initialize DFA. We use the length of the regular expression PAT_LEN 848 as the initial length of some arrays. */ 849 850 static reg_errcode_t 851 init_dfa (re_dfa_t *dfa, size_t pat_len) 852 { 853 __re_size_t table_size; 854 #ifndef _LIBC 855 char *codeset_name; 856 #endif 857 #ifdef RE_ENABLE_I18N 858 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 859 #else 860 size_t max_i18n_object_size = 0; 861 #endif 862 size_t max_object_size = 863 MAX (sizeof (struct re_state_table_entry), 864 MAX (sizeof (re_token_t), 865 MAX (sizeof (re_node_set), 866 MAX (sizeof (regmatch_t), 867 max_i18n_object_size)))); 868 869 memset (dfa, '\0', sizeof (re_dfa_t)); 870 871 /* Force allocation of str_tree_storage the first time. */ 872 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 873 874 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 875 calculation below, and for similar doubling calculations 876 elsewhere. And it's <= rather than <, because some of the 877 doubling calculations add 1 afterwards. */ 878 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) 879 return REG_ESPACE; 880 881 dfa->nodes_alloc = pat_len + 1; 882 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 883 884 /* table_size = 2 ^ ceil(log pat_len) */ 885 for (table_size = 1; ; table_size <<= 1) 886 if (table_size > pat_len) 887 break; 888 889 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 890 dfa->state_hash_mask = table_size - 1; 891 892 dfa->mb_cur_max = MB_CUR_MAX; 893 #ifdef _LIBC 894 if (dfa->mb_cur_max == 6 895 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 896 dfa->is_utf8 = 1; 897 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 898 != 0); 899 #else 900 codeset_name = nl_langinfo (CODESET); 901 if (strcasecmp (codeset_name, "UTF-8") == 0 902 || strcasecmp (codeset_name, "UTF8") == 0) 903 dfa->is_utf8 = 1; 904 905 /* We check exhaustively in the loop below if this charset is a 906 superset of ASCII. */ 907 dfa->map_notascii = 0; 908 #endif 909 910 #ifdef RE_ENABLE_I18N 911 if (dfa->mb_cur_max > 1) 912 { 913 if (dfa->is_utf8) 914 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 915 else 916 { 917 int i, j, ch; 918 919 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 920 if (BE (dfa->sb_char == NULL, 0)) 921 return REG_ESPACE; 922 923 /* Set the bits corresponding to single byte chars. */ 924 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 925 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 926 { 927 wint_t wch = __btowc (ch); 928 if (wch != WEOF) 929 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 930 # ifndef _LIBC 931 if (isascii (ch) && wch != ch) 932 dfa->map_notascii = 1; 933 # endif 934 } 935 } 936 } 937 #endif 938 939 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 940 return REG_ESPACE; 941 return REG_NOERROR; 942 } 943 944 /* Initialize WORD_CHAR table, which indicate which character is 945 "word". In this case "word" means that it is the word construction 946 character used by some operators like "\<", "\>", etc. */ 947 948 static void 949 internal_function 950 init_word_char (re_dfa_t *dfa) 951 { 952 int i, j, ch; 953 dfa->word_ops_used = 1; 954 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 955 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 956 if (isalnum (ch) || ch == '_') 957 dfa->word_char[i] |= (bitset_word_t) 1 << j; 958 } 959 960 /* Free the work area which are only used while compiling. */ 961 962 static void 963 free_workarea_compile (regex_t *preg) 964 { 965 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 966 bin_tree_storage_t *storage, *next; 967 for (storage = dfa->str_tree_storage; storage; storage = next) 968 { 969 next = storage->next; 970 re_free (storage); 971 } 972 dfa->str_tree_storage = NULL; 973 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 974 dfa->str_tree = NULL; 975 re_free (dfa->org_indices); 976 dfa->org_indices = NULL; 977 } 978 979 /* Create initial states for all contexts. */ 980 981 static reg_errcode_t 982 create_initial_state (re_dfa_t *dfa) 983 { 984 Idx first, i; 985 reg_errcode_t err; 986 re_node_set init_nodes; 987 988 /* Initial states have the epsilon closure of the node which is 989 the first node of the regular expression. */ 990 first = dfa->str_tree->first->node_idx; 991 dfa->init_node = first; 992 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 993 if (BE (err != REG_NOERROR, 0)) 994 return err; 995 996 /* The back-references which are in initial states can epsilon transit, 997 since in this case all of the subexpressions can be null. 998 Then we add epsilon closures of the nodes which are the next nodes of 999 the back-references. */ 1000 if (dfa->nbackref > 0) 1001 for (i = 0; i < init_nodes.nelem; ++i) 1002 { 1003 Idx node_idx = init_nodes.elems[i]; 1004 re_token_type_t type = dfa->nodes[node_idx].type; 1005 1006 Idx clexp_idx; 1007 if (type != OP_BACK_REF) 1008 continue; 1009 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1010 { 1011 re_token_t *clexp_node; 1012 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1013 if (clexp_node->type == OP_CLOSE_SUBEXP 1014 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1015 break; 1016 } 1017 if (clexp_idx == init_nodes.nelem) 1018 continue; 1019 1020 if (type == OP_BACK_REF) 1021 { 1022 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1023 if (!re_node_set_contains (&init_nodes, dest_idx)) 1024 { 1025 reg_errcode_t merge_err 1026 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1027 if (merge_err != REG_NOERROR) 1028 return merge_err; 1029 i = 0; 1030 } 1031 } 1032 } 1033 1034 /* It must be the first time to invoke acquire_state. */ 1035 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1036 /* We don't check ERR here, since the initial state must not be NULL. */ 1037 if (BE (dfa->init_state == NULL, 0)) 1038 return err; 1039 if (dfa->init_state->has_constraint) 1040 { 1041 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1042 CONTEXT_WORD); 1043 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1044 CONTEXT_NEWLINE); 1045 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1046 &init_nodes, 1047 CONTEXT_NEWLINE 1048 | CONTEXT_BEGBUF); 1049 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1050 || dfa->init_state_begbuf == NULL, 0)) 1051 return err; 1052 } 1053 else 1054 dfa->init_state_word = dfa->init_state_nl 1055 = dfa->init_state_begbuf = dfa->init_state; 1056 1057 re_node_set_free (&init_nodes); 1058 return REG_NOERROR; 1059 } 1060 1061 #ifdef RE_ENABLE_I18N 1062 /* If it is possible to do searching in single byte encoding instead of UTF-8 1063 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1064 DFA nodes where needed. */ 1065 1066 static void 1067 optimize_utf8 (re_dfa_t *dfa) 1068 { 1069 Idx node; 1070 int i; 1071 bool mb_chars = false; 1072 bool has_period = false; 1073 1074 for (node = 0; node < dfa->nodes_len; ++node) 1075 switch (dfa->nodes[node].type) 1076 { 1077 case CHARACTER: 1078 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1079 mb_chars = true; 1080 break; 1081 case ANCHOR: 1082 switch (dfa->nodes[node].opr.ctx_type) 1083 { 1084 case LINE_FIRST: 1085 case LINE_LAST: 1086 case BUF_FIRST: 1087 case BUF_LAST: 1088 break; 1089 default: 1090 /* Word anchors etc. cannot be handled. It's okay to test 1091 opr.ctx_type since constraints (for all DFA nodes) are 1092 created by ORing one or more opr.ctx_type values. */ 1093 return; 1094 } 1095 break; 1096 case OP_PERIOD: 1097 has_period = true; 1098 break; 1099 case OP_BACK_REF: 1100 case OP_ALT: 1101 case END_OF_RE: 1102 case OP_DUP_ASTERISK: 1103 case OP_OPEN_SUBEXP: 1104 case OP_CLOSE_SUBEXP: 1105 break; 1106 case COMPLEX_BRACKET: 1107 return; 1108 case SIMPLE_BRACKET: 1109 /* Just double check. */ 1110 { 1111 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1112 ? 0 1113 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1114 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1115 { 1116 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1117 return; 1118 rshift = 0; 1119 } 1120 } 1121 break; 1122 default: 1123 abort (); 1124 } 1125 1126 if (mb_chars || has_period) 1127 for (node = 0; node < dfa->nodes_len; ++node) 1128 { 1129 if (dfa->nodes[node].type == CHARACTER 1130 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1131 dfa->nodes[node].mb_partial = 0; 1132 else if (dfa->nodes[node].type == OP_PERIOD) 1133 dfa->nodes[node].type = OP_UTF8_PERIOD; 1134 } 1135 1136 /* The search can be in single byte locale. */ 1137 dfa->mb_cur_max = 1; 1138 dfa->is_utf8 = 0; 1139 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1140 } 1141 #endif 1142 1143 /* Analyze the structure tree, and calculate "first", "next", "edest", 1144 "eclosure", and "inveclosure". */ 1145 1146 static reg_errcode_t 1147 analyze (regex_t *preg) 1148 { 1149 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1150 reg_errcode_t ret; 1151 1152 /* Allocate arrays. */ 1153 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1154 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1155 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1156 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1157 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1158 || dfa->eclosures == NULL, 0)) 1159 return REG_ESPACE; 1160 1161 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1162 if (dfa->subexp_map != NULL) 1163 { 1164 Idx i; 1165 for (i = 0; i < preg->re_nsub; i++) 1166 dfa->subexp_map[i] = i; 1167 preorder (dfa->str_tree, optimize_subexps, dfa); 1168 for (i = 0; i < preg->re_nsub; i++) 1169 if (dfa->subexp_map[i] != i) 1170 break; 1171 if (i == preg->re_nsub) 1172 { 1173 free (dfa->subexp_map); 1174 dfa->subexp_map = NULL; 1175 } 1176 } 1177 1178 ret = postorder (dfa->str_tree, lower_subexps, preg); 1179 if (BE (ret != REG_NOERROR, 0)) 1180 return ret; 1181 ret = postorder (dfa->str_tree, calc_first, dfa); 1182 if (BE (ret != REG_NOERROR, 0)) 1183 return ret; 1184 preorder (dfa->str_tree, calc_next, dfa); 1185 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1186 if (BE (ret != REG_NOERROR, 0)) 1187 return ret; 1188 ret = calc_eclosure (dfa); 1189 if (BE (ret != REG_NOERROR, 0)) 1190 return ret; 1191 1192 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1193 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1194 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1195 || dfa->nbackref) 1196 { 1197 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1198 if (BE (dfa->inveclosures == NULL, 0)) 1199 return REG_ESPACE; 1200 ret = calc_inveclosure (dfa); 1201 } 1202 1203 return ret; 1204 } 1205 1206 /* Our parse trees are very unbalanced, so we cannot use a stack to 1207 implement parse tree visits. Instead, we use parent pointers and 1208 some hairy code in these two functions. */ 1209 static reg_errcode_t 1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1211 void *extra) 1212 { 1213 bin_tree_t *node, *prev; 1214 1215 for (node = root; ; ) 1216 { 1217 /* Descend down the tree, preferably to the left (or to the right 1218 if that's the only child). */ 1219 while (node->left || node->right) 1220 if (node->left) 1221 node = node->left; 1222 else 1223 node = node->right; 1224 1225 do 1226 { 1227 reg_errcode_t err = fn (extra, node); 1228 if (BE (err != REG_NOERROR, 0)) 1229 return err; 1230 if (node->parent == NULL) 1231 return REG_NOERROR; 1232 prev = node; 1233 node = node->parent; 1234 } 1235 /* Go up while we have a node that is reached from the right. */ 1236 while (node->right == prev || node->right == NULL); 1237 node = node->right; 1238 } 1239 } 1240 1241 static reg_errcode_t 1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1243 void *extra) 1244 { 1245 bin_tree_t *node; 1246 1247 for (node = root; ; ) 1248 { 1249 reg_errcode_t err = fn (extra, node); 1250 if (BE (err != REG_NOERROR, 0)) 1251 return err; 1252 1253 /* Go to the left node, or up and to the right. */ 1254 if (node->left) 1255 node = node->left; 1256 else 1257 { 1258 bin_tree_t *prev = NULL; 1259 while (node->right == prev || node->right == NULL) 1260 { 1261 prev = node; 1262 node = node->parent; 1263 if (!node) 1264 return REG_NOERROR; 1265 } 1266 node = node->right; 1267 } 1268 } 1269 } 1270 1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1272 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1273 backreferences as well. Requires a preorder visit. */ 1274 static reg_errcode_t 1275 optimize_subexps (void *extra, bin_tree_t *node) 1276 { 1277 re_dfa_t *dfa = (re_dfa_t *) extra; 1278 1279 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1280 { 1281 int idx = node->token.opr.idx; 1282 node->token.opr.idx = dfa->subexp_map[idx]; 1283 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1284 } 1285 1286 else if (node->token.type == SUBEXP 1287 && node->left && node->left->token.type == SUBEXP) 1288 { 1289 Idx other_idx = node->left->token.opr.idx; 1290 1291 node->left = node->left->left; 1292 if (node->left) 1293 node->left->parent = node; 1294 1295 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1296 if (other_idx < BITSET_WORD_BITS) 1297 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1298 } 1299 1300 return REG_NOERROR; 1301 } 1302 1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1304 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1305 static reg_errcode_t 1306 lower_subexps (void *extra, bin_tree_t *node) 1307 { 1308 regex_t *preg = (regex_t *) extra; 1309 reg_errcode_t err = REG_NOERROR; 1310 1311 if (node->left && node->left->token.type == SUBEXP) 1312 { 1313 node->left = lower_subexp (&err, preg, node->left); 1314 if (node->left) 1315 node->left->parent = node; 1316 } 1317 if (node->right && node->right->token.type == SUBEXP) 1318 { 1319 node->right = lower_subexp (&err, preg, node->right); 1320 if (node->right) 1321 node->right->parent = node; 1322 } 1323 1324 return err; 1325 } 1326 1327 static bin_tree_t * 1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1329 { 1330 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1331 bin_tree_t *body = node->left; 1332 bin_tree_t *op, *cls, *tree1, *tree; 1333 1334 if (preg->no_sub 1335 /* We do not optimize empty subexpressions, because otherwise we may 1336 have bad CONCAT nodes with NULL children. This is obviously not 1337 very common, so we do not lose much. An example that triggers 1338 this case is the sed "script" /\(\)/x. */ 1339 && node->left != NULL 1340 && (node->token.opr.idx >= BITSET_WORD_BITS 1341 || !(dfa->used_bkref_map 1342 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1343 return node->left; 1344 1345 /* Convert the SUBEXP node to the concatenation of an 1346 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1347 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1348 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1349 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1350 tree = create_tree (dfa, op, tree1, CONCAT); 1351 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1352 { 1353 *err = REG_ESPACE; 1354 return NULL; 1355 } 1356 1357 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1358 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1359 return tree; 1360 } 1361 1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1363 nodes. Requires a postorder visit. */ 1364 static reg_errcode_t 1365 calc_first (void *extra, bin_tree_t *node) 1366 { 1367 re_dfa_t *dfa = (re_dfa_t *) extra; 1368 if (node->token.type == CONCAT) 1369 { 1370 node->first = node->left->first; 1371 node->node_idx = node->left->node_idx; 1372 } 1373 else 1374 { 1375 node->first = node; 1376 node->node_idx = re_dfa_add_node (dfa, node->token); 1377 if (BE (node->node_idx == REG_MISSING, 0)) 1378 return REG_ESPACE; 1379 if (node->token.type == ANCHOR) 1380 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1381 } 1382 return REG_NOERROR; 1383 } 1384 1385 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1386 static reg_errcode_t 1387 calc_next (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 1388 { 1389 switch (node->token.type) 1390 { 1391 case OP_DUP_ASTERISK: 1392 node->left->next = node; 1393 break; 1394 case CONCAT: 1395 node->left->next = node->right->first; 1396 node->right->next = node->next; 1397 break; 1398 default: 1399 if (node->left) 1400 node->left->next = node->next; 1401 if (node->right) 1402 node->right->next = node->next; 1403 break; 1404 } 1405 return REG_NOERROR; 1406 } 1407 1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1409 static reg_errcode_t 1410 link_nfa_nodes (void *extra, bin_tree_t *node) 1411 { 1412 re_dfa_t *dfa = (re_dfa_t *) extra; 1413 Idx idx = node->node_idx; 1414 reg_errcode_t err = REG_NOERROR; 1415 1416 switch (node->token.type) 1417 { 1418 case CONCAT: 1419 break; 1420 1421 case END_OF_RE: 1422 assert (node->next == NULL); 1423 break; 1424 1425 case OP_DUP_ASTERISK: 1426 case OP_ALT: 1427 { 1428 Idx left, right; 1429 dfa->has_plural_match = 1; 1430 if (node->left != NULL) 1431 left = node->left->first->node_idx; 1432 else 1433 left = node->next->node_idx; 1434 if (node->right != NULL) 1435 right = node->right->first->node_idx; 1436 else 1437 right = node->next->node_idx; 1438 assert (REG_VALID_INDEX (left)); 1439 assert (REG_VALID_INDEX (right)); 1440 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1441 } 1442 break; 1443 1444 case ANCHOR: 1445 case OP_OPEN_SUBEXP: 1446 case OP_CLOSE_SUBEXP: 1447 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1448 break; 1449 1450 case OP_BACK_REF: 1451 dfa->nexts[idx] = node->next->node_idx; 1452 if (node->token.type == OP_BACK_REF) 1453 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1454 break; 1455 1456 default: 1457 assert (!IS_EPSILON_NODE (node->token.type)); 1458 dfa->nexts[idx] = node->next->node_idx; 1459 break; 1460 } 1461 1462 return err; 1463 } 1464 1465 /* Duplicate the epsilon closure of the node ROOT_NODE. 1466 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1467 to their own constraint. */ 1468 1469 static reg_errcode_t 1470 internal_function 1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1472 Idx root_node, unsigned int init_constraint) 1473 { 1474 Idx org_node, clone_node; 1475 bool ok; 1476 unsigned int constraint = init_constraint; 1477 for (org_node = top_org_node, clone_node = top_clone_node;;) 1478 { 1479 Idx org_dest, clone_dest; 1480 if (dfa->nodes[org_node].type == OP_BACK_REF) 1481 { 1482 /* If the back reference epsilon-transit, its destination must 1483 also have the constraint. Then duplicate the epsilon closure 1484 of the destination of the back reference, and store it in 1485 edests of the back reference. */ 1486 org_dest = dfa->nexts[org_node]; 1487 re_node_set_empty (dfa->edests + clone_node); 1488 clone_dest = duplicate_node (dfa, org_dest, constraint); 1489 if (BE (clone_dest == REG_MISSING, 0)) 1490 return REG_ESPACE; 1491 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1492 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1493 if (BE (! ok, 0)) 1494 return REG_ESPACE; 1495 } 1496 else if (dfa->edests[org_node].nelem == 0) 1497 { 1498 /* In case of the node can't epsilon-transit, don't duplicate the 1499 destination and store the original destination as the 1500 destination of the node. */ 1501 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1502 break; 1503 } 1504 else if (dfa->edests[org_node].nelem == 1) 1505 { 1506 /* In case of the node can epsilon-transit, and it has only one 1507 destination. */ 1508 org_dest = dfa->edests[org_node].elems[0]; 1509 re_node_set_empty (dfa->edests + clone_node); 1510 /* If the node is root_node itself, it means the epsilon closure 1511 has a loop. Then tie it to the destination of the root_node. */ 1512 if (org_node == root_node && clone_node != org_node) 1513 { 1514 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1515 if (BE (! ok, 0)) 1516 return REG_ESPACE; 1517 break; 1518 } 1519 /* In case the node has another constraint, append it. */ 1520 constraint |= dfa->nodes[org_node].constraint; 1521 clone_dest = duplicate_node (dfa, org_dest, constraint); 1522 if (BE (clone_dest == REG_MISSING, 0)) 1523 return REG_ESPACE; 1524 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1525 if (BE (! ok, 0)) 1526 return REG_ESPACE; 1527 } 1528 else /* dfa->edests[org_node].nelem == 2 */ 1529 { 1530 /* In case of the node can epsilon-transit, and it has two 1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1532 org_dest = dfa->edests[org_node].elems[0]; 1533 re_node_set_empty (dfa->edests + clone_node); 1534 /* Search for a duplicated node which satisfies the constraint. */ 1535 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1536 if (clone_dest == REG_MISSING) 1537 { 1538 /* There is no such duplicated node, create a new one. */ 1539 reg_errcode_t err; 1540 clone_dest = duplicate_node (dfa, org_dest, constraint); 1541 if (BE (clone_dest == REG_MISSING, 0)) 1542 return REG_ESPACE; 1543 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1544 if (BE (! ok, 0)) 1545 return REG_ESPACE; 1546 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1547 root_node, constraint); 1548 if (BE (err != REG_NOERROR, 0)) 1549 return err; 1550 } 1551 else 1552 { 1553 /* There is a duplicated node which satisfies the constraint, 1554 use it to avoid infinite loop. */ 1555 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1556 if (BE (! ok, 0)) 1557 return REG_ESPACE; 1558 } 1559 1560 org_dest = dfa->edests[org_node].elems[1]; 1561 clone_dest = duplicate_node (dfa, org_dest, constraint); 1562 if (BE (clone_dest == REG_MISSING, 0)) 1563 return REG_ESPACE; 1564 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1565 if (BE (! ok, 0)) 1566 return REG_ESPACE; 1567 } 1568 org_node = org_dest; 1569 clone_node = clone_dest; 1570 } 1571 return REG_NOERROR; 1572 } 1573 1574 /* Search for a node which is duplicated from the node ORG_NODE, and 1575 satisfies the constraint CONSTRAINT. */ 1576 1577 static Idx 1578 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1579 unsigned int constraint) 1580 { 1581 Idx idx; 1582 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1583 { 1584 if (org_node == dfa->org_indices[idx] 1585 && constraint == dfa->nodes[idx].constraint) 1586 return idx; /* Found. */ 1587 } 1588 return REG_MISSING; /* Not found. */ 1589 } 1590 1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1592 Return the index of the new node, or REG_MISSING if insufficient storage is 1593 available. */ 1594 1595 static Idx 1596 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1597 { 1598 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1599 if (BE (dup_idx != REG_MISSING, 1)) 1600 { 1601 dfa->nodes[dup_idx].constraint = constraint; 1602 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1603 dfa->nodes[dup_idx].duplicated = 1; 1604 1605 /* Store the index of the original node. */ 1606 dfa->org_indices[dup_idx] = org_idx; 1607 } 1608 return dup_idx; 1609 } 1610 1611 static reg_errcode_t 1612 calc_inveclosure (re_dfa_t *dfa) 1613 { 1614 Idx src, idx; 1615 bool ok; 1616 for (idx = 0; idx < dfa->nodes_len; ++idx) 1617 re_node_set_init_empty (dfa->inveclosures + idx); 1618 1619 for (src = 0; src < dfa->nodes_len; ++src) 1620 { 1621 Idx *elems = dfa->eclosures[src].elems; 1622 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1623 { 1624 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1625 if (BE (! ok, 0)) 1626 return REG_ESPACE; 1627 } 1628 } 1629 1630 return REG_NOERROR; 1631 } 1632 1633 /* Calculate "eclosure" for all the node in DFA. */ 1634 1635 static reg_errcode_t 1636 calc_eclosure (re_dfa_t *dfa) 1637 { 1638 Idx node_idx; 1639 bool incomplete; 1640 #ifdef DEBUG 1641 assert (dfa->nodes_len > 0); 1642 #endif 1643 incomplete = false; 1644 /* For each nodes, calculate epsilon closure. */ 1645 for (node_idx = 0; ; ++node_idx) 1646 { 1647 reg_errcode_t err; 1648 re_node_set eclosure_elem; 1649 if (node_idx == dfa->nodes_len) 1650 { 1651 if (!incomplete) 1652 break; 1653 incomplete = false; 1654 node_idx = 0; 1655 } 1656 1657 #ifdef DEBUG 1658 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1659 #endif 1660 1661 /* If we have already calculated, skip it. */ 1662 if (dfa->eclosures[node_idx].nelem != 0) 1663 continue; 1664 /* Calculate epsilon closure of `node_idx'. */ 1665 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1666 if (BE (err != REG_NOERROR, 0)) 1667 return err; 1668 1669 if (dfa->eclosures[node_idx].nelem == 0) 1670 { 1671 incomplete = true; 1672 re_node_set_free (&eclosure_elem); 1673 } 1674 } 1675 return REG_NOERROR; 1676 } 1677 1678 /* Calculate epsilon closure of NODE. */ 1679 1680 static reg_errcode_t 1681 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1682 { 1683 reg_errcode_t err; 1684 Idx i; 1685 re_node_set eclosure; 1686 bool ok; 1687 bool incomplete = false; 1688 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1689 if (BE (err != REG_NOERROR, 0)) 1690 return err; 1691 1692 /* This indicates that we are calculating this node now. 1693 We reference this value to avoid infinite loop. */ 1694 dfa->eclosures[node].nelem = REG_MISSING; 1695 1696 /* If the current node has constraints, duplicate all nodes 1697 since they must inherit the constraints. */ 1698 if (dfa->nodes[node].constraint 1699 && dfa->edests[node].nelem 1700 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1701 { 1702 err = duplicate_node_closure (dfa, node, node, node, 1703 dfa->nodes[node].constraint); 1704 if (BE (err != REG_NOERROR, 0)) 1705 return err; 1706 } 1707 1708 /* Expand each epsilon destination nodes. */ 1709 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1710 for (i = 0; i < dfa->edests[node].nelem; ++i) 1711 { 1712 re_node_set eclosure_elem; 1713 Idx edest = dfa->edests[node].elems[i]; 1714 /* If calculating the epsilon closure of `edest' is in progress, 1715 return intermediate result. */ 1716 if (dfa->eclosures[edest].nelem == REG_MISSING) 1717 { 1718 incomplete = true; 1719 continue; 1720 } 1721 /* If we haven't calculated the epsilon closure of `edest' yet, 1722 calculate now. Otherwise use calculated epsilon closure. */ 1723 if (dfa->eclosures[edest].nelem == 0) 1724 { 1725 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1726 if (BE (err != REG_NOERROR, 0)) 1727 return err; 1728 } 1729 else 1730 eclosure_elem = dfa->eclosures[edest]; 1731 /* Merge the epsilon closure of `edest'. */ 1732 err = re_node_set_merge (&eclosure, &eclosure_elem); 1733 if (BE (err != REG_NOERROR, 0)) 1734 return err; 1735 /* If the epsilon closure of `edest' is incomplete, 1736 the epsilon closure of this node is also incomplete. */ 1737 if (dfa->eclosures[edest].nelem == 0) 1738 { 1739 incomplete = true; 1740 re_node_set_free (&eclosure_elem); 1741 } 1742 } 1743 1744 /* An epsilon closure includes itself. */ 1745 ok = re_node_set_insert (&eclosure, node); 1746 if (BE (! ok, 0)) 1747 return REG_ESPACE; 1748 if (incomplete && !root) 1749 dfa->eclosures[node].nelem = 0; 1750 else 1751 dfa->eclosures[node] = eclosure; 1752 *new_set = eclosure; 1753 return REG_NOERROR; 1754 } 1755 1756 /* Functions for token which are used in the parser. */ 1757 1758 /* Fetch a token from INPUT. 1759 We must not use this function inside bracket expressions. */ 1760 1761 static void 1762 internal_function 1763 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1764 { 1765 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1766 } 1767 1768 /* Peek a token from INPUT, and return the length of the token. 1769 We must not use this function inside bracket expressions. */ 1770 1771 static int 1772 internal_function 1773 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1774 { 1775 unsigned char c; 1776 1777 if (re_string_eoi (input)) 1778 { 1779 token->type = END_OF_RE; 1780 return 0; 1781 } 1782 1783 c = re_string_peek_byte (input, 0); 1784 token->opr.c = c; 1785 1786 token->word_char = 0; 1787 #ifdef RE_ENABLE_I18N 1788 token->mb_partial = 0; 1789 if (input->mb_cur_max > 1 && 1790 !re_string_first_byte (input, re_string_cur_idx (input))) 1791 { 1792 token->type = CHARACTER; 1793 token->mb_partial = 1; 1794 return 1; 1795 } 1796 #endif 1797 if (c == '\\') 1798 { 1799 unsigned char c2; 1800 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1801 { 1802 token->type = BACK_SLASH; 1803 return 1; 1804 } 1805 1806 c2 = re_string_peek_byte_case (input, 1); 1807 token->opr.c = c2; 1808 token->type = CHARACTER; 1809 #ifdef RE_ENABLE_I18N 1810 if (input->mb_cur_max > 1) 1811 { 1812 wint_t wc = re_string_wchar_at (input, 1813 re_string_cur_idx (input) + 1); 1814 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1815 } 1816 else 1817 #endif 1818 token->word_char = IS_WORD_CHAR (c2) != 0; 1819 1820 switch (c2) 1821 { 1822 case '|': 1823 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1824 token->type = OP_ALT; 1825 break; 1826 case '1': case '2': case '3': case '4': case '5': 1827 case '6': case '7': case '8': case '9': 1828 if (!(syntax & RE_NO_BK_REFS)) 1829 { 1830 token->type = OP_BACK_REF; 1831 token->opr.idx = c2 - '1'; 1832 } 1833 break; 1834 case '<': 1835 if (!(syntax & RE_NO_GNU_OPS)) 1836 { 1837 token->type = ANCHOR; 1838 token->opr.ctx_type = WORD_FIRST; 1839 } 1840 break; 1841 case '>': 1842 if (!(syntax & RE_NO_GNU_OPS)) 1843 { 1844 token->type = ANCHOR; 1845 token->opr.ctx_type = WORD_LAST; 1846 } 1847 break; 1848 case 'b': 1849 if (!(syntax & RE_NO_GNU_OPS)) 1850 { 1851 token->type = ANCHOR; 1852 token->opr.ctx_type = WORD_DELIM; 1853 } 1854 break; 1855 case 'B': 1856 if (!(syntax & RE_NO_GNU_OPS)) 1857 { 1858 token->type = ANCHOR; 1859 token->opr.ctx_type = NOT_WORD_DELIM; 1860 } 1861 break; 1862 case 'w': 1863 if (!(syntax & RE_NO_GNU_OPS)) 1864 token->type = OP_WORD; 1865 break; 1866 case 'W': 1867 if (!(syntax & RE_NO_GNU_OPS)) 1868 token->type = OP_NOTWORD; 1869 break; 1870 case 's': 1871 if (!(syntax & RE_NO_GNU_OPS)) 1872 token->type = OP_SPACE; 1873 break; 1874 case 'S': 1875 if (!(syntax & RE_NO_GNU_OPS)) 1876 token->type = OP_NOTSPACE; 1877 break; 1878 case '`': 1879 if (!(syntax & RE_NO_GNU_OPS)) 1880 { 1881 token->type = ANCHOR; 1882 token->opr.ctx_type = BUF_FIRST; 1883 } 1884 break; 1885 case '\'': 1886 if (!(syntax & RE_NO_GNU_OPS)) 1887 { 1888 token->type = ANCHOR; 1889 token->opr.ctx_type = BUF_LAST; 1890 } 1891 break; 1892 case '(': 1893 if (!(syntax & RE_NO_BK_PARENS)) 1894 token->type = OP_OPEN_SUBEXP; 1895 break; 1896 case ')': 1897 if (!(syntax & RE_NO_BK_PARENS)) 1898 token->type = OP_CLOSE_SUBEXP; 1899 break; 1900 case '+': 1901 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1902 token->type = OP_DUP_PLUS; 1903 break; 1904 case '?': 1905 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1906 token->type = OP_DUP_QUESTION; 1907 break; 1908 case '{': 1909 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1910 token->type = OP_OPEN_DUP_NUM; 1911 break; 1912 case '}': 1913 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1914 token->type = OP_CLOSE_DUP_NUM; 1915 break; 1916 default: 1917 break; 1918 } 1919 return 2; 1920 } 1921 1922 token->type = CHARACTER; 1923 #ifdef RE_ENABLE_I18N 1924 if (input->mb_cur_max > 1) 1925 { 1926 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1927 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1928 } 1929 else 1930 #endif 1931 token->word_char = IS_WORD_CHAR (token->opr.c); 1932 1933 switch (c) 1934 { 1935 case '\n': 1936 if (syntax & RE_NEWLINE_ALT) 1937 token->type = OP_ALT; 1938 break; 1939 case '|': 1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1941 token->type = OP_ALT; 1942 break; 1943 case '*': 1944 token->type = OP_DUP_ASTERISK; 1945 break; 1946 case '+': 1947 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1948 token->type = OP_DUP_PLUS; 1949 break; 1950 case '?': 1951 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1952 token->type = OP_DUP_QUESTION; 1953 break; 1954 case '{': 1955 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1956 token->type = OP_OPEN_DUP_NUM; 1957 break; 1958 case '}': 1959 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1960 token->type = OP_CLOSE_DUP_NUM; 1961 break; 1962 case '(': 1963 if (syntax & RE_NO_BK_PARENS) 1964 token->type = OP_OPEN_SUBEXP; 1965 break; 1966 case ')': 1967 if (syntax & RE_NO_BK_PARENS) 1968 token->type = OP_CLOSE_SUBEXP; 1969 break; 1970 case '[': 1971 token->type = OP_OPEN_BRACKET; 1972 break; 1973 case '.': 1974 token->type = OP_PERIOD; 1975 break; 1976 case '^': 1977 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1978 re_string_cur_idx (input) != 0) 1979 { 1980 char prev = re_string_peek_byte (input, -1); 1981 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1982 break; 1983 } 1984 token->type = ANCHOR; 1985 token->opr.ctx_type = LINE_FIRST; 1986 break; 1987 case '$': 1988 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1989 re_string_cur_idx (input) + 1 != re_string_length (input)) 1990 { 1991 re_token_t next; 1992 re_string_skip_bytes (input, 1); 1993 peek_token (&next, input, syntax); 1994 re_string_skip_bytes (input, -1); 1995 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1996 break; 1997 } 1998 token->type = ANCHOR; 1999 token->opr.ctx_type = LINE_LAST; 2000 break; 2001 default: 2002 break; 2003 } 2004 return 1; 2005 } 2006 2007 /* Peek a token from INPUT, and return the length of the token. 2008 We must not use this function out of bracket expressions. */ 2009 2010 static int 2011 internal_function 2012 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2013 { 2014 unsigned char c; 2015 if (re_string_eoi (input)) 2016 { 2017 token->type = END_OF_RE; 2018 return 0; 2019 } 2020 c = re_string_peek_byte (input, 0); 2021 token->opr.c = c; 2022 2023 #ifdef RE_ENABLE_I18N 2024 if (input->mb_cur_max > 1 && 2025 !re_string_first_byte (input, re_string_cur_idx (input))) 2026 { 2027 token->type = CHARACTER; 2028 return 1; 2029 } 2030 #endif /* RE_ENABLE_I18N */ 2031 2032 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2033 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2034 { 2035 /* In this case, '\' escape a character. */ 2036 unsigned char c2; 2037 re_string_skip_bytes (input, 1); 2038 c2 = re_string_peek_byte (input, 0); 2039 token->opr.c = c2; 2040 token->type = CHARACTER; 2041 return 1; 2042 } 2043 if (c == '[') /* '[' is a special char in a bracket exps. */ 2044 { 2045 unsigned char c2; 2046 int token_len; 2047 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2048 c2 = re_string_peek_byte (input, 1); 2049 else 2050 c2 = 0; 2051 token->opr.c = c2; 2052 token_len = 2; 2053 switch (c2) 2054 { 2055 case '.': 2056 token->type = OP_OPEN_COLL_ELEM; 2057 break; 2058 case '=': 2059 token->type = OP_OPEN_EQUIV_CLASS; 2060 break; 2061 case ':': 2062 if (syntax & RE_CHAR_CLASSES) 2063 { 2064 token->type = OP_OPEN_CHAR_CLASS; 2065 break; 2066 } 2067 /* else fall through. */ 2068 default: 2069 token->type = CHARACTER; 2070 token->opr.c = c; 2071 token_len = 1; 2072 break; 2073 } 2074 return token_len; 2075 } 2076 switch (c) 2077 { 2078 case '-': 2079 token->type = OP_CHARSET_RANGE; 2080 break; 2081 case ']': 2082 token->type = OP_CLOSE_BRACKET; 2083 break; 2084 case '^': 2085 token->type = OP_NON_MATCH_LIST; 2086 break; 2087 default: 2088 token->type = CHARACTER; 2089 } 2090 return 1; 2091 } 2092 2093 /* Functions for parser. */ 2094 2095 /* Entry point of the parser. 2096 Parse the regular expression REGEXP and return the structure tree. 2097 If an error is occured, ERR is set by error code, and return NULL. 2098 This function build the following tree, from regular expression <reg_exp>: 2099 CAT 2100 / \ 2101 / \ 2102 <reg_exp> EOR 2103 2104 CAT means concatenation. 2105 EOR means end of regular expression. */ 2106 2107 static bin_tree_t * 2108 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2109 reg_errcode_t *err) 2110 { 2111 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2112 bin_tree_t *tree, *eor, *root; 2113 re_token_t current_token; 2114 dfa->syntax = syntax; 2115 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2116 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2117 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2118 return NULL; 2119 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2120 if (tree != NULL) 2121 root = create_tree (dfa, tree, eor, CONCAT); 2122 else 2123 root = eor; 2124 if (BE (eor == NULL || root == NULL, 0)) 2125 { 2126 *err = REG_ESPACE; 2127 return NULL; 2128 } 2129 return root; 2130 } 2131 2132 /* This function build the following tree, from regular expression 2133 <branch1>|<branch2>: 2134 ALT 2135 / \ 2136 / \ 2137 <branch1> <branch2> 2138 2139 ALT means alternative, which represents the operator `|'. */ 2140 2141 static bin_tree_t * 2142 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2143 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2144 { 2145 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2146 bin_tree_t *tree, *branch = NULL; 2147 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2148 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2149 return NULL; 2150 2151 while (token->type == OP_ALT) 2152 { 2153 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2154 if (token->type != OP_ALT && token->type != END_OF_RE 2155 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2156 { 2157 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2158 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2159 return NULL; 2160 } 2161 else 2162 branch = NULL; 2163 tree = create_tree (dfa, tree, branch, OP_ALT); 2164 if (BE (tree == NULL, 0)) 2165 { 2166 *err = REG_ESPACE; 2167 return NULL; 2168 } 2169 } 2170 return tree; 2171 } 2172 2173 /* This function build the following tree, from regular expression 2174 <exp1><exp2>: 2175 CAT 2176 / \ 2177 / \ 2178 <exp1> <exp2> 2179 2180 CAT means concatenation. */ 2181 2182 static bin_tree_t * 2183 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2184 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2185 { 2186 bin_tree_t *tree, *expr; 2187 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2188 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2189 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2190 return NULL; 2191 2192 while (token->type != OP_ALT && token->type != END_OF_RE 2193 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2194 { 2195 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2196 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2197 { 2198 return NULL; 2199 } 2200 if (tree != NULL && expr != NULL) 2201 { 2202 tree = create_tree (dfa, tree, expr, CONCAT); 2203 if (tree == NULL) 2204 { 2205 *err = REG_ESPACE; 2206 return NULL; 2207 } 2208 } 2209 else if (tree == NULL) 2210 tree = expr; 2211 /* Otherwise expr == NULL, we don't need to create new tree. */ 2212 } 2213 return tree; 2214 } 2215 2216 /* This function build the following tree, from regular expression a*: 2217 * 2218 | 2219 a 2220 */ 2221 2222 static bin_tree_t * 2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2224 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2225 { 2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2227 bin_tree_t *tree; 2228 switch (token->type) 2229 { 2230 case CHARACTER: 2231 tree = create_token_tree (dfa, NULL, NULL, token); 2232 if (BE (tree == NULL, 0)) 2233 { 2234 *err = REG_ESPACE; 2235 return NULL; 2236 } 2237 #ifdef RE_ENABLE_I18N 2238 if (dfa->mb_cur_max > 1) 2239 { 2240 while (!re_string_eoi (regexp) 2241 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2242 { 2243 bin_tree_t *mbc_remain; 2244 fetch_token (token, regexp, syntax); 2245 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2246 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2247 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2248 { 2249 *err = REG_ESPACE; 2250 return NULL; 2251 } 2252 } 2253 } 2254 #endif 2255 break; 2256 case OP_OPEN_SUBEXP: 2257 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2258 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2259 return NULL; 2260 break; 2261 case OP_OPEN_BRACKET: 2262 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2263 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2264 return NULL; 2265 break; 2266 case OP_BACK_REF: 2267 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2268 { 2269 *err = REG_ESUBREG; 2270 return NULL; 2271 } 2272 dfa->used_bkref_map |= 1 << token->opr.idx; 2273 tree = create_token_tree (dfa, NULL, NULL, token); 2274 if (BE (tree == NULL, 0)) 2275 { 2276 *err = REG_ESPACE; 2277 return NULL; 2278 } 2279 ++dfa->nbackref; 2280 dfa->has_mb_node = 1; 2281 break; 2282 case OP_OPEN_DUP_NUM: 2283 if (syntax & RE_CONTEXT_INVALID_DUP) 2284 { 2285 *err = REG_BADRPT; 2286 return NULL; 2287 } 2288 /* FALLTHROUGH */ 2289 case OP_DUP_ASTERISK: 2290 case OP_DUP_PLUS: 2291 case OP_DUP_QUESTION: 2292 if (syntax & RE_CONTEXT_INVALID_OPS) 2293 { 2294 *err = REG_BADRPT; 2295 return NULL; 2296 } 2297 else if (syntax & RE_CONTEXT_INDEP_OPS) 2298 { 2299 fetch_token (token, regexp, syntax); 2300 return parse_expression (regexp, preg, token, syntax, nest, err); 2301 } 2302 /* else fall through */ 2303 case OP_CLOSE_SUBEXP: 2304 if ((token->type == OP_CLOSE_SUBEXP) && 2305 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2306 { 2307 *err = REG_ERPAREN; 2308 return NULL; 2309 } 2310 /* else fall through */ 2311 case OP_CLOSE_DUP_NUM: 2312 /* We treat it as a normal character. */ 2313 2314 /* Then we can these characters as normal characters. */ 2315 token->type = CHARACTER; 2316 /* mb_partial and word_char bits should be initialized already 2317 by peek_token. */ 2318 tree = create_token_tree (dfa, NULL, NULL, token); 2319 if (BE (tree == NULL, 0)) 2320 { 2321 *err = REG_ESPACE; 2322 return NULL; 2323 } 2324 break; 2325 case ANCHOR: 2326 if ((token->opr.ctx_type 2327 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2328 && dfa->word_ops_used == 0) 2329 init_word_char (dfa); 2330 if (token->opr.ctx_type == WORD_DELIM 2331 || token->opr.ctx_type == NOT_WORD_DELIM) 2332 { 2333 bin_tree_t *tree_first, *tree_last; 2334 if (token->opr.ctx_type == WORD_DELIM) 2335 { 2336 token->opr.ctx_type = WORD_FIRST; 2337 tree_first = create_token_tree (dfa, NULL, NULL, token); 2338 token->opr.ctx_type = WORD_LAST; 2339 } 2340 else 2341 { 2342 token->opr.ctx_type = INSIDE_WORD; 2343 tree_first = create_token_tree (dfa, NULL, NULL, token); 2344 token->opr.ctx_type = INSIDE_NOTWORD; 2345 } 2346 tree_last = create_token_tree (dfa, NULL, NULL, token); 2347 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2348 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2349 { 2350 *err = REG_ESPACE; 2351 return NULL; 2352 } 2353 } 2354 else 2355 { 2356 tree = create_token_tree (dfa, NULL, NULL, token); 2357 if (BE (tree == NULL, 0)) 2358 { 2359 *err = REG_ESPACE; 2360 return NULL; 2361 } 2362 } 2363 /* We must return here, since ANCHORs can't be followed 2364 by repetition operators. 2365 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2366 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2367 fetch_token (token, regexp, syntax); 2368 return tree; 2369 case OP_PERIOD: 2370 tree = create_token_tree (dfa, NULL, NULL, token); 2371 if (BE (tree == NULL, 0)) 2372 { 2373 *err = REG_ESPACE; 2374 return NULL; 2375 } 2376 if (dfa->mb_cur_max > 1) 2377 dfa->has_mb_node = 1; 2378 break; 2379 case OP_WORD: 2380 case OP_NOTWORD: 2381 tree = build_charclass_op (dfa, regexp->trans, 2382 (const unsigned char *) "alnum", 2383 (const unsigned char *) "_", 2384 token->type == OP_NOTWORD, err); 2385 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2386 return NULL; 2387 break; 2388 case OP_SPACE: 2389 case OP_NOTSPACE: 2390 tree = build_charclass_op (dfa, regexp->trans, 2391 (const unsigned char *) "space", 2392 (const unsigned char *) "", 2393 token->type == OP_NOTSPACE, err); 2394 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2395 return NULL; 2396 break; 2397 case OP_ALT: 2398 case END_OF_RE: 2399 return NULL; 2400 case BACK_SLASH: 2401 *err = REG_EESCAPE; 2402 return NULL; 2403 default: 2404 /* Must not happen? */ 2405 #ifdef DEBUG 2406 assert (0); 2407 #endif 2408 return NULL; 2409 } 2410 fetch_token (token, regexp, syntax); 2411 2412 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2413 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2414 { 2415 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2416 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2417 return NULL; 2418 /* In BRE consecutive duplications are not allowed. */ 2419 if ((syntax & RE_CONTEXT_INVALID_DUP) 2420 && (token->type == OP_DUP_ASTERISK 2421 || token->type == OP_OPEN_DUP_NUM)) 2422 { 2423 *err = REG_BADRPT; 2424 return NULL; 2425 } 2426 } 2427 2428 return tree; 2429 } 2430 2431 /* This function build the following tree, from regular expression 2432 (<reg_exp>): 2433 SUBEXP 2434 | 2435 <reg_exp> 2436 */ 2437 2438 static bin_tree_t * 2439 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2440 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2441 { 2442 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2443 bin_tree_t *tree; 2444 size_t cur_nsub; 2445 cur_nsub = preg->re_nsub++; 2446 2447 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2448 2449 /* The subexpression may be a null string. */ 2450 if (token->type == OP_CLOSE_SUBEXP) 2451 tree = NULL; 2452 else 2453 { 2454 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2455 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2456 *err = REG_EPAREN; 2457 if (BE (*err != REG_NOERROR, 0)) 2458 return NULL; 2459 } 2460 2461 if (cur_nsub <= '9' - '1') 2462 dfa->completed_bkref_map |= 1 << cur_nsub; 2463 2464 tree = create_tree (dfa, tree, NULL, SUBEXP); 2465 if (BE (tree == NULL, 0)) 2466 { 2467 *err = REG_ESPACE; 2468 return NULL; 2469 } 2470 tree->token.opr.idx = cur_nsub; 2471 return tree; 2472 } 2473 2474 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2475 2476 static bin_tree_t * 2477 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2478 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2479 { 2480 bin_tree_t *tree = NULL, *old_tree = NULL; 2481 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2482 re_token_t start_token = *token; 2483 2484 if (token->type == OP_OPEN_DUP_NUM) 2485 { 2486 end = 0; 2487 start = fetch_number (regexp, token, syntax); 2488 if (start == REG_MISSING) 2489 { 2490 if (token->type == CHARACTER && token->opr.c == ',') 2491 start = 0; /* We treat "{,m}" as "{0,m}". */ 2492 else 2493 { 2494 *err = REG_BADBR; /* <re>{} is invalid. */ 2495 return NULL; 2496 } 2497 } 2498 if (BE (start != REG_ERROR, 1)) 2499 { 2500 /* We treat "{n}" as "{n,n}". */ 2501 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2502 : ((token->type == CHARACTER && token->opr.c == ',') 2503 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2504 } 2505 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2506 { 2507 /* Invalid sequence. */ 2508 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2509 { 2510 if (token->type == END_OF_RE) 2511 *err = REG_EBRACE; 2512 else 2513 *err = REG_BADBR; 2514 2515 return NULL; 2516 } 2517 2518 /* If the syntax bit is set, rollback. */ 2519 re_string_set_index (regexp, start_idx); 2520 *token = start_token; 2521 token->type = CHARACTER; 2522 /* mb_partial and word_char bits should be already initialized by 2523 peek_token. */ 2524 return elem; 2525 } 2526 2527 if (BE ((end != REG_MISSING && start > end) 2528 || token->type != OP_CLOSE_DUP_NUM, 0)) 2529 { 2530 /* First number greater than second. */ 2531 *err = REG_BADBR; 2532 return NULL; 2533 } 2534 } 2535 else 2536 { 2537 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2538 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2539 } 2540 2541 fetch_token (token, regexp, syntax); 2542 2543 if (BE (elem == NULL, 0)) 2544 return NULL; 2545 if (BE (start == 0 && end == 0, 0)) 2546 { 2547 postorder (elem, free_tree, NULL); 2548 return NULL; 2549 } 2550 2551 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2552 if (BE (start > 0, 0)) 2553 { 2554 tree = elem; 2555 for (i = 2; i <= start; ++i) 2556 { 2557 elem = duplicate_tree (elem, dfa); 2558 tree = create_tree (dfa, tree, elem, CONCAT); 2559 if (BE (elem == NULL || tree == NULL, 0)) 2560 goto parse_dup_op_espace; 2561 } 2562 2563 if (start == end) 2564 return tree; 2565 2566 /* Duplicate ELEM before it is marked optional. */ 2567 elem = duplicate_tree (elem, dfa); 2568 old_tree = tree; 2569 } 2570 else 2571 old_tree = NULL; 2572 2573 if (elem->token.type == SUBEXP) 2574 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2575 2576 tree = create_tree (dfa, elem, NULL, 2577 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2578 if (BE (tree == NULL, 0)) 2579 goto parse_dup_op_espace; 2580 2581 /* From gnulib's "intprops.h": 2582 True if the arithmetic type T is signed. */ 2583 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1)) 2584 2585 /* This loop is actually executed only when end != REG_MISSING, 2586 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2587 already created the start+1-th copy. */ 2588 if (TYPE_SIGNED (Idx) || end != REG_MISSING) 2589 for (i = start + 2; i <= end; ++i) 2590 { 2591 elem = duplicate_tree (elem, dfa); 2592 tree = create_tree (dfa, tree, elem, CONCAT); 2593 if (BE (elem == NULL || tree == NULL, 0)) 2594 goto parse_dup_op_espace; 2595 2596 tree = create_tree (dfa, tree, NULL, OP_ALT); 2597 if (BE (tree == NULL, 0)) 2598 goto parse_dup_op_espace; 2599 } 2600 2601 if (old_tree) 2602 tree = create_tree (dfa, old_tree, tree, CONCAT); 2603 2604 return tree; 2605 2606 parse_dup_op_espace: 2607 *err = REG_ESPACE; 2608 return NULL; 2609 } 2610 2611 /* Size of the names for collating symbol/equivalence_class/character_class. 2612 I'm not sure, but maybe enough. */ 2613 #define BRACKET_NAME_BUF_SIZE 32 2614 2615 #ifndef _LIBC 2616 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2617 Build the range expression which starts from START_ELEM, and ends 2618 at END_ELEM. The result are written to MBCSET and SBCSET. 2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2620 mbcset->range_ends, is a pointer argument sinse we may 2621 update it. */ 2622 2623 static reg_errcode_t 2624 internal_function 2625 # ifdef RE_ENABLE_I18N 2626 build_range_exp (const reg_syntax_t syntax, 2627 bitset_t sbcset, 2628 re_charset_t *mbcset, 2629 Idx *range_alloc, 2630 const bracket_elem_t *start_elem, 2631 const bracket_elem_t *end_elem) 2632 # else /* not RE_ENABLE_I18N */ 2633 build_range_exp (const reg_syntax_t syntax, 2634 bitset_t sbcset, 2635 const bracket_elem_t *start_elem, 2636 const bracket_elem_t *end_elem) 2637 # endif /* not RE_ENABLE_I18N */ 2638 { 2639 unsigned int start_ch, end_ch; 2640 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2641 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2642 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2643 0)) 2644 return REG_ERANGE; 2645 2646 /* We can handle no multi character collating elements without libc 2647 support. */ 2648 if (BE ((start_elem->type == COLL_SYM 2649 && strlen ((char *) start_elem->opr.name) > 1) 2650 || (end_elem->type == COLL_SYM 2651 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2652 return REG_ECOLLATE; 2653 2654 # ifdef RE_ENABLE_I18N 2655 { 2656 wchar_t wc; 2657 wint_t start_wc; 2658 wint_t end_wc; 2659 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2660 2661 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2662 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2663 : 0)); 2664 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2665 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2666 : 0)); 2667 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2668 ? __btowc (start_ch) : start_elem->opr.wch); 2669 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2670 ? __btowc (end_ch) : end_elem->opr.wch); 2671 if (start_wc == WEOF || end_wc == WEOF) 2672 return REG_ECOLLATE; 2673 cmp_buf[0] = start_wc; 2674 cmp_buf[4] = end_wc; 2675 2676 if (BE ((syntax & RE_NO_EMPTY_RANGES) 2677 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0)) 2678 return REG_ERANGE; 2679 2680 /* Got valid collation sequence values, add them as a new entry. 2681 However, for !_LIBC we have no collation elements: if the 2682 character set is single byte, the single byte character set 2683 that we build below suffices. parse_bracket_exp passes 2684 no MBCSET if dfa->mb_cur_max == 1. */ 2685 if (mbcset) 2686 { 2687 /* Check the space of the arrays. */ 2688 if (BE (*range_alloc == mbcset->nranges, 0)) 2689 { 2690 /* There is not enough space, need realloc. */ 2691 wchar_t *new_array_start, *new_array_end; 2692 Idx new_nranges; 2693 2694 /* +1 in case of mbcset->nranges is 0. */ 2695 new_nranges = 2 * mbcset->nranges + 1; 2696 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2697 are NULL if *range_alloc == 0. */ 2698 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2699 new_nranges); 2700 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2701 new_nranges); 2702 2703 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2704 return REG_ESPACE; 2705 2706 mbcset->range_starts = new_array_start; 2707 mbcset->range_ends = new_array_end; 2708 *range_alloc = new_nranges; 2709 } 2710 2711 mbcset->range_starts[mbcset->nranges] = start_wc; 2712 mbcset->range_ends[mbcset->nranges++] = end_wc; 2713 } 2714 2715 /* Build the table for single byte characters. */ 2716 for (wc = 0; wc < SBC_MAX; ++wc) 2717 { 2718 cmp_buf[2] = wc; 2719 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2720 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2721 bitset_set (sbcset, wc); 2722 } 2723 } 2724 # else /* not RE_ENABLE_I18N */ 2725 { 2726 unsigned int ch; 2727 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2728 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2729 : 0)); 2730 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2731 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2732 : 0)); 2733 if (start_ch > end_ch) 2734 return REG_ERANGE; 2735 /* Build the table for single byte characters. */ 2736 for (ch = 0; ch < SBC_MAX; ++ch) 2737 if (start_ch <= ch && ch <= end_ch) 2738 bitset_set (sbcset, ch); 2739 } 2740 # endif /* not RE_ENABLE_I18N */ 2741 return REG_NOERROR; 2742 } 2743 #endif /* not _LIBC */ 2744 2745 #ifndef _LIBC 2746 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2747 Build the collating element which is represented by NAME. 2748 The result are written to MBCSET and SBCSET. 2749 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2750 pointer argument since we may update it. */ 2751 2752 static reg_errcode_t 2753 internal_function 2754 build_collating_symbol (bitset_t sbcset, 2755 # ifdef RE_ENABLE_I18N 2756 re_charset_t *mbcset _UNUSED_PARAMETER_, 2757 Idx *coll_sym_alloc _UNUSED_PARAMETER_, 2758 # endif 2759 const unsigned char *name) 2760 { 2761 size_t name_len = strlen ((const char *) name); 2762 if (BE (name_len != 1, 0)) 2763 return REG_ECOLLATE; 2764 else 2765 { 2766 bitset_set (sbcset, name[0]); 2767 return REG_NOERROR; 2768 } 2769 } 2770 #endif /* not _LIBC */ 2771 2772 /* This function parse bracket expression like "[abc]", "[a-c]", 2773 "[[.a-a.]]" etc. */ 2774 2775 static bin_tree_t * 2776 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2777 reg_syntax_t syntax, reg_errcode_t *err) 2778 { 2779 #ifdef _LIBC 2780 const unsigned char *collseqmb; 2781 const char *collseqwc; 2782 uint32_t nrules; 2783 int32_t table_size; 2784 const int32_t *symb_table; 2785 const unsigned char *extra; 2786 2787 /* Local function for parse_bracket_exp used in _LIBC environement. 2788 Seek the collating symbol entry correspondings to NAME. 2789 Return the index of the symbol in the SYMB_TABLE. */ 2790 2791 auto inline int32_t 2792 __attribute ((always_inline)) 2793 seek_collating_symbol_entry (name, name_len) 2794 const unsigned char *name; 2795 size_t name_len; 2796 { 2797 int32_t hash = elem_hash ((const char *) name, name_len); 2798 int32_t elem = hash % table_size; 2799 if (symb_table[2 * elem] != 0) 2800 { 2801 int32_t second = hash % (table_size - 2) + 1; 2802 2803 do 2804 { 2805 /* First compare the hashing value. */ 2806 if (symb_table[2 * elem] == hash 2807 /* Compare the length of the name. */ 2808 && name_len == extra[symb_table[2 * elem + 1]] 2809 /* Compare the name. */ 2810 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2811 name_len) == 0) 2812 { 2813 /* Yep, this is the entry. */ 2814 break; 2815 } 2816 2817 /* Next entry. */ 2818 elem += second; 2819 } 2820 while (symb_table[2 * elem] != 0); 2821 } 2822 return elem; 2823 } 2824 2825 /* Local function for parse_bracket_exp used in _LIBC environment. 2826 Look up the collation sequence value of BR_ELEM. 2827 Return the value if succeeded, UINT_MAX otherwise. */ 2828 2829 auto inline unsigned int 2830 __attribute ((always_inline)) 2831 lookup_collation_sequence_value (br_elem) 2832 bracket_elem_t *br_elem; 2833 { 2834 if (br_elem->type == SB_CHAR) 2835 { 2836 /* 2837 if (MB_CUR_MAX == 1) 2838 */ 2839 if (nrules == 0) 2840 return collseqmb[br_elem->opr.ch]; 2841 else 2842 { 2843 wint_t wc = __btowc (br_elem->opr.ch); 2844 return __collseq_table_lookup (collseqwc, wc); 2845 } 2846 } 2847 else if (br_elem->type == MB_CHAR) 2848 { 2849 if (nrules != 0) 2850 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2851 } 2852 else if (br_elem->type == COLL_SYM) 2853 { 2854 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2855 if (nrules != 0) 2856 { 2857 int32_t elem, idx; 2858 elem = seek_collating_symbol_entry (br_elem->opr.name, 2859 sym_name_len); 2860 if (symb_table[2 * elem] != 0) 2861 { 2862 /* We found the entry. */ 2863 idx = symb_table[2 * elem + 1]; 2864 /* Skip the name of collating element name. */ 2865 idx += 1 + extra[idx]; 2866 /* Skip the byte sequence of the collating element. */ 2867 idx += 1 + extra[idx]; 2868 /* Adjust for the alignment. */ 2869 idx = (idx + 3) & ~3; 2870 /* Skip the multibyte collation sequence value. */ 2871 idx += sizeof (unsigned int); 2872 /* Skip the wide char sequence of the collating element. */ 2873 idx += sizeof (unsigned int) * 2874 (1 + *(unsigned int *) (extra + idx)); 2875 /* Return the collation sequence value. */ 2876 return *(unsigned int *) (extra + idx); 2877 } 2878 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2879 { 2880 /* No valid character. Match it as a single byte 2881 character. */ 2882 return collseqmb[br_elem->opr.name[0]]; 2883 } 2884 } 2885 else if (sym_name_len == 1) 2886 return collseqmb[br_elem->opr.name[0]]; 2887 } 2888 return UINT_MAX; 2889 } 2890 2891 /* Local function for parse_bracket_exp used in _LIBC environement. 2892 Build the range expression which starts from START_ELEM, and ends 2893 at END_ELEM. The result are written to MBCSET and SBCSET. 2894 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2895 mbcset->range_ends, is a pointer argument sinse we may 2896 update it. */ 2897 2898 auto inline reg_errcode_t 2899 __attribute ((always_inline)) 2900 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2901 re_charset_t *mbcset; 2902 Idx *range_alloc; 2903 bitset_t sbcset; 2904 bracket_elem_t *start_elem, *end_elem; 2905 { 2906 unsigned int ch; 2907 uint32_t start_collseq; 2908 uint32_t end_collseq; 2909 2910 /* Equivalence Classes and Character Classes can't be a range 2911 start/end. */ 2912 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2913 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2914 0)) 2915 return REG_ERANGE; 2916 2917 start_collseq = lookup_collation_sequence_value (start_elem); 2918 end_collseq = lookup_collation_sequence_value (end_elem); 2919 /* Check start/end collation sequence values. */ 2920 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2921 return REG_ECOLLATE; 2922 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2923 return REG_ERANGE; 2924 2925 /* Got valid collation sequence values, add them as a new entry. 2926 However, if we have no collation elements, and the character set 2927 is single byte, the single byte character set that we 2928 build below suffices. */ 2929 if (nrules > 0 || dfa->mb_cur_max > 1) 2930 { 2931 /* Check the space of the arrays. */ 2932 if (BE (*range_alloc == mbcset->nranges, 0)) 2933 { 2934 /* There is not enough space, need realloc. */ 2935 uint32_t *new_array_start; 2936 uint32_t *new_array_end; 2937 Idx new_nranges; 2938 2939 /* +1 in case of mbcset->nranges is 0. */ 2940 new_nranges = 2 * mbcset->nranges + 1; 2941 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2942 new_nranges); 2943 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2944 new_nranges); 2945 2946 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2947 return REG_ESPACE; 2948 2949 mbcset->range_starts = new_array_start; 2950 mbcset->range_ends = new_array_end; 2951 *range_alloc = new_nranges; 2952 } 2953 2954 mbcset->range_starts[mbcset->nranges] = start_collseq; 2955 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2956 } 2957 2958 /* Build the table for single byte characters. */ 2959 for (ch = 0; ch < SBC_MAX; ch++) 2960 { 2961 uint32_t ch_collseq; 2962 /* 2963 if (MB_CUR_MAX == 1) 2964 */ 2965 if (nrules == 0) 2966 ch_collseq = collseqmb[ch]; 2967 else 2968 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2969 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2970 bitset_set (sbcset, ch); 2971 } 2972 return REG_NOERROR; 2973 } 2974 2975 /* Local function for parse_bracket_exp used in _LIBC environement. 2976 Build the collating element which is represented by NAME. 2977 The result are written to MBCSET and SBCSET. 2978 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2979 pointer argument sinse we may update it. */ 2980 2981 auto inline reg_errcode_t 2982 __attribute ((always_inline)) 2983 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2984 re_charset_t *mbcset; 2985 Idx *coll_sym_alloc; 2986 bitset_t sbcset; 2987 const unsigned char *name; 2988 { 2989 int32_t elem, idx; 2990 size_t name_len = strlen ((const char *) name); 2991 if (nrules != 0) 2992 { 2993 elem = seek_collating_symbol_entry (name, name_len); 2994 if (symb_table[2 * elem] != 0) 2995 { 2996 /* We found the entry. */ 2997 idx = symb_table[2 * elem + 1]; 2998 /* Skip the name of collating element name. */ 2999 idx += 1 + extra[idx]; 3000 } 3001 else if (symb_table[2 * elem] == 0 && name_len == 1) 3002 { 3003 /* No valid character, treat it as a normal 3004 character. */ 3005 bitset_set (sbcset, name[0]); 3006 return REG_NOERROR; 3007 } 3008 else 3009 return REG_ECOLLATE; 3010 3011 /* Got valid collation sequence, add it as a new entry. */ 3012 /* Check the space of the arrays. */ 3013 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3014 { 3015 /* Not enough, realloc it. */ 3016 /* +1 in case of mbcset->ncoll_syms is 0. */ 3017 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3018 /* Use realloc since mbcset->coll_syms is NULL 3019 if *alloc == 0. */ 3020 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3021 new_coll_sym_alloc); 3022 if (BE (new_coll_syms == NULL, 0)) 3023 return REG_ESPACE; 3024 mbcset->coll_syms = new_coll_syms; 3025 *coll_sym_alloc = new_coll_sym_alloc; 3026 } 3027 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3028 return REG_NOERROR; 3029 } 3030 else 3031 { 3032 if (BE (name_len != 1, 0)) 3033 return REG_ECOLLATE; 3034 else 3035 { 3036 bitset_set (sbcset, name[0]); 3037 return REG_NOERROR; 3038 } 3039 } 3040 } 3041 #endif 3042 3043 re_token_t br_token; 3044 re_bitset_ptr_t sbcset; 3045 #ifdef RE_ENABLE_I18N 3046 re_charset_t *mbcset; 3047 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3048 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3049 #endif /* not RE_ENABLE_I18N */ 3050 bool non_match = false; 3051 bin_tree_t *work_tree; 3052 int token_len; 3053 bool first_round = true; 3054 #ifdef _LIBC 3055 collseqmb = (const unsigned char *) 3056 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3057 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3058 if (nrules) 3059 { 3060 /* 3061 if (MB_CUR_MAX > 1) 3062 */ 3063 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3064 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3065 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3066 _NL_COLLATE_SYMB_TABLEMB); 3067 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3068 _NL_COLLATE_SYMB_EXTRAMB); 3069 } 3070 #endif 3071 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3072 #ifdef RE_ENABLE_I18N 3073 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3074 #endif /* RE_ENABLE_I18N */ 3075 #ifdef RE_ENABLE_I18N 3076 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3077 #else 3078 if (BE (sbcset == NULL, 0)) 3079 #endif /* RE_ENABLE_I18N */ 3080 { 3081 *err = REG_ESPACE; 3082 return NULL; 3083 } 3084 3085 token_len = peek_token_bracket (token, regexp, syntax); 3086 if (BE (token->type == END_OF_RE, 0)) 3087 { 3088 *err = REG_BADPAT; 3089 goto parse_bracket_exp_free_return; 3090 } 3091 if (token->type == OP_NON_MATCH_LIST) 3092 { 3093 #ifdef RE_ENABLE_I18N 3094 mbcset->non_match = 1; 3095 #endif /* not RE_ENABLE_I18N */ 3096 non_match = true; 3097 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3098 bitset_set (sbcset, '\n'); 3099 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3100 token_len = peek_token_bracket (token, regexp, syntax); 3101 if (BE (token->type == END_OF_RE, 0)) 3102 { 3103 *err = REG_BADPAT; 3104 goto parse_bracket_exp_free_return; 3105 } 3106 } 3107 3108 /* We treat the first ']' as a normal character. */ 3109 if (token->type == OP_CLOSE_BRACKET) 3110 token->type = CHARACTER; 3111 3112 while (1) 3113 { 3114 bracket_elem_t start_elem, end_elem; 3115 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3116 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3117 reg_errcode_t ret; 3118 int token_len2 = 0; 3119 bool is_range_exp = false; 3120 re_token_t token2; 3121 3122 start_elem.opr.name = start_name_buf; 3123 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3124 syntax, first_round); 3125 if (BE (ret != REG_NOERROR, 0)) 3126 { 3127 *err = ret; 3128 goto parse_bracket_exp_free_return; 3129 } 3130 first_round = false; 3131 3132 /* Get information about the next token. We need it in any case. */ 3133 token_len = peek_token_bracket (token, regexp, syntax); 3134 3135 /* Do not check for ranges if we know they are not allowed. */ 3136 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3137 { 3138 if (BE (token->type == END_OF_RE, 0)) 3139 { 3140 *err = REG_EBRACK; 3141 goto parse_bracket_exp_free_return; 3142 } 3143 if (token->type == OP_CHARSET_RANGE) 3144 { 3145 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3146 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3147 if (BE (token2.type == END_OF_RE, 0)) 3148 { 3149 *err = REG_EBRACK; 3150 goto parse_bracket_exp_free_return; 3151 } 3152 if (token2.type == OP_CLOSE_BRACKET) 3153 { 3154 /* We treat the last '-' as a normal character. */ 3155 re_string_skip_bytes (regexp, -token_len); 3156 token->type = CHARACTER; 3157 } 3158 else 3159 is_range_exp = true; 3160 } 3161 } 3162 3163 if (is_range_exp == true) 3164 { 3165 end_elem.opr.name = end_name_buf; 3166 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3167 dfa, syntax, true); 3168 if (BE (ret != REG_NOERROR, 0)) 3169 { 3170 *err = ret; 3171 goto parse_bracket_exp_free_return; 3172 } 3173 3174 token_len = peek_token_bracket (token, regexp, syntax); 3175 3176 #ifdef _LIBC 3177 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3178 &start_elem, &end_elem); 3179 #else 3180 # ifdef RE_ENABLE_I18N 3181 *err = build_range_exp (syntax, sbcset, 3182 dfa->mb_cur_max > 1 ? mbcset : NULL, 3183 &range_alloc, &start_elem, &end_elem); 3184 # else 3185 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem); 3186 # endif 3187 #endif /* RE_ENABLE_I18N */ 3188 if (BE (*err != REG_NOERROR, 0)) 3189 goto parse_bracket_exp_free_return; 3190 } 3191 else 3192 { 3193 switch (start_elem.type) 3194 { 3195 case SB_CHAR: 3196 bitset_set (sbcset, start_elem.opr.ch); 3197 break; 3198 #ifdef RE_ENABLE_I18N 3199 case MB_CHAR: 3200 /* Check whether the array has enough space. */ 3201 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3202 { 3203 wchar_t *new_mbchars; 3204 /* Not enough, realloc it. */ 3205 /* +1 in case of mbcset->nmbchars is 0. */ 3206 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3207 /* Use realloc since array is NULL if *alloc == 0. */ 3208 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3209 mbchar_alloc); 3210 if (BE (new_mbchars == NULL, 0)) 3211 goto parse_bracket_exp_espace; 3212 mbcset->mbchars = new_mbchars; 3213 } 3214 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3215 break; 3216 #endif /* RE_ENABLE_I18N */ 3217 case EQUIV_CLASS: 3218 *err = build_equiv_class (sbcset, 3219 #ifdef RE_ENABLE_I18N 3220 mbcset, &equiv_class_alloc, 3221 #endif /* RE_ENABLE_I18N */ 3222 start_elem.opr.name); 3223 if (BE (*err != REG_NOERROR, 0)) 3224 goto parse_bracket_exp_free_return; 3225 break; 3226 case COLL_SYM: 3227 *err = build_collating_symbol (sbcset, 3228 #ifdef RE_ENABLE_I18N 3229 mbcset, &coll_sym_alloc, 3230 #endif /* RE_ENABLE_I18N */ 3231 start_elem.opr.name); 3232 if (BE (*err != REG_NOERROR, 0)) 3233 goto parse_bracket_exp_free_return; 3234 break; 3235 case CHAR_CLASS: 3236 *err = build_charclass (regexp->trans, sbcset, 3237 #ifdef RE_ENABLE_I18N 3238 mbcset, &char_class_alloc, 3239 #endif /* RE_ENABLE_I18N */ 3240 start_elem.opr.name, syntax); 3241 if (BE (*err != REG_NOERROR, 0)) 3242 goto parse_bracket_exp_free_return; 3243 break; 3244 default: 3245 assert (0); 3246 break; 3247 } 3248 } 3249 if (BE (token->type == END_OF_RE, 0)) 3250 { 3251 *err = REG_EBRACK; 3252 goto parse_bracket_exp_free_return; 3253 } 3254 if (token->type == OP_CLOSE_BRACKET) 3255 break; 3256 } 3257 3258 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3259 3260 /* If it is non-matching list. */ 3261 if (non_match) 3262 bitset_not (sbcset); 3263 3264 #ifdef RE_ENABLE_I18N 3265 /* Ensure only single byte characters are set. */ 3266 if (dfa->mb_cur_max > 1) 3267 bitset_mask (sbcset, dfa->sb_char); 3268 3269 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3270 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3271 || mbcset->non_match))) 3272 { 3273 bin_tree_t *mbc_tree; 3274 int sbc_idx; 3275 /* Build a tree for complex bracket. */ 3276 dfa->has_mb_node = 1; 3277 br_token.type = COMPLEX_BRACKET; 3278 br_token.opr.mbcset = mbcset; 3279 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3280 if (BE (mbc_tree == NULL, 0)) 3281 goto parse_bracket_exp_espace; 3282 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3283 if (sbcset[sbc_idx]) 3284 break; 3285 /* If there are no bits set in sbcset, there is no point 3286 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3287 if (sbc_idx < BITSET_WORDS) 3288 { 3289 /* Build a tree for simple bracket. */ 3290 br_token.type = SIMPLE_BRACKET; 3291 br_token.opr.sbcset = sbcset; 3292 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3293 if (BE (work_tree == NULL, 0)) 3294 goto parse_bracket_exp_espace; 3295 3296 /* Then join them by ALT node. */ 3297 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3298 if (BE (work_tree == NULL, 0)) 3299 goto parse_bracket_exp_espace; 3300 } 3301 else 3302 { 3303 re_free (sbcset); 3304 work_tree = mbc_tree; 3305 } 3306 } 3307 else 3308 #endif /* not RE_ENABLE_I18N */ 3309 { 3310 #ifdef RE_ENABLE_I18N 3311 free_charset (mbcset); 3312 #endif 3313 /* Build a tree for simple bracket. */ 3314 br_token.type = SIMPLE_BRACKET; 3315 br_token.opr.sbcset = sbcset; 3316 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3317 if (BE (work_tree == NULL, 0)) 3318 goto parse_bracket_exp_espace; 3319 } 3320 return work_tree; 3321 3322 parse_bracket_exp_espace: 3323 *err = REG_ESPACE; 3324 parse_bracket_exp_free_return: 3325 re_free (sbcset); 3326 #ifdef RE_ENABLE_I18N 3327 free_charset (mbcset); 3328 #endif /* RE_ENABLE_I18N */ 3329 return NULL; 3330 } 3331 3332 /* Parse an element in the bracket expression. */ 3333 3334 static reg_errcode_t 3335 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3336 re_token_t *token, int token_len, 3337 re_dfa_t *dfa _UNUSED_PARAMETER_, 3338 reg_syntax_t syntax, bool accept_hyphen) 3339 { 3340 #ifdef RE_ENABLE_I18N 3341 int cur_char_size; 3342 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3343 if (cur_char_size > 1) 3344 { 3345 elem->type = MB_CHAR; 3346 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3347 re_string_skip_bytes (regexp, cur_char_size); 3348 return REG_NOERROR; 3349 } 3350 #endif /* RE_ENABLE_I18N */ 3351 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3352 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3353 || token->type == OP_OPEN_EQUIV_CLASS) 3354 return parse_bracket_symbol (elem, regexp, token); 3355 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3356 { 3357 /* A '-' must only appear as anything but a range indicator before 3358 the closing bracket. Everything else is an error. */ 3359 re_token_t token2; 3360 (void) peek_token_bracket (&token2, regexp, syntax); 3361 if (token2.type != OP_CLOSE_BRACKET) 3362 /* The actual error value is not standardized since this whole 3363 case is undefined. But ERANGE makes good sense. */ 3364 return REG_ERANGE; 3365 } 3366 elem->type = SB_CHAR; 3367 elem->opr.ch = token->opr.c; 3368 return REG_NOERROR; 3369 } 3370 3371 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3372 such as [:<character_class>:], [.<collating_element>.], and 3373 [=<equivalent_class>=]. */ 3374 3375 static reg_errcode_t 3376 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3377 re_token_t *token) 3378 { 3379 unsigned char ch, delim = token->opr.c; 3380 int i = 0; 3381 if (re_string_eoi(regexp)) 3382 return REG_EBRACK; 3383 for (;; ++i) 3384 { 3385 if (i >= BRACKET_NAME_BUF_SIZE) 3386 return REG_EBRACK; 3387 if (token->type == OP_OPEN_CHAR_CLASS) 3388 ch = re_string_fetch_byte_case (regexp); 3389 else 3390 ch = re_string_fetch_byte (regexp); 3391 if (re_string_eoi(regexp)) 3392 return REG_EBRACK; 3393 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3394 break; 3395 elem->opr.name[i] = ch; 3396 } 3397 re_string_skip_bytes (regexp, 1); 3398 elem->opr.name[i] = '\0'; 3399 switch (token->type) 3400 { 3401 case OP_OPEN_COLL_ELEM: 3402 elem->type = COLL_SYM; 3403 break; 3404 case OP_OPEN_EQUIV_CLASS: 3405 elem->type = EQUIV_CLASS; 3406 break; 3407 case OP_OPEN_CHAR_CLASS: 3408 elem->type = CHAR_CLASS; 3409 break; 3410 default: 3411 break; 3412 } 3413 return REG_NOERROR; 3414 } 3415 3416 /* Helper function for parse_bracket_exp. 3417 Build the equivalence class which is represented by NAME. 3418 The result are written to MBCSET and SBCSET. 3419 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3420 is a pointer argument sinse we may update it. */ 3421 3422 static reg_errcode_t 3423 #ifdef RE_ENABLE_I18N 3424 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset _UNUSED_PARAMETER_, 3425 Idx *equiv_class_alloc _UNUSED_PARAMETER_, 3426 const unsigned char *name) 3427 #else /* not RE_ENABLE_I18N */ 3428 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3429 #endif /* not RE_ENABLE_I18N */ 3430 { 3431 #ifdef _LIBC 3432 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3433 if (nrules != 0) 3434 { 3435 const int32_t *table, *indirect; 3436 const unsigned char *weights, *extra, *cp; 3437 unsigned char char_buf[2]; 3438 int32_t idx1, idx2; 3439 unsigned int ch; 3440 size_t len; 3441 /* This #include defines a local function! */ 3442 # include <locale/weight.h> 3443 /* Calculate the index for equivalence class. */ 3444 cp = name; 3445 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3446 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3447 _NL_COLLATE_WEIGHTMB); 3448 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3449 _NL_COLLATE_EXTRAMB); 3450 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3451 _NL_COLLATE_INDIRECTMB); 3452 idx1 = findidx (&cp); 3453 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3454 /* This isn't a valid character. */ 3455 return REG_ECOLLATE; 3456 3457 /* Build single byte matcing table for this equivalence class. */ 3458 char_buf[1] = (unsigned char) '\0'; 3459 len = weights[idx1 & 0xffffff]; 3460 for (ch = 0; ch < SBC_MAX; ++ch) 3461 { 3462 char_buf[0] = ch; 3463 cp = char_buf; 3464 idx2 = findidx (&cp); 3465 /* 3466 idx2 = table[ch]; 3467 */ 3468 if (idx2 == 0) 3469 /* This isn't a valid character. */ 3470 continue; 3471 /* Compare only if the length matches and the collation rule 3472 index is the same. */ 3473 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)) 3474 { 3475 int cnt = 0; 3476 3477 while (cnt <= len && 3478 weights[(idx1 & 0xffffff) + 1 + cnt] 3479 == weights[(idx2 & 0xffffff) + 1 + cnt]) 3480 ++cnt; 3481 3482 if (cnt > len) 3483 bitset_set (sbcset, ch); 3484 } 3485 } 3486 /* Check whether the array has enough space. */ 3487 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3488 { 3489 /* Not enough, realloc it. */ 3490 /* +1 in case of mbcset->nequiv_classes is 0. */ 3491 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3492 /* Use realloc since the array is NULL if *alloc == 0. */ 3493 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3494 int32_t, 3495 new_equiv_class_alloc); 3496 if (BE (new_equiv_classes == NULL, 0)) 3497 return REG_ESPACE; 3498 mbcset->equiv_classes = new_equiv_classes; 3499 *equiv_class_alloc = new_equiv_class_alloc; 3500 } 3501 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3502 } 3503 else 3504 #endif /* _LIBC */ 3505 { 3506 if (BE (strlen ((const char *) name) != 1, 0)) 3507 return REG_ECOLLATE; 3508 bitset_set (sbcset, *name); 3509 } 3510 return REG_NOERROR; 3511 } 3512 3513 /* Helper function for parse_bracket_exp. 3514 Build the character class which is represented by NAME. 3515 The result are written to MBCSET and SBCSET. 3516 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3517 is a pointer argument sinse we may update it. */ 3518 3519 static reg_errcode_t 3520 #ifdef RE_ENABLE_I18N 3521 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3522 re_charset_t *mbcset, Idx *char_class_alloc, 3523 const unsigned char *class_name, reg_syntax_t syntax) 3524 #else /* not RE_ENABLE_I18N */ 3525 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3526 const unsigned char *class_name, reg_syntax_t syntax) 3527 #endif /* not RE_ENABLE_I18N */ 3528 { 3529 int i; 3530 const char *name = (const char *) class_name; 3531 3532 /* In case of REG_ICASE "upper" and "lower" match the both of 3533 upper and lower cases. */ 3534 if ((syntax & RE_ICASE) 3535 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3536 name = "alpha"; 3537 3538 #ifdef RE_ENABLE_I18N 3539 /* Check the space of the arrays. */ 3540 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3541 { 3542 /* Not enough, realloc it. */ 3543 /* +1 in case of mbcset->nchar_classes is 0. */ 3544 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3545 /* Use realloc since array is NULL if *alloc == 0. */ 3546 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3547 new_char_class_alloc); 3548 if (BE (new_char_classes == NULL, 0)) 3549 return REG_ESPACE; 3550 mbcset->char_classes = new_char_classes; 3551 *char_class_alloc = new_char_class_alloc; 3552 } 3553 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3554 #endif /* RE_ENABLE_I18N */ 3555 3556 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3557 do { \ 3558 if (BE (trans != NULL, 0)) \ 3559 { \ 3560 for (i = 0; i < SBC_MAX; ++i) \ 3561 if (ctype_func (i)) \ 3562 bitset_set (sbcset, trans[i]); \ 3563 } \ 3564 else \ 3565 { \ 3566 for (i = 0; i < SBC_MAX; ++i) \ 3567 if (ctype_func (i)) \ 3568 bitset_set (sbcset, i); \ 3569 } \ 3570 } while (0) 3571 3572 if (strcmp (name, "alnum") == 0) 3573 BUILD_CHARCLASS_LOOP (isalnum); 3574 else if (strcmp (name, "cntrl") == 0) 3575 BUILD_CHARCLASS_LOOP (iscntrl); 3576 else if (strcmp (name, "lower") == 0) 3577 BUILD_CHARCLASS_LOOP (islower); 3578 else if (strcmp (name, "space") == 0) 3579 BUILD_CHARCLASS_LOOP (isspace); 3580 else if (strcmp (name, "alpha") == 0) 3581 BUILD_CHARCLASS_LOOP (isalpha); 3582 else if (strcmp (name, "digit") == 0) 3583 BUILD_CHARCLASS_LOOP (isdigit); 3584 else if (strcmp (name, "print") == 0) 3585 BUILD_CHARCLASS_LOOP (isprint); 3586 else if (strcmp (name, "upper") == 0) 3587 BUILD_CHARCLASS_LOOP (isupper); 3588 else if (strcmp (name, "blank") == 0) 3589 BUILD_CHARCLASS_LOOP (isblank); 3590 else if (strcmp (name, "graph") == 0) 3591 BUILD_CHARCLASS_LOOP (isgraph); 3592 else if (strcmp (name, "punct") == 0) 3593 BUILD_CHARCLASS_LOOP (ispunct); 3594 else if (strcmp (name, "xdigit") == 0) 3595 BUILD_CHARCLASS_LOOP (isxdigit); 3596 else 3597 return REG_ECTYPE; 3598 3599 return REG_NOERROR; 3600 } 3601 3602 static bin_tree_t * 3603 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3604 const unsigned char *class_name, 3605 const unsigned char *extra, bool non_match, 3606 reg_errcode_t *err) 3607 { 3608 re_bitset_ptr_t sbcset; 3609 #ifdef RE_ENABLE_I18N 3610 re_charset_t *mbcset; 3611 Idx alloc = 0; 3612 #endif /* not RE_ENABLE_I18N */ 3613 reg_errcode_t ret; 3614 re_token_t br_token; 3615 bin_tree_t *tree; 3616 3617 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3618 #ifdef RE_ENABLE_I18N 3619 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3620 #endif /* RE_ENABLE_I18N */ 3621 3622 #ifdef RE_ENABLE_I18N 3623 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3624 #else /* not RE_ENABLE_I18N */ 3625 if (BE (sbcset == NULL, 0)) 3626 #endif /* not RE_ENABLE_I18N */ 3627 { 3628 *err = REG_ESPACE; 3629 return NULL; 3630 } 3631 3632 if (non_match) 3633 { 3634 #ifdef RE_ENABLE_I18N 3635 mbcset->non_match = 1; 3636 #endif /* not RE_ENABLE_I18N */ 3637 } 3638 3639 /* We don't care the syntax in this case. */ 3640 ret = build_charclass (trans, sbcset, 3641 #ifdef RE_ENABLE_I18N 3642 mbcset, &alloc, 3643 #endif /* RE_ENABLE_I18N */ 3644 class_name, 0); 3645 3646 if (BE (ret != REG_NOERROR, 0)) 3647 { 3648 re_free (sbcset); 3649 #ifdef RE_ENABLE_I18N 3650 free_charset (mbcset); 3651 #endif /* RE_ENABLE_I18N */ 3652 *err = ret; 3653 return NULL; 3654 } 3655 /* \w match '_' also. */ 3656 for (; *extra; extra++) 3657 bitset_set (sbcset, *extra); 3658 3659 /* If it is non-matching list. */ 3660 if (non_match) 3661 bitset_not (sbcset); 3662 3663 #ifdef RE_ENABLE_I18N 3664 /* Ensure only single byte characters are set. */ 3665 if (dfa->mb_cur_max > 1) 3666 bitset_mask (sbcset, dfa->sb_char); 3667 #endif 3668 3669 /* Build a tree for simple bracket. */ 3670 br_token.type = SIMPLE_BRACKET; 3671 br_token.opr.sbcset = sbcset; 3672 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3673 if (BE (tree == NULL, 0)) 3674 goto build_word_op_espace; 3675 3676 #ifdef RE_ENABLE_I18N 3677 if (dfa->mb_cur_max > 1) 3678 { 3679 bin_tree_t *mbc_tree; 3680 /* Build a tree for complex bracket. */ 3681 br_token.type = COMPLEX_BRACKET; 3682 br_token.opr.mbcset = mbcset; 3683 dfa->has_mb_node = 1; 3684 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3685 if (BE (mbc_tree == NULL, 0)) 3686 goto build_word_op_espace; 3687 /* Then join them by ALT node. */ 3688 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3689 if (BE (mbc_tree != NULL, 1)) 3690 return tree; 3691 } 3692 else 3693 { 3694 free_charset (mbcset); 3695 return tree; 3696 } 3697 #else /* not RE_ENABLE_I18N */ 3698 return tree; 3699 #endif /* not RE_ENABLE_I18N */ 3700 3701 build_word_op_espace: 3702 re_free (sbcset); 3703 #ifdef RE_ENABLE_I18N 3704 free_charset (mbcset); 3705 #endif /* RE_ENABLE_I18N */ 3706 *err = REG_ESPACE; 3707 return NULL; 3708 } 3709 3710 /* This is intended for the expressions like "a{1,3}". 3711 Fetch a number from `input', and return the number. 3712 Return REG_MISSING if the number field is empty like "{,1}". 3713 Return REG_ERROR if an error occurred. */ 3714 3715 static Idx 3716 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3717 { 3718 Idx num = REG_MISSING; 3719 unsigned char c; 3720 while (1) 3721 { 3722 fetch_token (token, input, syntax); 3723 c = token->opr.c; 3724 if (BE (token->type == END_OF_RE, 0)) 3725 return REG_ERROR; 3726 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3727 break; 3728 num = ((token->type != CHARACTER || c < '0' || '9' < c 3729 || num == REG_ERROR) 3730 ? REG_ERROR 3731 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3732 num = (num > RE_DUP_MAX) ? REG_ERROR : num; 3733 } 3734 return num; 3735 } 3736 3737 #ifdef RE_ENABLE_I18N 3738 static void 3739 free_charset (re_charset_t *cset) 3740 { 3741 re_free (cset->mbchars); 3742 # ifdef _LIBC 3743 re_free (cset->coll_syms); 3744 re_free (cset->equiv_classes); 3745 re_free (cset->range_starts); 3746 re_free (cset->range_ends); 3747 # endif 3748 re_free (cset->char_classes); 3749 re_free (cset); 3750 } 3751 #endif /* RE_ENABLE_I18N */ 3752 3753 /* Functions for binary tree operation. */ 3754 3755 /* Create a tree node. */ 3756 3757 static bin_tree_t * 3758 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3759 re_token_type_t type) 3760 { 3761 re_token_t t; 3762 t.type = type; 3763 return create_token_tree (dfa, left, right, &t); 3764 } 3765 3766 static bin_tree_t * 3767 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3768 const re_token_t *token) 3769 { 3770 bin_tree_t *tree; 3771 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3772 { 3773 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3774 3775 if (storage == NULL) 3776 return NULL; 3777 storage->next = dfa->str_tree_storage; 3778 dfa->str_tree_storage = storage; 3779 dfa->str_tree_storage_idx = 0; 3780 } 3781 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3782 3783 tree->parent = NULL; 3784 tree->left = left; 3785 tree->right = right; 3786 tree->token = *token; 3787 tree->token.duplicated = 0; 3788 tree->token.opt_subexp = 0; 3789 tree->first = NULL; 3790 tree->next = NULL; 3791 tree->node_idx = REG_MISSING; 3792 3793 if (left != NULL) 3794 left->parent = tree; 3795 if (right != NULL) 3796 right->parent = tree; 3797 return tree; 3798 } 3799 3800 /* Mark the tree SRC as an optional subexpression. 3801 To be called from preorder or postorder. */ 3802 3803 static reg_errcode_t 3804 mark_opt_subexp (void *extra, bin_tree_t *node) 3805 { 3806 Idx idx = (Idx) (long) extra; 3807 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3808 node->token.opt_subexp = 1; 3809 3810 return REG_NOERROR; 3811 } 3812 3813 /* Free the allocated memory inside NODE. */ 3814 3815 static void 3816 free_token (re_token_t *node) 3817 { 3818 #ifdef RE_ENABLE_I18N 3819 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3820 free_charset (node->opr.mbcset); 3821 else 3822 #endif /* RE_ENABLE_I18N */ 3823 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3824 re_free (node->opr.sbcset); 3825 } 3826 3827 /* Worker function for tree walking. Free the allocated memory inside NODE 3828 and its children. */ 3829 3830 static reg_errcode_t 3831 free_tree (void *extra _UNUSED_PARAMETER_, bin_tree_t *node) 3832 { 3833 free_token (&node->token); 3834 return REG_NOERROR; 3835 } 3836 3837 3838 /* Duplicate the node SRC, and return new node. This is a preorder 3839 visit similar to the one implemented by the generic visitor, but 3840 we need more infrastructure to maintain two parallel trees --- so, 3841 it's easier to duplicate. */ 3842 3843 static bin_tree_t * 3844 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3845 { 3846 const bin_tree_t *node; 3847 bin_tree_t *dup_root; 3848 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3849 3850 for (node = root; ; ) 3851 { 3852 /* Create a new tree and link it back to the current parent. */ 3853 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3854 if (*p_new == NULL) 3855 return NULL; 3856 (*p_new)->parent = dup_node; 3857 (*p_new)->token.duplicated = 1; 3858 dup_node = *p_new; 3859 3860 /* Go to the left node, or up and to the right. */ 3861 if (node->left) 3862 { 3863 node = node->left; 3864 p_new = &dup_node->left; 3865 } 3866 else 3867 { 3868 const bin_tree_t *prev = NULL; 3869 while (node->right == prev || node->right == NULL) 3870 { 3871 prev = node; 3872 node = node->parent; 3873 dup_node = dup_node->parent; 3874 if (!node) 3875 return dup_root; 3876 } 3877 node = node->right; 3878 p_new = &dup_node->right; 3879 } 3880 } 3881 } 3882