1 /* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License along 17 with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 21 Idx length, reg_syntax_t syntax); 22 static void re_compile_fastmap_iter (regex_t *bufp, 23 const re_dfastate_t *init_state, 24 char *fastmap); 25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len); 26 #ifdef RE_ENABLE_I18N 27 static void free_charset (re_charset_t *cset); 28 #endif /* RE_ENABLE_I18N */ 29 static void free_workarea_compile (regex_t *preg); 30 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 31 #ifdef RE_ENABLE_I18N 32 static void optimize_utf8 (re_dfa_t *dfa); 33 #endif 34 static reg_errcode_t analyze (regex_t *preg); 35 static reg_errcode_t preorder (bin_tree_t *root, 36 reg_errcode_t (fn (void *, bin_tree_t *)), 37 void *extra); 38 static reg_errcode_t postorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 44 bin_tree_t *node); 45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 50 unsigned int constraint); 51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 53 Idx node, bool root); 54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 55 static Idx fetch_number (re_string_t *input, re_token_t *token, 56 reg_syntax_t syntax); 57 static int peek_token (re_token_t *token, re_string_t *input, 58 reg_syntax_t syntax); 59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 60 reg_syntax_t syntax, reg_errcode_t *err); 61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 62 re_token_t *token, reg_syntax_t syntax, 63 Idx nest, reg_errcode_t *err); 64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 Idx nest, reg_errcode_t *err); 67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 Idx nest, reg_errcode_t *err); 70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 Idx nest, reg_errcode_t *err); 73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 74 re_dfa_t *dfa, re_token_t *token, 75 reg_syntax_t syntax, reg_errcode_t *err); 76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 77 re_token_t *token, reg_syntax_t syntax, 78 reg_errcode_t *err); 79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 80 re_string_t *regexp, 81 re_token_t *token, int token_len, 82 re_dfa_t *dfa, 83 reg_syntax_t syntax, 84 bool accept_hyphen); 85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 86 re_string_t *regexp, 87 re_token_t *token); 88 #ifdef RE_ENABLE_I18N 89 static reg_errcode_t build_equiv_class (bitset sbcset, 90 re_charset_t *mbcset, 91 Idx *equiv_class_alloc, 92 const unsigned char *name); 93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans, 94 bitset sbcset, 95 re_charset_t *mbcset, 96 Idx *char_class_alloc, 97 const unsigned char *class_name, 98 reg_syntax_t syntax); 99 #else /* not RE_ENABLE_I18N */ 100 static reg_errcode_t build_equiv_class (bitset sbcset, 101 const unsigned char *name); 102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans, 103 bitset sbcset, 104 const unsigned char *class_name, 105 reg_syntax_t syntax); 106 #endif /* not RE_ENABLE_I18N */ 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 108 unsigned REG_TRANSLATE_TYPE trans, 109 const unsigned char *class_name, 110 const unsigned char *extra, 111 bool non_match, reg_errcode_t *err); 112 static bin_tree_t *create_tree (re_dfa_t *dfa, 113 bin_tree_t *left, bin_tree_t *right, 114 re_token_type_t type); 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 116 bin_tree_t *left, bin_tree_t *right, 117 const re_token_t *token); 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 119 static void free_token (re_token_t *node); 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 122 123 /* This table gives an error message for each of the error codes listed 124 in regex.h. Obviously the order here has to be same as there. 125 POSIX doesn't require that we do anything for REG_NOERROR, 126 but why not be nice? */ 127 128 const char __re_error_msgid[] attribute_hidden = 129 { 130 #define REG_NOERROR_IDX 0 131 gettext_noop ("Success") /* REG_NOERROR */ 132 "\0" 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 134 gettext_noop ("No match") /* REG_NOMATCH */ 135 "\0" 136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 138 "\0" 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 141 "\0" 142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 144 "\0" 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 147 "\0" 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 150 "\0" 151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 153 "\0" 154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 156 "\0" 157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 159 "\0" 160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 162 "\0" 163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 164 gettext_noop ("Invalid range end") /* REG_ERANGE */ 165 "\0" 166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 167 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 168 "\0" 169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 171 "\0" 172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 173 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 174 "\0" 175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 176 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 177 "\0" 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 180 }; 181 182 const size_t __re_error_msgid_idx[] attribute_hidden = 183 { 184 REG_NOERROR_IDX, 185 REG_NOMATCH_IDX, 186 REG_BADPAT_IDX, 187 REG_ECOLLATE_IDX, 188 REG_ECTYPE_IDX, 189 REG_EESCAPE_IDX, 190 REG_ESUBREG_IDX, 191 REG_EBRACK_IDX, 192 REG_EPAREN_IDX, 193 REG_EBRACE_IDX, 194 REG_BADBR_IDX, 195 REG_ERANGE_IDX, 196 REG_ESPACE_IDX, 197 REG_BADRPT_IDX, 198 REG_EEND_IDX, 199 REG_ESIZE_IDX, 200 REG_ERPAREN_IDX 201 }; 202 203 /* Entry points for GNU code. */ 204 205 /* re_compile_pattern is the GNU regular expression compiler: it 206 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 207 Returns 0 if the pattern was valid, otherwise an error string. 208 209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields 210 are set in BUFP on entry. */ 211 212 const char * 213 re_compile_pattern (const char *pattern, size_t length, 214 struct re_pattern_buffer *bufp) 215 { 216 reg_errcode_t ret; 217 218 /* And GNU code determines whether or not to get register information 219 by passing null for the REGS argument to re_match, etc., not by 220 setting re_no_sub, unless REG_NO_SUB is set. */ 221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB); 222 223 /* Match anchors at newline. */ 224 bufp->re_newline_anchor = 1; 225 226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 227 228 if (!ret) 229 return NULL; 230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 231 } 232 #ifdef _LIBC 233 weak_alias (__re_compile_pattern, re_compile_pattern) 234 #endif 235 236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 237 also be assigned to arbitrarily: each pattern buffer stores its own 238 syntax, so it can be changed between regex compilations. */ 239 /* This has no initializer because initialized variables in Emacs 240 become read-only after dumping. */ 241 reg_syntax_t re_syntax_options; 242 243 244 /* Specify the precise syntax of regexps for compilation. This provides 245 for compatibility for various utilities which historically have 246 different, incompatible syntaxes. 247 248 The argument SYNTAX is a bit mask comprised of the various bits 249 defined in regex.h. We return the old syntax. */ 250 251 reg_syntax_t 252 re_set_syntax (reg_syntax_t syntax) 253 { 254 reg_syntax_t ret = re_syntax_options; 255 256 re_syntax_options = syntax; 257 return ret; 258 } 259 #ifdef _LIBC 260 weak_alias (__re_set_syntax, re_set_syntax) 261 #endif 262 263 int 264 re_compile_fastmap (struct re_pattern_buffer *bufp) 265 { 266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer; 267 char *fastmap = bufp->re_fastmap; 268 269 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 271 if (dfa->init_state != dfa->init_state_word) 272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 273 if (dfa->init_state != dfa->init_state_nl) 274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 275 if (dfa->init_state != dfa->init_state_begbuf) 276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 277 bufp->re_fastmap_accurate = 1; 278 return 0; 279 } 280 #ifdef _LIBC 281 weak_alias (__re_compile_fastmap, re_compile_fastmap) 282 #endif 283 284 static inline void 285 __attribute ((always_inline)) 286 re_set_fastmap (char *fastmap, bool icase, int ch) 287 { 288 fastmap[ch] = 1; 289 if (icase) 290 fastmap[tolower (ch)] = 1; 291 } 292 293 /* Helper function for re_compile_fastmap. 294 Compile fastmap for the initial_state INIT_STATE. */ 295 296 static void 297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 298 char *fastmap) 299 { 300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer; 301 Idx node_cnt; 302 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE)); 303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 304 { 305 Idx node = init_state->nodes.elems[node_cnt]; 306 re_token_type_t type = dfa->nodes[node].type; 307 308 if (type == CHARACTER) 309 { 310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 311 #ifdef RE_ENABLE_I18N 312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1) 313 { 314 unsigned char buf[MB_LEN_MAX]; 315 unsigned char *p; 316 wchar_t wc; 317 mbstate_t state; 318 319 p = buf; 320 *p++ = dfa->nodes[node].opr.c; 321 while (++node < dfa->nodes_len 322 && dfa->nodes[node].type == CHARACTER 323 && dfa->nodes[node].mb_partial) 324 *p++ = dfa->nodes[node].opr.c; 325 memset (&state, 0, sizeof (state)); 326 if (mbrtowc (&wc, (const char *) buf, p - buf, 327 &state) == p - buf 328 && (__wcrtomb ((char *) buf, towlower (wc), &state) 329 != (size_t) -1)) 330 re_set_fastmap (fastmap, false, buf[0]); 331 } 332 #endif 333 } 334 else if (type == SIMPLE_BRACKET) 335 { 336 int i, j, ch; 337 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 338 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 339 if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j)) 340 re_set_fastmap (fastmap, icase, ch); 341 } 342 #ifdef RE_ENABLE_I18N 343 else if (type == COMPLEX_BRACKET) 344 { 345 Idx i; 346 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes 348 || cset->nranges || cset->nchar_classes) 349 { 350 # ifdef _LIBC 351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) 352 { 353 /* In this case we want to catch the bytes which are 354 the first byte of any collation elements. 355 e.g. In da_DK, we want to catch 'a' since "aa" 356 is a valid collation element, and don't catch 357 'b' since 'b' is the only collation element 358 which starts from 'b'. */ 359 const int32_t *table = (const int32_t *) 360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 361 for (i = 0; i < SBC_MAX; ++i) 362 if (table[i] < 0) 363 re_set_fastmap (fastmap, icase, i); 364 } 365 # else 366 if (dfa->mb_cur_max > 1) 367 for (i = 0; i < SBC_MAX; ++i) 368 if (__btowc (i) == WEOF) 369 re_set_fastmap (fastmap, icase, i); 370 # endif /* not _LIBC */ 371 } 372 for (i = 0; i < cset->nmbchars; ++i) 373 { 374 char buf[256]; 375 mbstate_t state; 376 memset (&state, '\0', sizeof (state)); 377 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 378 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 379 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1) 380 { 381 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 382 != (size_t) -1) 383 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 384 } 385 } 386 } 387 #endif /* RE_ENABLE_I18N */ 388 else if (type == OP_PERIOD 389 #ifdef RE_ENABLE_I18N 390 || type == OP_UTF8_PERIOD 391 #endif /* RE_ENABLE_I18N */ 392 || type == END_OF_RE) 393 { 394 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 395 if (type == END_OF_RE) 396 bufp->re_can_be_null = 1; 397 return; 398 } 399 } 400 } 401 402 /* Entry point for POSIX code. */ 403 /* regcomp takes a regular expression as a string and compiles it. 404 405 PREG is a regex_t *. We do not expect any fields to be initialized, 406 since POSIX says we shouldn't. Thus, we set 407 408 `re_buffer' to the compiled pattern; 409 `re_used' to the length of the compiled pattern; 410 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the 411 REG_EXTENDED bit in CFLAGS is set; otherwise, to 412 REG_SYNTAX_POSIX_BASIC; 413 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS; 414 `re_fastmap' to an allocated space for the fastmap; 415 `re_fastmap_accurate' to zero; 416 `re_nsub' to the number of subexpressions in PATTERN. 417 418 PATTERN is the address of the pattern string. 419 420 CFLAGS is a series of bits which affect compilation. 421 422 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 423 use POSIX basic syntax. 424 425 If REG_NEWLINE is set, then . and [^...] don't match newline. 426 Also, regexec will try a match beginning after every newline. 427 428 If REG_ICASE is set, then we considers upper- and lowercase 429 versions of letters to be equivalent when matching. 430 431 If REG_NOSUB is set, then when PREG is passed to regexec, that 432 routine will report only success or failure, and nothing about the 433 registers. 434 435 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 436 the return codes and their meanings.) */ 437 438 int 439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags) 440 { 441 reg_errcode_t ret; 442 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED 443 : REG_SYNTAX_POSIX_BASIC); 444 445 preg->re_buffer = NULL; 446 preg->re_allocated = 0; 447 preg->re_used = 0; 448 449 /* Try to allocate space for the fastmap. */ 450 preg->re_fastmap = re_malloc (char, SBC_MAX); 451 if (BE (preg->re_fastmap == NULL, 0)) 452 return REG_ESPACE; 453 454 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0; 455 456 /* If REG_NEWLINE is set, newlines are treated differently. */ 457 if (cflags & REG_NEWLINE) 458 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 459 syntax &= ~REG_DOT_NEWLINE; 460 syntax |= REG_HAT_LISTS_NOT_NEWLINE; 461 /* It also changes the matching behavior. */ 462 preg->re_newline_anchor = 1; 463 } 464 else 465 preg->re_newline_anchor = 0; 466 preg->re_no_sub = !!(cflags & REG_NOSUB); 467 preg->re_translate = NULL; 468 469 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 470 471 /* POSIX doesn't distinguish between an unmatched open-group and an 472 unmatched close-group: both are REG_EPAREN. */ 473 if (ret == REG_ERPAREN) 474 ret = REG_EPAREN; 475 476 /* We have already checked preg->re_fastmap != NULL. */ 477 if (BE (ret == REG_NOERROR, 1)) 478 /* Compute the fastmap now, since regexec cannot modify the pattern 479 buffer. This function never fails in this implementation. */ 480 (void) re_compile_fastmap (preg); 481 else 482 { 483 /* Some error occurred while compiling the expression. */ 484 re_free (preg->re_fastmap); 485 preg->re_fastmap = NULL; 486 } 487 488 return (int) ret; 489 } 490 #ifdef _LIBC 491 weak_alias (__regcomp, regcomp) 492 #endif 493 494 /* Returns a message corresponding to an error code, ERRCODE, returned 495 from either regcomp or regexec. We don't use PREG here. */ 496 497 size_t 498 regerror (int errcode, const regex_t *__restrict preg, 499 char *__restrict errbuf, size_t errbuf_size) 500 { 501 const char *msg; 502 size_t msg_size; 503 504 if (BE (errcode < 0 505 || errcode >= (int) (sizeof (__re_error_msgid_idx) 506 / sizeof (__re_error_msgid_idx[0])), 0)) 507 /* Only error codes returned by the rest of the code should be passed 508 to this routine. If we are given anything else, or if other regex 509 code generates an invalid error code, then the program has a bug. 510 Dump core so we can fix it. */ 511 abort (); 512 513 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 514 515 msg_size = strlen (msg) + 1; /* Includes the null. */ 516 517 if (BE (errbuf_size != 0, 1)) 518 { 519 if (BE (msg_size > errbuf_size, 0)) 520 { 521 #if defined HAVE_MEMPCPY || defined _LIBC 522 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 523 #else 524 memcpy (errbuf, msg, errbuf_size - 1); 525 errbuf[errbuf_size - 1] = 0; 526 #endif 527 } 528 else 529 memcpy (errbuf, msg, msg_size); 530 } 531 532 return msg_size; 533 } 534 #ifdef _LIBC 535 weak_alias (__regerror, regerror) 536 #endif 537 538 539 #ifdef RE_ENABLE_I18N 540 /* This static array is used for the map to single-byte characters when 541 UTF-8 is used. Otherwise we would allocate memory just to initialize 542 it the same all the time. UTF-8 is the preferred encoding so this is 543 a worthwhile optimization. */ 544 static const bitset utf8_sb_map = 545 { 546 /* Set the first 128 bits. */ 547 # if 2 < BITSET_WORDS 548 BITSET_WORD_MAX, 549 # endif 550 # if 4 < BITSET_WORDS 551 BITSET_WORD_MAX, 552 # endif 553 # if 6 < BITSET_WORDS 554 BITSET_WORD_MAX, 555 # endif 556 # if 8 < BITSET_WORDS 557 # error "Invalid BITSET_WORDS" 558 # endif 559 (BITSET_WORD_MAX 560 >> (SBC_MAX % BITSET_WORD_BITS == 0 561 ? 0 562 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 563 }; 564 #endif 565 566 567 static void 568 free_dfa_content (re_dfa_t *dfa) 569 { 570 Idx i, j; 571 572 if (dfa->nodes) 573 for (i = 0; i < dfa->nodes_len; ++i) 574 free_token (dfa->nodes + i); 575 re_free (dfa->nexts); 576 for (i = 0; i < dfa->nodes_len; ++i) 577 { 578 if (dfa->eclosures != NULL) 579 re_node_set_free (dfa->eclosures + i); 580 if (dfa->inveclosures != NULL) 581 re_node_set_free (dfa->inveclosures + i); 582 if (dfa->edests != NULL) 583 re_node_set_free (dfa->edests + i); 584 } 585 re_free (dfa->edests); 586 re_free (dfa->eclosures); 587 re_free (dfa->inveclosures); 588 re_free (dfa->nodes); 589 590 if (dfa->state_table) 591 for (i = 0; i <= dfa->state_hash_mask; ++i) 592 { 593 struct re_state_table_entry *entry = dfa->state_table + i; 594 for (j = 0; j < entry->num; ++j) 595 { 596 re_dfastate_t *state = entry->array[j]; 597 free_state (state); 598 } 599 re_free (entry->array); 600 } 601 re_free (dfa->state_table); 602 #ifdef RE_ENABLE_I18N 603 if (dfa->sb_char != utf8_sb_map) 604 re_free (dfa->sb_char); 605 #endif 606 re_free (dfa->subexp_map); 607 #ifdef DEBUG 608 re_free (dfa->re_str); 609 #endif 610 611 re_free (dfa); 612 } 613 614 615 /* Free dynamically allocated space used by PREG. */ 616 617 void 618 regfree (regex_t *preg) 619 { 620 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 621 if (BE (dfa != NULL, 1)) 622 free_dfa_content (dfa); 623 preg->re_buffer = NULL; 624 preg->re_allocated = 0; 625 626 re_free (preg->re_fastmap); 627 preg->re_fastmap = NULL; 628 629 re_free (preg->re_translate); 630 preg->re_translate = NULL; 631 } 632 #ifdef _LIBC 633 weak_alias (__regfree, regfree) 634 #endif 635 636 /* Entry points compatible with 4.2 BSD regex library. We don't define 637 them unless specifically requested. */ 638 639 #if defined _REGEX_RE_COMP || defined _LIBC 640 641 /* BSD has one and only one pattern buffer. */ 642 static struct re_pattern_buffer re_comp_buf; 643 644 char * 645 # ifdef _LIBC 646 /* Make these definitions weak in libc, so POSIX programs can redefine 647 these names if they don't use our functions, and still use 648 regcomp/regexec above without link errors. */ 649 weak_function 650 # endif 651 re_comp (const char *s) 652 { 653 reg_errcode_t ret; 654 char *fastmap; 655 656 if (!s) 657 { 658 if (!re_comp_buf.re_buffer) 659 return gettext ("No previous regular expression"); 660 return 0; 661 } 662 663 if (re_comp_buf.re_buffer) 664 { 665 fastmap = re_comp_buf.re_fastmap; 666 re_comp_buf.re_fastmap = NULL; 667 __regfree (&re_comp_buf); 668 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 669 re_comp_buf.re_fastmap = fastmap; 670 } 671 672 if (re_comp_buf.re_fastmap == NULL) 673 { 674 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX); 675 if (re_comp_buf.re_fastmap == NULL) 676 return (char *) gettext (__re_error_msgid 677 + __re_error_msgid_idx[(int) REG_ESPACE]); 678 } 679 680 /* Since `re_exec' always passes NULL for the `regs' argument, we 681 don't need to initialize the pattern buffer fields which affect it. */ 682 683 /* Match anchors at newlines. */ 684 re_comp_buf.re_newline_anchor = 1; 685 686 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 687 688 if (!ret) 689 return NULL; 690 691 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 692 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 693 } 694 695 #ifdef _LIBC 696 libc_freeres_fn (free_mem) 697 { 698 __regfree (&re_comp_buf); 699 } 700 #endif 701 702 #endif /* _REGEX_RE_COMP */ 703 704 /* Internal entry point. 705 Compile the regular expression PATTERN, whose length is LENGTH. 706 SYNTAX indicate regular expression's syntax. */ 707 708 static reg_errcode_t 709 re_compile_internal (regex_t *preg, const char *pattern, Idx length, 710 reg_syntax_t syntax) 711 { 712 reg_errcode_t err = REG_NOERROR; 713 re_dfa_t *dfa; 714 re_string_t regexp; 715 716 /* Initialize the pattern buffer. */ 717 preg->re_fastmap_accurate = 0; 718 preg->re_syntax = syntax; 719 preg->re_not_bol = preg->re_not_eol = 0; 720 preg->re_used = 0; 721 preg->re_nsub = 0; 722 preg->re_can_be_null = 0; 723 preg->re_regs_allocated = REG_UNALLOCATED; 724 725 /* Initialize the dfa. */ 726 dfa = (re_dfa_t *) preg->re_buffer; 727 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0)) 728 { 729 /* If zero allocated, but buffer is non-null, try to realloc 730 enough space. This loses if buffer's address is bogus, but 731 that is the user's responsibility. If buffer is null this 732 is a simple allocation. */ 733 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1); 734 if (dfa == NULL) 735 return REG_ESPACE; 736 preg->re_allocated = sizeof (re_dfa_t); 737 preg->re_buffer = (unsigned char *) dfa; 738 } 739 preg->re_used = sizeof (re_dfa_t); 740 741 __libc_lock_init (dfa->lock); 742 743 err = init_dfa (dfa, length); 744 if (BE (err != REG_NOERROR, 0)) 745 { 746 free_dfa_content (dfa); 747 preg->re_buffer = NULL; 748 preg->re_allocated = 0; 749 return err; 750 } 751 #ifdef DEBUG 752 dfa->re_str = re_malloc (char, length + 1); 753 strncpy (dfa->re_str, pattern, length + 1); 754 #endif 755 756 err = re_string_construct (®exp, pattern, length, preg->re_translate, 757 syntax & REG_IGNORE_CASE, dfa); 758 if (BE (err != REG_NOERROR, 0)) 759 { 760 re_compile_internal_free_return: 761 free_workarea_compile (preg); 762 re_string_destruct (®exp); 763 free_dfa_content (dfa); 764 preg->re_buffer = NULL; 765 preg->re_allocated = 0; 766 return err; 767 } 768 769 /* Parse the regular expression, and build a structure tree. */ 770 preg->re_nsub = 0; 771 dfa->str_tree = parse (®exp, preg, syntax, &err); 772 if (BE (dfa->str_tree == NULL, 0)) 773 goto re_compile_internal_free_return; 774 775 /* Analyze the tree and create the nfa. */ 776 err = analyze (preg); 777 if (BE (err != REG_NOERROR, 0)) 778 goto re_compile_internal_free_return; 779 780 #ifdef RE_ENABLE_I18N 781 /* If possible, do searching in single byte encoding to speed things up. */ 782 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL) 783 optimize_utf8 (dfa); 784 #endif 785 786 /* Then create the initial state of the dfa. */ 787 err = create_initial_state (dfa); 788 789 /* Release work areas. */ 790 free_workarea_compile (preg); 791 re_string_destruct (®exp); 792 793 if (BE (err != REG_NOERROR, 0)) 794 { 795 free_dfa_content (dfa); 796 preg->re_buffer = NULL; 797 preg->re_allocated = 0; 798 } 799 800 return err; 801 } 802 803 /* Initialize DFA. We use the length of the regular expression PAT_LEN 804 as the initial length of some arrays. */ 805 806 static reg_errcode_t 807 init_dfa (re_dfa_t *dfa, Idx pat_len) 808 { 809 __re_size_t table_size; 810 #ifndef _LIBC 811 char *codeset_name; 812 #endif 813 814 memset (dfa, '\0', sizeof (re_dfa_t)); 815 816 /* Force allocation of str_tree_storage the first time. */ 817 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 818 819 dfa->nodes_alloc = pat_len + 1; 820 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc); 821 822 /* table_size = 2 ^ ceil(log pat_len) */ 823 for (table_size = 1; table_size <= pat_len; table_size <<= 1) 824 if (0 < (Idx) -1 && table_size == 0) 825 return REG_ESPACE; 826 827 dfa->state_table = re_calloc (struct re_state_table_entry, table_size); 828 dfa->state_hash_mask = table_size - 1; 829 830 dfa->mb_cur_max = MB_CUR_MAX; 831 #ifdef _LIBC 832 if (dfa->mb_cur_max == 6 833 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 834 dfa->is_utf8 = 1; 835 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 836 != 0); 837 #else 838 # ifdef HAVE_LANGINFO_CODESET 839 codeset_name = nl_langinfo (CODESET); 840 # else 841 codeset_name = getenv ("LC_ALL"); 842 if (codeset_name == NULL || codeset_name[0] == '\0') 843 codeset_name = getenv ("LC_CTYPE"); 844 if (codeset_name == NULL || codeset_name[0] == '\0') 845 codeset_name = getenv ("LANG"); 846 if (codeset_name == NULL) 847 codeset_name = ""; 848 else if (strchr (codeset_name, '.') != NULL) 849 codeset_name = strchr (codeset_name, '.') + 1; 850 # endif 851 852 if (strcasecmp (codeset_name, "UTF-8") == 0 853 || strcasecmp (codeset_name, "UTF8") == 0) 854 dfa->is_utf8 = 1; 855 856 /* We check exhaustively in the loop below if this charset is a 857 superset of ASCII. */ 858 dfa->map_notascii = 0; 859 #endif 860 861 #ifdef RE_ENABLE_I18N 862 if (dfa->mb_cur_max > 1) 863 { 864 if (dfa->is_utf8) 865 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 866 else 867 { 868 int i, j, ch; 869 870 dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS); 871 if (BE (dfa->sb_char == NULL, 0)) 872 return REG_ESPACE; 873 874 /* Set the bits corresponding to single byte chars. */ 875 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 876 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 877 { 878 wint_t wch = __btowc (ch); 879 if (wch != WEOF) 880 dfa->sb_char[i] |= (bitset_word) 1 << j; 881 # ifndef _LIBC 882 if (isascii (ch) && wch != ch) 883 dfa->map_notascii = 1; 884 # endif 885 } 886 } 887 } 888 #endif 889 890 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 891 return REG_ESPACE; 892 return REG_NOERROR; 893 } 894 895 /* Initialize WORD_CHAR table, which indicate which character is 896 "word". In this case "word" means that it is the word construction 897 character used by some operators like "\<", "\>", etc. */ 898 899 static void 900 init_word_char (re_dfa_t *dfa) 901 { 902 int i, j, ch; 903 dfa->word_ops_used = 1; 904 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 905 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 906 if (isalnum (ch) || ch == '_') 907 dfa->word_char[i] |= (bitset_word) 1 << j; 908 } 909 910 /* Free the work area which are only used while compiling. */ 911 912 static void 913 free_workarea_compile (regex_t *preg) 914 { 915 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 916 bin_tree_storage_t *storage, *next; 917 for (storage = dfa->str_tree_storage; storage; storage = next) 918 { 919 next = storage->next; 920 re_free (storage); 921 } 922 dfa->str_tree_storage = NULL; 923 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 924 dfa->str_tree = NULL; 925 re_free (dfa->org_indices); 926 dfa->org_indices = NULL; 927 } 928 929 /* Create initial states for all contexts. */ 930 931 static reg_errcode_t 932 create_initial_state (re_dfa_t *dfa) 933 { 934 Idx first, i; 935 reg_errcode_t err; 936 re_node_set init_nodes; 937 938 /* Initial states have the epsilon closure of the node which is 939 the first node of the regular expression. */ 940 first = dfa->str_tree->first->node_idx; 941 dfa->init_node = first; 942 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 943 if (BE (err != REG_NOERROR, 0)) 944 return err; 945 946 /* The back-references which are in initial states can epsilon transit, 947 since in this case all of the subexpressions can be null. 948 Then we add epsilon closures of the nodes which are the next nodes of 949 the back-references. */ 950 if (dfa->nbackref > 0) 951 for (i = 0; i < init_nodes.nelem; ++i) 952 { 953 Idx node_idx = init_nodes.elems[i]; 954 re_token_type_t type = dfa->nodes[node_idx].type; 955 956 Idx clexp_idx; 957 if (type != OP_BACK_REF) 958 continue; 959 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 960 { 961 re_token_t *clexp_node; 962 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 963 if (clexp_node->type == OP_CLOSE_SUBEXP 964 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 965 break; 966 } 967 if (clexp_idx == init_nodes.nelem) 968 continue; 969 970 if (type == OP_BACK_REF) 971 { 972 Idx dest_idx = dfa->edests[node_idx].elems[0]; 973 if (!re_node_set_contains (&init_nodes, dest_idx)) 974 { 975 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 976 i = 0; 977 } 978 } 979 } 980 981 /* It must be the first time to invoke acquire_state. */ 982 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 983 /* We don't check ERR here, since the initial state must not be NULL. */ 984 if (BE (dfa->init_state == NULL, 0)) 985 return err; 986 if (dfa->init_state->has_constraint) 987 { 988 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 989 CONTEXT_WORD); 990 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 991 CONTEXT_NEWLINE); 992 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 993 &init_nodes, 994 CONTEXT_NEWLINE 995 | CONTEXT_BEGBUF); 996 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 997 || dfa->init_state_begbuf == NULL, 0)) 998 return err; 999 } 1000 else 1001 dfa->init_state_word = dfa->init_state_nl 1002 = dfa->init_state_begbuf = dfa->init_state; 1003 1004 re_node_set_free (&init_nodes); 1005 return REG_NOERROR; 1006 } 1007 1008 #ifdef RE_ENABLE_I18N 1009 /* If it is possible to do searching in single byte encoding instead of UTF-8 1010 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1011 DFA nodes where needed. */ 1012 1013 static void 1014 optimize_utf8 (re_dfa_t *dfa) 1015 { 1016 Idx node; 1017 int i; 1018 bool mb_chars = false; 1019 bool has_period = false; 1020 1021 for (node = 0; node < dfa->nodes_len; ++node) 1022 switch (dfa->nodes[node].type) 1023 { 1024 case CHARACTER: 1025 if (dfa->nodes[node].opr.c >= 0x80) 1026 mb_chars = true; 1027 break; 1028 case ANCHOR: 1029 switch (dfa->nodes[node].opr.idx) 1030 { 1031 case LINE_FIRST: 1032 case LINE_LAST: 1033 case BUF_FIRST: 1034 case BUF_LAST: 1035 break; 1036 default: 1037 /* Word anchors etc. cannot be handled. */ 1038 return; 1039 } 1040 break; 1041 case OP_PERIOD: 1042 has_period = true; 1043 break; 1044 case OP_BACK_REF: 1045 case OP_ALT: 1046 case END_OF_RE: 1047 case OP_DUP_ASTERISK: 1048 case OP_OPEN_SUBEXP: 1049 case OP_CLOSE_SUBEXP: 1050 break; 1051 case COMPLEX_BRACKET: 1052 return; 1053 case SIMPLE_BRACKET: 1054 /* Just double check. */ 1055 { 1056 int rshift = 1057 (SBC_MAX / 2 % BITSET_WORD_BITS == 0 1058 ? 0 1059 : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS); 1060 for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1061 { 1062 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1063 return; 1064 rshift = 0; 1065 } 1066 } 1067 break; 1068 default: 1069 abort (); 1070 } 1071 1072 if (mb_chars || has_period) 1073 for (node = 0; node < dfa->nodes_len; ++node) 1074 { 1075 if (dfa->nodes[node].type == CHARACTER 1076 && dfa->nodes[node].opr.c >= 0x80) 1077 dfa->nodes[node].mb_partial = 0; 1078 else if (dfa->nodes[node].type == OP_PERIOD) 1079 dfa->nodes[node].type = OP_UTF8_PERIOD; 1080 } 1081 1082 /* The search can be in single byte locale. */ 1083 dfa->mb_cur_max = 1; 1084 dfa->is_utf8 = 0; 1085 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1086 } 1087 #endif 1088 1089 /* Analyze the structure tree, and calculate "first", "next", "edest", 1090 "eclosure", and "inveclosure". */ 1091 1092 static reg_errcode_t 1093 analyze (regex_t *preg) 1094 { 1095 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 1096 reg_errcode_t ret; 1097 1098 /* Allocate arrays. */ 1099 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1100 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1101 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc); 1102 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1103 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1104 || dfa->eclosures == NULL, 0)) 1105 return REG_ESPACE; 1106 1107 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub); 1108 if (dfa->subexp_map != NULL) 1109 { 1110 Idx i; 1111 for (i = 0; i < preg->re_nsub; i++) 1112 dfa->subexp_map[i] = i; 1113 preorder (dfa->str_tree, optimize_subexps, dfa); 1114 for (i = 0; i < preg->re_nsub; i++) 1115 if (dfa->subexp_map[i] != i) 1116 break; 1117 if (i == preg->re_nsub) 1118 { 1119 free (dfa->subexp_map); 1120 dfa->subexp_map = NULL; 1121 } 1122 } 1123 1124 ret = postorder (dfa->str_tree, lower_subexps, preg); 1125 if (BE (ret != REG_NOERROR, 0)) 1126 return ret; 1127 ret = postorder (dfa->str_tree, calc_first, dfa); 1128 if (BE (ret != REG_NOERROR, 0)) 1129 return ret; 1130 preorder (dfa->str_tree, calc_next, dfa); 1131 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1132 if (BE (ret != REG_NOERROR, 0)) 1133 return ret; 1134 ret = calc_eclosure (dfa); 1135 if (BE (ret != REG_NOERROR, 0)) 1136 return ret; 1137 1138 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1139 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1140 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1141 || dfa->nbackref) 1142 { 1143 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len); 1144 if (BE (dfa->inveclosures == NULL, 0)) 1145 return REG_ESPACE; 1146 ret = calc_inveclosure (dfa); 1147 } 1148 1149 return ret; 1150 } 1151 1152 /* Our parse trees are very unbalanced, so we cannot use a stack to 1153 implement parse tree visits. Instead, we use parent pointers and 1154 some hairy code in these two functions. */ 1155 static reg_errcode_t 1156 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1157 void *extra) 1158 { 1159 bin_tree_t *node, *prev; 1160 1161 for (node = root; ; ) 1162 { 1163 /* Descend down the tree, preferably to the left (or to the right 1164 if that's the only child). */ 1165 while (node->left || node->right) 1166 if (node->left) 1167 node = node->left; 1168 else 1169 node = node->right; 1170 1171 do 1172 { 1173 reg_errcode_t err = fn (extra, node); 1174 if (BE (err != REG_NOERROR, 0)) 1175 return err; 1176 if (node->parent == NULL) 1177 return REG_NOERROR; 1178 prev = node; 1179 node = node->parent; 1180 } 1181 /* Go up while we have a node that is reached from the right. */ 1182 while (node->right == prev || node->right == NULL); 1183 node = node->right; 1184 } 1185 } 1186 1187 static reg_errcode_t 1188 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1189 void *extra) 1190 { 1191 bin_tree_t *node; 1192 1193 for (node = root; ; ) 1194 { 1195 reg_errcode_t err = fn (extra, node); 1196 if (BE (err != REG_NOERROR, 0)) 1197 return err; 1198 1199 /* Go to the left node, or up and to the right. */ 1200 if (node->left) 1201 node = node->left; 1202 else 1203 { 1204 bin_tree_t *prev = NULL; 1205 while (node->right == prev || node->right == NULL) 1206 { 1207 prev = node; 1208 node = node->parent; 1209 if (!node) 1210 return REG_NOERROR; 1211 } 1212 node = node->right; 1213 } 1214 } 1215 } 1216 1217 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1218 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1219 backreferences as well. Requires a preorder visit. */ 1220 static reg_errcode_t 1221 optimize_subexps (void *extra, bin_tree_t *node) 1222 { 1223 re_dfa_t *dfa = (re_dfa_t *) extra; 1224 1225 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1226 { 1227 int idx = node->token.opr.idx; 1228 node->token.opr.idx = dfa->subexp_map[idx]; 1229 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1230 } 1231 1232 else if (node->token.type == SUBEXP 1233 && node->left && node->left->token.type == SUBEXP) 1234 { 1235 Idx other_idx = node->left->token.opr.idx; 1236 1237 node->left = node->left->left; 1238 if (node->left) 1239 node->left->parent = node; 1240 1241 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1242 if (other_idx < BITSET_WORD_BITS) 1243 dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx); 1244 } 1245 1246 return REG_NOERROR; 1247 } 1248 1249 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1250 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1251 static reg_errcode_t 1252 lower_subexps (void *extra, bin_tree_t *node) 1253 { 1254 regex_t *preg = (regex_t *) extra; 1255 reg_errcode_t err = REG_NOERROR; 1256 1257 if (node->left && node->left->token.type == SUBEXP) 1258 { 1259 node->left = lower_subexp (&err, preg, node->left); 1260 if (node->left) 1261 node->left->parent = node; 1262 } 1263 if (node->right && node->right->token.type == SUBEXP) 1264 { 1265 node->right = lower_subexp (&err, preg, node->right); 1266 if (node->right) 1267 node->right->parent = node; 1268 } 1269 1270 return err; 1271 } 1272 1273 static bin_tree_t * 1274 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1275 { 1276 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 1277 bin_tree_t *body = node->left; 1278 bin_tree_t *op, *cls, *tree1, *tree; 1279 1280 if (preg->re_no_sub 1281 /* We do not optimize empty subexpressions, because otherwise we may 1282 have bad CONCAT nodes with NULL children. This is obviously not 1283 very common, so we do not lose much. An example that triggers 1284 this case is the sed "script" /\(\)/x. */ 1285 && node->left != NULL 1286 && ! (node->token.opr.idx < BITSET_WORD_BITS 1287 && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx))) 1288 return node->left; 1289 1290 /* Convert the SUBEXP node to the concatenation of an 1291 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1292 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1293 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1294 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1295 tree = create_tree (dfa, op, tree1, CONCAT); 1296 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1297 { 1298 *err = REG_ESPACE; 1299 return NULL; 1300 } 1301 1302 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1303 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1304 return tree; 1305 } 1306 1307 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1308 nodes. Requires a postorder visit. */ 1309 static reg_errcode_t 1310 calc_first (void *extra, bin_tree_t *node) 1311 { 1312 re_dfa_t *dfa = (re_dfa_t *) extra; 1313 if (node->token.type == CONCAT) 1314 { 1315 node->first = node->left->first; 1316 node->node_idx = node->left->node_idx; 1317 } 1318 else 1319 { 1320 node->first = node; 1321 node->node_idx = re_dfa_add_node (dfa, node->token); 1322 if (BE (node->node_idx == REG_MISSING, 0)) 1323 return REG_ESPACE; 1324 } 1325 return REG_NOERROR; 1326 } 1327 1328 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1329 static reg_errcode_t 1330 calc_next (void *extra, bin_tree_t *node) 1331 { 1332 switch (node->token.type) 1333 { 1334 case OP_DUP_ASTERISK: 1335 node->left->next = node; 1336 break; 1337 case CONCAT: 1338 node->left->next = node->right->first; 1339 node->right->next = node->next; 1340 break; 1341 default: 1342 if (node->left) 1343 node->left->next = node->next; 1344 if (node->right) 1345 node->right->next = node->next; 1346 break; 1347 } 1348 return REG_NOERROR; 1349 } 1350 1351 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1352 static reg_errcode_t 1353 link_nfa_nodes (void *extra, bin_tree_t *node) 1354 { 1355 re_dfa_t *dfa = (re_dfa_t *) extra; 1356 Idx idx = node->node_idx; 1357 reg_errcode_t err = REG_NOERROR; 1358 1359 switch (node->token.type) 1360 { 1361 case CONCAT: 1362 break; 1363 1364 case END_OF_RE: 1365 assert (node->next == NULL); 1366 break; 1367 1368 case OP_DUP_ASTERISK: 1369 case OP_ALT: 1370 { 1371 Idx left, right; 1372 dfa->has_plural_match = 1; 1373 if (node->left != NULL) 1374 left = node->left->first->node_idx; 1375 else 1376 left = node->next->node_idx; 1377 if (node->right != NULL) 1378 right = node->right->first->node_idx; 1379 else 1380 right = node->next->node_idx; 1381 assert (REG_VALID_INDEX (left)); 1382 assert (REG_VALID_INDEX (right)); 1383 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1384 } 1385 break; 1386 1387 case ANCHOR: 1388 case OP_OPEN_SUBEXP: 1389 case OP_CLOSE_SUBEXP: 1390 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1391 break; 1392 1393 case OP_BACK_REF: 1394 dfa->nexts[idx] = node->next->node_idx; 1395 if (node->token.type == OP_BACK_REF) 1396 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1397 break; 1398 1399 default: 1400 assert (!IS_EPSILON_NODE (node->token.type)); 1401 dfa->nexts[idx] = node->next->node_idx; 1402 break; 1403 } 1404 1405 return err; 1406 } 1407 1408 /* Duplicate the epsilon closure of the node ROOT_NODE. 1409 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1410 to their own constraint. */ 1411 1412 static reg_errcode_t 1413 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, 1414 Idx top_clone_node, Idx root_node, 1415 unsigned int init_constraint) 1416 { 1417 Idx org_node, clone_node; 1418 bool ok; 1419 unsigned int constraint = init_constraint; 1420 for (org_node = top_org_node, clone_node = top_clone_node;;) 1421 { 1422 Idx org_dest, clone_dest; 1423 if (dfa->nodes[org_node].type == OP_BACK_REF) 1424 { 1425 /* If the back reference epsilon-transit, its destination must 1426 also have the constraint. Then duplicate the epsilon closure 1427 of the destination of the back reference, and store it in 1428 edests of the back reference. */ 1429 org_dest = dfa->nexts[org_node]; 1430 re_node_set_empty (dfa->edests + clone_node); 1431 clone_dest = duplicate_node (dfa, org_dest, constraint); 1432 if (BE (clone_dest == REG_MISSING, 0)) 1433 return REG_ESPACE; 1434 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1435 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1436 if (BE (! ok, 0)) 1437 return REG_ESPACE; 1438 } 1439 else if (dfa->edests[org_node].nelem == 0) 1440 { 1441 /* In case of the node can't epsilon-transit, don't duplicate the 1442 destination and store the original destination as the 1443 destination of the node. */ 1444 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1445 break; 1446 } 1447 else if (dfa->edests[org_node].nelem == 1) 1448 { 1449 /* In case of the node can epsilon-transit, and it has only one 1450 destination. */ 1451 org_dest = dfa->edests[org_node].elems[0]; 1452 re_node_set_empty (dfa->edests + clone_node); 1453 if (dfa->nodes[org_node].type == ANCHOR) 1454 { 1455 /* In case of the node has another constraint, append it. */ 1456 if (org_node == root_node && clone_node != org_node) 1457 { 1458 /* ...but if the node is root_node itself, it means the 1459 epsilon closure have a loop, then tie it to the 1460 destination of the root_node. */ 1461 ok = re_node_set_insert (dfa->edests + clone_node, 1462 org_dest); 1463 if (BE (! ok, 0)) 1464 return REG_ESPACE; 1465 break; 1466 } 1467 constraint |= dfa->nodes[org_node].opr.ctx_type; 1468 } 1469 clone_dest = duplicate_node (dfa, org_dest, constraint); 1470 if (BE (clone_dest == REG_MISSING, 0)) 1471 return REG_ESPACE; 1472 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1473 if (BE (! ok, 0)) 1474 return REG_ESPACE; 1475 } 1476 else /* dfa->edests[org_node].nelem == 2 */ 1477 { 1478 /* In case of the node can epsilon-transit, and it has two 1479 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1480 org_dest = dfa->edests[org_node].elems[0]; 1481 re_node_set_empty (dfa->edests + clone_node); 1482 /* Search for a duplicated node which satisfies the constraint. */ 1483 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1484 if (clone_dest == REG_MISSING) 1485 { 1486 /* There are no such a duplicated node, create a new one. */ 1487 reg_errcode_t err; 1488 clone_dest = duplicate_node (dfa, org_dest, constraint); 1489 if (BE (clone_dest == REG_MISSING, 0)) 1490 return REG_ESPACE; 1491 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1492 if (BE (! ok, 0)) 1493 return REG_ESPACE; 1494 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1495 root_node, constraint); 1496 if (BE (err != REG_NOERROR, 0)) 1497 return err; 1498 } 1499 else 1500 { 1501 /* There are a duplicated node which satisfy the constraint, 1502 use it to avoid infinite loop. */ 1503 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1504 if (BE (! ok, 0)) 1505 return REG_ESPACE; 1506 } 1507 1508 org_dest = dfa->edests[org_node].elems[1]; 1509 clone_dest = duplicate_node (dfa, org_dest, constraint); 1510 if (BE (clone_dest == REG_MISSING, 0)) 1511 return REG_ESPACE; 1512 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1513 if (BE (! ok, 0)) 1514 return REG_ESPACE; 1515 } 1516 org_node = org_dest; 1517 clone_node = clone_dest; 1518 } 1519 return REG_NOERROR; 1520 } 1521 1522 /* Search for a node which is duplicated from the node ORG_NODE, and 1523 satisfies the constraint CONSTRAINT. */ 1524 1525 static Idx 1526 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1527 unsigned int constraint) 1528 { 1529 Idx idx; 1530 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1531 { 1532 if (org_node == dfa->org_indices[idx] 1533 && constraint == dfa->nodes[idx].constraint) 1534 return idx; /* Found. */ 1535 } 1536 return REG_MISSING; /* Not found. */ 1537 } 1538 1539 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1540 Return the index of the new node, or REG_MISSING if insufficient storage is 1541 available. */ 1542 1543 static Idx 1544 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1545 { 1546 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1547 if (BE (dup_idx != REG_MISSING, 1)) 1548 { 1549 dfa->nodes[dup_idx].constraint = constraint; 1550 if (dfa->nodes[org_idx].type == ANCHOR) 1551 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; 1552 dfa->nodes[dup_idx].duplicated = 1; 1553 1554 /* Store the index of the original node. */ 1555 dfa->org_indices[dup_idx] = org_idx; 1556 } 1557 return dup_idx; 1558 } 1559 1560 static reg_errcode_t 1561 calc_inveclosure (re_dfa_t *dfa) 1562 { 1563 Idx src, idx; 1564 bool ok; 1565 for (idx = 0; idx < dfa->nodes_len; ++idx) 1566 re_node_set_init_empty (dfa->inveclosures + idx); 1567 1568 for (src = 0; src < dfa->nodes_len; ++src) 1569 { 1570 Idx *elems = dfa->eclosures[src].elems; 1571 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1572 { 1573 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1574 if (BE (! ok, 0)) 1575 return REG_ESPACE; 1576 } 1577 } 1578 1579 return REG_NOERROR; 1580 } 1581 1582 /* Calculate "eclosure" for all the node in DFA. */ 1583 1584 static reg_errcode_t 1585 calc_eclosure (re_dfa_t *dfa) 1586 { 1587 Idx node_idx; 1588 bool incomplete; 1589 #ifdef DEBUG 1590 assert (dfa->nodes_len > 0); 1591 #endif 1592 incomplete = false; 1593 /* For each nodes, calculate epsilon closure. */ 1594 for (node_idx = 0; ; ++node_idx) 1595 { 1596 reg_errcode_t err; 1597 re_node_set eclosure_elem; 1598 if (node_idx == dfa->nodes_len) 1599 { 1600 if (!incomplete) 1601 break; 1602 incomplete = false; 1603 node_idx = 0; 1604 } 1605 1606 #ifdef DEBUG 1607 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1608 #endif 1609 1610 /* If we have already calculated, skip it. */ 1611 if (dfa->eclosures[node_idx].nelem != 0) 1612 continue; 1613 /* Calculate epsilon closure of `node_idx'. */ 1614 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1615 if (BE (err != REG_NOERROR, 0)) 1616 return err; 1617 1618 if (dfa->eclosures[node_idx].nelem == 0) 1619 { 1620 incomplete = true; 1621 re_node_set_free (&eclosure_elem); 1622 } 1623 } 1624 return REG_NOERROR; 1625 } 1626 1627 /* Calculate epsilon closure of NODE. */ 1628 1629 static reg_errcode_t 1630 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1631 { 1632 reg_errcode_t err; 1633 unsigned int constraint; 1634 Idx i; 1635 bool incomplete; 1636 bool ok; 1637 re_node_set eclosure; 1638 incomplete = false; 1639 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1640 if (BE (err != REG_NOERROR, 0)) 1641 return err; 1642 1643 /* This indicates that we are calculating this node now. 1644 We reference this value to avoid infinite loop. */ 1645 dfa->eclosures[node].nelem = REG_MISSING; 1646 1647 constraint = ((dfa->nodes[node].type == ANCHOR) 1648 ? dfa->nodes[node].opr.ctx_type : 0); 1649 /* If the current node has constraints, duplicate all nodes. 1650 Since they must inherit the constraints. */ 1651 if (constraint 1652 && dfa->edests[node].nelem 1653 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1654 { 1655 Idx org_node, cur_node; 1656 org_node = cur_node = node; 1657 err = duplicate_node_closure (dfa, node, node, node, constraint); 1658 if (BE (err != REG_NOERROR, 0)) 1659 return err; 1660 } 1661 1662 /* Expand each epsilon destination nodes. */ 1663 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1664 for (i = 0; i < dfa->edests[node].nelem; ++i) 1665 { 1666 re_node_set eclosure_elem; 1667 Idx edest = dfa->edests[node].elems[i]; 1668 /* If calculating the epsilon closure of `edest' is in progress, 1669 return intermediate result. */ 1670 if (dfa->eclosures[edest].nelem == REG_MISSING) 1671 { 1672 incomplete = true; 1673 continue; 1674 } 1675 /* If we haven't calculated the epsilon closure of `edest' yet, 1676 calculate now. Otherwise use calculated epsilon closure. */ 1677 if (dfa->eclosures[edest].nelem == 0) 1678 { 1679 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1680 if (BE (err != REG_NOERROR, 0)) 1681 return err; 1682 } 1683 else 1684 eclosure_elem = dfa->eclosures[edest]; 1685 /* Merge the epsilon closure of `edest'. */ 1686 re_node_set_merge (&eclosure, &eclosure_elem); 1687 /* If the epsilon closure of `edest' is incomplete, 1688 the epsilon closure of this node is also incomplete. */ 1689 if (dfa->eclosures[edest].nelem == 0) 1690 { 1691 incomplete = true; 1692 re_node_set_free (&eclosure_elem); 1693 } 1694 } 1695 1696 /* Epsilon closures include itself. */ 1697 ok = re_node_set_insert (&eclosure, node); 1698 if (BE (! ok, 0)) 1699 return REG_ESPACE; 1700 if (incomplete && !root) 1701 dfa->eclosures[node].nelem = 0; 1702 else 1703 dfa->eclosures[node] = eclosure; 1704 *new_set = eclosure; 1705 return REG_NOERROR; 1706 } 1707 1708 /* Functions for token which are used in the parser. */ 1709 1710 /* Fetch a token from INPUT. 1711 We must not use this function inside bracket expressions. */ 1712 1713 static void 1714 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1715 { 1716 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1717 } 1718 1719 /* Peek a token from INPUT, and return the length of the token. 1720 We must not use this function inside bracket expressions. */ 1721 1722 static int 1723 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1724 { 1725 unsigned char c; 1726 1727 if (re_string_eoi (input)) 1728 { 1729 token->type = END_OF_RE; 1730 return 0; 1731 } 1732 1733 c = re_string_peek_byte (input, 0); 1734 token->opr.c = c; 1735 1736 token->word_char = 0; 1737 #ifdef RE_ENABLE_I18N 1738 token->mb_partial = 0; 1739 if (input->mb_cur_max > 1 && 1740 !re_string_first_byte (input, re_string_cur_idx (input))) 1741 { 1742 token->type = CHARACTER; 1743 token->mb_partial = 1; 1744 return 1; 1745 } 1746 #endif 1747 if (c == '\\') 1748 { 1749 unsigned char c2; 1750 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1751 { 1752 token->type = BACK_SLASH; 1753 return 1; 1754 } 1755 1756 c2 = re_string_peek_byte_case (input, 1); 1757 token->opr.c = c2; 1758 token->type = CHARACTER; 1759 #ifdef RE_ENABLE_I18N 1760 if (input->mb_cur_max > 1) 1761 { 1762 wint_t wc = re_string_wchar_at (input, 1763 re_string_cur_idx (input) + 1); 1764 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1765 } 1766 else 1767 #endif 1768 token->word_char = IS_WORD_CHAR (c2) != 0; 1769 1770 switch (c2) 1771 { 1772 case '|': 1773 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR)) 1774 token->type = OP_ALT; 1775 break; 1776 case '1': case '2': case '3': case '4': case '5': 1777 case '6': case '7': case '8': case '9': 1778 if (!(syntax & REG_NO_BK_REFS)) 1779 { 1780 token->type = OP_BACK_REF; 1781 token->opr.idx = c2 - '1'; 1782 } 1783 break; 1784 case '<': 1785 if (!(syntax & REG_NO_GNU_OPS)) 1786 { 1787 token->type = ANCHOR; 1788 token->opr.ctx_type = WORD_FIRST; 1789 } 1790 break; 1791 case '>': 1792 if (!(syntax & REG_NO_GNU_OPS)) 1793 { 1794 token->type = ANCHOR; 1795 token->opr.ctx_type = WORD_LAST; 1796 } 1797 break; 1798 case 'b': 1799 if (!(syntax & REG_NO_GNU_OPS)) 1800 { 1801 token->type = ANCHOR; 1802 token->opr.ctx_type = WORD_DELIM; 1803 } 1804 break; 1805 case 'B': 1806 if (!(syntax & REG_NO_GNU_OPS)) 1807 { 1808 token->type = ANCHOR; 1809 token->opr.ctx_type = NOT_WORD_DELIM; 1810 } 1811 break; 1812 case 'w': 1813 if (!(syntax & REG_NO_GNU_OPS)) 1814 token->type = OP_WORD; 1815 break; 1816 case 'W': 1817 if (!(syntax & REG_NO_GNU_OPS)) 1818 token->type = OP_NOTWORD; 1819 break; 1820 case 's': 1821 if (!(syntax & REG_NO_GNU_OPS)) 1822 token->type = OP_SPACE; 1823 break; 1824 case 'S': 1825 if (!(syntax & REG_NO_GNU_OPS)) 1826 token->type = OP_NOTSPACE; 1827 break; 1828 case '`': 1829 if (!(syntax & REG_NO_GNU_OPS)) 1830 { 1831 token->type = ANCHOR; 1832 token->opr.ctx_type = BUF_FIRST; 1833 } 1834 break; 1835 case '\'': 1836 if (!(syntax & REG_NO_GNU_OPS)) 1837 { 1838 token->type = ANCHOR; 1839 token->opr.ctx_type = BUF_LAST; 1840 } 1841 break; 1842 case '(': 1843 if (!(syntax & REG_NO_BK_PARENS)) 1844 token->type = OP_OPEN_SUBEXP; 1845 break; 1846 case ')': 1847 if (!(syntax & REG_NO_BK_PARENS)) 1848 token->type = OP_CLOSE_SUBEXP; 1849 break; 1850 case '+': 1851 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM)) 1852 token->type = OP_DUP_PLUS; 1853 break; 1854 case '?': 1855 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM)) 1856 token->type = OP_DUP_QUESTION; 1857 break; 1858 case '{': 1859 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES))) 1860 token->type = OP_OPEN_DUP_NUM; 1861 break; 1862 case '}': 1863 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES))) 1864 token->type = OP_CLOSE_DUP_NUM; 1865 break; 1866 default: 1867 break; 1868 } 1869 return 2; 1870 } 1871 1872 token->type = CHARACTER; 1873 #ifdef RE_ENABLE_I18N 1874 if (input->mb_cur_max > 1) 1875 { 1876 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1877 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1878 } 1879 else 1880 #endif 1881 token->word_char = IS_WORD_CHAR (token->opr.c); 1882 1883 switch (c) 1884 { 1885 case '\n': 1886 if (syntax & REG_NEWLINE_ALT) 1887 token->type = OP_ALT; 1888 break; 1889 case '|': 1890 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR)) 1891 token->type = OP_ALT; 1892 break; 1893 case '*': 1894 token->type = OP_DUP_ASTERISK; 1895 break; 1896 case '+': 1897 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM)) 1898 token->type = OP_DUP_PLUS; 1899 break; 1900 case '?': 1901 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM)) 1902 token->type = OP_DUP_QUESTION; 1903 break; 1904 case '{': 1905 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES)) 1906 token->type = OP_OPEN_DUP_NUM; 1907 break; 1908 case '}': 1909 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES)) 1910 token->type = OP_CLOSE_DUP_NUM; 1911 break; 1912 case '(': 1913 if (syntax & REG_NO_BK_PARENS) 1914 token->type = OP_OPEN_SUBEXP; 1915 break; 1916 case ')': 1917 if (syntax & REG_NO_BK_PARENS) 1918 token->type = OP_CLOSE_SUBEXP; 1919 break; 1920 case '[': 1921 token->type = OP_OPEN_BRACKET; 1922 break; 1923 case '.': 1924 token->type = OP_PERIOD; 1925 break; 1926 case '^': 1927 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) && 1928 re_string_cur_idx (input) != 0) 1929 { 1930 char prev = re_string_peek_byte (input, -1); 1931 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n') 1932 break; 1933 } 1934 token->type = ANCHOR; 1935 token->opr.ctx_type = LINE_FIRST; 1936 break; 1937 case '$': 1938 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) && 1939 re_string_cur_idx (input) + 1 != re_string_length (input)) 1940 { 1941 re_token_t next; 1942 re_string_skip_bytes (input, 1); 1943 peek_token (&next, input, syntax); 1944 re_string_skip_bytes (input, -1); 1945 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1946 break; 1947 } 1948 token->type = ANCHOR; 1949 token->opr.ctx_type = LINE_LAST; 1950 break; 1951 default: 1952 break; 1953 } 1954 return 1; 1955 } 1956 1957 /* Peek a token from INPUT, and return the length of the token. 1958 We must not use this function out of bracket expressions. */ 1959 1960 static int 1961 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1962 { 1963 unsigned char c; 1964 if (re_string_eoi (input)) 1965 { 1966 token->type = END_OF_RE; 1967 return 0; 1968 } 1969 c = re_string_peek_byte (input, 0); 1970 token->opr.c = c; 1971 1972 #ifdef RE_ENABLE_I18N 1973 if (input->mb_cur_max > 1 && 1974 !re_string_first_byte (input, re_string_cur_idx (input))) 1975 { 1976 token->type = CHARACTER; 1977 return 1; 1978 } 1979 #endif /* RE_ENABLE_I18N */ 1980 1981 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS) 1982 && re_string_cur_idx (input) + 1 < re_string_length (input)) 1983 { 1984 /* In this case, '\' escape a character. */ 1985 unsigned char c2; 1986 re_string_skip_bytes (input, 1); 1987 c2 = re_string_peek_byte (input, 0); 1988 token->opr.c = c2; 1989 token->type = CHARACTER; 1990 return 1; 1991 } 1992 if (c == '[') /* '[' is a special char in a bracket exps. */ 1993 { 1994 unsigned char c2; 1995 int token_len; 1996 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 1997 c2 = re_string_peek_byte (input, 1); 1998 else 1999 c2 = 0; 2000 token->opr.c = c2; 2001 token_len = 2; 2002 switch (c2) 2003 { 2004 case '.': 2005 token->type = OP_OPEN_COLL_ELEM; 2006 break; 2007 case '=': 2008 token->type = OP_OPEN_EQUIV_CLASS; 2009 break; 2010 case ':': 2011 if (syntax & REG_CHAR_CLASSES) 2012 { 2013 token->type = OP_OPEN_CHAR_CLASS; 2014 break; 2015 } 2016 /* else fall through. */ 2017 default: 2018 token->type = CHARACTER; 2019 token->opr.c = c; 2020 token_len = 1; 2021 break; 2022 } 2023 return token_len; 2024 } 2025 switch (c) 2026 { 2027 case '-': 2028 token->type = OP_CHARSET_RANGE; 2029 break; 2030 case ']': 2031 token->type = OP_CLOSE_BRACKET; 2032 break; 2033 case '^': 2034 token->type = OP_NON_MATCH_LIST; 2035 break; 2036 default: 2037 token->type = CHARACTER; 2038 } 2039 return 1; 2040 } 2041 2042 /* Functions for parser. */ 2043 2044 /* Entry point of the parser. 2045 Parse the regular expression REGEXP and return the structure tree. 2046 If an error is occured, ERR is set by error code, and return NULL. 2047 This function build the following tree, from regular expression <reg_exp>: 2048 CAT 2049 / \ 2050 / \ 2051 <reg_exp> EOR 2052 2053 CAT means concatenation. 2054 EOR means end of regular expression. */ 2055 2056 static bin_tree_t * 2057 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2058 reg_errcode_t *err) 2059 { 2060 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2061 bin_tree_t *tree, *eor, *root; 2062 re_token_t current_token; 2063 dfa->syntax = syntax; 2064 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE); 2065 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2066 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2067 return NULL; 2068 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2069 if (tree != NULL) 2070 root = create_tree (dfa, tree, eor, CONCAT); 2071 else 2072 root = eor; 2073 if (BE (eor == NULL || root == NULL, 0)) 2074 { 2075 *err = REG_ESPACE; 2076 return NULL; 2077 } 2078 return root; 2079 } 2080 2081 /* This function build the following tree, from regular expression 2082 <branch1>|<branch2>: 2083 ALT 2084 / \ 2085 / \ 2086 <branch1> <branch2> 2087 2088 ALT means alternative, which represents the operator `|'. */ 2089 2090 static bin_tree_t * 2091 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2092 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2093 { 2094 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2095 bin_tree_t *tree, *branch = NULL; 2096 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2097 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2098 return NULL; 2099 2100 while (token->type == OP_ALT) 2101 { 2102 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE); 2103 if (token->type != OP_ALT && token->type != END_OF_RE 2104 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2105 { 2106 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2107 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2108 return NULL; 2109 } 2110 else 2111 branch = NULL; 2112 tree = create_tree (dfa, tree, branch, OP_ALT); 2113 if (BE (tree == NULL, 0)) 2114 { 2115 *err = REG_ESPACE; 2116 return NULL; 2117 } 2118 } 2119 return tree; 2120 } 2121 2122 /* This function build the following tree, from regular expression 2123 <exp1><exp2>: 2124 CAT 2125 / \ 2126 / \ 2127 <exp1> <exp2> 2128 2129 CAT means concatenation. */ 2130 2131 static bin_tree_t * 2132 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2133 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2134 { 2135 bin_tree_t *tree, *exp; 2136 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2137 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2138 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2139 return NULL; 2140 2141 while (token->type != OP_ALT && token->type != END_OF_RE 2142 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2143 { 2144 exp = parse_expression (regexp, preg, token, syntax, nest, err); 2145 if (BE (*err != REG_NOERROR && exp == NULL, 0)) 2146 { 2147 return NULL; 2148 } 2149 if (tree != NULL && exp != NULL) 2150 { 2151 tree = create_tree (dfa, tree, exp, CONCAT); 2152 if (tree == NULL) 2153 { 2154 *err = REG_ESPACE; 2155 return NULL; 2156 } 2157 } 2158 else if (tree == NULL) 2159 tree = exp; 2160 /* Otherwise exp == NULL, we don't need to create new tree. */ 2161 } 2162 return tree; 2163 } 2164 2165 /* This function build the following tree, from regular expression a*: 2166 * 2167 | 2168 a 2169 */ 2170 2171 static bin_tree_t * 2172 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2173 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2174 { 2175 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2176 bin_tree_t *tree; 2177 switch (token->type) 2178 { 2179 case CHARACTER: 2180 tree = create_token_tree (dfa, NULL, NULL, token); 2181 if (BE (tree == NULL, 0)) 2182 { 2183 *err = REG_ESPACE; 2184 return NULL; 2185 } 2186 #ifdef RE_ENABLE_I18N 2187 if (dfa->mb_cur_max > 1) 2188 { 2189 while (!re_string_eoi (regexp) 2190 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2191 { 2192 bin_tree_t *mbc_remain; 2193 fetch_token (token, regexp, syntax); 2194 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2195 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2196 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2197 { 2198 *err = REG_ESPACE; 2199 return NULL; 2200 } 2201 } 2202 } 2203 #endif 2204 break; 2205 case OP_OPEN_SUBEXP: 2206 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2207 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2208 return NULL; 2209 break; 2210 case OP_OPEN_BRACKET: 2211 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2212 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2213 return NULL; 2214 break; 2215 case OP_BACK_REF: 2216 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2217 { 2218 *err = REG_ESUBREG; 2219 return NULL; 2220 } 2221 dfa->used_bkref_map |= 1 << token->opr.idx; 2222 tree = create_token_tree (dfa, NULL, NULL, token); 2223 if (BE (tree == NULL, 0)) 2224 { 2225 *err = REG_ESPACE; 2226 return NULL; 2227 } 2228 ++dfa->nbackref; 2229 dfa->has_mb_node = 1; 2230 break; 2231 case OP_OPEN_DUP_NUM: 2232 if (syntax & REG_CONTEXT_INVALID_DUP) 2233 { 2234 *err = REG_BADRPT; 2235 return NULL; 2236 } 2237 /* FALLTHROUGH */ 2238 case OP_DUP_ASTERISK: 2239 case OP_DUP_PLUS: 2240 case OP_DUP_QUESTION: 2241 if (syntax & REG_CONTEXT_INVALID_OPS) 2242 { 2243 *err = REG_BADRPT; 2244 return NULL; 2245 } 2246 else if (syntax & REG_CONTEXT_INDEP_OPS) 2247 { 2248 fetch_token (token, regexp, syntax); 2249 return parse_expression (regexp, preg, token, syntax, nest, err); 2250 } 2251 /* else fall through */ 2252 case OP_CLOSE_SUBEXP: 2253 if ((token->type == OP_CLOSE_SUBEXP) && 2254 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD)) 2255 { 2256 *err = REG_ERPAREN; 2257 return NULL; 2258 } 2259 /* else fall through */ 2260 case OP_CLOSE_DUP_NUM: 2261 /* We treat it as a normal character. */ 2262 2263 /* Then we can these characters as normal characters. */ 2264 token->type = CHARACTER; 2265 /* mb_partial and word_char bits should be initialized already 2266 by peek_token. */ 2267 tree = create_token_tree (dfa, NULL, NULL, token); 2268 if (BE (tree == NULL, 0)) 2269 { 2270 *err = REG_ESPACE; 2271 return NULL; 2272 } 2273 break; 2274 case ANCHOR: 2275 if ((token->opr.ctx_type 2276 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2277 && dfa->word_ops_used == 0) 2278 init_word_char (dfa); 2279 if (token->opr.ctx_type == WORD_DELIM 2280 || token->opr.ctx_type == NOT_WORD_DELIM) 2281 { 2282 bin_tree_t *tree_first, *tree_last; 2283 if (token->opr.ctx_type == WORD_DELIM) 2284 { 2285 token->opr.ctx_type = WORD_FIRST; 2286 tree_first = create_token_tree (dfa, NULL, NULL, token); 2287 token->opr.ctx_type = WORD_LAST; 2288 } 2289 else 2290 { 2291 token->opr.ctx_type = INSIDE_WORD; 2292 tree_first = create_token_tree (dfa, NULL, NULL, token); 2293 token->opr.ctx_type = INSIDE_NOTWORD; 2294 } 2295 tree_last = create_token_tree (dfa, NULL, NULL, token); 2296 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2297 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2298 { 2299 *err = REG_ESPACE; 2300 return NULL; 2301 } 2302 } 2303 else 2304 { 2305 tree = create_token_tree (dfa, NULL, NULL, token); 2306 if (BE (tree == NULL, 0)) 2307 { 2308 *err = REG_ESPACE; 2309 return NULL; 2310 } 2311 } 2312 /* We must return here, since ANCHORs can't be followed 2313 by repetition operators. 2314 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2315 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2316 fetch_token (token, regexp, syntax); 2317 return tree; 2318 case OP_PERIOD: 2319 tree = create_token_tree (dfa, NULL, NULL, token); 2320 if (BE (tree == NULL, 0)) 2321 { 2322 *err = REG_ESPACE; 2323 return NULL; 2324 } 2325 if (dfa->mb_cur_max > 1) 2326 dfa->has_mb_node = 1; 2327 break; 2328 case OP_WORD: 2329 case OP_NOTWORD: 2330 tree = build_charclass_op (dfa, regexp->trans, 2331 (const unsigned char *) "alnum", 2332 (const unsigned char *) "_", 2333 token->type == OP_NOTWORD, err); 2334 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2335 return NULL; 2336 break; 2337 case OP_SPACE: 2338 case OP_NOTSPACE: 2339 tree = build_charclass_op (dfa, regexp->trans, 2340 (const unsigned char *) "space", 2341 (const unsigned char *) "", 2342 token->type == OP_NOTSPACE, err); 2343 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2344 return NULL; 2345 break; 2346 case OP_ALT: 2347 case END_OF_RE: 2348 return NULL; 2349 case BACK_SLASH: 2350 *err = REG_EESCAPE; 2351 return NULL; 2352 default: 2353 /* Must not happen? */ 2354 #ifdef DEBUG 2355 assert (0); 2356 #endif 2357 return NULL; 2358 } 2359 fetch_token (token, regexp, syntax); 2360 2361 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2362 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2363 { 2364 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2365 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2366 return NULL; 2367 /* In BRE consecutive duplications are not allowed. */ 2368 if ((syntax & REG_CONTEXT_INVALID_DUP) 2369 && (token->type == OP_DUP_ASTERISK 2370 || token->type == OP_OPEN_DUP_NUM)) 2371 { 2372 *err = REG_BADRPT; 2373 return NULL; 2374 } 2375 } 2376 2377 return tree; 2378 } 2379 2380 /* This function build the following tree, from regular expression 2381 (<reg_exp>): 2382 SUBEXP 2383 | 2384 <reg_exp> 2385 */ 2386 2387 static bin_tree_t * 2388 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2389 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2390 { 2391 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer; 2392 bin_tree_t *tree; 2393 size_t cur_nsub; 2394 cur_nsub = preg->re_nsub++; 2395 2396 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE); 2397 2398 /* The subexpression may be a null string. */ 2399 if (token->type == OP_CLOSE_SUBEXP) 2400 tree = NULL; 2401 else 2402 { 2403 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2404 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2405 *err = REG_EPAREN; 2406 if (BE (*err != REG_NOERROR, 0)) 2407 return NULL; 2408 } 2409 2410 if (cur_nsub <= '9' - '1') 2411 dfa->completed_bkref_map |= 1 << cur_nsub; 2412 2413 tree = create_tree (dfa, tree, NULL, SUBEXP); 2414 if (BE (tree == NULL, 0)) 2415 { 2416 *err = REG_ESPACE; 2417 return NULL; 2418 } 2419 tree->token.opr.idx = cur_nsub; 2420 return tree; 2421 } 2422 2423 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2424 2425 static bin_tree_t * 2426 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2427 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2428 { 2429 bin_tree_t *tree = NULL, *old_tree = NULL; 2430 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2431 re_token_t start_token = *token; 2432 2433 if (token->type == OP_OPEN_DUP_NUM) 2434 { 2435 end = 0; 2436 start = fetch_number (regexp, token, syntax); 2437 if (start == REG_MISSING) 2438 { 2439 if (token->type == CHARACTER && token->opr.c == ',') 2440 start = 0; /* We treat "{,m}" as "{0,m}". */ 2441 else 2442 { 2443 *err = REG_BADBR; /* <re>{} is invalid. */ 2444 return NULL; 2445 } 2446 } 2447 if (BE (start != REG_ERROR, 1)) 2448 { 2449 /* We treat "{n}" as "{n,n}". */ 2450 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2451 : ((token->type == CHARACTER && token->opr.c == ',') 2452 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2453 } 2454 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2455 { 2456 /* Invalid sequence. */ 2457 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0)) 2458 { 2459 if (token->type == END_OF_RE) 2460 *err = REG_EBRACE; 2461 else 2462 *err = REG_BADBR; 2463 2464 return NULL; 2465 } 2466 2467 /* If the syntax bit is set, rollback. */ 2468 re_string_set_index (regexp, start_idx); 2469 *token = start_token; 2470 token->type = CHARACTER; 2471 /* mb_partial and word_char bits should be already initialized by 2472 peek_token. */ 2473 return elem; 2474 } 2475 2476 if (BE (end != REG_MISSING && start > end, 0)) 2477 { 2478 /* First number greater than second. */ 2479 *err = REG_BADBR; 2480 return NULL; 2481 } 2482 } 2483 else 2484 { 2485 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2486 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2487 } 2488 2489 fetch_token (token, regexp, syntax); 2490 2491 if (BE (elem == NULL, 0)) 2492 return NULL; 2493 if (BE (start == 0 && end == 0, 0)) 2494 { 2495 postorder (elem, free_tree, NULL); 2496 return NULL; 2497 } 2498 2499 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2500 if (BE (start > 0, 0)) 2501 { 2502 tree = elem; 2503 for (i = 2; i <= start; ++i) 2504 { 2505 elem = duplicate_tree (elem, dfa); 2506 tree = create_tree (dfa, tree, elem, CONCAT); 2507 if (BE (elem == NULL || tree == NULL, 0)) 2508 goto parse_dup_op_espace; 2509 } 2510 2511 if (start == end) 2512 return tree; 2513 2514 /* Duplicate ELEM before it is marked optional. */ 2515 elem = duplicate_tree (elem, dfa); 2516 old_tree = tree; 2517 } 2518 else 2519 old_tree = NULL; 2520 2521 if (elem->token.type == SUBEXP) 2522 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx); 2523 2524 tree = create_tree (dfa, elem, NULL, 2525 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2526 if (BE (tree == NULL, 0)) 2527 goto parse_dup_op_espace; 2528 2529 /* This loop is actually executed only when end != REG_MISSING, 2530 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2531 already created the start+1-th copy. */ 2532 if ((Idx) -1 < 0 || end != REG_MISSING) 2533 for (i = start + 2; i <= end; ++i) 2534 { 2535 elem = duplicate_tree (elem, dfa); 2536 tree = create_tree (dfa, tree, elem, CONCAT); 2537 if (BE (elem == NULL || tree == NULL, 0)) 2538 goto parse_dup_op_espace; 2539 2540 tree = create_tree (dfa, tree, NULL, OP_ALT); 2541 if (BE (tree == NULL, 0)) 2542 goto parse_dup_op_espace; 2543 } 2544 2545 if (old_tree) 2546 tree = create_tree (dfa, old_tree, tree, CONCAT); 2547 2548 return tree; 2549 2550 parse_dup_op_espace: 2551 *err = REG_ESPACE; 2552 return NULL; 2553 } 2554 2555 /* Size of the names for collating symbol/equivalence_class/character_class. 2556 I'm not sure, but maybe enough. */ 2557 #define BRACKET_NAME_BUF_SIZE 32 2558 2559 #ifndef _LIBC 2560 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2561 Build the range expression which starts from START_ELEM, and ends 2562 at END_ELEM. The result are written to MBCSET and SBCSET. 2563 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2564 mbcset->range_ends, is a pointer argument sinse we may 2565 update it. */ 2566 2567 static reg_errcode_t 2568 build_range_exp (bitset sbcset, 2569 # ifdef RE_ENABLE_I18N 2570 re_charset_t *mbcset, Idx *range_alloc, 2571 # endif 2572 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2573 { 2574 unsigned int start_ch, end_ch; 2575 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2576 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2577 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2578 0)) 2579 return REG_ERANGE; 2580 2581 /* We can handle no multi character collating elements without libc 2582 support. */ 2583 if (BE ((start_elem->type == COLL_SYM 2584 && strlen ((char *) start_elem->opr.name) > 1) 2585 || (end_elem->type == COLL_SYM 2586 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2587 return REG_ECOLLATE; 2588 2589 # ifdef RE_ENABLE_I18N 2590 { 2591 wchar_t wc; 2592 wint_t start_wc, end_wc; 2593 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2594 2595 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2596 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2597 : 0)); 2598 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2599 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2600 : 0)); 2601 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2602 ? __btowc (start_ch) : start_elem->opr.wch); 2603 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2604 ? __btowc (end_ch) : end_elem->opr.wch); 2605 if (start_wc == WEOF || end_wc == WEOF) 2606 return REG_ECOLLATE; 2607 cmp_buf[0] = start_wc; 2608 cmp_buf[4] = end_wc; 2609 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2610 return REG_ERANGE; 2611 2612 /* Got valid collation sequence values, add them as a new entry. 2613 However, for !_LIBC we have no collation elements: if the 2614 character set is single byte, the single byte character set 2615 that we build below suffices. parse_bracket_exp passes 2616 no MBCSET if dfa->mb_cur_max == 1. */ 2617 if (mbcset) 2618 { 2619 /* Check the space of the arrays. */ 2620 if (BE (*range_alloc == mbcset->nranges, 0)) 2621 { 2622 /* There is not enough space, need realloc. */ 2623 wchar_t *new_array_start, *new_array_end; 2624 Idx new_nranges; 2625 2626 new_nranges = mbcset->nranges; 2627 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2628 are NULL if *range_alloc == 0. */ 2629 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t, 2630 &new_nranges); 2631 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2632 new_nranges); 2633 2634 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2635 return REG_ESPACE; 2636 2637 mbcset->range_starts = new_array_start; 2638 mbcset->range_ends = new_array_end; 2639 *range_alloc = new_nranges; 2640 } 2641 2642 mbcset->range_starts[mbcset->nranges] = start_wc; 2643 mbcset->range_ends[mbcset->nranges++] = end_wc; 2644 } 2645 2646 /* Build the table for single byte characters. */ 2647 for (wc = 0; wc < SBC_MAX; ++wc) 2648 { 2649 cmp_buf[2] = wc; 2650 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2651 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2652 bitset_set (sbcset, wc); 2653 } 2654 } 2655 # else /* not RE_ENABLE_I18N */ 2656 { 2657 unsigned int ch; 2658 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2659 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2660 : 0)); 2661 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2662 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2663 : 0)); 2664 if (start_ch > end_ch) 2665 return REG_ERANGE; 2666 /* Build the table for single byte characters. */ 2667 for (ch = 0; ch < SBC_MAX; ++ch) 2668 if (start_ch <= ch && ch <= end_ch) 2669 bitset_set (sbcset, ch); 2670 } 2671 # endif /* not RE_ENABLE_I18N */ 2672 return REG_NOERROR; 2673 } 2674 #endif /* not _LIBC */ 2675 2676 #ifndef _LIBC 2677 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2678 Build the collating element which is represented by NAME. 2679 The result are written to MBCSET and SBCSET. 2680 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2681 pointer argument since we may update it. */ 2682 2683 static reg_errcode_t 2684 build_collating_symbol (bitset sbcset, 2685 # ifdef RE_ENABLE_I18N 2686 re_charset_t *mbcset, Idx *coll_sym_alloc, 2687 # endif 2688 const unsigned char *name) 2689 { 2690 size_t name_len = strlen ((const char *) name); 2691 if (BE (name_len != 1, 0)) 2692 return REG_ECOLLATE; 2693 else 2694 { 2695 bitset_set (sbcset, name[0]); 2696 return REG_NOERROR; 2697 } 2698 } 2699 #endif /* not _LIBC */ 2700 2701 /* This function parse bracket expression like "[abc]", "[a-c]", 2702 "[[.a-a.]]" etc. */ 2703 2704 static bin_tree_t * 2705 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2706 reg_syntax_t syntax, reg_errcode_t *err) 2707 { 2708 #ifdef _LIBC 2709 const unsigned char *collseqmb; 2710 const char *collseqwc; 2711 uint32_t nrules; 2712 int32_t table_size; 2713 const int32_t *symb_table; 2714 const unsigned char *extra; 2715 2716 /* Local function for parse_bracket_exp used in _LIBC environement. 2717 Seek the collating symbol entry correspondings to NAME. 2718 Return the index of the symbol in the SYMB_TABLE. */ 2719 2720 auto inline int32_t 2721 __attribute ((always_inline)) 2722 seek_collating_symbol_entry (const unsigned char *name, size_t name_len) 2723 { 2724 int32_t hash = elem_hash ((const char *) name, name_len); 2725 int32_t elem = hash % table_size; 2726 int32_t second = hash % (table_size - 2); 2727 while (symb_table[2 * elem] != 0) 2728 { 2729 /* First compare the hashing value. */ 2730 if (symb_table[2 * elem] == hash 2731 /* Compare the length of the name. */ 2732 && name_len == extra[symb_table[2 * elem + 1]] 2733 /* Compare the name. */ 2734 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2735 name_len) == 0) 2736 { 2737 /* Yep, this is the entry. */ 2738 break; 2739 } 2740 2741 /* Next entry. */ 2742 elem += second; 2743 } 2744 return elem; 2745 } 2746 2747 /* Local function for parse_bracket_exp used in _LIBC environement. 2748 Look up the collation sequence value of BR_ELEM. 2749 Return the value if succeeded, UINT_MAX otherwise. */ 2750 2751 auto inline unsigned int 2752 __attribute ((always_inline)) 2753 lookup_collation_sequence_value (bracket_elem_t *br_elem) 2754 { 2755 if (br_elem->type == SB_CHAR) 2756 { 2757 /* 2758 if (MB_CUR_MAX == 1) 2759 */ 2760 if (nrules == 0) 2761 return collseqmb[br_elem->opr.ch]; 2762 else 2763 { 2764 wint_t wc = __btowc (br_elem->opr.ch); 2765 return __collseq_table_lookup (collseqwc, wc); 2766 } 2767 } 2768 else if (br_elem->type == MB_CHAR) 2769 { 2770 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2771 } 2772 else if (br_elem->type == COLL_SYM) 2773 { 2774 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2775 if (nrules != 0) 2776 { 2777 int32_t elem, idx; 2778 elem = seek_collating_symbol_entry (br_elem->opr.name, 2779 sym_name_len); 2780 if (symb_table[2 * elem] != 0) 2781 { 2782 /* We found the entry. */ 2783 idx = symb_table[2 * elem + 1]; 2784 /* Skip the name of collating element name. */ 2785 idx += 1 + extra[idx]; 2786 /* Skip the byte sequence of the collating element. */ 2787 idx += 1 + extra[idx]; 2788 /* Adjust for the alignment. */ 2789 idx = (idx + 3) & ~3; 2790 /* Skip the multibyte collation sequence value. */ 2791 idx += sizeof (unsigned int); 2792 /* Skip the wide char sequence of the collating element. */ 2793 idx += sizeof (unsigned int) * 2794 (1 + *(unsigned int *) (extra + idx)); 2795 /* Return the collation sequence value. */ 2796 return *(unsigned int *) (extra + idx); 2797 } 2798 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2799 { 2800 /* No valid character. Match it as a single byte 2801 character. */ 2802 return collseqmb[br_elem->opr.name[0]]; 2803 } 2804 } 2805 else if (sym_name_len == 1) 2806 return collseqmb[br_elem->opr.name[0]]; 2807 } 2808 return UINT_MAX; 2809 } 2810 2811 /* Local function for parse_bracket_exp used in _LIBC environement. 2812 Build the range expression which starts from START_ELEM, and ends 2813 at END_ELEM. The result are written to MBCSET and SBCSET. 2814 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2815 mbcset->range_ends, is a pointer argument sinse we may 2816 update it. */ 2817 2818 auto inline reg_errcode_t 2819 __attribute ((always_inline)) 2820 build_range_exp (bitset sbcset, re_charset_t *mbcset, 2821 Idx *range_alloc, 2822 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2823 { 2824 unsigned int ch; 2825 uint32_t start_collseq; 2826 uint32_t end_collseq; 2827 2828 /* Equivalence Classes and Character Classes can't be a range 2829 start/end. */ 2830 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2831 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2832 0)) 2833 return REG_ERANGE; 2834 2835 start_collseq = lookup_collation_sequence_value (start_elem); 2836 end_collseq = lookup_collation_sequence_value (end_elem); 2837 /* Check start/end collation sequence values. */ 2838 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2839 return REG_ECOLLATE; 2840 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2841 return REG_ERANGE; 2842 2843 /* Got valid collation sequence values, add them as a new entry. 2844 However, if we have no collation elements, and the character set 2845 is single byte, the single byte character set that we 2846 build below suffices. */ 2847 if (nrules > 0 || dfa->mb_cur_max > 1) 2848 { 2849 /* Check the space of the arrays. */ 2850 if (BE (*range_alloc == mbcset->nranges, 0)) 2851 { 2852 /* There is not enough space, need realloc. */ 2853 uint32_t *new_array_start; 2854 uint32_t *new_array_end; 2855 Idx new_nranges; 2856 2857 new_nranges = mbcset->nranges; 2858 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t, 2859 &new_nranges); 2860 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2861 new_nranges); 2862 2863 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2864 return REG_ESPACE; 2865 2866 mbcset->range_starts = new_array_start; 2867 mbcset->range_ends = new_array_end; 2868 *range_alloc = new_nranges; 2869 } 2870 2871 mbcset->range_starts[mbcset->nranges] = start_collseq; 2872 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2873 } 2874 2875 /* Build the table for single byte characters. */ 2876 for (ch = 0; ch < SBC_MAX; ch++) 2877 { 2878 uint32_t ch_collseq; 2879 /* 2880 if (MB_CUR_MAX == 1) 2881 */ 2882 if (nrules == 0) 2883 ch_collseq = collseqmb[ch]; 2884 else 2885 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2886 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2887 bitset_set (sbcset, ch); 2888 } 2889 return REG_NOERROR; 2890 } 2891 2892 /* Local function for parse_bracket_exp used in _LIBC environement. 2893 Build the collating element which is represented by NAME. 2894 The result are written to MBCSET and SBCSET. 2895 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2896 pointer argument sinse we may update it. */ 2897 2898 auto inline reg_errcode_t 2899 __attribute ((always_inline)) 2900 build_collating_symbol (bitset sbcset, re_charset_t *mbcset, 2901 Idx *coll_sym_alloc, const unsigned char *name) 2902 { 2903 int32_t elem, idx; 2904 size_t name_len = strlen ((const char *) name); 2905 if (nrules != 0) 2906 { 2907 elem = seek_collating_symbol_entry (name, name_len); 2908 if (symb_table[2 * elem] != 0) 2909 { 2910 /* We found the entry. */ 2911 idx = symb_table[2 * elem + 1]; 2912 /* Skip the name of collating element name. */ 2913 idx += 1 + extra[idx]; 2914 } 2915 else if (symb_table[2 * elem] == 0 && name_len == 1) 2916 { 2917 /* No valid character, treat it as a normal 2918 character. */ 2919 bitset_set (sbcset, name[0]); 2920 return REG_NOERROR; 2921 } 2922 else 2923 return REG_ECOLLATE; 2924 2925 /* Got valid collation sequence, add it as a new entry. */ 2926 /* Check the space of the arrays. */ 2927 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 2928 { 2929 /* Not enough, realloc it. */ 2930 Idx new_coll_sym_alloc = mbcset->ncoll_syms; 2931 /* Use realloc since mbcset->coll_syms is NULL 2932 if *alloc == 0. */ 2933 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t, 2934 &new_coll_sym_alloc); 2935 if (BE (new_coll_syms == NULL, 0)) 2936 return REG_ESPACE; 2937 mbcset->coll_syms = new_coll_syms; 2938 *coll_sym_alloc = new_coll_sym_alloc; 2939 } 2940 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 2941 return REG_NOERROR; 2942 } 2943 else 2944 { 2945 if (BE (name_len != 1, 0)) 2946 return REG_ECOLLATE; 2947 else 2948 { 2949 bitset_set (sbcset, name[0]); 2950 return REG_NOERROR; 2951 } 2952 } 2953 } 2954 #endif 2955 2956 re_token_t br_token; 2957 re_bitset_ptr_t sbcset; 2958 #ifdef RE_ENABLE_I18N 2959 re_charset_t *mbcset; 2960 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 2961 Idx equiv_class_alloc = 0, char_class_alloc = 0; 2962 #endif /* not RE_ENABLE_I18N */ 2963 bool non_match = false; 2964 bin_tree_t *work_tree; 2965 int token_len; 2966 bool first_round = true; 2967 #ifdef _LIBC 2968 collseqmb = (const unsigned char *) 2969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 2970 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2971 if (nrules) 2972 { 2973 /* 2974 if (MB_CUR_MAX > 1) 2975 */ 2976 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 2977 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 2978 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 2979 _NL_COLLATE_SYMB_TABLEMB); 2980 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 2981 _NL_COLLATE_SYMB_EXTRAMB); 2982 } 2983 #endif 2984 sbcset = re_calloc (bitset_word, BITSET_WORDS); 2985 #ifdef RE_ENABLE_I18N 2986 mbcset = re_calloc (re_charset_t, 1); 2987 #endif /* RE_ENABLE_I18N */ 2988 #ifdef RE_ENABLE_I18N 2989 if (BE (sbcset == NULL || mbcset == NULL, 0)) 2990 #else 2991 if (BE (sbcset == NULL, 0)) 2992 #endif /* RE_ENABLE_I18N */ 2993 { 2994 *err = REG_ESPACE; 2995 return NULL; 2996 } 2997 2998 token_len = peek_token_bracket (token, regexp, syntax); 2999 if (BE (token->type == END_OF_RE, 0)) 3000 { 3001 *err = REG_BADPAT; 3002 goto parse_bracket_exp_free_return; 3003 } 3004 if (token->type == OP_NON_MATCH_LIST) 3005 { 3006 #ifdef RE_ENABLE_I18N 3007 mbcset->non_match = 1; 3008 #endif /* not RE_ENABLE_I18N */ 3009 non_match = true; 3010 if (syntax & REG_HAT_LISTS_NOT_NEWLINE) 3011 bitset_set (sbcset, '\0'); 3012 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3013 token_len = peek_token_bracket (token, regexp, syntax); 3014 if (BE (token->type == END_OF_RE, 0)) 3015 { 3016 *err = REG_BADPAT; 3017 goto parse_bracket_exp_free_return; 3018 } 3019 } 3020 3021 /* We treat the first ']' as a normal character. */ 3022 if (token->type == OP_CLOSE_BRACKET) 3023 token->type = CHARACTER; 3024 3025 while (1) 3026 { 3027 bracket_elem_t start_elem, end_elem; 3028 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3029 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3030 reg_errcode_t ret; 3031 int token_len2 = 0; 3032 bool is_range_exp = false; 3033 re_token_t token2; 3034 3035 start_elem.opr.name = start_name_buf; 3036 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3037 syntax, first_round); 3038 if (BE (ret != REG_NOERROR, 0)) 3039 { 3040 *err = ret; 3041 goto parse_bracket_exp_free_return; 3042 } 3043 first_round = false; 3044 3045 /* Get information about the next token. We need it in any case. */ 3046 token_len = peek_token_bracket (token, regexp, syntax); 3047 3048 /* Do not check for ranges if we know they are not allowed. */ 3049 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3050 { 3051 if (BE (token->type == END_OF_RE, 0)) 3052 { 3053 *err = REG_EBRACK; 3054 goto parse_bracket_exp_free_return; 3055 } 3056 if (token->type == OP_CHARSET_RANGE) 3057 { 3058 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3059 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3060 if (BE (token2.type == END_OF_RE, 0)) 3061 { 3062 *err = REG_EBRACK; 3063 goto parse_bracket_exp_free_return; 3064 } 3065 if (token2.type == OP_CLOSE_BRACKET) 3066 { 3067 /* We treat the last '-' as a normal character. */ 3068 re_string_skip_bytes (regexp, -token_len); 3069 token->type = CHARACTER; 3070 } 3071 else 3072 is_range_exp = true; 3073 } 3074 } 3075 3076 if (is_range_exp == true) 3077 { 3078 end_elem.opr.name = end_name_buf; 3079 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3080 dfa, syntax, true); 3081 if (BE (ret != REG_NOERROR, 0)) 3082 { 3083 *err = ret; 3084 goto parse_bracket_exp_free_return; 3085 } 3086 3087 token_len = peek_token_bracket (token, regexp, syntax); 3088 3089 #ifdef _LIBC 3090 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3091 &start_elem, &end_elem); 3092 #else 3093 # ifdef RE_ENABLE_I18N 3094 *err = build_range_exp (sbcset, 3095 dfa->mb_cur_max > 1 ? mbcset : NULL, 3096 &range_alloc, &start_elem, &end_elem); 3097 # else 3098 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3099 # endif 3100 #endif /* RE_ENABLE_I18N */ 3101 if (BE (*err != REG_NOERROR, 0)) 3102 goto parse_bracket_exp_free_return; 3103 } 3104 else 3105 { 3106 switch (start_elem.type) 3107 { 3108 case SB_CHAR: 3109 bitset_set (sbcset, start_elem.opr.ch); 3110 break; 3111 #ifdef RE_ENABLE_I18N 3112 case MB_CHAR: 3113 /* Check whether the array has enough space. */ 3114 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3115 { 3116 wchar_t *new_mbchars; 3117 /* Not enough, realloc it. */ 3118 mbchar_alloc = mbcset->nmbchars; 3119 /* Use realloc since array is NULL if *alloc == 0. */ 3120 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t, 3121 &mbchar_alloc); 3122 if (BE (new_mbchars == NULL, 0)) 3123 goto parse_bracket_exp_espace; 3124 mbcset->mbchars = new_mbchars; 3125 } 3126 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3127 break; 3128 #endif /* RE_ENABLE_I18N */ 3129 case EQUIV_CLASS: 3130 *err = build_equiv_class (sbcset, 3131 #ifdef RE_ENABLE_I18N 3132 mbcset, &equiv_class_alloc, 3133 #endif /* RE_ENABLE_I18N */ 3134 start_elem.opr.name); 3135 if (BE (*err != REG_NOERROR, 0)) 3136 goto parse_bracket_exp_free_return; 3137 break; 3138 case COLL_SYM: 3139 *err = build_collating_symbol (sbcset, 3140 #ifdef RE_ENABLE_I18N 3141 mbcset, &coll_sym_alloc, 3142 #endif /* RE_ENABLE_I18N */ 3143 start_elem.opr.name); 3144 if (BE (*err != REG_NOERROR, 0)) 3145 goto parse_bracket_exp_free_return; 3146 break; 3147 case CHAR_CLASS: 3148 *err = build_charclass (regexp->trans, sbcset, 3149 #ifdef RE_ENABLE_I18N 3150 mbcset, &char_class_alloc, 3151 #endif /* RE_ENABLE_I18N */ 3152 start_elem.opr.name, syntax); 3153 if (BE (*err != REG_NOERROR, 0)) 3154 goto parse_bracket_exp_free_return; 3155 break; 3156 default: 3157 assert (0); 3158 break; 3159 } 3160 } 3161 if (BE (token->type == END_OF_RE, 0)) 3162 { 3163 *err = REG_EBRACK; 3164 goto parse_bracket_exp_free_return; 3165 } 3166 if (token->type == OP_CLOSE_BRACKET) 3167 break; 3168 } 3169 3170 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3171 3172 /* If it is non-matching list. */ 3173 if (non_match) 3174 bitset_not (sbcset); 3175 3176 #ifdef RE_ENABLE_I18N 3177 /* Ensure only single byte characters are set. */ 3178 if (dfa->mb_cur_max > 1) 3179 bitset_mask (sbcset, dfa->sb_char); 3180 3181 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3182 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3183 || mbcset->non_match))) 3184 { 3185 bin_tree_t *mbc_tree; 3186 int sbc_idx; 3187 /* Build a tree for complex bracket. */ 3188 dfa->has_mb_node = 1; 3189 br_token.type = COMPLEX_BRACKET; 3190 br_token.opr.mbcset = mbcset; 3191 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3192 if (BE (mbc_tree == NULL, 0)) 3193 goto parse_bracket_exp_espace; 3194 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3195 if (sbcset[sbc_idx]) 3196 break; 3197 /* If there are no bits set in sbcset, there is no point 3198 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3199 if (sbc_idx < BITSET_WORDS) 3200 { 3201 /* Build a tree for simple bracket. */ 3202 br_token.type = SIMPLE_BRACKET; 3203 br_token.opr.sbcset = sbcset; 3204 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3205 if (BE (work_tree == NULL, 0)) 3206 goto parse_bracket_exp_espace; 3207 3208 /* Then join them by ALT node. */ 3209 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3210 if (BE (work_tree == NULL, 0)) 3211 goto parse_bracket_exp_espace; 3212 } 3213 else 3214 { 3215 re_free (sbcset); 3216 work_tree = mbc_tree; 3217 } 3218 } 3219 else 3220 #endif /* not RE_ENABLE_I18N */ 3221 { 3222 #ifdef RE_ENABLE_I18N 3223 free_charset (mbcset); 3224 #endif 3225 /* Build a tree for simple bracket. */ 3226 br_token.type = SIMPLE_BRACKET; 3227 br_token.opr.sbcset = sbcset; 3228 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3229 if (BE (work_tree == NULL, 0)) 3230 goto parse_bracket_exp_espace; 3231 } 3232 return work_tree; 3233 3234 parse_bracket_exp_espace: 3235 *err = REG_ESPACE; 3236 parse_bracket_exp_free_return: 3237 re_free (sbcset); 3238 #ifdef RE_ENABLE_I18N 3239 free_charset (mbcset); 3240 #endif /* RE_ENABLE_I18N */ 3241 return NULL; 3242 } 3243 3244 /* Parse an element in the bracket expression. */ 3245 3246 static reg_errcode_t 3247 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3248 re_token_t *token, int token_len, re_dfa_t *dfa, 3249 reg_syntax_t syntax, bool accept_hyphen) 3250 { 3251 #ifdef RE_ENABLE_I18N 3252 int cur_char_size; 3253 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3254 if (cur_char_size > 1) 3255 { 3256 elem->type = MB_CHAR; 3257 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3258 re_string_skip_bytes (regexp, cur_char_size); 3259 return REG_NOERROR; 3260 } 3261 #endif /* RE_ENABLE_I18N */ 3262 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3263 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3264 || token->type == OP_OPEN_EQUIV_CLASS) 3265 return parse_bracket_symbol (elem, regexp, token); 3266 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3267 { 3268 /* A '-' must only appear as anything but a range indicator before 3269 the closing bracket. Everything else is an error. */ 3270 re_token_t token2; 3271 (void) peek_token_bracket (&token2, regexp, syntax); 3272 if (token2.type != OP_CLOSE_BRACKET) 3273 /* The actual error value is not standardized since this whole 3274 case is undefined. But ERANGE makes good sense. */ 3275 return REG_ERANGE; 3276 } 3277 elem->type = SB_CHAR; 3278 elem->opr.ch = token->opr.c; 3279 return REG_NOERROR; 3280 } 3281 3282 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3283 such as [:<character_class>:], [.<collating_element>.], and 3284 [=<equivalent_class>=]. */ 3285 3286 static reg_errcode_t 3287 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3288 re_token_t *token) 3289 { 3290 unsigned char ch, delim = token->opr.c; 3291 int i = 0; 3292 if (re_string_eoi(regexp)) 3293 return REG_EBRACK; 3294 for (;; ++i) 3295 { 3296 if (i >= BRACKET_NAME_BUF_SIZE) 3297 return REG_EBRACK; 3298 if (token->type == OP_OPEN_CHAR_CLASS) 3299 ch = re_string_fetch_byte_case (regexp); 3300 else 3301 ch = re_string_fetch_byte (regexp); 3302 if (re_string_eoi(regexp)) 3303 return REG_EBRACK; 3304 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3305 break; 3306 elem->opr.name[i] = ch; 3307 } 3308 re_string_skip_bytes (regexp, 1); 3309 elem->opr.name[i] = '\0'; 3310 switch (token->type) 3311 { 3312 case OP_OPEN_COLL_ELEM: 3313 elem->type = COLL_SYM; 3314 break; 3315 case OP_OPEN_EQUIV_CLASS: 3316 elem->type = EQUIV_CLASS; 3317 break; 3318 case OP_OPEN_CHAR_CLASS: 3319 elem->type = CHAR_CLASS; 3320 break; 3321 default: 3322 break; 3323 } 3324 return REG_NOERROR; 3325 } 3326 3327 /* Helper function for parse_bracket_exp. 3328 Build the equivalence class which is represented by NAME. 3329 The result are written to MBCSET and SBCSET. 3330 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3331 is a pointer argument sinse we may update it. */ 3332 3333 static reg_errcode_t 3334 build_equiv_class (bitset sbcset, 3335 #ifdef RE_ENABLE_I18N 3336 re_charset_t *mbcset, Idx *equiv_class_alloc, 3337 #endif 3338 const unsigned char *name) 3339 { 3340 #if defined _LIBC 3341 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3342 if (nrules != 0) 3343 { 3344 const int32_t *table, *indirect; 3345 const unsigned char *weights, *extra, *cp; 3346 unsigned char char_buf[2]; 3347 int32_t idx1, idx2; 3348 unsigned int ch; 3349 size_t len; 3350 /* This #include defines a local function! */ 3351 # include <locale/weight.h> 3352 /* Calculate the index for equivalence class. */ 3353 cp = name; 3354 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3355 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3356 _NL_COLLATE_WEIGHTMB); 3357 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3358 _NL_COLLATE_EXTRAMB); 3359 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3360 _NL_COLLATE_INDIRECTMB); 3361 idx1 = findidx (&cp); 3362 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3363 /* This isn't a valid character. */ 3364 return REG_ECOLLATE; 3365 3366 /* Build single byte matcing table for this equivalence class. */ 3367 char_buf[1] = (unsigned char) '\0'; 3368 len = weights[idx1]; 3369 for (ch = 0; ch < SBC_MAX; ++ch) 3370 { 3371 char_buf[0] = ch; 3372 cp = char_buf; 3373 idx2 = findidx (&cp); 3374 /* 3375 idx2 = table[ch]; 3376 */ 3377 if (idx2 == 0) 3378 /* This isn't a valid character. */ 3379 continue; 3380 if (len == weights[idx2]) 3381 { 3382 int cnt = 0; 3383 while (cnt <= len && 3384 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3385 ++cnt; 3386 3387 if (cnt > len) 3388 bitset_set (sbcset, ch); 3389 } 3390 } 3391 /* Check whether the array has enough space. */ 3392 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3393 { 3394 /* Not enough, realloc it. */ 3395 Idx new_equiv_class_alloc = mbcset->nequiv_classes; 3396 /* Use realloc since the array is NULL if *alloc == 0. */ 3397 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes, 3398 int32_t, 3399 &new_equiv_class_alloc); 3400 if (BE (new_equiv_classes == NULL, 0)) 3401 return REG_ESPACE; 3402 mbcset->equiv_classes = new_equiv_classes; 3403 *equiv_class_alloc = new_equiv_class_alloc; 3404 } 3405 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3406 } 3407 else 3408 #endif /* _LIBC */ 3409 { 3410 if (BE (strlen ((const char *) name) != 1, 0)) 3411 return REG_ECOLLATE; 3412 bitset_set (sbcset, *name); 3413 } 3414 return REG_NOERROR; 3415 } 3416 3417 /* Helper function for parse_bracket_exp. 3418 Build the character class which is represented by NAME. 3419 The result are written to MBCSET and SBCSET. 3420 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3421 is a pointer argument sinse we may update it. */ 3422 3423 static reg_errcode_t 3424 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset, 3425 #ifdef RE_ENABLE_I18N 3426 re_charset_t *mbcset, Idx *char_class_alloc, 3427 #endif 3428 const unsigned char *class_name, reg_syntax_t syntax) 3429 { 3430 int i; 3431 const char *name = (const char *) class_name; 3432 3433 /* In case of REG_ICASE "upper" and "lower" match the both of 3434 upper and lower cases. */ 3435 if ((syntax & REG_IGNORE_CASE) 3436 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3437 name = "alpha"; 3438 3439 #ifdef RE_ENABLE_I18N 3440 /* Check the space of the arrays. */ 3441 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3442 { 3443 /* Not enough, realloc it. */ 3444 Idx new_char_class_alloc = mbcset->nchar_classes; 3445 /* Use realloc since array is NULL if *alloc == 0. */ 3446 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t, 3447 &new_char_class_alloc); 3448 if (BE (new_char_classes == NULL, 0)) 3449 return REG_ESPACE; 3450 mbcset->char_classes = new_char_classes; 3451 *char_class_alloc = new_char_class_alloc; 3452 } 3453 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3454 #endif /* RE_ENABLE_I18N */ 3455 3456 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3457 for (i = 0; i < SBC_MAX; ++i) \ 3458 { \ 3459 if (ctype_func (i)) \ 3460 { \ 3461 int ch = trans ? trans[i] : i; \ 3462 bitset_set (sbcset, ch); \ 3463 } \ 3464 } 3465 3466 if (strcmp (name, "alnum") == 0) 3467 BUILD_CHARCLASS_LOOP (isalnum) 3468 else if (strcmp (name, "cntrl") == 0) 3469 BUILD_CHARCLASS_LOOP (iscntrl) 3470 else if (strcmp (name, "lower") == 0) 3471 BUILD_CHARCLASS_LOOP (islower) 3472 else if (strcmp (name, "space") == 0) 3473 BUILD_CHARCLASS_LOOP (isspace) 3474 else if (strcmp (name, "alpha") == 0) 3475 BUILD_CHARCLASS_LOOP (isalpha) 3476 else if (strcmp (name, "digit") == 0) 3477 BUILD_CHARCLASS_LOOP (isdigit) 3478 else if (strcmp (name, "print") == 0) 3479 BUILD_CHARCLASS_LOOP (isprint) 3480 else if (strcmp (name, "upper") == 0) 3481 BUILD_CHARCLASS_LOOP (isupper) 3482 else if (strcmp (name, "blank") == 0) 3483 BUILD_CHARCLASS_LOOP (isblank) 3484 else if (strcmp (name, "graph") == 0) 3485 BUILD_CHARCLASS_LOOP (isgraph) 3486 else if (strcmp (name, "punct") == 0) 3487 BUILD_CHARCLASS_LOOP (ispunct) 3488 else if (strcmp (name, "xdigit") == 0) 3489 BUILD_CHARCLASS_LOOP (isxdigit) 3490 else 3491 return REG_ECTYPE; 3492 3493 return REG_NOERROR; 3494 } 3495 3496 static bin_tree_t * 3497 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans, 3498 const unsigned char *class_name, 3499 const unsigned char *extra, 3500 bool non_match, reg_errcode_t *err) 3501 { 3502 re_bitset_ptr_t sbcset; 3503 #ifdef RE_ENABLE_I18N 3504 re_charset_t *mbcset; 3505 Idx alloc = 0; 3506 #endif /* not RE_ENABLE_I18N */ 3507 reg_errcode_t ret; 3508 re_token_t br_token; 3509 bin_tree_t *tree; 3510 3511 sbcset = re_calloc (bitset_word, BITSET_WORDS); 3512 #ifdef RE_ENABLE_I18N 3513 mbcset = re_calloc (re_charset_t, 1); 3514 #endif /* RE_ENABLE_I18N */ 3515 3516 #ifdef RE_ENABLE_I18N 3517 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3518 #else /* not RE_ENABLE_I18N */ 3519 if (BE (sbcset == NULL, 0)) 3520 #endif /* not RE_ENABLE_I18N */ 3521 { 3522 *err = REG_ESPACE; 3523 return NULL; 3524 } 3525 3526 if (non_match) 3527 { 3528 #ifdef RE_ENABLE_I18N 3529 /* 3530 if (syntax & REG_HAT_LISTS_NOT_NEWLINE) 3531 bitset_set(cset->sbcset, '\0'); 3532 */ 3533 mbcset->non_match = 1; 3534 #endif /* not RE_ENABLE_I18N */ 3535 } 3536 3537 /* We don't care the syntax in this case. */ 3538 ret = build_charclass (trans, sbcset, 3539 #ifdef RE_ENABLE_I18N 3540 mbcset, &alloc, 3541 #endif /* RE_ENABLE_I18N */ 3542 class_name, 0); 3543 3544 if (BE (ret != REG_NOERROR, 0)) 3545 { 3546 re_free (sbcset); 3547 #ifdef RE_ENABLE_I18N 3548 free_charset (mbcset); 3549 #endif /* RE_ENABLE_I18N */ 3550 *err = ret; 3551 return NULL; 3552 } 3553 /* \w match '_' also. */ 3554 for (; *extra; extra++) 3555 bitset_set (sbcset, *extra); 3556 3557 /* If it is non-matching list. */ 3558 if (non_match) 3559 bitset_not (sbcset); 3560 3561 #ifdef RE_ENABLE_I18N 3562 /* Ensure only single byte characters are set. */ 3563 if (dfa->mb_cur_max > 1) 3564 bitset_mask (sbcset, dfa->sb_char); 3565 #endif 3566 3567 /* Build a tree for simple bracket. */ 3568 br_token.type = SIMPLE_BRACKET; 3569 br_token.opr.sbcset = sbcset; 3570 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3571 if (BE (tree == NULL, 0)) 3572 goto build_word_op_espace; 3573 3574 #ifdef RE_ENABLE_I18N 3575 if (dfa->mb_cur_max > 1) 3576 { 3577 bin_tree_t *mbc_tree; 3578 /* Build a tree for complex bracket. */ 3579 br_token.type = COMPLEX_BRACKET; 3580 br_token.opr.mbcset = mbcset; 3581 dfa->has_mb_node = 1; 3582 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3583 if (BE (mbc_tree == NULL, 0)) 3584 goto build_word_op_espace; 3585 /* Then join them by ALT node. */ 3586 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3587 if (BE (mbc_tree != NULL, 1)) 3588 return tree; 3589 } 3590 else 3591 { 3592 free_charset (mbcset); 3593 return tree; 3594 } 3595 #else /* not RE_ENABLE_I18N */ 3596 return tree; 3597 #endif /* not RE_ENABLE_I18N */ 3598 3599 build_word_op_espace: 3600 re_free (sbcset); 3601 #ifdef RE_ENABLE_I18N 3602 free_charset (mbcset); 3603 #endif /* RE_ENABLE_I18N */ 3604 *err = REG_ESPACE; 3605 return NULL; 3606 } 3607 3608 /* This is intended for the expressions like "a{1,3}". 3609 Fetch a number from `input', and return the number. 3610 Return REG_MISSING if the number field is empty like "{,1}". 3611 Return REG_ERROR if an error occurred. */ 3612 3613 static Idx 3614 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3615 { 3616 Idx num = REG_MISSING; 3617 unsigned char c; 3618 while (1) 3619 { 3620 fetch_token (token, input, syntax); 3621 c = token->opr.c; 3622 if (BE (token->type == END_OF_RE, 0)) 3623 return REG_ERROR; 3624 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3625 break; 3626 num = ((token->type != CHARACTER || c < '0' || '9' < c 3627 || num == REG_ERROR) 3628 ? REG_ERROR 3629 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3630 num = (num > REG_DUP_MAX) ? REG_ERROR : num; 3631 } 3632 return num; 3633 } 3634 3635 #ifdef RE_ENABLE_I18N 3636 static void 3637 free_charset (re_charset_t *cset) 3638 { 3639 re_free (cset->mbchars); 3640 # ifdef _LIBC 3641 re_free (cset->coll_syms); 3642 re_free (cset->equiv_classes); 3643 re_free (cset->range_starts); 3644 re_free (cset->range_ends); 3645 # endif 3646 re_free (cset->char_classes); 3647 re_free (cset); 3648 } 3649 #endif /* RE_ENABLE_I18N */ 3650 3651 /* Functions for binary tree operation. */ 3652 3653 /* Create a tree node. */ 3654 3655 static bin_tree_t * 3656 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3657 re_token_type_t type) 3658 { 3659 re_token_t t; 3660 t.type = type; 3661 return create_token_tree (dfa, left, right, &t); 3662 } 3663 3664 static bin_tree_t * 3665 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3666 const re_token_t *token) 3667 { 3668 bin_tree_t *tree; 3669 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3670 { 3671 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3672 3673 if (storage == NULL) 3674 return NULL; 3675 storage->next = dfa->str_tree_storage; 3676 dfa->str_tree_storage = storage; 3677 dfa->str_tree_storage_idx = 0; 3678 } 3679 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3680 3681 tree->parent = NULL; 3682 tree->left = left; 3683 tree->right = right; 3684 tree->token = *token; 3685 tree->token.duplicated = 0; 3686 tree->token.opt_subexp = 0; 3687 tree->first = NULL; 3688 tree->next = NULL; 3689 tree->node_idx = REG_MISSING; 3690 3691 if (left != NULL) 3692 left->parent = tree; 3693 if (right != NULL) 3694 right->parent = tree; 3695 return tree; 3696 } 3697 3698 /* Mark the tree SRC as an optional subexpression. 3699 To be called from preorder or postorder. */ 3700 3701 static reg_errcode_t 3702 mark_opt_subexp (void *extra, bin_tree_t *node) 3703 { 3704 Idx idx = (Idx) (long) extra; 3705 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3706 node->token.opt_subexp = 1; 3707 3708 return REG_NOERROR; 3709 } 3710 3711 /* Free the allocated memory inside NODE. */ 3712 3713 static void 3714 free_token (re_token_t *node) 3715 { 3716 #ifdef RE_ENABLE_I18N 3717 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3718 free_charset (node->opr.mbcset); 3719 else 3720 #endif /* RE_ENABLE_I18N */ 3721 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3722 re_free (node->opr.sbcset); 3723 } 3724 3725 /* Worker function for tree walking. Free the allocated memory inside NODE 3726 and its children. */ 3727 3728 static reg_errcode_t 3729 free_tree (void *extra, bin_tree_t *node) 3730 { 3731 free_token (&node->token); 3732 return REG_NOERROR; 3733 } 3734 3735 3736 /* Duplicate the node SRC, and return new node. This is a preorder 3737 visit similar to the one implemented by the generic visitor, but 3738 we need more infrastructure to maintain two parallel trees --- so, 3739 it's easier to duplicate. */ 3740 3741 static bin_tree_t * 3742 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3743 { 3744 const bin_tree_t *node; 3745 bin_tree_t *dup_root; 3746 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3747 3748 for (node = root; ; ) 3749 { 3750 /* Create a new tree and link it back to the current parent. */ 3751 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3752 if (*p_new == NULL) 3753 return NULL; 3754 (*p_new)->parent = dup_node; 3755 (*p_new)->token.duplicated = 1; 3756 dup_node = *p_new; 3757 3758 /* Go to the left node, or up and to the right. */ 3759 if (node->left) 3760 { 3761 node = node->left; 3762 p_new = &dup_node->left; 3763 } 3764 else 3765 { 3766 const bin_tree_t *prev = NULL; 3767 while (node->right == prev || node->right == NULL) 3768 { 3769 prev = node; 3770 node = node->parent; 3771 dup_node = dup_node->parent; 3772 if (!node) 3773 return dup_root; 3774 } 3775 node = node->right; 3776 p_new = &dup_node->right; 3777 } 3778 } 3779 } 3780