1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free 5 Software Foundation, Inc. 6 This file is part of the GNU C Library. 7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3, or (at your option) 12 any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with this program; if not, write to the Free Software Foundation, 21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 #include "verify.h" 24 #include "intprops.h" 25 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 26 Idx n) internal_function; 27 static void match_ctx_clean (re_match_context_t *mctx) internal_function; 28 static void match_ctx_free (re_match_context_t *cache) internal_function; 29 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node, 30 Idx str_idx, Idx from, Idx to) 31 internal_function; 32 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx) 33 internal_function; 34 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node, 35 Idx str_idx) internal_function; 36 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 37 Idx node, Idx str_idx) 38 internal_function; 39 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 40 re_dfastate_t **limited_sts, Idx last_node, 41 Idx last_str_idx) 42 internal_function; 43 static reg_errcode_t re_search_internal (const regex_t *preg, 44 const char *string, Idx length, 45 Idx start, Idx last_start, Idx stop, 46 size_t nmatch, regmatch_t pmatch[], 47 int eflags) internal_function; 48 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp, 49 const char *string1, Idx length1, 50 const char *string2, Idx length2, 51 Idx start, regoff_t range, 52 struct re_registers *regs, 53 Idx stop, bool ret_len) internal_function; 54 static regoff_t re_search_stub (struct re_pattern_buffer *bufp, 55 const char *string, Idx length, Idx start, 56 regoff_t range, Idx stop, 57 struct re_registers *regs, 58 bool ret_len) internal_function; 59 static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 60 Idx nregs, int regs_allocated) 61 internal_function; 62 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx) 63 internal_function; 64 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match, 65 Idx *p_match_first) internal_function; 66 static Idx check_halt_state_context (const re_match_context_t *mctx, 67 const re_dfastate_t *state, Idx idx) 68 internal_function; 69 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 70 regmatch_t *prev_idx_match, Idx cur_node, 71 Idx cur_idx, Idx nmatch) internal_function; 72 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 73 Idx str_idx, Idx dest_node, Idx nregs, 74 regmatch_t *regs, 75 re_node_set *eps_via_nodes) 76 internal_function; 77 static reg_errcode_t set_regs (const regex_t *preg, 78 const re_match_context_t *mctx, 79 size_t nmatch, regmatch_t *pmatch, 80 bool fl_backtrack) internal_function; 81 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) 82 internal_function; 83 84 #ifdef RE_ENABLE_I18N 85 static int sift_states_iter_mb (const re_match_context_t *mctx, 86 re_sift_context_t *sctx, 87 Idx node_idx, Idx str_idx, Idx max_str_idx) 88 internal_function; 89 #endif /* RE_ENABLE_I18N */ 90 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx, 91 re_sift_context_t *sctx) 92 internal_function; 93 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx, 94 re_sift_context_t *sctx, Idx str_idx, 95 re_node_set *cur_dest) 96 internal_function; 97 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx, 98 re_sift_context_t *sctx, 99 Idx str_idx, 100 re_node_set *dest_nodes) 101 internal_function; 102 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa, 103 re_node_set *dest_nodes, 104 const re_node_set *candidates) 105 internal_function; 106 static bool check_dst_limits (const re_match_context_t *mctx, 107 const re_node_set *limits, 108 Idx dst_node, Idx dst_idx, Idx src_node, 109 Idx src_idx) internal_function; 110 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, 111 int boundaries, Idx subexp_idx, 112 Idx from_node, Idx bkref_idx) 113 internal_function; 114 static int check_dst_limits_calc_pos (const re_match_context_t *mctx, 115 Idx limit, Idx subexp_idx, 116 Idx node, Idx str_idx, 117 Idx bkref_idx) internal_function; 118 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa, 119 re_node_set *dest_nodes, 120 const re_node_set *candidates, 121 re_node_set *limits, 122 struct re_backref_cache_entry *bkref_ents, 123 Idx str_idx) internal_function; 124 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx, 125 re_sift_context_t *sctx, 126 Idx str_idx, const re_node_set *candidates) 127 internal_function; 128 static reg_errcode_t merge_state_array (const re_dfa_t *dfa, 129 re_dfastate_t **dst, 130 re_dfastate_t **src, Idx num) 131 internal_function; 132 static re_dfastate_t *find_recover_state (reg_errcode_t *err, 133 re_match_context_t *mctx) internal_function; 134 static re_dfastate_t *transit_state (reg_errcode_t *err, 135 re_match_context_t *mctx, 136 re_dfastate_t *state) internal_function; 137 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err, 138 re_match_context_t *mctx, 139 re_dfastate_t *next_state) 140 internal_function; 141 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx, 142 re_node_set *cur_nodes, 143 Idx str_idx) internal_function; 144 #if 0 145 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, 146 re_match_context_t *mctx, 147 re_dfastate_t *pstate) 148 internal_function; 149 #endif 150 #ifdef RE_ENABLE_I18N 151 static reg_errcode_t transit_state_mb (re_match_context_t *mctx, 152 re_dfastate_t *pstate) 153 internal_function; 154 #endif /* RE_ENABLE_I18N */ 155 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx, 156 const re_node_set *nodes) 157 internal_function; 158 static reg_errcode_t get_subexp (re_match_context_t *mctx, 159 Idx bkref_node, Idx bkref_str_idx) 160 internal_function; 161 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx, 162 const re_sub_match_top_t *sub_top, 163 re_sub_match_last_t *sub_last, 164 Idx bkref_node, Idx bkref_str) 165 internal_function; 166 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 167 Idx subexp_idx, int type) internal_function; 168 static reg_errcode_t check_arrival (re_match_context_t *mctx, 169 state_array_t *path, Idx top_node, 170 Idx top_str, Idx last_node, Idx last_str, 171 int type) internal_function; 172 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx, 173 Idx str_idx, 174 re_node_set *cur_nodes, 175 re_node_set *next_nodes) 176 internal_function; 177 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa, 178 re_node_set *cur_nodes, 179 Idx ex_subexp, int type) 180 internal_function; 181 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa, 182 re_node_set *dst_nodes, 183 Idx target, Idx ex_subexp, 184 int type) internal_function; 185 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, 186 re_node_set *cur_nodes, Idx cur_str, 187 Idx subexp_num, int type) 188 internal_function; 189 static bool build_trtable (const re_dfa_t *dfa, 190 re_dfastate_t *state) internal_function; 191 #ifdef RE_ENABLE_I18N 192 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, 193 const re_string_t *input, Idx idx) 194 internal_function; 195 # ifdef _LIBC 196 static unsigned int find_collation_sequence_value (const unsigned char *mbs, 197 size_t name_len) 198 internal_function; 199 # endif /* _LIBC */ 200 #endif /* RE_ENABLE_I18N */ 201 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa, 202 const re_dfastate_t *state, 203 re_node_set *states_node, 204 bitset_t *states_ch) internal_function; 205 static bool check_node_accept (const re_match_context_t *mctx, 206 const re_token_t *node, Idx idx) 207 internal_function; 208 static reg_errcode_t extend_buffers (re_match_context_t *mctx) 209 internal_function; 210 211 /* Entry point for POSIX code. */ 212 213 /* regexec searches for a given pattern, specified by PREG, in the 214 string STRING. 215 216 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 217 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 218 least NMATCH elements, and we set them to the offsets of the 219 corresponding matched substrings. 220 221 EFLAGS specifies `execution flags' which affect matching: if 222 REG_NOTBOL is set, then ^ does not match at the beginning of the 223 string; if REG_NOTEOL is set, then $ does not match at the end. 224 225 We return 0 if we find a match and REG_NOMATCH if not. */ 226 227 int 228 regexec (preg, string, nmatch, pmatch, eflags) 229 const regex_t *_Restrict_ preg; 230 const char *_Restrict_ string; 231 size_t nmatch; 232 regmatch_t pmatch[_Restrict_arr_]; 233 int eflags; 234 { 235 reg_errcode_t err; 236 Idx start, length; 237 #ifdef _LIBC 238 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 239 #endif 240 241 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) 242 return REG_BADPAT; 243 244 if (eflags & REG_STARTEND) 245 { 246 start = pmatch[0].rm_so; 247 length = pmatch[0].rm_eo; 248 } 249 else 250 { 251 start = 0; 252 length = strlen (string); 253 } 254 255 __libc_lock_lock (dfa->lock); 256 if (preg->no_sub) 257 err = re_search_internal (preg, string, length, start, length, 258 length, 0, NULL, eflags); 259 else 260 err = re_search_internal (preg, string, length, start, length, 261 length, nmatch, pmatch, eflags); 262 __libc_lock_unlock (dfa->lock); 263 return err != REG_NOERROR; 264 } 265 266 #ifdef _LIBC 267 # include <shlib-compat.h> 268 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4); 269 270 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) 271 __typeof__ (__regexec) __compat_regexec; 272 273 int 274 attribute_compat_text_section 275 __compat_regexec (const regex_t *_Restrict_ preg, 276 const char *_Restrict_ string, size_t nmatch, 277 regmatch_t pmatch[], int eflags) 278 { 279 return regexec (preg, string, nmatch, pmatch, 280 eflags & (REG_NOTBOL | REG_NOTEOL)); 281 } 282 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0); 283 # endif 284 #endif 285 286 /* Entry points for GNU code. */ 287 288 /* re_match, re_search, re_match_2, re_search_2 289 290 The former two functions operate on STRING with length LENGTH, 291 while the later two operate on concatenation of STRING1 and STRING2 292 with lengths LENGTH1 and LENGTH2, respectively. 293 294 re_match() matches the compiled pattern in BUFP against the string, 295 starting at index START. 296 297 re_search() first tries matching at index START, then it tries to match 298 starting from index START + 1, and so on. The last start position tried 299 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 300 way as re_match().) 301 302 The parameter STOP of re_{match,search}_2 specifies that no match exceeding 303 the first STOP characters of the concatenation of the strings should be 304 concerned. 305 306 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 307 and all groups is stored in REGS. (For the "_2" variants, the offsets are 308 computed relative to the concatenation, not relative to the individual 309 strings.) 310 311 On success, re_match* functions return the length of the match, re_search* 312 return the position of the start of the match. Return value -1 means no 313 match was found and -2 indicates an internal error. */ 314 315 regoff_t 316 re_match (bufp, string, length, start, regs) 317 struct re_pattern_buffer *bufp; 318 const char *string; 319 Idx length, start; 320 struct re_registers *regs; 321 { 322 return re_search_stub (bufp, string, length, start, 0, length, regs, true); 323 } 324 #ifdef _LIBC 325 weak_alias (__re_match, re_match) 326 #endif 327 328 regoff_t 329 re_search (bufp, string, length, start, range, regs) 330 struct re_pattern_buffer *bufp; 331 const char *string; 332 Idx length, start; 333 regoff_t range; 334 struct re_registers *regs; 335 { 336 return re_search_stub (bufp, string, length, start, range, length, regs, 337 false); 338 } 339 #ifdef _LIBC 340 weak_alias (__re_search, re_search) 341 #endif 342 343 regoff_t 344 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 345 struct re_pattern_buffer *bufp; 346 const char *string1, *string2; 347 Idx length1, length2, start, stop; 348 struct re_registers *regs; 349 { 350 return re_search_2_stub (bufp, string1, length1, string2, length2, 351 start, 0, regs, stop, true); 352 } 353 #ifdef _LIBC 354 weak_alias (__re_match_2, re_match_2) 355 #endif 356 357 regoff_t 358 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 359 struct re_pattern_buffer *bufp; 360 const char *string1, *string2; 361 Idx length1, length2, start, stop; 362 regoff_t range; 363 struct re_registers *regs; 364 { 365 return re_search_2_stub (bufp, string1, length1, string2, length2, 366 start, range, regs, stop, false); 367 } 368 #ifdef _LIBC 369 weak_alias (__re_search_2, re_search_2) 370 #endif 371 372 static regoff_t 373 internal_function 374 re_search_2_stub (struct re_pattern_buffer *bufp, 375 const char *string1, Idx length1, 376 const char *string2, Idx length2, 377 Idx start, regoff_t range, struct re_registers *regs, 378 Idx stop, bool ret_len) 379 { 380 const char *str; 381 regoff_t rval; 382 Idx len = length1 + length2; 383 char *s = NULL; 384 385 verify (! TYPE_SIGNED (Idx)); 386 if (BE (len < length1, 0)) 387 return -2; 388 /* if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 389 return -2; */ 390 391 /* Concatenate the strings. */ 392 if (length2 > 0) 393 if (length1 > 0) 394 { 395 s = re_malloc (char, len); 396 397 if (BE (s == NULL, 0)) 398 return -2; 399 #ifdef _LIBC 400 memcpy (__mempcpy (s, string1, length1), string2, length2); 401 #else 402 memcpy (s, string1, length1); 403 memcpy (s + length1, string2, length2); 404 #endif 405 str = s; 406 } 407 else 408 str = string2; 409 else 410 str = string1; 411 412 rval = re_search_stub (bufp, str, len, start, range, stop, regs, 413 ret_len); 414 re_free (s); 415 return rval; 416 } 417 418 /* The parameters have the same meaning as those of re_search. 419 Additional parameters: 420 If RET_LEN is true the length of the match is returned (re_match style); 421 otherwise the position of the match is returned. */ 422 423 static regoff_t 424 internal_function 425 re_search_stub (struct re_pattern_buffer *bufp, 426 const char *string, Idx length, 427 Idx start, regoff_t range, Idx stop, struct re_registers *regs, 428 bool ret_len) 429 { 430 reg_errcode_t result; 431 regmatch_t *pmatch; 432 Idx nregs; 433 regoff_t rval; 434 int eflags = 0; 435 #ifdef _LIBC 436 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 437 #endif 438 Idx last_start = start + range; 439 440 /* Check for out-of-range. */ 441 verify (! TYPE_SIGNED (Idx)); 442 /* if (BE (start < 0, 0)) 443 return -1; */ 444 if (BE (start > length, 0)) 445 return -1; 446 if (BE (length < last_start || (0 <= range && last_start < start), 0)) 447 last_start = length; 448 else if (BE (/* last_start < 0 || */ (range < 0 && start <= last_start), 0)) 449 last_start = 0; 450 451 __libc_lock_lock (dfa->lock); 452 453 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 454 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 455 456 /* Compile fastmap if we haven't yet. */ 457 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate) 458 re_compile_fastmap (bufp); 459 460 if (BE (bufp->no_sub, 0)) 461 regs = NULL; 462 463 /* We need at least 1 register. */ 464 if (regs == NULL) 465 nregs = 1; 466 else if (BE (bufp->regs_allocated == REGS_FIXED 467 && regs->num_regs <= bufp->re_nsub, 0)) 468 { 469 nregs = regs->num_regs; 470 if (BE (nregs < 1, 0)) 471 { 472 /* Nothing can be copied to regs. */ 473 regs = NULL; 474 nregs = 1; 475 } 476 } 477 else 478 nregs = bufp->re_nsub + 1; 479 pmatch = re_malloc (regmatch_t, nregs); 480 if (BE (pmatch == NULL, 0)) 481 { 482 rval = -2; 483 goto out; 484 } 485 486 result = re_search_internal (bufp, string, length, start, last_start, stop, 487 nregs, pmatch, eflags); 488 489 rval = 0; 490 491 /* I hope we needn't fill ther regs with -1's when no match was found. */ 492 if (result != REG_NOERROR) 493 rval = -1; 494 else if (regs != NULL) 495 { 496 /* If caller wants register contents data back, copy them. */ 497 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 498 bufp->regs_allocated); 499 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 500 rval = -2; 501 } 502 503 if (BE (rval == 0, 1)) 504 { 505 if (ret_len) 506 { 507 assert (pmatch[0].rm_so == start); 508 rval = pmatch[0].rm_eo - start; 509 } 510 else 511 rval = pmatch[0].rm_so; 512 } 513 re_free (pmatch); 514 out: 515 __libc_lock_unlock (dfa->lock); 516 return rval; 517 } 518 519 static unsigned int 520 internal_function 521 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs, 522 int regs_allocated) 523 { 524 int rval = REGS_REALLOCATE; 525 Idx i; 526 Idx need_regs = nregs + 1; 527 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 528 uses. */ 529 530 /* Have the register data arrays been allocated? */ 531 if (regs_allocated == REGS_UNALLOCATED) 532 { /* No. So allocate them with malloc. */ 533 regs->start = re_malloc (regoff_t, need_regs); 534 if (BE (regs->start == NULL, 0)) 535 return REGS_UNALLOCATED; 536 regs->end = re_malloc (regoff_t, need_regs); 537 if (BE (regs->end == NULL, 0)) 538 { 539 re_free (regs->start); 540 return REGS_UNALLOCATED; 541 } 542 regs->num_regs = need_regs; 543 } 544 else if (regs_allocated == REGS_REALLOCATE) 545 { /* Yes. If we need more elements than were already 546 allocated, reallocate them. If we need fewer, just 547 leave it alone. */ 548 if (BE (need_regs > regs->num_regs, 0)) 549 { 550 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs); 551 regoff_t *new_end; 552 if (BE (new_start == NULL, 0)) 553 return REGS_UNALLOCATED; 554 new_end = re_realloc (regs->end, regoff_t, need_regs); 555 if (BE (new_end == NULL, 0)) 556 { 557 re_free (new_start); 558 return REGS_UNALLOCATED; 559 } 560 regs->start = new_start; 561 regs->end = new_end; 562 regs->num_regs = need_regs; 563 } 564 } 565 else 566 { 567 assert (regs_allocated == REGS_FIXED); 568 /* This function may not be called with REGS_FIXED and nregs too big. */ 569 assert (regs->num_regs >= nregs); 570 rval = REGS_FIXED; 571 } 572 573 /* Copy the regs. */ 574 for (i = 0; i < nregs; ++i) 575 { 576 regs->start[i] = pmatch[i].rm_so; 577 regs->end[i] = pmatch[i].rm_eo; 578 } 579 for ( ; i < regs->num_regs; ++i) 580 regs->start[i] = regs->end[i] = -1; 581 582 return rval; 583 } 584 585 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 586 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 587 this memory for recording register information. STARTS and ENDS 588 must be allocated using the malloc library routine, and must each 589 be at least NUM_REGS * sizeof (regoff_t) bytes long. 590 591 If NUM_REGS == 0, then subsequent matches should allocate their own 592 register data. 593 594 Unless this function is called, the first search or match using 595 PATTERN_BUFFER will allocate its own register data, without 596 freeing the old data. */ 597 598 void 599 re_set_registers (bufp, regs, num_regs, starts, ends) 600 struct re_pattern_buffer *bufp; 601 struct re_registers *regs; 602 __re_size_t num_regs; 603 regoff_t *starts, *ends; 604 { 605 if (num_regs) 606 { 607 bufp->regs_allocated = REGS_REALLOCATE; 608 regs->num_regs = num_regs; 609 regs->start = starts; 610 regs->end = ends; 611 } 612 else 613 { 614 bufp->regs_allocated = REGS_UNALLOCATED; 615 regs->num_regs = 0; 616 regs->start = regs->end = NULL; 617 } 618 } 619 #ifdef _LIBC 620 weak_alias (__re_set_registers, re_set_registers) 621 #endif 622 623 /* Entry points compatible with 4.2 BSD regex library. We don't define 624 them unless specifically requested. */ 625 626 #if defined _REGEX_RE_COMP || defined _LIBC 627 int 628 # ifdef _LIBC 629 weak_function 630 # endif 631 re_exec (s) 632 const char *s; 633 { 634 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 635 } 636 #endif /* _REGEX_RE_COMP */ 637 638 /* Internal entry point. */ 639 640 /* Searches for a compiled pattern PREG in the string STRING, whose 641 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 642 meaning as with regexec. LAST_START is START + RANGE, where 643 START and RANGE have the same meaning as with re_search. 644 Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 645 otherwise return the error code. 646 Note: We assume front end functions already check ranges. 647 (0 <= LAST_START && LAST_START <= LENGTH) */ 648 649 static reg_errcode_t 650 internal_function __attribute_warn_unused_result__ 651 re_search_internal (const regex_t *preg, 652 const char *string, Idx length, 653 Idx start, Idx last_start, Idx stop, 654 size_t nmatch, regmatch_t pmatch[], 655 int eflags) 656 { 657 reg_errcode_t err; 658 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 659 Idx left_lim, right_lim; 660 int incr; 661 bool fl_longest_match; 662 int match_kind; 663 Idx match_first; 664 Idx match_last = REG_MISSING; 665 Idx extra_nmatch; 666 bool sb; 667 int ch; 668 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 669 re_match_context_t mctx = { .dfa = dfa }; 670 #else 671 re_match_context_t mctx; 672 #endif 673 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate 674 && start != last_start && !preg->can_be_null) 675 ? preg->fastmap : NULL); 676 RE_TRANSLATE_TYPE t = preg->translate; 677 678 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)) 679 memset (&mctx, '\0', sizeof (re_match_context_t)); 680 mctx.dfa = dfa; 681 #endif 682 683 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0; 684 nmatch -= extra_nmatch; 685 686 /* Check if the DFA haven't been compiled. */ 687 if (BE (preg->used == 0 || dfa->init_state == NULL 688 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 689 || dfa->init_state_begbuf == NULL, 0)) 690 return REG_NOMATCH; 691 692 #ifdef DEBUG 693 /* We assume front-end functions already check them. */ 694 assert (0 <= last_start && last_start <= length); 695 #endif 696 697 /* If initial states with non-begbuf contexts have no elements, 698 the regex must be anchored. If preg->newline_anchor is set, 699 we'll never use init_state_nl, so do not check it. */ 700 if (dfa->init_state->nodes.nelem == 0 701 && dfa->init_state_word->nodes.nelem == 0 702 && (dfa->init_state_nl->nodes.nelem == 0 703 || !preg->newline_anchor)) 704 { 705 if (start != 0 && last_start != 0) 706 return REG_NOMATCH; 707 start = last_start = 0; 708 } 709 710 /* We must check the longest matching, if nmatch > 0. */ 711 fl_longest_match = (nmatch != 0 || dfa->nbackref); 712 713 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1, 714 preg->translate, (preg->syntax & RE_ICASE) != 0, 715 dfa); 716 if (BE (err != REG_NOERROR, 0)) 717 goto free_return; 718 mctx.input.stop = stop; 719 mctx.input.raw_stop = stop; 720 mctx.input.newline_anchor = preg->newline_anchor; 721 722 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2); 723 if (BE (err != REG_NOERROR, 0)) 724 goto free_return; 725 726 /* We will log all the DFA states through which the dfa pass, 727 if nmatch > 1, or this dfa has "multibyte node", which is a 728 back-reference or a node which can accept multibyte character or 729 multi character collating element. */ 730 if (nmatch > 1 || dfa->has_mb_node) 731 { 732 /* Avoid overflow. */ 733 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0)) 734 { 735 err = REG_ESPACE; 736 goto free_return; 737 } 738 739 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1); 740 if (BE (mctx.state_log == NULL, 0)) 741 { 742 err = REG_ESPACE; 743 goto free_return; 744 } 745 } 746 else 747 mctx.state_log = NULL; 748 749 match_first = start; 750 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 751 : CONTEXT_NEWLINE | CONTEXT_BEGBUF; 752 753 /* Check incrementally whether of not the input string match. */ 754 incr = (last_start < start) ? -1 : 1; 755 left_lim = (last_start < start) ? last_start : start; 756 right_lim = (last_start < start) ? start : last_start; 757 sb = dfa->mb_cur_max == 1; 758 match_kind = 759 (fastmap 760 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) 761 | (start <= last_start ? 2 : 0) 762 | (t != NULL ? 1 : 0)) 763 : 8); 764 765 for (;; match_first += incr) 766 { 767 err = REG_NOMATCH; 768 if (match_first < left_lim || right_lim < match_first) 769 goto free_return; 770 771 /* Advance as rapidly as possible through the string, until we 772 find a plausible place to start matching. This may be done 773 with varying efficiency, so there are various possibilities: 774 only the most common of them are specialized, in order to 775 save on code size. We use a switch statement for speed. */ 776 switch (match_kind) 777 { 778 case 8: 779 /* No fastmap. */ 780 break; 781 782 case 7: 783 /* Fastmap with single-byte translation, match forward. */ 784 while (BE (match_first < right_lim, 1) 785 && !fastmap[t[(unsigned char) string[match_first]]]) 786 ++match_first; 787 goto forward_match_found_start_or_reached_end; 788 789 case 6: 790 /* Fastmap without translation, match forward. */ 791 while (BE (match_first < right_lim, 1) 792 && !fastmap[(unsigned char) string[match_first]]) 793 ++match_first; 794 795 forward_match_found_start_or_reached_end: 796 if (BE (match_first == right_lim, 0)) 797 { 798 ch = match_first >= length 799 ? 0 : (unsigned char) string[match_first]; 800 if (!fastmap[t ? t[ch] : ch]) 801 goto free_return; 802 } 803 break; 804 805 case 4: 806 case 5: 807 /* Fastmap without multi-byte translation, match backwards. */ 808 while (match_first >= left_lim) 809 { 810 ch = match_first >= length 811 ? 0 : (unsigned char) string[match_first]; 812 if (fastmap[t ? t[ch] : ch]) 813 break; 814 --match_first; 815 } 816 if (match_first < left_lim) 817 goto free_return; 818 break; 819 820 default: 821 /* In this case, we can't determine easily the current byte, 822 since it might be a component byte of a multibyte 823 character. Then we use the constructed buffer instead. */ 824 for (;;) 825 { 826 /* If MATCH_FIRST is out of the valid range, reconstruct the 827 buffers. */ 828 __re_size_t offset = match_first - mctx.input.raw_mbs_idx; 829 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0)) 830 { 831 err = re_string_reconstruct (&mctx.input, match_first, 832 eflags); 833 if (BE (err != REG_NOERROR, 0)) 834 goto free_return; 835 836 offset = match_first - mctx.input.raw_mbs_idx; 837 } 838 /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 839 Note that MATCH_FIRST must not be smaller than 0. */ 840 ch = (match_first >= length 841 ? 0 : re_string_byte_at (&mctx.input, offset)); 842 if (fastmap[ch]) 843 break; 844 match_first += incr; 845 if (match_first < left_lim || match_first > right_lim) 846 { 847 err = REG_NOMATCH; 848 goto free_return; 849 } 850 } 851 break; 852 } 853 854 /* Reconstruct the buffers so that the matcher can assume that 855 the matching starts from the beginning of the buffer. */ 856 err = re_string_reconstruct (&mctx.input, match_first, eflags); 857 if (BE (err != REG_NOERROR, 0)) 858 goto free_return; 859 860 #ifdef RE_ENABLE_I18N 861 /* Don't consider this char as a possible match start if it part, 862 yet isn't the head, of a multibyte character. */ 863 if (!sb && !re_string_first_byte (&mctx.input, 0)) 864 continue; 865 #endif 866 867 /* It seems to be appropriate one, then use the matcher. */ 868 /* We assume that the matching starts from 0. */ 869 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 870 match_last = check_matching (&mctx, fl_longest_match, 871 start <= last_start ? &match_first : NULL); 872 if (match_last != REG_MISSING) 873 { 874 if (BE (match_last == REG_ERROR, 0)) 875 { 876 err = REG_ESPACE; 877 goto free_return; 878 } 879 else 880 { 881 mctx.match_last = match_last; 882 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 883 { 884 re_dfastate_t *pstate = mctx.state_log[match_last]; 885 mctx.last_node = check_halt_state_context (&mctx, pstate, 886 match_last); 887 } 888 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 889 || dfa->nbackref) 890 { 891 err = prune_impossible_nodes (&mctx); 892 if (err == REG_NOERROR) 893 break; 894 if (BE (err != REG_NOMATCH, 0)) 895 goto free_return; 896 match_last = REG_MISSING; 897 } 898 else 899 break; /* We found a match. */ 900 } 901 } 902 903 match_ctx_clean (&mctx); 904 } 905 906 #ifdef DEBUG 907 assert (match_last != REG_MISSING); 908 assert (err == REG_NOERROR); 909 #endif 910 911 /* Set pmatch[] if we need. */ 912 if (nmatch > 0) 913 { 914 Idx reg_idx; 915 916 /* Initialize registers. */ 917 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) 918 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 919 920 /* Set the points where matching start/end. */ 921 pmatch[0].rm_so = 0; 922 pmatch[0].rm_eo = mctx.match_last; 923 /* FIXME: This function should fail if mctx.match_last exceeds 924 the maximum possible regoff_t value. We need a new error 925 code REG_OVERFLOW. */ 926 927 if (!preg->no_sub && nmatch > 1) 928 { 929 err = set_regs (preg, &mctx, nmatch, pmatch, 930 dfa->has_plural_match && dfa->nbackref > 0); 931 if (BE (err != REG_NOERROR, 0)) 932 goto free_return; 933 } 934 935 /* At last, add the offset to the each registers, since we slided 936 the buffers so that we could assume that the matching starts 937 from 0. */ 938 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 939 if (pmatch[reg_idx].rm_so != -1) 940 { 941 #ifdef RE_ENABLE_I18N 942 if (BE (mctx.input.offsets_needed != 0, 0)) 943 { 944 pmatch[reg_idx].rm_so = 945 (pmatch[reg_idx].rm_so == mctx.input.valid_len 946 ? mctx.input.valid_raw_len 947 : mctx.input.offsets[pmatch[reg_idx].rm_so]); 948 pmatch[reg_idx].rm_eo = 949 (pmatch[reg_idx].rm_eo == mctx.input.valid_len 950 ? mctx.input.valid_raw_len 951 : mctx.input.offsets[pmatch[reg_idx].rm_eo]); 952 } 953 #else 954 assert (mctx.input.offsets_needed == 0); 955 #endif 956 pmatch[reg_idx].rm_so += match_first; 957 pmatch[reg_idx].rm_eo += match_first; 958 } 959 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx) 960 { 961 pmatch[nmatch + reg_idx].rm_so = -1; 962 pmatch[nmatch + reg_idx].rm_eo = -1; 963 } 964 965 if (dfa->subexp_map) 966 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++) 967 if (dfa->subexp_map[reg_idx] != reg_idx) 968 { 969 pmatch[reg_idx + 1].rm_so 970 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; 971 pmatch[reg_idx + 1].rm_eo 972 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; 973 } 974 } 975 976 free_return: 977 re_free (mctx.state_log); 978 if (dfa->nbackref) 979 match_ctx_free (&mctx); 980 re_string_destruct (&mctx.input); 981 return err; 982 } 983 984 static reg_errcode_t 985 internal_function __attribute_warn_unused_result__ 986 prune_impossible_nodes (re_match_context_t *mctx) 987 { 988 const re_dfa_t *const dfa = mctx->dfa; 989 Idx halt_node, match_last; 990 reg_errcode_t ret; 991 re_dfastate_t **sifted_states; 992 re_dfastate_t **lim_states = NULL; 993 re_sift_context_t sctx; 994 #ifdef DEBUG 995 assert (mctx->state_log != NULL); 996 #endif 997 match_last = mctx->match_last; 998 halt_node = mctx->last_node; 999 1000 /* Avoid overflow. */ 1001 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0)) 1002 return REG_ESPACE; 1003 1004 sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 1005 if (BE (sifted_states == NULL, 0)) 1006 { 1007 ret = REG_ESPACE; 1008 goto free_return; 1009 } 1010 if (dfa->nbackref) 1011 { 1012 lim_states = re_malloc (re_dfastate_t *, match_last + 1); 1013 if (BE (lim_states == NULL, 0)) 1014 { 1015 ret = REG_ESPACE; 1016 goto free_return; 1017 } 1018 while (1) 1019 { 1020 memset (lim_states, '\0', 1021 sizeof (re_dfastate_t *) * (match_last + 1)); 1022 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 1023 match_last); 1024 ret = sift_states_backward (mctx, &sctx); 1025 re_node_set_free (&sctx.limits); 1026 if (BE (ret != REG_NOERROR, 0)) 1027 goto free_return; 1028 if (sifted_states[0] != NULL || lim_states[0] != NULL) 1029 break; 1030 do 1031 { 1032 --match_last; 1033 if (! REG_VALID_INDEX (match_last)) 1034 { 1035 ret = REG_NOMATCH; 1036 goto free_return; 1037 } 1038 } while (mctx->state_log[match_last] == NULL 1039 || !mctx->state_log[match_last]->halt); 1040 halt_node = check_halt_state_context (mctx, 1041 mctx->state_log[match_last], 1042 match_last); 1043 } 1044 ret = merge_state_array (dfa, sifted_states, lim_states, 1045 match_last + 1); 1046 re_free (lim_states); 1047 lim_states = NULL; 1048 if (BE (ret != REG_NOERROR, 0)) 1049 goto free_return; 1050 } 1051 else 1052 { 1053 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last); 1054 ret = sift_states_backward (mctx, &sctx); 1055 re_node_set_free (&sctx.limits); 1056 if (BE (ret != REG_NOERROR, 0)) 1057 goto free_return; 1058 if (sifted_states[0] == NULL) 1059 { 1060 ret = REG_NOMATCH; 1061 goto free_return; 1062 } 1063 } 1064 re_free (mctx->state_log); 1065 mctx->state_log = sifted_states; 1066 sifted_states = NULL; 1067 mctx->last_node = halt_node; 1068 mctx->match_last = match_last; 1069 ret = REG_NOERROR; 1070 free_return: 1071 re_free (sifted_states); 1072 re_free (lim_states); 1073 return ret; 1074 } 1075 1076 /* Acquire an initial state and return it. 1077 We must select appropriate initial state depending on the context, 1078 since initial states may have constraints like "\<", "^", etc.. */ 1079 1080 static inline re_dfastate_t * 1081 __attribute ((always_inline)) internal_function 1082 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx, 1083 Idx idx) 1084 { 1085 const re_dfa_t *const dfa = mctx->dfa; 1086 if (dfa->init_state->has_constraint) 1087 { 1088 unsigned int context; 1089 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags); 1090 if (IS_WORD_CONTEXT (context)) 1091 return dfa->init_state_word; 1092 else if (IS_ORDINARY_CONTEXT (context)) 1093 return dfa->init_state; 1094 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 1095 return dfa->init_state_begbuf; 1096 else if (IS_NEWLINE_CONTEXT (context)) 1097 return dfa->init_state_nl; 1098 else if (IS_BEGBUF_CONTEXT (context)) 1099 { 1100 /* It is relatively rare case, then calculate on demand. */ 1101 return re_acquire_state_context (err, dfa, 1102 dfa->init_state->entrance_nodes, 1103 context); 1104 } 1105 else 1106 /* Must not happen? */ 1107 return dfa->init_state; 1108 } 1109 else 1110 return dfa->init_state; 1111 } 1112 1113 /* Check whether the regular expression match input string INPUT or not, 1114 and return the index where the matching end. Return REG_MISSING if 1115 there is no match, and return REG_ERROR in case of an error. 1116 FL_LONGEST_MATCH means we want the POSIX longest matching. 1117 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the 1118 next place where we may want to try matching. 1119 Note that the matcher assume that the maching starts from the current 1120 index of the buffer. */ 1121 1122 static Idx 1123 internal_function __attribute_warn_unused_result__ 1124 check_matching (re_match_context_t *mctx, bool fl_longest_match, 1125 Idx *p_match_first) 1126 { 1127 const re_dfa_t *const dfa = mctx->dfa; 1128 reg_errcode_t err; 1129 Idx match = 0; 1130 Idx match_last = REG_MISSING; 1131 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 1132 re_dfastate_t *cur_state; 1133 bool at_init_state = p_match_first != NULL; 1134 Idx next_start_idx = cur_str_idx; 1135 1136 err = REG_NOERROR; 1137 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx); 1138 /* An initial state must not be NULL (invalid). */ 1139 if (BE (cur_state == NULL, 0)) 1140 { 1141 assert (err == REG_ESPACE); 1142 return REG_ERROR; 1143 } 1144 1145 if (mctx->state_log != NULL) 1146 { 1147 mctx->state_log[cur_str_idx] = cur_state; 1148 1149 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 1150 later. E.g. Processing back references. */ 1151 if (BE (dfa->nbackref, 0)) 1152 { 1153 at_init_state = false; 1154 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0); 1155 if (BE (err != REG_NOERROR, 0)) 1156 return err; 1157 1158 if (cur_state->has_backref) 1159 { 1160 err = transit_state_bkref (mctx, &cur_state->nodes); 1161 if (BE (err != REG_NOERROR, 0)) 1162 return err; 1163 } 1164 } 1165 } 1166 1167 /* If the RE accepts NULL string. */ 1168 if (BE (cur_state->halt, 0)) 1169 { 1170 if (!cur_state->has_constraint 1171 || check_halt_state_context (mctx, cur_state, cur_str_idx)) 1172 { 1173 if (!fl_longest_match) 1174 return cur_str_idx; 1175 else 1176 { 1177 match_last = cur_str_idx; 1178 match = 1; 1179 } 1180 } 1181 } 1182 1183 while (!re_string_eoi (&mctx->input)) 1184 { 1185 re_dfastate_t *old_state = cur_state; 1186 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1; 1187 1188 if (BE (next_char_idx >= mctx->input.bufs_len, 0) 1189 || (BE (next_char_idx >= mctx->input.valid_len, 0) 1190 && mctx->input.valid_len < mctx->input.len)) 1191 { 1192 err = extend_buffers (mctx); 1193 if (BE (err != REG_NOERROR, 0)) 1194 { 1195 assert (err == REG_ESPACE); 1196 return REG_ERROR; 1197 } 1198 } 1199 1200 cur_state = transit_state (&err, mctx, cur_state); 1201 if (mctx->state_log != NULL) 1202 cur_state = merge_state_with_log (&err, mctx, cur_state); 1203 1204 if (cur_state == NULL) 1205 { 1206 /* Reached the invalid state or an error. Try to recover a valid 1207 state using the state log, if available and if we have not 1208 already found a valid (even if not the longest) match. */ 1209 if (BE (err != REG_NOERROR, 0)) 1210 return REG_ERROR; 1211 1212 if (mctx->state_log == NULL 1213 || (match && !fl_longest_match) 1214 || (cur_state = find_recover_state (&err, mctx)) == NULL) 1215 break; 1216 } 1217 1218 if (BE (at_init_state, 0)) 1219 { 1220 if (old_state == cur_state) 1221 next_start_idx = next_char_idx; 1222 else 1223 at_init_state = false; 1224 } 1225 1226 if (cur_state->halt) 1227 { 1228 /* Reached a halt state. 1229 Check the halt state can satisfy the current context. */ 1230 if (!cur_state->has_constraint 1231 || check_halt_state_context (mctx, cur_state, 1232 re_string_cur_idx (&mctx->input))) 1233 { 1234 /* We found an appropriate halt state. */ 1235 match_last = re_string_cur_idx (&mctx->input); 1236 match = 1; 1237 1238 /* We found a match, do not modify match_first below. */ 1239 p_match_first = NULL; 1240 if (!fl_longest_match) 1241 break; 1242 } 1243 } 1244 } 1245 1246 if (p_match_first) 1247 *p_match_first += next_start_idx; 1248 1249 return match_last; 1250 } 1251 1252 /* Check NODE match the current context. */ 1253 1254 static bool 1255 internal_function 1256 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context) 1257 { 1258 re_token_type_t type = dfa->nodes[node].type; 1259 unsigned int constraint = dfa->nodes[node].constraint; 1260 if (type != END_OF_RE) 1261 return false; 1262 if (!constraint) 1263 return true; 1264 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1265 return false; 1266 return true; 1267 } 1268 1269 /* Check the halt state STATE match the current context. 1270 Return 0 if not match, if the node, STATE has, is a halt node and 1271 match the context, return the node. */ 1272 1273 static Idx 1274 internal_function 1275 check_halt_state_context (const re_match_context_t *mctx, 1276 const re_dfastate_t *state, Idx idx) 1277 { 1278 Idx i; 1279 unsigned int context; 1280 #ifdef DEBUG 1281 assert (state->halt); 1282 #endif 1283 context = re_string_context_at (&mctx->input, idx, mctx->eflags); 1284 for (i = 0; i < state->nodes.nelem; ++i) 1285 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context)) 1286 return state->nodes.elems[i]; 1287 return 0; 1288 } 1289 1290 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1291 corresponding to the DFA). 1292 Return the destination node, and update EPS_VIA_NODES; 1293 return REG_MISSING in case of errors. */ 1294 1295 static Idx 1296 internal_function 1297 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs, 1298 Idx *pidx, Idx node, re_node_set *eps_via_nodes, 1299 struct re_fail_stack_t *fs) 1300 { 1301 const re_dfa_t *const dfa = mctx->dfa; 1302 Idx i; 1303 bool ok; 1304 if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1305 { 1306 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1307 re_node_set *edests = &dfa->edests[node]; 1308 Idx dest_node; 1309 ok = re_node_set_insert (eps_via_nodes, node); 1310 if (BE (! ok, 0)) 1311 return REG_ERROR; 1312 /* Pick up a valid destination, or return REG_MISSING if none 1313 is found. */ 1314 for (dest_node = REG_MISSING, i = 0; i < edests->nelem; ++i) 1315 { 1316 Idx candidate = edests->elems[i]; 1317 if (!re_node_set_contains (cur_nodes, candidate)) 1318 continue; 1319 if (dest_node == REG_MISSING) 1320 dest_node = candidate; 1321 1322 else 1323 { 1324 /* In order to avoid infinite loop like "(a*)*", return the second 1325 epsilon-transition if the first was already considered. */ 1326 if (re_node_set_contains (eps_via_nodes, dest_node)) 1327 return candidate; 1328 1329 /* Otherwise, push the second epsilon-transition on the fail stack. */ 1330 else if (fs != NULL 1331 && push_fail_stack (fs, *pidx, candidate, nregs, regs, 1332 eps_via_nodes)) 1333 return REG_ERROR; 1334 1335 /* We know we are going to exit. */ 1336 break; 1337 } 1338 } 1339 return dest_node; 1340 } 1341 else 1342 { 1343 Idx naccepted = 0; 1344 re_token_type_t type = dfa->nodes[node].type; 1345 1346 #ifdef RE_ENABLE_I18N 1347 if (dfa->nodes[node].accept_mb) 1348 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx); 1349 else 1350 #endif /* RE_ENABLE_I18N */ 1351 if (type == OP_BACK_REF) 1352 { 1353 Idx subexp_idx = dfa->nodes[node].opr.idx + 1; 1354 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1355 if (fs != NULL) 1356 { 1357 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1358 return REG_MISSING; 1359 else if (naccepted) 1360 { 1361 char *buf = (char *) re_string_get_buffer (&mctx->input); 1362 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1363 naccepted) != 0) 1364 return REG_MISSING; 1365 } 1366 } 1367 1368 if (naccepted == 0) 1369 { 1370 Idx dest_node; 1371 ok = re_node_set_insert (eps_via_nodes, node); 1372 if (BE (! ok, 0)) 1373 return REG_ERROR; 1374 dest_node = dfa->edests[node].elems[0]; 1375 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1376 dest_node)) 1377 return dest_node; 1378 } 1379 } 1380 1381 if (naccepted != 0 1382 || check_node_accept (mctx, dfa->nodes + node, *pidx)) 1383 { 1384 Idx dest_node = dfa->nexts[node]; 1385 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1386 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1387 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1388 dest_node))) 1389 return REG_MISSING; 1390 re_node_set_empty (eps_via_nodes); 1391 return dest_node; 1392 } 1393 } 1394 return REG_MISSING; 1395 } 1396 1397 static reg_errcode_t 1398 internal_function __attribute_warn_unused_result__ 1399 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node, 1400 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes) 1401 { 1402 reg_errcode_t err; 1403 Idx num = fs->num++; 1404 if (fs->num == fs->alloc) 1405 { 1406 struct re_fail_stack_ent_t *new_array; 1407 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1408 * fs->alloc * 2)); 1409 if (new_array == NULL) 1410 return REG_ESPACE; 1411 fs->alloc *= 2; 1412 fs->stack = new_array; 1413 } 1414 fs->stack[num].idx = str_idx; 1415 fs->stack[num].node = dest_node; 1416 fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1417 if (fs->stack[num].regs == NULL) 1418 return REG_ESPACE; 1419 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1420 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1421 return err; 1422 } 1423 1424 static Idx 1425 internal_function 1426 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs, 1427 regmatch_t *regs, re_node_set *eps_via_nodes) 1428 { 1429 Idx num = --fs->num; 1430 assert (REG_VALID_INDEX (num)); 1431 *pidx = fs->stack[num].idx; 1432 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1433 re_node_set_free (eps_via_nodes); 1434 re_free (fs->stack[num].regs); 1435 *eps_via_nodes = fs->stack[num].eps_via_nodes; 1436 return fs->stack[num].node; 1437 } 1438 1439 /* Set the positions where the subexpressions are starts/ends to registers 1440 PMATCH. 1441 Note: We assume that pmatch[0] is already set, and 1442 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ 1443 1444 static reg_errcode_t 1445 internal_function __attribute_warn_unused_result__ 1446 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch, 1447 regmatch_t *pmatch, bool fl_backtrack) 1448 { 1449 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer; 1450 Idx idx, cur_node; 1451 re_node_set eps_via_nodes; 1452 struct re_fail_stack_t *fs; 1453 struct re_fail_stack_t fs_body = { 0, 2, NULL }; 1454 regmatch_t *prev_idx_match; 1455 bool prev_idx_match_malloced = false; 1456 1457 #ifdef DEBUG 1458 assert (nmatch > 1); 1459 assert (mctx->state_log != NULL); 1460 #endif 1461 if (fl_backtrack) 1462 { 1463 fs = &fs_body; 1464 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1465 if (fs->stack == NULL) 1466 return REG_ESPACE; 1467 } 1468 else 1469 fs = NULL; 1470 1471 cur_node = dfa->init_node; 1472 re_node_set_init_empty (&eps_via_nodes); 1473 1474 if (__libc_use_alloca (nmatch * sizeof (regmatch_t))) 1475 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t)); 1476 else 1477 { 1478 prev_idx_match = re_malloc (regmatch_t, nmatch); 1479 if (prev_idx_match == NULL) 1480 { 1481 free_fail_stack_return (fs); 1482 return REG_ESPACE; 1483 } 1484 prev_idx_match_malloced = true; 1485 } 1486 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1487 1488 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1489 { 1490 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch); 1491 1492 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1493 { 1494 Idx reg_idx; 1495 if (fs) 1496 { 1497 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1498 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1499 break; 1500 if (reg_idx == nmatch) 1501 { 1502 re_node_set_free (&eps_via_nodes); 1503 if (prev_idx_match_malloced) 1504 re_free (prev_idx_match); 1505 return free_fail_stack_return (fs); 1506 } 1507 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1508 &eps_via_nodes); 1509 } 1510 else 1511 { 1512 re_node_set_free (&eps_via_nodes); 1513 if (prev_idx_match_malloced) 1514 re_free (prev_idx_match); 1515 return REG_NOERROR; 1516 } 1517 } 1518 1519 /* Proceed to next node. */ 1520 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node, 1521 &eps_via_nodes, fs); 1522 1523 if (BE (! REG_VALID_INDEX (cur_node), 0)) 1524 { 1525 if (BE (cur_node == REG_ERROR, 0)) 1526 { 1527 re_node_set_free (&eps_via_nodes); 1528 if (prev_idx_match_malloced) 1529 re_free (prev_idx_match); 1530 free_fail_stack_return (fs); 1531 return REG_ESPACE; 1532 } 1533 if (fs) 1534 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1535 &eps_via_nodes); 1536 else 1537 { 1538 re_node_set_free (&eps_via_nodes); 1539 if (prev_idx_match_malloced) 1540 re_free (prev_idx_match); 1541 return REG_NOMATCH; 1542 } 1543 } 1544 } 1545 re_node_set_free (&eps_via_nodes); 1546 if (prev_idx_match_malloced) 1547 re_free (prev_idx_match); 1548 return free_fail_stack_return (fs); 1549 } 1550 1551 static reg_errcode_t 1552 internal_function 1553 free_fail_stack_return (struct re_fail_stack_t *fs) 1554 { 1555 if (fs) 1556 { 1557 Idx fs_idx; 1558 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1559 { 1560 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1561 re_free (fs->stack[fs_idx].regs); 1562 } 1563 re_free (fs->stack); 1564 } 1565 return REG_NOERROR; 1566 } 1567 1568 static void 1569 internal_function 1570 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch, 1571 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch) 1572 { 1573 int type = dfa->nodes[cur_node].type; 1574 if (type == OP_OPEN_SUBEXP) 1575 { 1576 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1; 1577 1578 /* We are at the first node of this sub expression. */ 1579 if (reg_num < nmatch) 1580 { 1581 pmatch[reg_num].rm_so = cur_idx; 1582 pmatch[reg_num].rm_eo = -1; 1583 } 1584 } 1585 else if (type == OP_CLOSE_SUBEXP) 1586 { 1587 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1; 1588 if (reg_num < nmatch) 1589 { 1590 /* We are at the last node of this sub expression. */ 1591 if (pmatch[reg_num].rm_so < cur_idx) 1592 { 1593 pmatch[reg_num].rm_eo = cur_idx; 1594 /* This is a non-empty match or we are not inside an optional 1595 subexpression. Accept this right away. */ 1596 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch); 1597 } 1598 else 1599 { 1600 if (dfa->nodes[cur_node].opt_subexp 1601 && prev_idx_match[reg_num].rm_so != -1) 1602 /* We transited through an empty match for an optional 1603 subexpression, like (a?)*, and this is not the subexp's 1604 first match. Copy back the old content of the registers 1605 so that matches of an inner subexpression are undone as 1606 well, like in ((a?))*. */ 1607 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch); 1608 else 1609 /* We completed a subexpression, but it may be part of 1610 an optional one, so do not update PREV_IDX_MATCH. */ 1611 pmatch[reg_num].rm_eo = cur_idx; 1612 } 1613 } 1614 } 1615 } 1616 1617 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1618 and sift the nodes in each states according to the following rules. 1619 Updated state_log will be wrote to STATE_LOG. 1620 1621 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1622 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1623 If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1624 the LAST_NODE, we throw away the node `a'. 1625 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1626 string `s' and transit to `b': 1627 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1628 away the node `a'. 1629 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1630 thrown away, we throw away the node `a'. 1631 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b': 1632 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1633 node `a'. 1634 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away, 1635 we throw away the node `a'. */ 1636 1637 #define STATE_NODE_CONTAINS(state,node) \ 1638 ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1639 1640 static reg_errcode_t 1641 internal_function 1642 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx) 1643 { 1644 reg_errcode_t err; 1645 int null_cnt = 0; 1646 Idx str_idx = sctx->last_str_idx; 1647 re_node_set cur_dest; 1648 1649 #ifdef DEBUG 1650 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1651 #endif 1652 1653 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1654 transit to the last_node and the last_node itself. */ 1655 err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1656 if (BE (err != REG_NOERROR, 0)) 1657 return err; 1658 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1659 if (BE (err != REG_NOERROR, 0)) 1660 goto free_return; 1661 1662 /* Then check each states in the state_log. */ 1663 while (str_idx > 0) 1664 { 1665 /* Update counters. */ 1666 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1667 if (null_cnt > mctx->max_mb_elem_len) 1668 { 1669 memset (sctx->sifted_states, '\0', 1670 sizeof (re_dfastate_t *) * str_idx); 1671 re_node_set_free (&cur_dest); 1672 return REG_NOERROR; 1673 } 1674 re_node_set_empty (&cur_dest); 1675 --str_idx; 1676 1677 if (mctx->state_log[str_idx]) 1678 { 1679 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest); 1680 if (BE (err != REG_NOERROR, 0)) 1681 goto free_return; 1682 } 1683 1684 /* Add all the nodes which satisfy the following conditions: 1685 - It can epsilon transit to a node in CUR_DEST. 1686 - It is in CUR_SRC. 1687 And update state_log. */ 1688 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest); 1689 if (BE (err != REG_NOERROR, 0)) 1690 goto free_return; 1691 } 1692 err = REG_NOERROR; 1693 free_return: 1694 re_node_set_free (&cur_dest); 1695 return err; 1696 } 1697 1698 static reg_errcode_t 1699 internal_function __attribute_warn_unused_result__ 1700 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx, 1701 Idx str_idx, re_node_set *cur_dest) 1702 { 1703 const re_dfa_t *const dfa = mctx->dfa; 1704 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes; 1705 Idx i; 1706 1707 /* Then build the next sifted state. 1708 We build the next sifted state on `cur_dest', and update 1709 `sifted_states[str_idx]' with `cur_dest'. 1710 Note: 1711 `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1712 `cur_src' points the node_set of the old `state_log[str_idx]' 1713 (with the epsilon nodes pre-filtered out). */ 1714 for (i = 0; i < cur_src->nelem; i++) 1715 { 1716 Idx prev_node = cur_src->elems[i]; 1717 int naccepted = 0; 1718 bool ok; 1719 1720 #ifdef DEBUG 1721 re_token_type_t type = dfa->nodes[prev_node].type; 1722 assert (!IS_EPSILON_NODE (type)); 1723 #endif 1724 #ifdef RE_ENABLE_I18N 1725 /* If the node may accept `multi byte'. */ 1726 if (dfa->nodes[prev_node].accept_mb) 1727 naccepted = sift_states_iter_mb (mctx, sctx, prev_node, 1728 str_idx, sctx->last_str_idx); 1729 #endif /* RE_ENABLE_I18N */ 1730 1731 /* We don't check backreferences here. 1732 See update_cur_sifted_state(). */ 1733 if (!naccepted 1734 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx) 1735 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1736 dfa->nexts[prev_node])) 1737 naccepted = 1; 1738 1739 if (naccepted == 0) 1740 continue; 1741 1742 if (sctx->limits.nelem) 1743 { 1744 Idx to_idx = str_idx + naccepted; 1745 if (check_dst_limits (mctx, &sctx->limits, 1746 dfa->nexts[prev_node], to_idx, 1747 prev_node, str_idx)) 1748 continue; 1749 } 1750 ok = re_node_set_insert (cur_dest, prev_node); 1751 if (BE (! ok, 0)) 1752 return REG_ESPACE; 1753 } 1754 1755 return REG_NOERROR; 1756 } 1757 1758 /* Helper functions. */ 1759 1760 static reg_errcode_t 1761 internal_function 1762 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx) 1763 { 1764 Idx top = mctx->state_log_top; 1765 1766 if (next_state_log_idx >= mctx->input.bufs_len 1767 || (next_state_log_idx >= mctx->input.valid_len 1768 && mctx->input.valid_len < mctx->input.len)) 1769 { 1770 reg_errcode_t err; 1771 err = extend_buffers (mctx); 1772 if (BE (err != REG_NOERROR, 0)) 1773 return err; 1774 } 1775 1776 if (top < next_state_log_idx) 1777 { 1778 memset (mctx->state_log + top + 1, '\0', 1779 sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1780 mctx->state_log_top = next_state_log_idx; 1781 } 1782 return REG_NOERROR; 1783 } 1784 1785 static reg_errcode_t 1786 internal_function 1787 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst, 1788 re_dfastate_t **src, Idx num) 1789 { 1790 Idx st_idx; 1791 reg_errcode_t err; 1792 for (st_idx = 0; st_idx < num; ++st_idx) 1793 { 1794 if (dst[st_idx] == NULL) 1795 dst[st_idx] = src[st_idx]; 1796 else if (src[st_idx] != NULL) 1797 { 1798 re_node_set merged_set; 1799 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1800 &src[st_idx]->nodes); 1801 if (BE (err != REG_NOERROR, 0)) 1802 return err; 1803 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1804 re_node_set_free (&merged_set); 1805 if (BE (err != REG_NOERROR, 0)) 1806 return err; 1807 } 1808 } 1809 return REG_NOERROR; 1810 } 1811 1812 static reg_errcode_t 1813 internal_function 1814 update_cur_sifted_state (const re_match_context_t *mctx, 1815 re_sift_context_t *sctx, Idx str_idx, 1816 re_node_set *dest_nodes) 1817 { 1818 const re_dfa_t *const dfa = mctx->dfa; 1819 reg_errcode_t err = REG_NOERROR; 1820 const re_node_set *candidates; 1821 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL 1822 : &mctx->state_log[str_idx]->nodes); 1823 1824 if (dest_nodes->nelem == 0) 1825 sctx->sifted_states[str_idx] = NULL; 1826 else 1827 { 1828 if (candidates) 1829 { 1830 /* At first, add the nodes which can epsilon transit to a node in 1831 DEST_NODE. */ 1832 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1833 if (BE (err != REG_NOERROR, 0)) 1834 return err; 1835 1836 /* Then, check the limitations in the current sift_context. */ 1837 if (sctx->limits.nelem) 1838 { 1839 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1840 mctx->bkref_ents, str_idx); 1841 if (BE (err != REG_NOERROR, 0)) 1842 return err; 1843 } 1844 } 1845 1846 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1847 if (BE (err != REG_NOERROR, 0)) 1848 return err; 1849 } 1850 1851 if (candidates && mctx->state_log[str_idx]->has_backref) 1852 { 1853 err = sift_states_bkref (mctx, sctx, str_idx, candidates); 1854 if (BE (err != REG_NOERROR, 0)) 1855 return err; 1856 } 1857 return REG_NOERROR; 1858 } 1859 1860 static reg_errcode_t 1861 internal_function __attribute_warn_unused_result__ 1862 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes, 1863 const re_node_set *candidates) 1864 { 1865 reg_errcode_t err = REG_NOERROR; 1866 Idx i; 1867 1868 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes); 1869 if (BE (err != REG_NOERROR, 0)) 1870 return err; 1871 1872 if (!state->inveclosure.alloc) 1873 { 1874 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem); 1875 if (BE (err != REG_NOERROR, 0)) 1876 return REG_ESPACE; 1877 for (i = 0; i < dest_nodes->nelem; i++) 1878 { 1879 err = re_node_set_merge (&state->inveclosure, 1880 dfa->inveclosures + dest_nodes->elems[i]); 1881 if (BE (err != REG_NOERROR, 0)) 1882 return REG_ESPACE; 1883 } 1884 } 1885 return re_node_set_add_intersect (dest_nodes, candidates, 1886 &state->inveclosure); 1887 } 1888 1889 static reg_errcode_t 1890 internal_function 1891 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes, 1892 const re_node_set *candidates) 1893 { 1894 Idx ecl_idx; 1895 reg_errcode_t err; 1896 re_node_set *inv_eclosure = dfa->inveclosures + node; 1897 re_node_set except_nodes; 1898 re_node_set_init_empty (&except_nodes); 1899 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1900 { 1901 Idx cur_node = inv_eclosure->elems[ecl_idx]; 1902 if (cur_node == node) 1903 continue; 1904 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1905 { 1906 Idx edst1 = dfa->edests[cur_node].elems[0]; 1907 Idx edst2 = ((dfa->edests[cur_node].nelem > 1) 1908 ? dfa->edests[cur_node].elems[1] : REG_MISSING); 1909 if ((!re_node_set_contains (inv_eclosure, edst1) 1910 && re_node_set_contains (dest_nodes, edst1)) 1911 || (REG_VALID_NONZERO_INDEX (edst2) 1912 && !re_node_set_contains (inv_eclosure, edst2) 1913 && re_node_set_contains (dest_nodes, edst2))) 1914 { 1915 err = re_node_set_add_intersect (&except_nodes, candidates, 1916 dfa->inveclosures + cur_node); 1917 if (BE (err != REG_NOERROR, 0)) 1918 { 1919 re_node_set_free (&except_nodes); 1920 return err; 1921 } 1922 } 1923 } 1924 } 1925 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1926 { 1927 Idx cur_node = inv_eclosure->elems[ecl_idx]; 1928 if (!re_node_set_contains (&except_nodes, cur_node)) 1929 { 1930 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1931 re_node_set_remove_at (dest_nodes, idx); 1932 } 1933 } 1934 re_node_set_free (&except_nodes); 1935 return REG_NOERROR; 1936 } 1937 1938 static bool 1939 internal_function 1940 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits, 1941 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx) 1942 { 1943 const re_dfa_t *const dfa = mctx->dfa; 1944 Idx lim_idx, src_pos, dst_pos; 1945 1946 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx); 1947 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx); 1948 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1949 { 1950 Idx subexp_idx; 1951 struct re_backref_cache_entry *ent; 1952 ent = mctx->bkref_ents + limits->elems[lim_idx]; 1953 subexp_idx = dfa->nodes[ent->node].opr.idx; 1954 1955 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1956 subexp_idx, dst_node, dst_idx, 1957 dst_bkref_idx); 1958 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx], 1959 subexp_idx, src_node, src_idx, 1960 src_bkref_idx); 1961 1962 /* In case of: 1963 <src> <dst> ( <subexp> ) 1964 ( <subexp> ) <src> <dst> 1965 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1966 if (src_pos == dst_pos) 1967 continue; /* This is unrelated limitation. */ 1968 else 1969 return true; 1970 } 1971 return false; 1972 } 1973 1974 static int 1975 internal_function 1976 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, 1977 Idx subexp_idx, Idx from_node, Idx bkref_idx) 1978 { 1979 const re_dfa_t *const dfa = mctx->dfa; 1980 const re_node_set *eclosures = dfa->eclosures + from_node; 1981 Idx node_idx; 1982 1983 /* Else, we are on the boundary: examine the nodes on the epsilon 1984 closure. */ 1985 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1986 { 1987 Idx node = eclosures->elems[node_idx]; 1988 switch (dfa->nodes[node].type) 1989 { 1990 case OP_BACK_REF: 1991 if (bkref_idx != REG_MISSING) 1992 { 1993 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx; 1994 do 1995 { 1996 Idx dst; 1997 int cpos; 1998 1999 if (ent->node != node) 2000 continue; 2001 2002 if (subexp_idx < BITSET_WORD_BITS 2003 && !(ent->eps_reachable_subexps_map 2004 & ((bitset_word_t) 1 << subexp_idx))) 2005 continue; 2006 2007 /* Recurse trying to reach the OP_OPEN_SUBEXP and 2008 OP_CLOSE_SUBEXP cases below. But, if the 2009 destination node is the same node as the source 2010 node, don't recurse because it would cause an 2011 infinite loop: a regex that exhibits this behavior 2012 is ()\1*\1* */ 2013 dst = dfa->edests[node].elems[0]; 2014 if (dst == from_node) 2015 { 2016 if (boundaries & 1) 2017 return -1; 2018 else /* if (boundaries & 2) */ 2019 return 0; 2020 } 2021 2022 cpos = 2023 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2024 dst, bkref_idx); 2025 if (cpos == -1 /* && (boundaries & 1) */) 2026 return -1; 2027 if (cpos == 0 && (boundaries & 2)) 2028 return 0; 2029 2030 if (subexp_idx < BITSET_WORD_BITS) 2031 ent->eps_reachable_subexps_map 2032 &= ~((bitset_word_t) 1 << subexp_idx); 2033 } 2034 while (ent++->more); 2035 } 2036 break; 2037 2038 case OP_OPEN_SUBEXP: 2039 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx) 2040 return -1; 2041 break; 2042 2043 case OP_CLOSE_SUBEXP: 2044 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx) 2045 return 0; 2046 break; 2047 2048 default: 2049 break; 2050 } 2051 } 2052 2053 return (boundaries & 2) ? 1 : 0; 2054 } 2055 2056 static int 2057 internal_function 2058 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit, 2059 Idx subexp_idx, Idx from_node, Idx str_idx, 2060 Idx bkref_idx) 2061 { 2062 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 2063 int boundaries; 2064 2065 /* If we are outside the range of the subexpression, return -1 or 1. */ 2066 if (str_idx < lim->subexp_from) 2067 return -1; 2068 2069 if (lim->subexp_to < str_idx) 2070 return 1; 2071 2072 /* If we are within the subexpression, return 0. */ 2073 boundaries = (str_idx == lim->subexp_from); 2074 boundaries |= (str_idx == lim->subexp_to) << 1; 2075 if (boundaries == 0) 2076 return 0; 2077 2078 /* Else, examine epsilon closure. */ 2079 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, 2080 from_node, bkref_idx); 2081 } 2082 2083 /* Check the limitations of sub expressions LIMITS, and remove the nodes 2084 which are against limitations from DEST_NODES. */ 2085 2086 static reg_errcode_t 2087 internal_function 2088 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes, 2089 const re_node_set *candidates, re_node_set *limits, 2090 struct re_backref_cache_entry *bkref_ents, Idx str_idx) 2091 { 2092 reg_errcode_t err; 2093 Idx node_idx, lim_idx; 2094 2095 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 2096 { 2097 Idx subexp_idx; 2098 struct re_backref_cache_entry *ent; 2099 ent = bkref_ents + limits->elems[lim_idx]; 2100 2101 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 2102 continue; /* This is unrelated limitation. */ 2103 2104 subexp_idx = dfa->nodes[ent->node].opr.idx; 2105 if (ent->subexp_to == str_idx) 2106 { 2107 Idx ops_node = REG_MISSING; 2108 Idx cls_node = REG_MISSING; 2109 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2110 { 2111 Idx node = dest_nodes->elems[node_idx]; 2112 re_token_type_t type = dfa->nodes[node].type; 2113 if (type == OP_OPEN_SUBEXP 2114 && subexp_idx == dfa->nodes[node].opr.idx) 2115 ops_node = node; 2116 else if (type == OP_CLOSE_SUBEXP 2117 && subexp_idx == dfa->nodes[node].opr.idx) 2118 cls_node = node; 2119 } 2120 2121 /* Check the limitation of the open subexpression. */ 2122 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 2123 if (REG_VALID_INDEX (ops_node)) 2124 { 2125 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes, 2126 candidates); 2127 if (BE (err != REG_NOERROR, 0)) 2128 return err; 2129 } 2130 2131 /* Check the limitation of the close subexpression. */ 2132 if (REG_VALID_INDEX (cls_node)) 2133 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2134 { 2135 Idx node = dest_nodes->elems[node_idx]; 2136 if (!re_node_set_contains (dfa->inveclosures + node, 2137 cls_node) 2138 && !re_node_set_contains (dfa->eclosures + node, 2139 cls_node)) 2140 { 2141 /* It is against this limitation. 2142 Remove it form the current sifted state. */ 2143 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2144 candidates); 2145 if (BE (err != REG_NOERROR, 0)) 2146 return err; 2147 --node_idx; 2148 } 2149 } 2150 } 2151 else /* (ent->subexp_to != str_idx) */ 2152 { 2153 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 2154 { 2155 Idx node = dest_nodes->elems[node_idx]; 2156 re_token_type_t type = dfa->nodes[node].type; 2157 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 2158 { 2159 if (subexp_idx != dfa->nodes[node].opr.idx) 2160 continue; 2161 /* It is against this limitation. 2162 Remove it form the current sifted state. */ 2163 err = sub_epsilon_src_nodes (dfa, node, dest_nodes, 2164 candidates); 2165 if (BE (err != REG_NOERROR, 0)) 2166 return err; 2167 } 2168 } 2169 } 2170 } 2171 return REG_NOERROR; 2172 } 2173 2174 static reg_errcode_t 2175 internal_function __attribute_warn_unused_result__ 2176 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx, 2177 Idx str_idx, const re_node_set *candidates) 2178 { 2179 const re_dfa_t *const dfa = mctx->dfa; 2180 reg_errcode_t err; 2181 Idx node_idx, node; 2182 re_sift_context_t local_sctx; 2183 Idx first_idx = search_cur_bkref_entry (mctx, str_idx); 2184 2185 if (first_idx == REG_MISSING) 2186 return REG_NOERROR; 2187 2188 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 2189 2190 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 2191 { 2192 Idx enabled_idx; 2193 re_token_type_t type; 2194 struct re_backref_cache_entry *entry; 2195 node = candidates->elems[node_idx]; 2196 type = dfa->nodes[node].type; 2197 /* Avoid infinite loop for the REs like "()\1+". */ 2198 if (node == sctx->last_node && str_idx == sctx->last_str_idx) 2199 continue; 2200 if (type != OP_BACK_REF) 2201 continue; 2202 2203 entry = mctx->bkref_ents + first_idx; 2204 enabled_idx = first_idx; 2205 do 2206 { 2207 Idx subexp_len; 2208 Idx to_idx; 2209 Idx dst_node; 2210 bool ok; 2211 re_dfastate_t *cur_state; 2212 2213 if (entry->node != node) 2214 continue; 2215 subexp_len = entry->subexp_to - entry->subexp_from; 2216 to_idx = str_idx + subexp_len; 2217 dst_node = (subexp_len ? dfa->nexts[node] 2218 : dfa->edests[node].elems[0]); 2219 2220 if (to_idx > sctx->last_str_idx 2221 || sctx->sifted_states[to_idx] == NULL 2222 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node) 2223 || check_dst_limits (mctx, &sctx->limits, node, 2224 str_idx, dst_node, to_idx)) 2225 continue; 2226 2227 if (local_sctx.sifted_states == NULL) 2228 { 2229 local_sctx = *sctx; 2230 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits); 2231 if (BE (err != REG_NOERROR, 0)) 2232 goto free_return; 2233 } 2234 local_sctx.last_node = node; 2235 local_sctx.last_str_idx = str_idx; 2236 ok = re_node_set_insert (&local_sctx.limits, enabled_idx); 2237 if (BE (! ok, 0)) 2238 { 2239 err = REG_ESPACE; 2240 goto free_return; 2241 } 2242 cur_state = local_sctx.sifted_states[str_idx]; 2243 err = sift_states_backward (mctx, &local_sctx); 2244 if (BE (err != REG_NOERROR, 0)) 2245 goto free_return; 2246 if (sctx->limited_states != NULL) 2247 { 2248 err = merge_state_array (dfa, sctx->limited_states, 2249 local_sctx.sifted_states, 2250 str_idx + 1); 2251 if (BE (err != REG_NOERROR, 0)) 2252 goto free_return; 2253 } 2254 local_sctx.sifted_states[str_idx] = cur_state; 2255 re_node_set_remove (&local_sctx.limits, enabled_idx); 2256 2257 /* mctx->bkref_ents may have changed, reload the pointer. */ 2258 entry = mctx->bkref_ents + enabled_idx; 2259 } 2260 while (enabled_idx++, entry++->more); 2261 } 2262 err = REG_NOERROR; 2263 free_return: 2264 if (local_sctx.sifted_states != NULL) 2265 { 2266 re_node_set_free (&local_sctx.limits); 2267 } 2268 2269 return err; 2270 } 2271 2272 2273 #ifdef RE_ENABLE_I18N 2274 static int 2275 internal_function 2276 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx, 2277 Idx node_idx, Idx str_idx, Idx max_str_idx) 2278 { 2279 const re_dfa_t *const dfa = mctx->dfa; 2280 int naccepted; 2281 /* Check the node can accept `multi byte'. */ 2282 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx); 2283 if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2284 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2285 dfa->nexts[node_idx])) 2286 /* The node can't accept the `multi byte', or the 2287 destination was already thrown away, then the node 2288 could't accept the current input `multi byte'. */ 2289 naccepted = 0; 2290 /* Otherwise, it is sure that the node could accept 2291 `naccepted' bytes input. */ 2292 return naccepted; 2293 } 2294 #endif /* RE_ENABLE_I18N */ 2295 2296 2297 /* Functions for state transition. */ 2298 2299 /* Return the next state to which the current state STATE will transit by 2300 accepting the current input byte, and update STATE_LOG if necessary. 2301 If STATE can accept a multibyte char/collating element/back reference 2302 update the destination of STATE_LOG. */ 2303 2304 static re_dfastate_t * 2305 internal_function __attribute_warn_unused_result__ 2306 transit_state (reg_errcode_t *err, re_match_context_t *mctx, 2307 re_dfastate_t *state) 2308 { 2309 re_dfastate_t **trtable; 2310 unsigned char ch; 2311 2312 #ifdef RE_ENABLE_I18N 2313 /* If the current state can accept multibyte. */ 2314 if (BE (state->accept_mb, 0)) 2315 { 2316 *err = transit_state_mb (mctx, state); 2317 if (BE (*err != REG_NOERROR, 0)) 2318 return NULL; 2319 } 2320 #endif /* RE_ENABLE_I18N */ 2321 2322 /* Then decide the next state with the single byte. */ 2323 #if 0 2324 if (0) 2325 /* don't use transition table */ 2326 return transit_state_sb (err, mctx, state); 2327 #endif 2328 2329 /* Use transition table */ 2330 ch = re_string_fetch_byte (&mctx->input); 2331 for (;;) 2332 { 2333 trtable = state->trtable; 2334 if (BE (trtable != NULL, 1)) 2335 return trtable[ch]; 2336 2337 trtable = state->word_trtable; 2338 if (BE (trtable != NULL, 1)) 2339 { 2340 unsigned int context; 2341 context 2342 = re_string_context_at (&mctx->input, 2343 re_string_cur_idx (&mctx->input) - 1, 2344 mctx->eflags); 2345 if (IS_WORD_CONTEXT (context)) 2346 return trtable[ch + SBC_MAX]; 2347 else 2348 return trtable[ch]; 2349 } 2350 2351 if (!build_trtable (mctx->dfa, state)) 2352 { 2353 *err = REG_ESPACE; 2354 return NULL; 2355 } 2356 2357 /* Retry, we now have a transition table. */ 2358 } 2359 } 2360 2361 /* Update the state_log if we need */ 2362 static re_dfastate_t * 2363 internal_function 2364 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx, 2365 re_dfastate_t *next_state) 2366 { 2367 const re_dfa_t *const dfa = mctx->dfa; 2368 Idx cur_idx = re_string_cur_idx (&mctx->input); 2369 2370 if (cur_idx > mctx->state_log_top) 2371 { 2372 mctx->state_log[cur_idx] = next_state; 2373 mctx->state_log_top = cur_idx; 2374 } 2375 else if (mctx->state_log[cur_idx] == 0) 2376 { 2377 mctx->state_log[cur_idx] = next_state; 2378 } 2379 else 2380 { 2381 re_dfastate_t *pstate; 2382 unsigned int context; 2383 re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2384 /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2385 the destination of a multibyte char/collating element/ 2386 back reference. Then the next state is the union set of 2387 these destinations and the results of the transition table. */ 2388 pstate = mctx->state_log[cur_idx]; 2389 log_nodes = pstate->entrance_nodes; 2390 if (next_state != NULL) 2391 { 2392 table_nodes = next_state->entrance_nodes; 2393 *err = re_node_set_init_union (&next_nodes, table_nodes, 2394 log_nodes); 2395 if (BE (*err != REG_NOERROR, 0)) 2396 return NULL; 2397 } 2398 else 2399 next_nodes = *log_nodes; 2400 /* Note: We already add the nodes of the initial state, 2401 then we don't need to add them here. */ 2402 2403 context = re_string_context_at (&mctx->input, 2404 re_string_cur_idx (&mctx->input) - 1, 2405 mctx->eflags); 2406 next_state = mctx->state_log[cur_idx] 2407 = re_acquire_state_context (err, dfa, &next_nodes, context); 2408 /* We don't need to check errors here, since the return value of 2409 this function is next_state and ERR is already set. */ 2410 2411 if (table_nodes != NULL) 2412 re_node_set_free (&next_nodes); 2413 } 2414 2415 if (BE (dfa->nbackref, 0) && next_state != NULL) 2416 { 2417 /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2418 later. We must check them here, since the back references in the 2419 next state might use them. */ 2420 *err = check_subexp_matching_top (mctx, &next_state->nodes, 2421 cur_idx); 2422 if (BE (*err != REG_NOERROR, 0)) 2423 return NULL; 2424 2425 /* If the next state has back references. */ 2426 if (next_state->has_backref) 2427 { 2428 *err = transit_state_bkref (mctx, &next_state->nodes); 2429 if (BE (*err != REG_NOERROR, 0)) 2430 return NULL; 2431 next_state = mctx->state_log[cur_idx]; 2432 } 2433 } 2434 2435 return next_state; 2436 } 2437 2438 /* Skip bytes in the input that correspond to part of a 2439 multi-byte match, then look in the log for a state 2440 from which to restart matching. */ 2441 static re_dfastate_t * 2442 internal_function 2443 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx) 2444 { 2445 re_dfastate_t *cur_state; 2446 do 2447 { 2448 Idx max = mctx->state_log_top; 2449 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 2450 2451 do 2452 { 2453 if (++cur_str_idx > max) 2454 return NULL; 2455 re_string_skip_bytes (&mctx->input, 1); 2456 } 2457 while (mctx->state_log[cur_str_idx] == NULL); 2458 2459 cur_state = merge_state_with_log (err, mctx, NULL); 2460 } 2461 while (*err == REG_NOERROR && cur_state == NULL); 2462 return cur_state; 2463 } 2464 2465 /* Helper functions for transit_state. */ 2466 2467 /* From the node set CUR_NODES, pick up the nodes whose types are 2468 OP_OPEN_SUBEXP and which have corresponding back references in the regular 2469 expression. And register them to use them later for evaluating the 2470 correspoding back references. */ 2471 2472 static reg_errcode_t 2473 internal_function 2474 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes, 2475 Idx str_idx) 2476 { 2477 const re_dfa_t *const dfa = mctx->dfa; 2478 Idx node_idx; 2479 reg_errcode_t err; 2480 2481 /* TODO: This isn't efficient. 2482 Because there might be more than one nodes whose types are 2483 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2484 nodes. 2485 E.g. RE: (a){2} */ 2486 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2487 { 2488 Idx node = cur_nodes->elems[node_idx]; 2489 if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2490 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS 2491 && (dfa->used_bkref_map 2492 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx))) 2493 { 2494 err = match_ctx_add_subtop (mctx, node, str_idx); 2495 if (BE (err != REG_NOERROR, 0)) 2496 return err; 2497 } 2498 } 2499 return REG_NOERROR; 2500 } 2501 2502 #if 0 2503 /* Return the next state to which the current state STATE will transit by 2504 accepting the current input byte. */ 2505 2506 static re_dfastate_t * 2507 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx, 2508 re_dfastate_t *state) 2509 { 2510 const re_dfa_t *const dfa = mctx->dfa; 2511 re_node_set next_nodes; 2512 re_dfastate_t *next_state; 2513 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input); 2514 unsigned int context; 2515 2516 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2517 if (BE (*err != REG_NOERROR, 0)) 2518 return NULL; 2519 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2520 { 2521 Idx cur_node = state->nodes.elems[node_cnt]; 2522 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx)) 2523 { 2524 *err = re_node_set_merge (&next_nodes, 2525 dfa->eclosures + dfa->nexts[cur_node]); 2526 if (BE (*err != REG_NOERROR, 0)) 2527 { 2528 re_node_set_free (&next_nodes); 2529 return NULL; 2530 } 2531 } 2532 } 2533 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags); 2534 next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2535 /* We don't need to check errors here, since the return value of 2536 this function is next_state and ERR is already set. */ 2537 2538 re_node_set_free (&next_nodes); 2539 re_string_skip_bytes (&mctx->input, 1); 2540 return next_state; 2541 } 2542 #endif 2543 2544 #ifdef RE_ENABLE_I18N 2545 static reg_errcode_t 2546 internal_function 2547 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate) 2548 { 2549 const re_dfa_t *const dfa = mctx->dfa; 2550 reg_errcode_t err; 2551 Idx i; 2552 2553 for (i = 0; i < pstate->nodes.nelem; ++i) 2554 { 2555 re_node_set dest_nodes, *new_nodes; 2556 Idx cur_node_idx = pstate->nodes.elems[i]; 2557 int naccepted; 2558 Idx dest_idx; 2559 unsigned int context; 2560 re_dfastate_t *dest_state; 2561 2562 if (!dfa->nodes[cur_node_idx].accept_mb) 2563 continue; 2564 2565 if (dfa->nodes[cur_node_idx].constraint) 2566 { 2567 context = re_string_context_at (&mctx->input, 2568 re_string_cur_idx (&mctx->input), 2569 mctx->eflags); 2570 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2571 context)) 2572 continue; 2573 } 2574 2575 /* How many bytes the node can accept? */ 2576 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input, 2577 re_string_cur_idx (&mctx->input)); 2578 if (naccepted == 0) 2579 continue; 2580 2581 /* The node can accepts `naccepted' bytes. */ 2582 dest_idx = re_string_cur_idx (&mctx->input) + naccepted; 2583 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2584 : mctx->max_mb_elem_len); 2585 err = clean_state_log_if_needed (mctx, dest_idx); 2586 if (BE (err != REG_NOERROR, 0)) 2587 return err; 2588 #ifdef DEBUG 2589 assert (dfa->nexts[cur_node_idx] != REG_MISSING); 2590 #endif 2591 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx]; 2592 2593 dest_state = mctx->state_log[dest_idx]; 2594 if (dest_state == NULL) 2595 dest_nodes = *new_nodes; 2596 else 2597 { 2598 err = re_node_set_init_union (&dest_nodes, 2599 dest_state->entrance_nodes, new_nodes); 2600 if (BE (err != REG_NOERROR, 0)) 2601 return err; 2602 } 2603 context = re_string_context_at (&mctx->input, dest_idx - 1, 2604 mctx->eflags); 2605 mctx->state_log[dest_idx] 2606 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2607 if (dest_state != NULL) 2608 re_node_set_free (&dest_nodes); 2609 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2610 return err; 2611 } 2612 return REG_NOERROR; 2613 } 2614 #endif /* RE_ENABLE_I18N */ 2615 2616 static reg_errcode_t 2617 internal_function 2618 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes) 2619 { 2620 const re_dfa_t *const dfa = mctx->dfa; 2621 reg_errcode_t err; 2622 Idx i; 2623 Idx cur_str_idx = re_string_cur_idx (&mctx->input); 2624 2625 for (i = 0; i < nodes->nelem; ++i) 2626 { 2627 Idx dest_str_idx, prev_nelem, bkc_idx; 2628 Idx node_idx = nodes->elems[i]; 2629 unsigned int context; 2630 const re_token_t *node = dfa->nodes + node_idx; 2631 re_node_set *new_dest_nodes; 2632 2633 /* Check whether `node' is a backreference or not. */ 2634 if (node->type != OP_BACK_REF) 2635 continue; 2636 2637 if (node->constraint) 2638 { 2639 context = re_string_context_at (&mctx->input, cur_str_idx, 2640 mctx->eflags); 2641 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2642 continue; 2643 } 2644 2645 /* `node' is a backreference. 2646 Check the substring which the substring matched. */ 2647 bkc_idx = mctx->nbkref_ents; 2648 err = get_subexp (mctx, node_idx, cur_str_idx); 2649 if (BE (err != REG_NOERROR, 0)) 2650 goto free_return; 2651 2652 /* And add the epsilon closures (which is `new_dest_nodes') of 2653 the backreference to appropriate state_log. */ 2654 #ifdef DEBUG 2655 assert (dfa->nexts[node_idx] != REG_MISSING); 2656 #endif 2657 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2658 { 2659 Idx subexp_len; 2660 re_dfastate_t *dest_state; 2661 struct re_backref_cache_entry *bkref_ent; 2662 bkref_ent = mctx->bkref_ents + bkc_idx; 2663 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2664 continue; 2665 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2666 new_dest_nodes = (subexp_len == 0 2667 ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2668 : dfa->eclosures + dfa->nexts[node_idx]); 2669 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2670 - bkref_ent->subexp_from); 2671 context = re_string_context_at (&mctx->input, dest_str_idx - 1, 2672 mctx->eflags); 2673 dest_state = mctx->state_log[dest_str_idx]; 2674 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2675 : mctx->state_log[cur_str_idx]->nodes.nelem); 2676 /* Add `new_dest_node' to state_log. */ 2677 if (dest_state == NULL) 2678 { 2679 mctx->state_log[dest_str_idx] 2680 = re_acquire_state_context (&err, dfa, new_dest_nodes, 2681 context); 2682 if (BE (mctx->state_log[dest_str_idx] == NULL 2683 && err != REG_NOERROR, 0)) 2684 goto free_return; 2685 } 2686 else 2687 { 2688 re_node_set dest_nodes; 2689 err = re_node_set_init_union (&dest_nodes, 2690 dest_state->entrance_nodes, 2691 new_dest_nodes); 2692 if (BE (err != REG_NOERROR, 0)) 2693 { 2694 re_node_set_free (&dest_nodes); 2695 goto free_return; 2696 } 2697 mctx->state_log[dest_str_idx] 2698 = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2699 re_node_set_free (&dest_nodes); 2700 if (BE (mctx->state_log[dest_str_idx] == NULL 2701 && err != REG_NOERROR, 0)) 2702 goto free_return; 2703 } 2704 /* We need to check recursively if the backreference can epsilon 2705 transit. */ 2706 if (subexp_len == 0 2707 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2708 { 2709 err = check_subexp_matching_top (mctx, new_dest_nodes, 2710 cur_str_idx); 2711 if (BE (err != REG_NOERROR, 0)) 2712 goto free_return; 2713 err = transit_state_bkref (mctx, new_dest_nodes); 2714 if (BE (err != REG_NOERROR, 0)) 2715 goto free_return; 2716 } 2717 } 2718 } 2719 err = REG_NOERROR; 2720 free_return: 2721 return err; 2722 } 2723 2724 /* Enumerate all the candidates which the backreference BKREF_NODE can match 2725 at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2726 Note that we might collect inappropriate candidates here. 2727 However, the cost of checking them strictly here is too high, then we 2728 delay these checking for prune_impossible_nodes(). */ 2729 2730 static reg_errcode_t 2731 internal_function __attribute_warn_unused_result__ 2732 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx) 2733 { 2734 const re_dfa_t *const dfa = mctx->dfa; 2735 Idx subexp_num, sub_top_idx; 2736 const char *buf = (const char *) re_string_get_buffer (&mctx->input); 2737 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2738 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2739 if (cache_idx != REG_MISSING) 2740 { 2741 const struct re_backref_cache_entry *entry 2742 = mctx->bkref_ents + cache_idx; 2743 do 2744 if (entry->node == bkref_node) 2745 return REG_NOERROR; /* We already checked it. */ 2746 while (entry++->more); 2747 } 2748 2749 subexp_num = dfa->nodes[bkref_node].opr.idx; 2750 2751 /* For each sub expression */ 2752 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2753 { 2754 reg_errcode_t err; 2755 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2756 re_sub_match_last_t *sub_last; 2757 Idx sub_last_idx, sl_str, bkref_str_off; 2758 2759 if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2760 continue; /* It isn't related. */ 2761 2762 sl_str = sub_top->str_idx; 2763 bkref_str_off = bkref_str_idx; 2764 /* At first, check the last node of sub expressions we already 2765 evaluated. */ 2766 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2767 { 2768 regoff_t sl_str_diff; 2769 sub_last = sub_top->lasts[sub_last_idx]; 2770 sl_str_diff = sub_last->str_idx - sl_str; 2771 /* The matched string by the sub expression match with the substring 2772 at the back reference? */ 2773 if (sl_str_diff > 0) 2774 { 2775 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0)) 2776 { 2777 /* Not enough chars for a successful match. */ 2778 if (bkref_str_off + sl_str_diff > mctx->input.len) 2779 break; 2780 2781 err = clean_state_log_if_needed (mctx, 2782 bkref_str_off 2783 + sl_str_diff); 2784 if (BE (err != REG_NOERROR, 0)) 2785 return err; 2786 buf = (const char *) re_string_get_buffer (&mctx->input); 2787 } 2788 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) 2789 /* We don't need to search this sub expression any more. */ 2790 break; 2791 } 2792 bkref_str_off += sl_str_diff; 2793 sl_str += sl_str_diff; 2794 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2795 bkref_str_idx); 2796 2797 /* Reload buf, since the preceding call might have reallocated 2798 the buffer. */ 2799 buf = (const char *) re_string_get_buffer (&mctx->input); 2800 2801 if (err == REG_NOMATCH) 2802 continue; 2803 if (BE (err != REG_NOERROR, 0)) 2804 return err; 2805 } 2806 2807 if (sub_last_idx < sub_top->nlasts) 2808 continue; 2809 if (sub_last_idx > 0) 2810 ++sl_str; 2811 /* Then, search for the other last nodes of the sub expression. */ 2812 for (; sl_str <= bkref_str_idx; ++sl_str) 2813 { 2814 Idx cls_node; 2815 regoff_t sl_str_off; 2816 const re_node_set *nodes; 2817 sl_str_off = sl_str - sub_top->str_idx; 2818 /* The matched string by the sub expression match with the substring 2819 at the back reference? */ 2820 if (sl_str_off > 0) 2821 { 2822 if (BE (bkref_str_off >= mctx->input.valid_len, 0)) 2823 { 2824 /* If we are at the end of the input, we cannot match. */ 2825 if (bkref_str_off >= mctx->input.len) 2826 break; 2827 2828 err = extend_buffers (mctx); 2829 if (BE (err != REG_NOERROR, 0)) 2830 return err; 2831 2832 buf = (const char *) re_string_get_buffer (&mctx->input); 2833 } 2834 if (buf [bkref_str_off++] != buf[sl_str - 1]) 2835 break; /* We don't need to search this sub expression 2836 any more. */ 2837 } 2838 if (mctx->state_log[sl_str] == NULL) 2839 continue; 2840 /* Does this state have a ')' of the sub expression? */ 2841 nodes = &mctx->state_log[sl_str]->nodes; 2842 cls_node = find_subexp_node (dfa, nodes, subexp_num, 2843 OP_CLOSE_SUBEXP); 2844 if (cls_node == REG_MISSING) 2845 continue; /* No. */ 2846 if (sub_top->path == NULL) 2847 { 2848 sub_top->path = calloc (sizeof (state_array_t), 2849 sl_str - sub_top->str_idx + 1); 2850 if (sub_top->path == NULL) 2851 return REG_ESPACE; 2852 } 2853 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2854 in the current context? */ 2855 err = check_arrival (mctx, sub_top->path, sub_top->node, 2856 sub_top->str_idx, cls_node, sl_str, 2857 OP_CLOSE_SUBEXP); 2858 if (err == REG_NOMATCH) 2859 continue; 2860 if (BE (err != REG_NOERROR, 0)) 2861 return err; 2862 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2863 if (BE (sub_last == NULL, 0)) 2864 return REG_ESPACE; 2865 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node, 2866 bkref_str_idx); 2867 if (err == REG_NOMATCH) 2868 continue; 2869 } 2870 } 2871 return REG_NOERROR; 2872 } 2873 2874 /* Helper functions for get_subexp(). */ 2875 2876 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2877 If it can arrive, register the sub expression expressed with SUB_TOP 2878 and SUB_LAST. */ 2879 2880 static reg_errcode_t 2881 internal_function 2882 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top, 2883 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str) 2884 { 2885 reg_errcode_t err; 2886 Idx to_idx; 2887 /* Can the subexpression arrive the back reference? */ 2888 err = check_arrival (mctx, &sub_last->path, sub_last->node, 2889 sub_last->str_idx, bkref_node, bkref_str, 2890 OP_OPEN_SUBEXP); 2891 if (err != REG_NOERROR) 2892 return err; 2893 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2894 sub_last->str_idx); 2895 if (BE (err != REG_NOERROR, 0)) 2896 return err; 2897 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2898 return clean_state_log_if_needed (mctx, to_idx); 2899 } 2900 2901 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2902 Search '(' if FL_OPEN, or search ')' otherwise. 2903 TODO: This function isn't efficient... 2904 Because there might be more than one nodes whose types are 2905 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2906 nodes. 2907 E.g. RE: (a){2} */ 2908 2909 static Idx 2910 internal_function 2911 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes, 2912 Idx subexp_idx, int type) 2913 { 2914 Idx cls_idx; 2915 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2916 { 2917 Idx cls_node = nodes->elems[cls_idx]; 2918 const re_token_t *node = dfa->nodes + cls_node; 2919 if (node->type == type 2920 && node->opr.idx == subexp_idx) 2921 return cls_node; 2922 } 2923 return REG_MISSING; 2924 } 2925 2926 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2927 LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2928 heavily reused. 2929 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2930 2931 static reg_errcode_t 2932 internal_function __attribute_warn_unused_result__ 2933 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node, 2934 Idx top_str, Idx last_node, Idx last_str, int type) 2935 { 2936 const re_dfa_t *const dfa = mctx->dfa; 2937 reg_errcode_t err = REG_NOERROR; 2938 Idx subexp_num, backup_cur_idx, str_idx, null_cnt; 2939 re_dfastate_t *cur_state = NULL; 2940 re_node_set *cur_nodes, next_nodes; 2941 re_dfastate_t **backup_state_log; 2942 unsigned int context; 2943 2944 subexp_num = dfa->nodes[top_node].opr.idx; 2945 /* Extend the buffer if we need. */ 2946 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0)) 2947 { 2948 re_dfastate_t **new_array; 2949 Idx old_alloc = path->alloc; 2950 Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1; 2951 if (BE (new_alloc < old_alloc, 0) 2952 || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0)) 2953 return REG_ESPACE; 2954 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc); 2955 if (BE (new_array == NULL, 0)) 2956 return REG_ESPACE; 2957 path->array = new_array; 2958 path->alloc = new_alloc; 2959 memset (new_array + old_alloc, '\0', 2960 sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2961 } 2962 2963 str_idx = path->next_idx ? path->next_idx : top_str; 2964 2965 /* Temporary modify MCTX. */ 2966 backup_state_log = mctx->state_log; 2967 backup_cur_idx = mctx->input.cur_idx; 2968 mctx->state_log = path->array; 2969 mctx->input.cur_idx = str_idx; 2970 2971 /* Setup initial node set. */ 2972 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 2973 if (str_idx == top_str) 2974 { 2975 err = re_node_set_init_1 (&next_nodes, top_node); 2976 if (BE (err != REG_NOERROR, 0)) 2977 return err; 2978 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 2979 if (BE (err != REG_NOERROR, 0)) 2980 { 2981 re_node_set_free (&next_nodes); 2982 return err; 2983 } 2984 } 2985 else 2986 { 2987 cur_state = mctx->state_log[str_idx]; 2988 if (cur_state && cur_state->has_backref) 2989 { 2990 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2991 if (BE (err != REG_NOERROR, 0)) 2992 return err; 2993 } 2994 else 2995 re_node_set_init_empty (&next_nodes); 2996 } 2997 if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2998 { 2999 if (next_nodes.nelem) 3000 { 3001 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 3002 subexp_num, type); 3003 if (BE (err != REG_NOERROR, 0)) 3004 { 3005 re_node_set_free (&next_nodes); 3006 return err; 3007 } 3008 } 3009 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3010 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3011 { 3012 re_node_set_free (&next_nodes); 3013 return err; 3014 } 3015 mctx->state_log[str_idx] = cur_state; 3016 } 3017 3018 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 3019 { 3020 re_node_set_empty (&next_nodes); 3021 if (mctx->state_log[str_idx + 1]) 3022 { 3023 err = re_node_set_merge (&next_nodes, 3024 &mctx->state_log[str_idx + 1]->nodes); 3025 if (BE (err != REG_NOERROR, 0)) 3026 { 3027 re_node_set_free (&next_nodes); 3028 return err; 3029 } 3030 } 3031 if (cur_state) 3032 { 3033 err = check_arrival_add_next_nodes (mctx, str_idx, 3034 &cur_state->non_eps_nodes, 3035 &next_nodes); 3036 if (BE (err != REG_NOERROR, 0)) 3037 { 3038 re_node_set_free (&next_nodes); 3039 return err; 3040 } 3041 } 3042 ++str_idx; 3043 if (next_nodes.nelem) 3044 { 3045 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type); 3046 if (BE (err != REG_NOERROR, 0)) 3047 { 3048 re_node_set_free (&next_nodes); 3049 return err; 3050 } 3051 err = expand_bkref_cache (mctx, &next_nodes, str_idx, 3052 subexp_num, type); 3053 if (BE (err != REG_NOERROR, 0)) 3054 { 3055 re_node_set_free (&next_nodes); 3056 return err; 3057 } 3058 } 3059 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags); 3060 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 3061 if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 3062 { 3063 re_node_set_free (&next_nodes); 3064 return err; 3065 } 3066 mctx->state_log[str_idx] = cur_state; 3067 null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 3068 } 3069 re_node_set_free (&next_nodes); 3070 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 3071 : &mctx->state_log[last_str]->nodes); 3072 path->next_idx = str_idx; 3073 3074 /* Fix MCTX. */ 3075 mctx->state_log = backup_state_log; 3076 mctx->input.cur_idx = backup_cur_idx; 3077 3078 /* Then check the current node set has the node LAST_NODE. */ 3079 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node)) 3080 return REG_NOERROR; 3081 3082 return REG_NOMATCH; 3083 } 3084 3085 /* Helper functions for check_arrival. */ 3086 3087 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 3088 to NEXT_NODES. 3089 TODO: This function is similar to the functions transit_state*(), 3090 however this function has many additional works. 3091 Can't we unify them? */ 3092 3093 static reg_errcode_t 3094 internal_function __attribute_warn_unused_result__ 3095 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx, 3096 re_node_set *cur_nodes, re_node_set *next_nodes) 3097 { 3098 const re_dfa_t *const dfa = mctx->dfa; 3099 bool ok; 3100 Idx cur_idx; 3101 #ifdef RE_ENABLE_I18N 3102 reg_errcode_t err = REG_NOERROR; 3103 #endif 3104 re_node_set union_set; 3105 re_node_set_init_empty (&union_set); 3106 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 3107 { 3108 int naccepted = 0; 3109 Idx cur_node = cur_nodes->elems[cur_idx]; 3110 #ifdef DEBUG 3111 re_token_type_t type = dfa->nodes[cur_node].type; 3112 assert (!IS_EPSILON_NODE (type)); 3113 #endif 3114 #ifdef RE_ENABLE_I18N 3115 /* If the node may accept `multi byte'. */ 3116 if (dfa->nodes[cur_node].accept_mb) 3117 { 3118 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input, 3119 str_idx); 3120 if (naccepted > 1) 3121 { 3122 re_dfastate_t *dest_state; 3123 Idx next_node = dfa->nexts[cur_node]; 3124 Idx next_idx = str_idx + naccepted; 3125 dest_state = mctx->state_log[next_idx]; 3126 re_node_set_empty (&union_set); 3127 if (dest_state) 3128 { 3129 err = re_node_set_merge (&union_set, &dest_state->nodes); 3130 if (BE (err != REG_NOERROR, 0)) 3131 { 3132 re_node_set_free (&union_set); 3133 return err; 3134 } 3135 } 3136 ok = re_node_set_insert (&union_set, next_node); 3137 if (BE (! ok, 0)) 3138 { 3139 re_node_set_free (&union_set); 3140 return REG_ESPACE; 3141 } 3142 mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 3143 &union_set); 3144 if (BE (mctx->state_log[next_idx] == NULL 3145 && err != REG_NOERROR, 0)) 3146 { 3147 re_node_set_free (&union_set); 3148 return err; 3149 } 3150 } 3151 } 3152 #endif /* RE_ENABLE_I18N */ 3153 if (naccepted 3154 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx)) 3155 { 3156 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 3157 if (BE (! ok, 0)) 3158 { 3159 re_node_set_free (&union_set); 3160 return REG_ESPACE; 3161 } 3162 } 3163 } 3164 re_node_set_free (&union_set); 3165 return REG_NOERROR; 3166 } 3167 3168 /* For all the nodes in CUR_NODES, add the epsilon closures of them to 3169 CUR_NODES, however exclude the nodes which are: 3170 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 3171 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 3172 */ 3173 3174 static reg_errcode_t 3175 internal_function 3176 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes, 3177 Idx ex_subexp, int type) 3178 { 3179 reg_errcode_t err; 3180 Idx idx, outside_node; 3181 re_node_set new_nodes; 3182 #ifdef DEBUG 3183 assert (cur_nodes->nelem); 3184 #endif 3185 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 3186 if (BE (err != REG_NOERROR, 0)) 3187 return err; 3188 /* Create a new node set NEW_NODES with the nodes which are epsilon 3189 closures of the node in CUR_NODES. */ 3190 3191 for (idx = 0; idx < cur_nodes->nelem; ++idx) 3192 { 3193 Idx cur_node = cur_nodes->elems[idx]; 3194 const re_node_set *eclosure = dfa->eclosures + cur_node; 3195 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type); 3196 if (outside_node == REG_MISSING) 3197 { 3198 /* There are no problematic nodes, just merge them. */ 3199 err = re_node_set_merge (&new_nodes, eclosure); 3200 if (BE (err != REG_NOERROR, 0)) 3201 { 3202 re_node_set_free (&new_nodes); 3203 return err; 3204 } 3205 } 3206 else 3207 { 3208 /* There are problematic nodes, re-calculate incrementally. */ 3209 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 3210 ex_subexp, type); 3211 if (BE (err != REG_NOERROR, 0)) 3212 { 3213 re_node_set_free (&new_nodes); 3214 return err; 3215 } 3216 } 3217 } 3218 re_node_set_free (cur_nodes); 3219 *cur_nodes = new_nodes; 3220 return REG_NOERROR; 3221 } 3222 3223 /* Helper function for check_arrival_expand_ecl. 3224 Check incrementally the epsilon closure of TARGET, and if it isn't 3225 problematic append it to DST_NODES. */ 3226 3227 static reg_errcode_t 3228 internal_function __attribute_warn_unused_result__ 3229 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes, 3230 Idx target, Idx ex_subexp, int type) 3231 { 3232 Idx cur_node; 3233 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 3234 { 3235 bool ok; 3236 3237 if (dfa->nodes[cur_node].type == type 3238 && dfa->nodes[cur_node].opr.idx == ex_subexp) 3239 { 3240 if (type == OP_CLOSE_SUBEXP) 3241 { 3242 ok = re_node_set_insert (dst_nodes, cur_node); 3243 if (BE (! ok, 0)) 3244 return REG_ESPACE; 3245 } 3246 break; 3247 } 3248 ok = re_node_set_insert (dst_nodes, cur_node); 3249 if (BE (! ok, 0)) 3250 return REG_ESPACE; 3251 if (dfa->edests[cur_node].nelem == 0) 3252 break; 3253 if (dfa->edests[cur_node].nelem == 2) 3254 { 3255 reg_errcode_t err; 3256 err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 3257 dfa->edests[cur_node].elems[1], 3258 ex_subexp, type); 3259 if (BE (err != REG_NOERROR, 0)) 3260 return err; 3261 } 3262 cur_node = dfa->edests[cur_node].elems[0]; 3263 } 3264 return REG_NOERROR; 3265 } 3266 3267 3268 /* For all the back references in the current state, calculate the 3269 destination of the back references by the appropriate entry 3270 in MCTX->BKREF_ENTS. */ 3271 3272 static reg_errcode_t 3273 internal_function __attribute_warn_unused_result__ 3274 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, 3275 Idx cur_str, Idx subexp_num, int type) 3276 { 3277 const re_dfa_t *const dfa = mctx->dfa; 3278 reg_errcode_t err; 3279 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 3280 struct re_backref_cache_entry *ent; 3281 3282 if (cache_idx_start == REG_MISSING) 3283 return REG_NOERROR; 3284 3285 restart: 3286 ent = mctx->bkref_ents + cache_idx_start; 3287 do 3288 { 3289 Idx to_idx, next_node; 3290 3291 /* Is this entry ENT is appropriate? */ 3292 if (!re_node_set_contains (cur_nodes, ent->node)) 3293 continue; /* No. */ 3294 3295 to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3296 /* Calculate the destination of the back reference, and append it 3297 to MCTX->STATE_LOG. */ 3298 if (to_idx == cur_str) 3299 { 3300 /* The backreference did epsilon transit, we must re-check all the 3301 node in the current state. */ 3302 re_node_set new_dests; 3303 reg_errcode_t err2, err3; 3304 next_node = dfa->edests[ent->node].elems[0]; 3305 if (re_node_set_contains (cur_nodes, next_node)) 3306 continue; 3307 err = re_node_set_init_1 (&new_dests, next_node); 3308 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type); 3309 err3 = re_node_set_merge (cur_nodes, &new_dests); 3310 re_node_set_free (&new_dests); 3311 if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3312 || err3 != REG_NOERROR, 0)) 3313 { 3314 err = (err != REG_NOERROR ? err 3315 : (err2 != REG_NOERROR ? err2 : err3)); 3316 return err; 3317 } 3318 /* TODO: It is still inefficient... */ 3319 goto restart; 3320 } 3321 else 3322 { 3323 re_node_set union_set; 3324 next_node = dfa->nexts[ent->node]; 3325 if (mctx->state_log[to_idx]) 3326 { 3327 bool ok; 3328 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3329 next_node)) 3330 continue; 3331 err = re_node_set_init_copy (&union_set, 3332 &mctx->state_log[to_idx]->nodes); 3333 ok = re_node_set_insert (&union_set, next_node); 3334 if (BE (err != REG_NOERROR || ! ok, 0)) 3335 { 3336 re_node_set_free (&union_set); 3337 err = err != REG_NOERROR ? err : REG_ESPACE; 3338 return err; 3339 } 3340 } 3341 else 3342 { 3343 err = re_node_set_init_1 (&union_set, next_node); 3344 if (BE (err != REG_NOERROR, 0)) 3345 return err; 3346 } 3347 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3348 re_node_set_free (&union_set); 3349 if (BE (mctx->state_log[to_idx] == NULL 3350 && err != REG_NOERROR, 0)) 3351 return err; 3352 } 3353 } 3354 while (ent++->more); 3355 return REG_NOERROR; 3356 } 3357 3358 /* Build transition table for the state. 3359 Return true if successful. */ 3360 3361 static bool 3362 internal_function 3363 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state) 3364 { 3365 reg_errcode_t err; 3366 Idx i, j; 3367 int ch; 3368 bool need_word_trtable = false; 3369 bitset_word_t elem, mask; 3370 bool dests_node_malloced = false; 3371 bool dest_states_malloced = false; 3372 Idx ndests; /* Number of the destination states from `state'. */ 3373 re_dfastate_t **trtable; 3374 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3375 re_node_set follows, *dests_node; 3376 bitset_t *dests_ch; 3377 bitset_t acceptable; 3378 3379 struct dests_alloc 3380 { 3381 re_node_set dests_node[SBC_MAX]; 3382 bitset_t dests_ch[SBC_MAX]; 3383 } *dests_alloc; 3384 3385 /* We build DFA states which corresponds to the destination nodes 3386 from `state'. `dests_node[i]' represents the nodes which i-th 3387 destination state contains, and `dests_ch[i]' represents the 3388 characters which i-th destination state accepts. */ 3389 if (__libc_use_alloca (sizeof (struct dests_alloc))) 3390 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc)); 3391 else 3392 { 3393 dests_alloc = re_malloc (struct dests_alloc, 1); 3394 if (BE (dests_alloc == NULL, 0)) 3395 return false; 3396 dests_node_malloced = true; 3397 } 3398 dests_node = dests_alloc->dests_node; 3399 dests_ch = dests_alloc->dests_ch; 3400 3401 /* Initialize transiton table. */ 3402 state->word_trtable = state->trtable = NULL; 3403 3404 /* At first, group all nodes belonging to `state' into several 3405 destinations. */ 3406 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch); 3407 if (BE (! REG_VALID_NONZERO_INDEX (ndests), 0)) 3408 { 3409 if (dests_node_malloced) 3410 free (dests_alloc); 3411 if (ndests == 0) 3412 { 3413 state->trtable = (re_dfastate_t **) 3414 calloc (sizeof (re_dfastate_t *), SBC_MAX); 3415 return true; 3416 } 3417 return false; 3418 } 3419 3420 err = re_node_set_alloc (&follows, ndests + 1); 3421 if (BE (err != REG_NOERROR, 0)) 3422 goto out_free; 3423 3424 /* Avoid arithmetic overflow in size calculation. */ 3425 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX) 3426 / (3 * sizeof (re_dfastate_t *))) 3427 < ndests), 3428 0)) 3429 goto out_free; 3430 3431 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX 3432 + ndests * 3 * sizeof (re_dfastate_t *))) 3433 dest_states = (re_dfastate_t **) 3434 alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3435 else 3436 { 3437 dest_states = (re_dfastate_t **) 3438 malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3439 if (BE (dest_states == NULL, 0)) 3440 { 3441 out_free: 3442 if (dest_states_malloced) 3443 free (dest_states); 3444 re_node_set_free (&follows); 3445 for (i = 0; i < ndests; ++i) 3446 re_node_set_free (dests_node + i); 3447 if (dests_node_malloced) 3448 free (dests_alloc); 3449 return false; 3450 } 3451 dest_states_malloced = true; 3452 } 3453 dest_states_word = dest_states + ndests; 3454 dest_states_nl = dest_states_word + ndests; 3455 bitset_empty (acceptable); 3456 3457 /* Then build the states for all destinations. */ 3458 for (i = 0; i < ndests; ++i) 3459 { 3460 Idx next_node; 3461 re_node_set_empty (&follows); 3462 /* Merge the follows of this destination states. */ 3463 for (j = 0; j < dests_node[i].nelem; ++j) 3464 { 3465 next_node = dfa->nexts[dests_node[i].elems[j]]; 3466 if (next_node != REG_MISSING) 3467 { 3468 err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3469 if (BE (err != REG_NOERROR, 0)) 3470 goto out_free; 3471 } 3472 } 3473 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3474 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3475 goto out_free; 3476 /* If the new state has context constraint, 3477 build appropriate states for these contexts. */ 3478 if (dest_states[i]->has_constraint) 3479 { 3480 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3481 CONTEXT_WORD); 3482 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3483 goto out_free; 3484 3485 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) 3486 need_word_trtable = true; 3487 3488 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3489 CONTEXT_NEWLINE); 3490 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3491 goto out_free; 3492 } 3493 else 3494 { 3495 dest_states_word[i] = dest_states[i]; 3496 dest_states_nl[i] = dest_states[i]; 3497 } 3498 bitset_merge (acceptable, dests_ch[i]); 3499 } 3500 3501 if (!BE (need_word_trtable, 0)) 3502 { 3503 /* We don't care about whether the following character is a word 3504 character, or we are in a single-byte character set so we can 3505 discern by looking at the character code: allocate a 3506 256-entry transition table. */ 3507 trtable = state->trtable = 3508 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3509 if (BE (trtable == NULL, 0)) 3510 goto out_free; 3511 3512 /* For all characters ch...: */ 3513 for (i = 0; i < BITSET_WORDS; ++i) 3514 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3515 elem; 3516 mask <<= 1, elem >>= 1, ++ch) 3517 if (BE (elem & 1, 0)) 3518 { 3519 /* There must be exactly one destination which accepts 3520 character ch. See group_nodes_into_DFAstates. */ 3521 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3522 ; 3523 3524 /* j-th destination accepts the word character ch. */ 3525 if (dfa->word_char[i] & mask) 3526 trtable[ch] = dest_states_word[j]; 3527 else 3528 trtable[ch] = dest_states[j]; 3529 } 3530 } 3531 else 3532 { 3533 /* We care about whether the following character is a word 3534 character, and we are in a multi-byte character set: discern 3535 by looking at the character code: build two 256-entry 3536 transition tables, one starting at trtable[0] and one 3537 starting at trtable[SBC_MAX]. */ 3538 trtable = state->word_trtable = 3539 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); 3540 if (BE (trtable == NULL, 0)) 3541 goto out_free; 3542 3543 /* For all characters ch...: */ 3544 for (i = 0; i < BITSET_WORDS; ++i) 3545 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1; 3546 elem; 3547 mask <<= 1, elem >>= 1, ++ch) 3548 if (BE (elem & 1, 0)) 3549 { 3550 /* There must be exactly one destination which accepts 3551 character ch. See group_nodes_into_DFAstates. */ 3552 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j) 3553 ; 3554 3555 /* j-th destination accepts the word character ch. */ 3556 trtable[ch] = dest_states[j]; 3557 trtable[ch + SBC_MAX] = dest_states_word[j]; 3558 } 3559 } 3560 3561 /* new line */ 3562 if (bitset_contain (acceptable, NEWLINE_CHAR)) 3563 { 3564 /* The current state accepts newline character. */ 3565 for (j = 0; j < ndests; ++j) 3566 if (bitset_contain (dests_ch[j], NEWLINE_CHAR)) 3567 { 3568 /* k-th destination accepts newline character. */ 3569 trtable[NEWLINE_CHAR] = dest_states_nl[j]; 3570 if (need_word_trtable) 3571 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; 3572 /* There must be only one destination which accepts 3573 newline. See group_nodes_into_DFAstates. */ 3574 break; 3575 } 3576 } 3577 3578 if (dest_states_malloced) 3579 free (dest_states); 3580 3581 re_node_set_free (&follows); 3582 for (i = 0; i < ndests; ++i) 3583 re_node_set_free (dests_node + i); 3584 3585 if (dests_node_malloced) 3586 free (dests_alloc); 3587 3588 return true; 3589 } 3590 3591 /* Group all nodes belonging to STATE into several destinations. 3592 Then for all destinations, set the nodes belonging to the destination 3593 to DESTS_NODE[i] and set the characters accepted by the destination 3594 to DEST_CH[i]. This function return the number of destinations. */ 3595 3596 static Idx 3597 internal_function 3598 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, 3599 re_node_set *dests_node, bitset_t *dests_ch) 3600 { 3601 reg_errcode_t err; 3602 bool ok; 3603 Idx i, j, k; 3604 Idx ndests; /* Number of the destinations from `state'. */ 3605 bitset_t accepts; /* Characters a node can accept. */ 3606 const re_node_set *cur_nodes = &state->nodes; 3607 bitset_empty (accepts); 3608 ndests = 0; 3609 3610 /* For all the nodes belonging to `state', */ 3611 for (i = 0; i < cur_nodes->nelem; ++i) 3612 { 3613 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3614 re_token_type_t type = node->type; 3615 unsigned int constraint = node->constraint; 3616 3617 /* Enumerate all single byte character this node can accept. */ 3618 if (type == CHARACTER) 3619 bitset_set (accepts, node->opr.c); 3620 else if (type == SIMPLE_BRACKET) 3621 { 3622 bitset_merge (accepts, node->opr.sbcset); 3623 } 3624 else if (type == OP_PERIOD) 3625 { 3626 #ifdef RE_ENABLE_I18N 3627 if (dfa->mb_cur_max > 1) 3628 bitset_merge (accepts, dfa->sb_char); 3629 else 3630 #endif 3631 bitset_set_all (accepts); 3632 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3633 bitset_clear (accepts, '\n'); 3634 if (dfa->syntax & RE_DOT_NOT_NULL) 3635 bitset_clear (accepts, '\0'); 3636 } 3637 #ifdef RE_ENABLE_I18N 3638 else if (type == OP_UTF8_PERIOD) 3639 { 3640 if (ASCII_CHARS % BITSET_WORD_BITS == 0) 3641 memset (accepts, -1, ASCII_CHARS / CHAR_BIT); 3642 else 3643 bitset_merge (accepts, utf8_sb_map); 3644 if (!(dfa->syntax & RE_DOT_NEWLINE)) 3645 bitset_clear (accepts, '\n'); 3646 if (dfa->syntax & RE_DOT_NOT_NULL) 3647 bitset_clear (accepts, '\0'); 3648 } 3649 #endif 3650 else 3651 continue; 3652 3653 /* Check the `accepts' and sift the characters which are not 3654 match it the context. */ 3655 if (constraint) 3656 { 3657 if (constraint & NEXT_NEWLINE_CONSTRAINT) 3658 { 3659 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3660 bitset_empty (accepts); 3661 if (accepts_newline) 3662 bitset_set (accepts, NEWLINE_CHAR); 3663 else 3664 continue; 3665 } 3666 if (constraint & NEXT_ENDBUF_CONSTRAINT) 3667 { 3668 bitset_empty (accepts); 3669 continue; 3670 } 3671 3672 if (constraint & NEXT_WORD_CONSTRAINT) 3673 { 3674 bitset_word_t any_set = 0; 3675 if (type == CHARACTER && !node->word_char) 3676 { 3677 bitset_empty (accepts); 3678 continue; 3679 } 3680 #ifdef RE_ENABLE_I18N 3681 if (dfa->mb_cur_max > 1) 3682 for (j = 0; j < BITSET_WORDS; ++j) 3683 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j])); 3684 else 3685 #endif 3686 for (j = 0; j < BITSET_WORDS; ++j) 3687 any_set |= (accepts[j] &= dfa->word_char[j]); 3688 if (!any_set) 3689 continue; 3690 } 3691 if (constraint & NEXT_NOTWORD_CONSTRAINT) 3692 { 3693 bitset_word_t any_set = 0; 3694 if (type == CHARACTER && node->word_char) 3695 { 3696 bitset_empty (accepts); 3697 continue; 3698 } 3699 #ifdef RE_ENABLE_I18N 3700 if (dfa->mb_cur_max > 1) 3701 for (j = 0; j < BITSET_WORDS; ++j) 3702 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j])); 3703 else 3704 #endif 3705 for (j = 0; j < BITSET_WORDS; ++j) 3706 any_set |= (accepts[j] &= ~dfa->word_char[j]); 3707 if (!any_set) 3708 continue; 3709 } 3710 } 3711 3712 /* Then divide `accepts' into DFA states, or create a new 3713 state. Above, we make sure that accepts is not empty. */ 3714 for (j = 0; j < ndests; ++j) 3715 { 3716 bitset_t intersec; /* Intersection sets, see below. */ 3717 bitset_t remains; 3718 /* Flags, see below. */ 3719 bitset_word_t has_intersec, not_subset, not_consumed; 3720 3721 /* Optimization, skip if this state doesn't accept the character. */ 3722 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3723 continue; 3724 3725 /* Enumerate the intersection set of this state and `accepts'. */ 3726 has_intersec = 0; 3727 for (k = 0; k < BITSET_WORDS; ++k) 3728 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3729 /* And skip if the intersection set is empty. */ 3730 if (!has_intersec) 3731 continue; 3732 3733 /* Then check if this state is a subset of `accepts'. */ 3734 not_subset = not_consumed = 0; 3735 for (k = 0; k < BITSET_WORDS; ++k) 3736 { 3737 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3738 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3739 } 3740 3741 /* If this state isn't a subset of `accepts', create a 3742 new group state, which has the `remains'. */ 3743 if (not_subset) 3744 { 3745 bitset_copy (dests_ch[ndests], remains); 3746 bitset_copy (dests_ch[j], intersec); 3747 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3748 if (BE (err != REG_NOERROR, 0)) 3749 goto error_return; 3750 ++ndests; 3751 } 3752 3753 /* Put the position in the current group. */ 3754 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3755 if (BE (! ok, 0)) 3756 goto error_return; 3757 3758 /* If all characters are consumed, go to next node. */ 3759 if (!not_consumed) 3760 break; 3761 } 3762 /* Some characters remain, create a new group. */ 3763 if (j == ndests) 3764 { 3765 bitset_copy (dests_ch[ndests], accepts); 3766 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3767 if (BE (err != REG_NOERROR, 0)) 3768 goto error_return; 3769 ++ndests; 3770 bitset_empty (accepts); 3771 } 3772 } 3773 return ndests; 3774 error_return: 3775 for (j = 0; j < ndests; ++j) 3776 re_node_set_free (dests_node + j); 3777 return REG_MISSING; 3778 } 3779 3780 #ifdef RE_ENABLE_I18N 3781 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3782 Return the number of the bytes the node accepts. 3783 STR_IDX is the current index of the input string. 3784 3785 This function handles the nodes which can accept one character, or 3786 one collating element like '.', '[a-z]', opposite to the other nodes 3787 can only accept one byte. */ 3788 3789 static int 3790 internal_function 3791 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx, 3792 const re_string_t *input, Idx str_idx) 3793 { 3794 const re_token_t *node = dfa->nodes + node_idx; 3795 int char_len, elem_len; 3796 Idx i; 3797 3798 if (BE (node->type == OP_UTF8_PERIOD, 0)) 3799 { 3800 unsigned char c = re_string_byte_at (input, str_idx), d; 3801 if (BE (c < 0xc2, 1)) 3802 return 0; 3803 3804 if (str_idx + 2 > input->len) 3805 return 0; 3806 3807 d = re_string_byte_at (input, str_idx + 1); 3808 if (c < 0xe0) 3809 return (d < 0x80 || d > 0xbf) ? 0 : 2; 3810 else if (c < 0xf0) 3811 { 3812 char_len = 3; 3813 if (c == 0xe0 && d < 0xa0) 3814 return 0; 3815 } 3816 else if (c < 0xf8) 3817 { 3818 char_len = 4; 3819 if (c == 0xf0 && d < 0x90) 3820 return 0; 3821 } 3822 else if (c < 0xfc) 3823 { 3824 char_len = 5; 3825 if (c == 0xf8 && d < 0x88) 3826 return 0; 3827 } 3828 else if (c < 0xfe) 3829 { 3830 char_len = 6; 3831 if (c == 0xfc && d < 0x84) 3832 return 0; 3833 } 3834 else 3835 return 0; 3836 3837 if (str_idx + char_len > input->len) 3838 return 0; 3839 3840 for (i = 1; i < char_len; ++i) 3841 { 3842 d = re_string_byte_at (input, str_idx + i); 3843 if (d < 0x80 || d > 0xbf) 3844 return 0; 3845 } 3846 return char_len; 3847 } 3848 3849 char_len = re_string_char_size_at (input, str_idx); 3850 if (node->type == OP_PERIOD) 3851 { 3852 if (char_len <= 1) 3853 return 0; 3854 /* FIXME: I don't think this if is needed, as both '\n' 3855 and '\0' are char_len == 1. */ 3856 /* '.' accepts any one character except the following two cases. */ 3857 if ((!(dfa->syntax & RE_DOT_NEWLINE) && 3858 re_string_byte_at (input, str_idx) == '\n') || 3859 ((dfa->syntax & RE_DOT_NOT_NULL) && 3860 re_string_byte_at (input, str_idx) == '\0')) 3861 return 0; 3862 return char_len; 3863 } 3864 3865 elem_len = re_string_elem_size_at (input, str_idx); 3866 if ((elem_len <= 1 && char_len <= 1) || char_len == 0) 3867 return 0; 3868 3869 if (node->type == COMPLEX_BRACKET) 3870 { 3871 const re_charset_t *cset = node->opr.mbcset; 3872 # ifdef _LIBC 3873 const unsigned char *pin 3874 = ((const unsigned char *) re_string_get_buffer (input) + str_idx); 3875 Idx j; 3876 uint32_t nrules; 3877 # endif /* _LIBC */ 3878 int match_len = 0; 3879 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3880 ? re_string_wchar_at (input, str_idx) : 0); 3881 3882 /* match with multibyte character? */ 3883 for (i = 0; i < cset->nmbchars; ++i) 3884 if (wc == cset->mbchars[i]) 3885 { 3886 match_len = char_len; 3887 goto check_node_accept_bytes_match; 3888 } 3889 /* match with character_class? */ 3890 for (i = 0; i < cset->nchar_classes; ++i) 3891 { 3892 wctype_t wt = cset->char_classes[i]; 3893 if (__iswctype (wc, wt)) 3894 { 3895 match_len = char_len; 3896 goto check_node_accept_bytes_match; 3897 } 3898 } 3899 3900 # ifdef _LIBC 3901 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3902 if (nrules != 0) 3903 { 3904 unsigned int in_collseq = 0; 3905 const int32_t *table, *indirect; 3906 const unsigned char *weights, *extra; 3907 const char *collseqwc; 3908 int32_t idx; 3909 /* This #include defines a local function! */ 3910 # include <locale/weight.h> 3911 3912 /* match with collating_symbol? */ 3913 if (cset->ncoll_syms) 3914 extra = (const unsigned char *) 3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3916 for (i = 0; i < cset->ncoll_syms; ++i) 3917 { 3918 const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3919 /* Compare the length of input collating element and 3920 the length of current collating element. */ 3921 if (*coll_sym != elem_len) 3922 continue; 3923 /* Compare each bytes. */ 3924 for (j = 0; j < *coll_sym; j++) 3925 if (pin[j] != coll_sym[1 + j]) 3926 break; 3927 if (j == *coll_sym) 3928 { 3929 /* Match if every bytes is equal. */ 3930 match_len = j; 3931 goto check_node_accept_bytes_match; 3932 } 3933 } 3934 3935 if (cset->nranges) 3936 { 3937 if (elem_len <= char_len) 3938 { 3939 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3940 in_collseq = __collseq_table_lookup (collseqwc, wc); 3941 } 3942 else 3943 in_collseq = find_collation_sequence_value (pin, elem_len); 3944 } 3945 /* match with range expression? */ 3946 for (i = 0; i < cset->nranges; ++i) 3947 if (cset->range_starts[i] <= in_collseq 3948 && in_collseq <= cset->range_ends[i]) 3949 { 3950 match_len = elem_len; 3951 goto check_node_accept_bytes_match; 3952 } 3953 3954 /* match with equivalence_class? */ 3955 if (cset->nequiv_classes) 3956 { 3957 const unsigned char *cp = pin; 3958 table = (const int32_t *) 3959 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3960 weights = (const unsigned char *) 3961 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3962 extra = (const unsigned char *) 3963 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3964 indirect = (const int32_t *) 3965 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3966 int32_t idx = findidx (&cp); 3967 if (idx > 0) 3968 for (i = 0; i < cset->nequiv_classes; ++i) 3969 { 3970 int32_t equiv_class_idx = cset->equiv_classes[i]; 3971 size_t weight_len = weights[idx & 0xffffff]; 3972 if (weight_len == weights[equiv_class_idx & 0xffffff] 3973 && (idx >> 24) == (equiv_class_idx >> 24)) 3974 { 3975 Idx cnt = 0; 3976 3977 idx &= 0xffffff; 3978 equiv_class_idx &= 0xffffff; 3979 3980 while (cnt <= weight_len 3981 && (weights[equiv_class_idx + 1 + cnt] 3982 == weights[idx + 1 + cnt])) 3983 ++cnt; 3984 if (cnt > weight_len) 3985 { 3986 match_len = elem_len; 3987 goto check_node_accept_bytes_match; 3988 } 3989 } 3990 } 3991 } 3992 } 3993 else 3994 # endif /* _LIBC */ 3995 { 3996 /* match with range expression? */ 3997 #if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && __STRICT_ANSI__) 3998 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3999 #else 4000 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 4001 cmp_buf[2] = wc; 4002 #endif 4003 for (i = 0; i < cset->nranges; ++i) 4004 { 4005 cmp_buf[0] = cset->range_starts[i]; 4006 cmp_buf[4] = cset->range_ends[i]; 4007 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 4008 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 4009 { 4010 match_len = char_len; 4011 goto check_node_accept_bytes_match; 4012 } 4013 } 4014 } 4015 check_node_accept_bytes_match: 4016 if (!cset->non_match) 4017 return match_len; 4018 else 4019 { 4020 if (match_len > 0) 4021 return 0; 4022 else 4023 return (elem_len > char_len) ? elem_len : char_len; 4024 } 4025 } 4026 return 0; 4027 } 4028 4029 # ifdef _LIBC 4030 static unsigned int 4031 internal_function 4032 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len) 4033 { 4034 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 4035 if (nrules == 0) 4036 { 4037 if (mbs_len == 1) 4038 { 4039 /* No valid character. Match it as a single byte character. */ 4040 const unsigned char *collseq = (const unsigned char *) 4041 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 4042 return collseq[mbs[0]]; 4043 } 4044 return UINT_MAX; 4045 } 4046 else 4047 { 4048 int32_t idx; 4049 const unsigned char *extra = (const unsigned char *) 4050 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4051 int32_t extrasize = (const unsigned char *) 4052 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra; 4053 4054 for (idx = 0; idx < extrasize;) 4055 { 4056 int mbs_cnt; 4057 bool found = false; 4058 int32_t elem_mbs_len; 4059 /* Skip the name of collating element name. */ 4060 idx = idx + extra[idx] + 1; 4061 elem_mbs_len = extra[idx++]; 4062 if (mbs_len == elem_mbs_len) 4063 { 4064 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 4065 if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 4066 break; 4067 if (mbs_cnt == elem_mbs_len) 4068 /* Found the entry. */ 4069 found = true; 4070 } 4071 /* Skip the byte sequence of the collating element. */ 4072 idx += elem_mbs_len; 4073 /* Adjust for the alignment. */ 4074 idx = (idx + 3) & ~3; 4075 /* Skip the collation sequence value. */ 4076 idx += sizeof (uint32_t); 4077 /* Skip the wide char sequence of the collating element. */ 4078 idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 4079 /* If we found the entry, return the sequence value. */ 4080 if (found) 4081 return *(uint32_t *) (extra + idx); 4082 /* Skip the collation sequence value. */ 4083 idx += sizeof (uint32_t); 4084 } 4085 return UINT_MAX; 4086 } 4087 } 4088 # endif /* _LIBC */ 4089 #endif /* RE_ENABLE_I18N */ 4090 4091 /* Check whether the node accepts the byte which is IDX-th 4092 byte of the INPUT. */ 4093 4094 static bool 4095 internal_function 4096 check_node_accept (const re_match_context_t *mctx, const re_token_t *node, 4097 Idx idx) 4098 { 4099 unsigned char ch; 4100 ch = re_string_byte_at (&mctx->input, idx); 4101 switch (node->type) 4102 { 4103 case CHARACTER: 4104 if (node->opr.c != ch) 4105 return false; 4106 break; 4107 4108 case SIMPLE_BRACKET: 4109 if (!bitset_contain (node->opr.sbcset, ch)) 4110 return false; 4111 break; 4112 4113 #ifdef RE_ENABLE_I18N 4114 case OP_UTF8_PERIOD: 4115 if (ch >= ASCII_CHARS) 4116 return false; 4117 /* FALLTHROUGH */ 4118 #endif 4119 case OP_PERIOD: 4120 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE)) 4121 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL))) 4122 return false; 4123 break; 4124 4125 default: 4126 return false; 4127 } 4128 4129 if (node->constraint) 4130 { 4131 /* The node has constraints. Check whether the current context 4132 satisfies the constraints. */ 4133 unsigned int context = re_string_context_at (&mctx->input, idx, 4134 mctx->eflags); 4135 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 4136 return false; 4137 } 4138 4139 return true; 4140 } 4141 4142 /* Extend the buffers, if the buffers have run out. */ 4143 4144 static reg_errcode_t 4145 internal_function __attribute_warn_unused_result__ 4146 extend_buffers (re_match_context_t *mctx) 4147 { 4148 reg_errcode_t ret; 4149 re_string_t *pstr = &mctx->input; 4150 4151 /* Avoid overflow. */ 4152 if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0)) 4153 return REG_ESPACE; 4154 4155 /* Double the lengthes of the buffers. */ 4156 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 4157 if (BE (ret != REG_NOERROR, 0)) 4158 return ret; 4159 4160 if (mctx->state_log != NULL) 4161 { 4162 /* And double the length of state_log. */ 4163 /* XXX We have no indication of the size of this buffer. If this 4164 allocation fail we have no indication that the state_log array 4165 does not have the right size. */ 4166 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *, 4167 pstr->bufs_len + 1); 4168 if (BE (new_array == NULL, 0)) 4169 return REG_ESPACE; 4170 mctx->state_log = new_array; 4171 } 4172 4173 /* Then reconstruct the buffers. */ 4174 if (pstr->icase) 4175 { 4176 #ifdef RE_ENABLE_I18N 4177 if (pstr->mb_cur_max > 1) 4178 { 4179 ret = build_wcs_upper_buffer (pstr); 4180 if (BE (ret != REG_NOERROR, 0)) 4181 return ret; 4182 } 4183 else 4184 #endif /* RE_ENABLE_I18N */ 4185 build_upper_buffer (pstr); 4186 } 4187 else 4188 { 4189 #ifdef RE_ENABLE_I18N 4190 if (pstr->mb_cur_max > 1) 4191 build_wcs_buffer (pstr); 4192 else 4193 #endif /* RE_ENABLE_I18N */ 4194 { 4195 if (pstr->trans != NULL) 4196 re_string_translate_buffer (pstr); 4197 } 4198 } 4199 return REG_NOERROR; 4200 } 4201 4202 4203 /* Functions for matching context. */ 4204 4205 /* Initialize MCTX. */ 4206 4207 static reg_errcode_t 4208 internal_function __attribute_warn_unused_result__ 4209 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n) 4210 { 4211 mctx->eflags = eflags; 4212 mctx->match_last = REG_MISSING; 4213 if (n > 0) 4214 { 4215 /* Avoid overflow. */ 4216 size_t max_object_size = 4217 MAX (sizeof (struct re_backref_cache_entry), 4218 sizeof (re_sub_match_top_t *)); 4219 if (BE (SIZE_MAX / max_object_size < n, 0)) 4220 return REG_ESPACE; 4221 4222 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 4223 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 4224 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 4225 return REG_ESPACE; 4226 } 4227 /* Already zero-ed by the caller. 4228 else 4229 mctx->bkref_ents = NULL; 4230 mctx->nbkref_ents = 0; 4231 mctx->nsub_tops = 0; */ 4232 mctx->abkref_ents = n; 4233 mctx->max_mb_elem_len = 1; 4234 mctx->asub_tops = n; 4235 return REG_NOERROR; 4236 } 4237 4238 /* Clean the entries which depend on the current input in MCTX. 4239 This function must be invoked when the matcher changes the start index 4240 of the input, or changes the input string. */ 4241 4242 static void 4243 internal_function 4244 match_ctx_clean (re_match_context_t *mctx) 4245 { 4246 Idx st_idx; 4247 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 4248 { 4249 Idx sl_idx; 4250 re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 4251 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 4252 { 4253 re_sub_match_last_t *last = top->lasts[sl_idx]; 4254 re_free (last->path.array); 4255 re_free (last); 4256 } 4257 re_free (top->lasts); 4258 if (top->path) 4259 { 4260 re_free (top->path->array); 4261 re_free (top->path); 4262 } 4263 free (top); 4264 } 4265 4266 mctx->nsub_tops = 0; 4267 mctx->nbkref_ents = 0; 4268 } 4269 4270 /* Free all the memory associated with MCTX. */ 4271 4272 static void 4273 internal_function 4274 match_ctx_free (re_match_context_t *mctx) 4275 { 4276 /* First, free all the memory associated with MCTX->SUB_TOPS. */ 4277 match_ctx_clean (mctx); 4278 re_free (mctx->sub_tops); 4279 re_free (mctx->bkref_ents); 4280 } 4281 4282 /* Add a new backreference entry to MCTX. 4283 Note that we assume that caller never call this function with duplicate 4284 entry, and call with STR_IDX which isn't smaller than any existing entry. 4285 */ 4286 4287 static reg_errcode_t 4288 internal_function __attribute_warn_unused_result__ 4289 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from, 4290 Idx to) 4291 { 4292 if (mctx->nbkref_ents >= mctx->abkref_ents) 4293 { 4294 struct re_backref_cache_entry* new_entry; 4295 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 4296 mctx->abkref_ents * 2); 4297 if (BE (new_entry == NULL, 0)) 4298 { 4299 re_free (mctx->bkref_ents); 4300 return REG_ESPACE; 4301 } 4302 mctx->bkref_ents = new_entry; 4303 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 4304 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 4305 mctx->abkref_ents *= 2; 4306 } 4307 if (mctx->nbkref_ents > 0 4308 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx) 4309 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1; 4310 4311 mctx->bkref_ents[mctx->nbkref_ents].node = node; 4312 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 4313 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 4314 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 4315 4316 /* This is a cache that saves negative results of check_dst_limits_calc_pos. 4317 If bit N is clear, means that this entry won't epsilon-transition to 4318 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If 4319 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one 4320 such node. 4321 4322 A backreference does not epsilon-transition unless it is empty, so set 4323 to all zeros if FROM != TO. */ 4324 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map 4325 = (from == to ? -1 : 0); 4326 4327 mctx->bkref_ents[mctx->nbkref_ents++].more = 0; 4328 if (mctx->max_mb_elem_len < to - from) 4329 mctx->max_mb_elem_len = to - from; 4330 return REG_NOERROR; 4331 } 4332 4333 /* Return the first entry with the same str_idx, or REG_MISSING if none is 4334 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 4335 4336 static Idx 4337 internal_function 4338 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx) 4339 { 4340 Idx left, right, mid, last; 4341 last = right = mctx->nbkref_ents; 4342 for (left = 0; left < right;) 4343 { 4344 mid = (left + right) / 2; 4345 if (mctx->bkref_ents[mid].str_idx < str_idx) 4346 left = mid + 1; 4347 else 4348 right = mid; 4349 } 4350 if (left < last && mctx->bkref_ents[left].str_idx == str_idx) 4351 return left; 4352 else 4353 return REG_MISSING; 4354 } 4355 4356 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 4357 at STR_IDX. */ 4358 4359 static reg_errcode_t 4360 internal_function __attribute_warn_unused_result__ 4361 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx) 4362 { 4363 #ifdef DEBUG 4364 assert (mctx->sub_tops != NULL); 4365 assert (mctx->asub_tops > 0); 4366 #endif 4367 if (BE (mctx->nsub_tops == mctx->asub_tops, 0)) 4368 { 4369 Idx new_asub_tops = mctx->asub_tops * 2; 4370 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops, 4371 re_sub_match_top_t *, 4372 new_asub_tops); 4373 if (BE (new_array == NULL, 0)) 4374 return REG_ESPACE; 4375 mctx->sub_tops = new_array; 4376 mctx->asub_tops = new_asub_tops; 4377 } 4378 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 4379 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0)) 4380 return REG_ESPACE; 4381 mctx->sub_tops[mctx->nsub_tops]->node = node; 4382 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 4383 return REG_NOERROR; 4384 } 4385 4386 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 4387 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 4388 4389 static re_sub_match_last_t * 4390 internal_function 4391 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx) 4392 { 4393 re_sub_match_last_t *new_entry; 4394 if (BE (subtop->nlasts == subtop->alasts, 0)) 4395 { 4396 Idx new_alasts = 2 * subtop->alasts + 1; 4397 re_sub_match_last_t **new_array = re_realloc (subtop->lasts, 4398 re_sub_match_last_t *, 4399 new_alasts); 4400 if (BE (new_array == NULL, 0)) 4401 return NULL; 4402 subtop->lasts = new_array; 4403 subtop->alasts = new_alasts; 4404 } 4405 new_entry = calloc (1, sizeof (re_sub_match_last_t)); 4406 if (BE (new_entry != NULL, 1)) 4407 { 4408 subtop->lasts[subtop->nlasts] = new_entry; 4409 new_entry->node = node; 4410 new_entry->str_idx = str_idx; 4411 ++subtop->nlasts; 4412 } 4413 return new_entry; 4414 } 4415 4416 static void 4417 internal_function 4418 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 4419 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx) 4420 { 4421 sctx->sifted_states = sifted_sts; 4422 sctx->limited_states = limited_sts; 4423 sctx->last_node = last_node; 4424 sctx->last_str_idx = last_str_idx; 4425 re_node_set_init_empty (&sctx->limits); 4426 } 4427