1 /* kwset.c - search for any of a set of keywords. 2 Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2020 Free Software 3 Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 18 02110-1301, USA. */ 19 20 /* Written August 1989 by Mike Haertel. */ 21 22 /* For the Aho-Corasick algorithm, see: 23 Aho AV, Corasick MJ. Efficient string matching: an aid to 24 bibliographic search. CACM 18, 6 (1975), 333-40 25 <https://dx.doi.org/10.1145/360825.360855>, which describes the 26 failure function used below. 27 28 For the Boyer-Moore algorithm, see: Boyer RS, Moore JS. 29 A fast string searching algorithm. CACM 20, 10 (1977), 762-72 30 <https://dx.doi.org/10.1145/359842.359859>. 31 32 For a survey of more-recent string matching algorithms that might 33 help improve performance, see: Faro S, Lecroq T. The exact online 34 string matching problem: a review of the most recent results. 35 ACM Computing Surveys 45, 2 (2013), 13 36 <https://dx.doi.org/10.1145/2431211.2431212>. */ 37 38 #include <config.h> 39 40 #include "kwset.h" 41 42 #include <stdint.h> 43 #include <sys/types.h> 44 #include "system.h" 45 #include "intprops.h" 46 #include "memchr2.h" 47 #include "obstack.h" 48 #include "xalloc.h" 49 #include "verify.h" 50 51 #define obstack_chunk_alloc xmalloc 52 #define obstack_chunk_free free 53 54 static unsigned char 55 U (char ch) 56 { 57 return to_uchar (ch); 58 } 59 60 /* Balanced tree of edges and labels leaving a given trie node. */ 61 struct tree 62 { 63 struct tree *llink; /* Left link; MUST be first field. */ 64 struct tree *rlink; /* Right link (to larger labels). */ 65 struct trie *trie; /* Trie node pointed to by this edge. */ 66 unsigned char label; /* Label on this edge. */ 67 char balance; /* Difference in depths of subtrees. */ 68 }; 69 70 /* Node of a trie representing a set of keywords. */ 71 struct trie 72 { 73 /* If an accepting node, this is either 2*W + 1 where W is the word 74 index, or is SIZE_MAX if Aho-Corasick is in use and FAIL 75 specifies where to look for more info. If not an accepting node, 76 this is zero. */ 77 size_t accepting; 78 79 struct tree *links; /* Tree of edges leaving this node. */ 80 struct trie *parent; /* Parent of this node. */ 81 struct trie *next; /* List of all trie nodes in level order. */ 82 struct trie *fail; /* Aho-Corasick failure function. */ 83 ptrdiff_t depth; /* Depth of this node from the root. */ 84 ptrdiff_t shift; /* Shift function for search failures. */ 85 ptrdiff_t maxshift; /* Max shift of self and descendants. */ 86 }; 87 88 /* Structure returned opaquely to the caller, containing everything. */ 89 struct kwset 90 { 91 struct obstack obstack; /* Obstack for node allocation. */ 92 ptrdiff_t words; /* Number of words in the trie. */ 93 struct trie *trie; /* The trie itself. */ 94 ptrdiff_t mind; /* Minimum depth of an accepting node. */ 95 ptrdiff_t maxd; /* Maximum depth of any node. */ 96 unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 97 struct trie *next[NCHAR]; /* Table of children of the root. */ 98 char *target; /* Target string if there's only one. */ 99 ptrdiff_t *shift; /* Used in Boyer-Moore search for one 100 string. */ 101 char const *trans; /* Character translation table. */ 102 103 /* This helps to match a terminal byte, which is the first byte 104 for Aho-Corasick, and the last byte for Boyer-More. If all the 105 patterns have the same terminal byte (after translation via TRANS 106 if TRANS is nonnull), then this is that byte as an unsigned char. 107 Otherwise this is -1 if there is disagreement among the strings 108 about terminal bytes, and -2 if there are no terminal bytes and 109 no disagreement because all the patterns are empty. */ 110 int gc1; 111 112 /* This helps to match a terminal byte. If 0 <= GC1HELP, B is 113 terminal when B == GC1 || B == GC1HELP (note that GC1 == GCHELP 114 is common here). This is typically faster than evaluating 115 to_uchar (TRANS[B]) == GC1. */ 116 int gc1help; 117 118 /* If the string has two or more bytes, this is the penultimate byte, 119 after translation via TRANS if TRANS is nonnull. This variable 120 is used only by Boyer-Moore. */ 121 char gc2; 122 123 /* kwsexec implementation. */ 124 ptrdiff_t (*kwsexec) (kwset_t, char const *, ptrdiff_t, 125 struct kwsmatch *, bool); 126 }; 127 128 /* Use TRANS to transliterate C. A null TRANS does no transliteration. */ 129 static inline char 130 tr (char const *trans, char c) 131 { 132 return trans ? trans[U(c)] : c; 133 } 134 135 static ptrdiff_t acexec (kwset_t, char const *, ptrdiff_t, 136 struct kwsmatch *, bool); 137 static ptrdiff_t bmexec (kwset_t, char const *, ptrdiff_t, 138 struct kwsmatch *, bool); 139 140 /* Return a newly allocated keyword set. A nonnull TRANS specifies a 141 table of character translations to be applied to all pattern and 142 search text. */ 143 kwset_t 144 kwsalloc (char const *trans) 145 { 146 struct kwset *kwset = xmalloc (sizeof *kwset); 147 148 obstack_init (&kwset->obstack); 149 kwset->words = 0; 150 kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie); 151 kwset->trie->accepting = 0; 152 kwset->trie->links = NULL; 153 kwset->trie->parent = NULL; 154 kwset->trie->next = NULL; 155 kwset->trie->fail = NULL; 156 kwset->trie->depth = 0; 157 kwset->trie->shift = 0; 158 kwset->mind = PTRDIFF_MAX; 159 kwset->maxd = -1; 160 kwset->target = NULL; 161 kwset->trans = trans; 162 kwset->kwsexec = acexec; 163 164 return kwset; 165 } 166 167 /* This upper bound is valid for CHAR_BIT >= 4 and 168 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ 169 enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 }; 170 171 /* Add the given string to the contents of the keyword set. */ 172 void 173 kwsincr (kwset_t kwset, char const *text, ptrdiff_t len) 174 { 175 assume (0 <= len); 176 struct trie *trie = kwset->trie; 177 char const *trans = kwset->trans; 178 bool reverse = kwset->kwsexec == bmexec; 179 180 if (reverse) 181 text += len; 182 183 /* Descend the trie (built of keywords) character-by-character, 184 installing new nodes when necessary. */ 185 while (len--) 186 { 187 unsigned char uc = reverse ? *--text : *text++; 188 unsigned char label = trans ? trans[uc] : uc; 189 190 /* Descend the tree of outgoing links for this trie node, 191 looking for the current character and keeping track 192 of the path followed. */ 193 struct tree *cur = trie->links; 194 struct tree *links[DEPTH_SIZE]; 195 enum { L, R } dirs[DEPTH_SIZE]; 196 links[0] = (struct tree *) &trie->links; 197 dirs[0] = L; 198 ptrdiff_t depth = 1; 199 200 while (cur && label != cur->label) 201 { 202 links[depth] = cur; 203 if (label < cur->label) 204 dirs[depth++] = L, cur = cur->llink; 205 else 206 dirs[depth++] = R, cur = cur->rlink; 207 } 208 209 /* The current character doesn't have an outgoing link at 210 this trie node, so build a new trie node and install 211 a link in the current trie node's tree. */ 212 if (!cur) 213 { 214 cur = obstack_alloc (&kwset->obstack, sizeof *cur); 215 cur->llink = NULL; 216 cur->rlink = NULL; 217 cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie); 218 cur->trie->accepting = 0; 219 cur->trie->links = NULL; 220 cur->trie->parent = trie; 221 cur->trie->next = NULL; 222 cur->trie->fail = NULL; 223 cur->trie->depth = trie->depth + 1; 224 cur->trie->shift = 0; 225 cur->label = label; 226 cur->balance = 0; 227 228 /* Install the new tree node in its parent. */ 229 if (dirs[--depth] == L) 230 links[depth]->llink = cur; 231 else 232 links[depth]->rlink = cur; 233 234 /* Back up the tree fixing the balance flags. */ 235 while (depth && !links[depth]->balance) 236 { 237 if (dirs[depth] == L) 238 --links[depth]->balance; 239 else 240 ++links[depth]->balance; 241 --depth; 242 } 243 244 /* Rebalance the tree by pointer rotations if necessary. */ 245 if (depth && ((dirs[depth] == L && --links[depth]->balance) 246 || (dirs[depth] == R && ++links[depth]->balance))) 247 { 248 struct tree *t, *r, *l, *rl, *lr; 249 250 switch (links[depth]->balance) 251 { 252 case (char) -2: 253 switch (dirs[depth + 1]) 254 { 255 case L: 256 r = links[depth], t = r->llink, rl = t->rlink; 257 t->rlink = r, r->llink = rl; 258 t->balance = r->balance = 0; 259 break; 260 case R: 261 r = links[depth], l = r->llink, t = l->rlink; 262 rl = t->rlink, lr = t->llink; 263 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 264 l->balance = t->balance != 1 ? 0 : -1; 265 r->balance = t->balance != (char) -1 ? 0 : 1; 266 t->balance = 0; 267 break; 268 default: 269 abort (); 270 } 271 break; 272 case 2: 273 switch (dirs[depth + 1]) 274 { 275 case R: 276 l = links[depth], t = l->rlink, lr = t->llink; 277 t->llink = l, l->rlink = lr; 278 t->balance = l->balance = 0; 279 break; 280 case L: 281 l = links[depth], r = l->rlink, t = r->llink; 282 lr = t->llink, rl = t->rlink; 283 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 284 l->balance = t->balance != 1 ? 0 : -1; 285 r->balance = t->balance != (char) -1 ? 0 : 1; 286 t->balance = 0; 287 break; 288 default: 289 abort (); 290 } 291 break; 292 default: 293 abort (); 294 } 295 296 if (dirs[depth - 1] == L) 297 links[depth - 1]->llink = t; 298 else 299 links[depth - 1]->rlink = t; 300 } 301 } 302 303 trie = cur->trie; 304 } 305 306 /* Mark the node finally reached as accepting, encoding the 307 index number of this word in the keyword set so far. */ 308 if (!trie->accepting) 309 { 310 size_t words = kwset->words; 311 trie->accepting = 2 * words + 1; 312 } 313 ++kwset->words; 314 315 /* Keep track of the longest and shortest string of the keyword set. */ 316 if (trie->depth < kwset->mind) 317 kwset->mind = trie->depth; 318 if (trie->depth > kwset->maxd) 319 kwset->maxd = trie->depth; 320 } 321 322 ptrdiff_t 323 kwswords (kwset_t kwset) 324 { 325 return kwset->words; 326 } 327 328 /* Enqueue the trie nodes referenced from the given tree in the 329 given queue. */ 330 static void 331 enqueue (struct tree *tree, struct trie **last) 332 { 333 if (!tree) 334 return; 335 enqueue (tree->llink, last); 336 enqueue (tree->rlink, last); 337 (*last) = (*last)->next = tree->trie; 338 } 339 340 /* Compute the Aho-Corasick failure function for the trie nodes referenced 341 from the given tree, given the failure function for their parent as 342 well as a last resort failure node. */ 343 static void 344 treefails (struct tree const *tree, struct trie const *fail, 345 struct trie *recourse, bool reverse) 346 { 347 struct tree *cur; 348 349 if (!tree) 350 return; 351 352 treefails (tree->llink, fail, recourse, reverse); 353 treefails (tree->rlink, fail, recourse, reverse); 354 355 /* Find, in the chain of fails going back to the root, the first 356 node that has a descendant on the current label. */ 357 while (fail) 358 { 359 cur = fail->links; 360 while (cur && tree->label != cur->label) 361 if (tree->label < cur->label) 362 cur = cur->llink; 363 else 364 cur = cur->rlink; 365 if (cur) 366 { 367 tree->trie->fail = cur->trie; 368 if (!reverse && cur->trie->accepting && !tree->trie->accepting) 369 tree->trie->accepting = SIZE_MAX; 370 return; 371 } 372 fail = fail->fail; 373 } 374 375 tree->trie->fail = recourse; 376 } 377 378 /* Set delta entries for the links of the given tree such that 379 the preexisting delta value is larger than the current depth. */ 380 static void 381 treedelta (struct tree const *tree, ptrdiff_t depth, unsigned char delta[]) 382 { 383 if (!tree) 384 return; 385 treedelta (tree->llink, depth, delta); 386 treedelta (tree->rlink, depth, delta); 387 if (depth < delta[tree->label]) 388 delta[tree->label] = depth; 389 } 390 391 /* Return true if A has every label in B. */ 392 static bool _GL_ATTRIBUTE_PURE 393 hasevery (struct tree const *a, struct tree const *b) 394 { 395 if (!b) 396 return true; 397 if (!hasevery (a, b->llink)) 398 return false; 399 if (!hasevery (a, b->rlink)) 400 return false; 401 while (a && b->label != a->label) 402 if (b->label < a->label) 403 a = a->llink; 404 else 405 a = a->rlink; 406 return !!a; 407 } 408 409 /* Compute a vector, indexed by character code, of the trie nodes 410 referenced from the given tree. */ 411 static void 412 treenext (struct tree const *tree, struct trie *next[]) 413 { 414 if (!tree) 415 return; 416 treenext (tree->llink, next); 417 treenext (tree->rlink, next); 418 next[tree->label] = tree->trie; 419 } 420 421 /* Prepare a built keyword set for use. */ 422 void 423 kwsprep (kwset_t kwset) 424 { 425 char const *trans = kwset->trans; 426 ptrdiff_t i; 427 unsigned char deltabuf[NCHAR]; 428 unsigned char *delta = trans ? deltabuf : kwset->delta; 429 struct trie *curr, *last; 430 431 /* Use Boyer-Moore if just one pattern, Aho-Corasick otherwise. */ 432 bool reverse = kwset->words == 1; 433 434 if (reverse) 435 { 436 kwset_t new_kwset; 437 438 /* Enqueue the immediate descendants in the level order queue. */ 439 for (curr = last = kwset->trie; curr; curr = curr->next) 440 enqueue (curr->links, &last); 441 442 /* Looking for just one string. Extract it from the trie. */ 443 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); 444 for (i = 0, curr = kwset->trie; i < kwset->mind; ++i) 445 { 446 kwset->target[i] = curr->links->label; 447 curr = curr->next; 448 } 449 450 new_kwset = kwsalloc (kwset->trans); 451 new_kwset->kwsexec = bmexec; 452 kwsincr (new_kwset, kwset->target, kwset->mind); 453 obstack_free (&kwset->obstack, NULL); 454 *kwset = *new_kwset; 455 free (new_kwset); 456 } 457 458 /* Initial values for the delta table; will be changed later. The 459 delta entry for a given character is the smallest depth of any 460 node at which an outgoing edge is labeled by that character. */ 461 memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf); 462 463 /* Traverse the nodes of the trie in level order, simultaneously 464 computing the delta table, failure function, and shift function. */ 465 for (curr = last = kwset->trie; curr; curr = curr->next) 466 { 467 /* Enqueue the immediate descendants in the level order queue. */ 468 enqueue (curr->links, &last); 469 470 /* Update the delta table for the descendants of this node. */ 471 treedelta (curr->links, curr->depth, delta); 472 473 /* Compute the failure function for the descendants of this node. */ 474 treefails (curr->links, curr->fail, kwset->trie, reverse); 475 476 if (reverse) 477 { 478 curr->shift = kwset->mind; 479 curr->maxshift = kwset->mind; 480 481 /* Update the shifts at each node in the current node's chain 482 of fails back to the root. */ 483 struct trie *fail; 484 for (fail = curr->fail; fail; fail = fail->fail) 485 { 486 /* If the current node has some outgoing edge that the fail 487 doesn't, then the shift at the fail should be no larger 488 than the difference of their depths. */ 489 if (!hasevery (fail->links, curr->links)) 490 if (curr->depth - fail->depth < fail->shift) 491 fail->shift = curr->depth - fail->depth; 492 493 /* If the current node is accepting then the shift at the 494 fail and its descendants should be no larger than the 495 difference of their depths. */ 496 if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 497 fail->maxshift = curr->depth - fail->depth; 498 } 499 } 500 } 501 502 if (reverse) 503 { 504 /* Traverse the trie in level order again, fixing up all nodes whose 505 shift exceeds their inherited maxshift. */ 506 for (curr = kwset->trie->next; curr; curr = curr->next) 507 { 508 if (curr->maxshift > curr->parent->maxshift) 509 curr->maxshift = curr->parent->maxshift; 510 if (curr->shift > curr->maxshift) 511 curr->shift = curr->maxshift; 512 } 513 } 514 515 /* Create a vector, indexed by character code, of the outgoing links 516 from the root node. Accumulate GC1 and GC1HELP. */ 517 struct trie *nextbuf[NCHAR]; 518 struct trie **next = trans ? nextbuf : kwset->next; 519 memset (next, 0, sizeof nextbuf); 520 treenext (kwset->trie->links, next); 521 int gc1 = -2; 522 int gc1help = -1; 523 for (i = 0; i < NCHAR; i++) 524 { 525 int ti = i; 526 if (trans) 527 { 528 ti = U(trans[i]); 529 kwset->next[i] = next[ti]; 530 } 531 if (kwset->next[i]) 532 { 533 if (gc1 < -1) 534 { 535 gc1 = ti; 536 gc1help = i; 537 } 538 else if (gc1 == ti) 539 gc1help = gc1help == ti ? i : -1; 540 else if (i == ti && gc1 == gc1help) 541 gc1help = i; 542 else 543 gc1 = -1; 544 } 545 } 546 kwset->gc1 = gc1; 547 kwset->gc1help = gc1help; 548 549 if (reverse) 550 { 551 /* Looking for just one string. Extract it from the trie. */ 552 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); 553 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 554 { 555 kwset->target[i] = curr->links->label; 556 curr = curr->next; 557 } 558 559 if (kwset->mind > 1) 560 { 561 /* Looking for the delta2 shift that might be made after a 562 backwards match has failed. Extract it from the trie. */ 563 kwset->shift 564 = obstack_alloc (&kwset->obstack, 565 sizeof *kwset->shift * (kwset->mind - 1)); 566 for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) 567 { 568 kwset->shift[i] = curr->shift; 569 curr = curr->next; 570 } 571 572 /* The penultimate byte. */ 573 kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]); 574 } 575 } 576 577 /* Fix things up for any translation table. */ 578 if (trans) 579 for (i = 0; i < NCHAR; ++i) 580 kwset->delta[i] = delta[U(trans[i])]; 581 } 582 583 /* Delta2 portion of a Boyer-Moore search. *TP is the string text 584 pointer; it is updated in place. EP is the end of the string text, 585 and SP the end of the pattern. LEN is the pattern length; it must 586 be at least 2. TRANS, if nonnull, is the input translation table. 587 GC1 and GC2 are the last and second-from last bytes of the pattern, 588 transliterated by TRANS; the caller precomputes them for 589 efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP 590 when failing. KWSET->shift says how much to shift. */ 591 static inline bool 592 bm_delta2_search (char const **tpp, char const *ep, char const *sp, 593 ptrdiff_t len, 594 char const *trans, char gc1, char gc2, 595 unsigned char const *d1, kwset_t kwset) 596 { 597 char const *tp = *tpp; 598 ptrdiff_t d = len, skip = 0; 599 600 while (true) 601 { 602 ptrdiff_t i = 2; 603 if (tr (trans, tp[-2]) == gc2) 604 { 605 while (++i <= d) 606 if (tr (trans, tp[-i]) != tr (trans, sp[-i])) 607 break; 608 if (i > d) 609 { 610 for (i = d + skip + 1; i <= len; ++i) 611 if (tr (trans, tp[-i]) != tr (trans, sp[-i])) 612 break; 613 if (i > len) 614 { 615 *tpp = tp - len; 616 return true; 617 } 618 } 619 } 620 621 tp += d = kwset->shift[i - 2]; 622 if (tp > ep) 623 break; 624 if (tr (trans, tp[-1]) != gc1) 625 { 626 if (d1) 627 tp += d1[U(tp[-1])]; 628 break; 629 } 630 skip = i - 1; 631 } 632 633 *tpp = tp; 634 return false; 635 } 636 637 /* Return the address of the first byte in the buffer S (of size N) 638 that matches the terminal byte specified by KWSET, or NULL if there 639 is no match. KWSET->gc1 should be nonnegative. */ 640 static char const * 641 memchr_kwset (char const *s, ptrdiff_t n, kwset_t kwset) 642 { 643 char const *slim = s + n; 644 if (kwset->gc1help < 0) 645 { 646 for (; s < slim; s++) 647 if (kwset->next[U(*s)]) 648 return s; 649 } 650 else 651 { 652 int small_heuristic = 2; 653 size_t small_bytes = small_heuristic * sizeof (unsigned long int); 654 while (s < slim) 655 { 656 if (kwset->next[U(*s)]) 657 return s; 658 s++; 659 if ((uintptr_t) s % small_bytes == 0) 660 return memchr2 (s, kwset->gc1, kwset->gc1help, slim - s); 661 } 662 } 663 return NULL; 664 } 665 666 /* Fast Boyer-Moore search (inlinable version). */ 667 static inline ptrdiff_t _GL_ATTRIBUTE_PURE 668 bmexec_trans (kwset_t kwset, char const *text, ptrdiff_t size) 669 { 670 assume (0 <= size); 671 unsigned char const *d1; 672 char const *ep, *sp, *tp; 673 int d; 674 ptrdiff_t len = kwset->mind; 675 char const *trans = kwset->trans; 676 677 if (len == 0) 678 return 0; 679 if (len > size) 680 return -1; 681 if (len == 1) 682 { 683 tp = memchr_kwset (text, size, kwset); 684 return tp ? tp - text : -1; 685 } 686 687 d1 = kwset->delta; 688 sp = kwset->target + len; 689 tp = text + len; 690 char gc1 = kwset->gc1; 691 char gc2 = kwset->gc2; 692 693 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 694 ptrdiff_t len12; 695 if (!INT_MULTIPLY_WRAPV (len, 12, &len12) && len12 < size) 696 /* 11 is not a bug, the initial offset happens only once. */ 697 for (ep = text + size - 11 * len; tp <= ep; ) 698 { 699 char const *tp0 = tp; 700 d = d1[U(tp[-1])], tp += d; 701 d = d1[U(tp[-1])], tp += d; 702 if (d != 0) 703 { 704 d = d1[U(tp[-1])], tp += d; 705 d = d1[U(tp[-1])], tp += d; 706 d = d1[U(tp[-1])], tp += d; 707 if (d != 0) 708 { 709 d = d1[U(tp[-1])], tp += d; 710 d = d1[U(tp[-1])], tp += d; 711 d = d1[U(tp[-1])], tp += d; 712 if (d != 0) 713 { 714 d = d1[U(tp[-1])], tp += d; 715 d = d1[U(tp[-1])], tp += d; 716 717 /* As a heuristic, prefer memchr to seeking by 718 delta1 when the latter doesn't advance much. */ 719 int advance_heuristic = 16 * sizeof (long); 720 if (advance_heuristic <= tp - tp0) 721 continue; 722 tp--; 723 tp = memchr_kwset (tp, text + size - tp, kwset); 724 if (! tp) 725 return -1; 726 tp++; 727 if (ep <= tp) 728 break; 729 } 730 } 731 } 732 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset)) 733 return tp - text; 734 } 735 736 /* Now only a few characters are left to search. Carefully avoid 737 ever producing an out-of-bounds pointer. */ 738 ep = text + size; 739 d = d1[U(tp[-1])]; 740 while (d <= ep - tp) 741 { 742 d = d1[U((tp += d)[-1])]; 743 if (d != 0) 744 continue; 745 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset)) 746 return tp - text; 747 } 748 749 return -1; 750 } 751 752 /* Fast Boyer-Moore search. */ 753 static ptrdiff_t 754 bmexec (kwset_t kwset, char const *text, ptrdiff_t size, 755 struct kwsmatch *kwsmatch, bool longest) 756 { 757 /* Help the compiler inline in two ways, depending on whether 758 kwset->trans is null. */ 759 ptrdiff_t ret = (IGNORE_DUPLICATE_BRANCH_WARNING 760 (kwset->trans 761 ? bmexec_trans (kwset, text, size) 762 : bmexec_trans (kwset, text, size))); 763 if (0 <= ret) 764 { 765 kwsmatch->index = 0; 766 kwsmatch->offset[0] = ret; 767 kwsmatch->size[0] = kwset->mind; 768 } 769 770 return ret; 771 } 772 773 /* Hairy multiple string search with the Aho-Corasick algorithm. 774 (inlinable version) */ 775 static inline ptrdiff_t 776 acexec_trans (kwset_t kwset, char const *text, ptrdiff_t len, 777 struct kwsmatch *kwsmatch, bool longest) 778 { 779 struct trie const *trie, *accept; 780 char const *tp, *left, *lim; 781 struct tree const *tree; 782 char const *trans; 783 784 /* Initialize register copies and look for easy ways out. */ 785 if (len < kwset->mind) 786 return -1; 787 trans = kwset->trans; 788 trie = kwset->trie; 789 lim = text + len; 790 tp = text; 791 792 if (!trie->accepting) 793 { 794 unsigned char c; 795 int gc1 = kwset->gc1; 796 797 while (true) 798 { 799 if (gc1 < 0) 800 { 801 while (! (trie = kwset->next[c = tr (trans, *tp++)])) 802 if (tp >= lim) 803 return -1; 804 } 805 else 806 { 807 tp = memchr_kwset (tp, lim - tp, kwset); 808 if (!tp) 809 return -1; 810 c = tr (trans, *tp++); 811 trie = kwset->next[c]; 812 } 813 814 while (true) 815 { 816 if (trie->accepting) 817 goto match; 818 if (tp >= lim) 819 return -1; 820 c = tr (trans, *tp++); 821 822 for (tree = trie->links; c != tree->label; ) 823 { 824 tree = c < tree->label ? tree->llink : tree->rlink; 825 if (! tree) 826 { 827 trie = trie->fail; 828 if (!trie) 829 { 830 trie = kwset->next[c]; 831 if (trie) 832 goto have_trie; 833 if (tp >= lim) 834 return -1; 835 goto next_c; 836 } 837 if (trie->accepting) 838 { 839 --tp; 840 goto match; 841 } 842 tree = trie->links; 843 } 844 } 845 trie = tree->trie; 846 have_trie:; 847 } 848 next_c:; 849 } 850 } 851 852 match: 853 accept = trie; 854 while (accept->accepting == SIZE_MAX) 855 accept = accept->fail; 856 left = tp - accept->depth; 857 858 /* Try left-most longest match. */ 859 if (longest) 860 { 861 while (tp < lim) 862 { 863 struct trie const *accept1; 864 char const *left1; 865 unsigned char c = tr (trans, *tp++); 866 867 do 868 { 869 tree = trie->links; 870 while (tree && c != tree->label) 871 tree = c < tree->label ? tree->llink : tree->rlink; 872 } 873 while (!tree && (trie = trie->fail) && accept->depth <= trie->depth); 874 875 if (!tree) 876 break; 877 trie = tree->trie; 878 if (trie->accepting) 879 { 880 accept1 = trie; 881 while (accept1->accepting == SIZE_MAX) 882 accept1 = accept1->fail; 883 left1 = tp - accept1->depth; 884 if (left1 <= left) 885 { 886 left = left1; 887 accept = accept1; 888 } 889 } 890 } 891 } 892 893 kwsmatch->index = accept->accepting / 2; 894 kwsmatch->offset[0] = left - text; 895 kwsmatch->size[0] = accept->depth; 896 897 return left - text; 898 } 899 900 /* Hairy multiple string search with Aho-Corasick algorithm. */ 901 static ptrdiff_t 902 acexec (kwset_t kwset, char const *text, ptrdiff_t size, 903 struct kwsmatch *kwsmatch, bool longest) 904 { 905 assume (0 <= size); 906 /* Help the compiler inline in two ways, depending on whether 907 kwset->trans is null. */ 908 return (IGNORE_DUPLICATE_BRANCH_WARNING 909 (kwset->trans 910 ? acexec_trans (kwset, text, size, kwsmatch, longest) 911 : acexec_trans (kwset, text, size, kwsmatch, longest))); 912 } 913 914 /* Find the first instance of a KWSET member in TEXT, which has SIZE bytes. 915 Return the offset (into TEXT) of the first byte of the matching substring, 916 or -1 if no match is found. Upon a match, store details in 917 *KWSMATCH: index of matched keyword, start offset (same as the return 918 value), and length. If LONGEST, find the longest match; otherwise 919 any match will do. */ 920 ptrdiff_t 921 kwsexec (kwset_t kwset, char const *text, ptrdiff_t size, 922 struct kwsmatch *kwsmatch, bool longest) 923 { 924 return kwset->kwsexec (kwset, text, size, kwsmatch, longest); 925 } 926 927 /* Free the components of the given keyword set. */ 928 void 929 kwsfree (kwset_t kwset) 930 { 931 obstack_free (&kwset->obstack, NULL); 932 free (kwset); 933 } 934