1 /* kwset.c - search for any of a set of keywords. 2 Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2011 Free Software 3 Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 18 02110-1301, USA. */ 19 20 /* Written August 1989 by Mike Haertel. 21 The author may be reached (Email) at the address mike@ai.mit.edu, 22 or (US mail) as Mike Haertel c/o Free Software Foundation. */ 23 24 /* The algorithm implemented by these routines bears a startling resemblence 25 to one discovered by Beate Commentz-Walter, although it is not identical. 26 See "A String Matching Algorithm Fast on the Average," Technical Report, 27 IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900 28 Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient 29 String Matching: An Aid to Bibliographic Search," CACM June 1975, 30 Vol. 18, No. 6, which describes the failure function used below. */ 31 32 #include <config.h> 33 #include <sys/types.h> 34 #include "system.h" 35 #include "kwset.h" 36 #include "obstack.h" 37 38 #define link kwset_link 39 40 #ifdef GREP 41 # include "xalloc.h" 42 # undef malloc 43 # define malloc xmalloc 44 #endif 45 46 #define NCHAR (UCHAR_MAX + 1) 47 #define obstack_chunk_alloc malloc 48 #define obstack_chunk_free free 49 50 #define U(c) ((unsigned char) (c)) 51 52 /* Balanced tree of edges and labels leaving a given trie node. */ 53 struct tree 54 { 55 struct tree *llink; /* Left link; MUST be first field. */ 56 struct tree *rlink; /* Right link (to larger labels). */ 57 struct trie *trie; /* Trie node pointed to by this edge. */ 58 unsigned char label; /* Label on this edge. */ 59 char balance; /* Difference in depths of subtrees. */ 60 }; 61 62 /* Node of a trie representing a set of reversed keywords. */ 63 struct trie 64 { 65 unsigned int accepting; /* Word index of accepted word, or zero. */ 66 struct tree *links; /* Tree of edges leaving this node. */ 67 struct trie *parent; /* Parent of this node. */ 68 struct trie *next; /* List of all trie nodes in level order. */ 69 struct trie *fail; /* Aho-Corasick failure function. */ 70 int depth; /* Depth of this node from the root. */ 71 int shift; /* Shift function for search failures. */ 72 int maxshift; /* Max shift of self and descendents. */ 73 }; 74 75 /* Structure returned opaquely to the caller, containing everything. */ 76 struct kwset 77 { 78 struct obstack obstack; /* Obstack for node allocation. */ 79 int words; /* Number of words in the trie. */ 80 struct trie *trie; /* The trie itself. */ 81 int mind; /* Minimum depth of an accepting node. */ 82 int maxd; /* Maximum depth of any node. */ 83 unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 84 struct trie *next[NCHAR]; /* Table of children of the root. */ 85 char *target; /* Target string if there's only one. */ 86 int mind2; /* Used in Boyer-Moore search for one string. */ 87 char const *trans; /* Character translation table. */ 88 }; 89 90 /* Allocate and initialize a keyword set object, returning an opaque 91 pointer to it. Return NULL if memory is not available. */ 92 kwset_t 93 kwsalloc (char const *trans) 94 { 95 struct kwset *kwset; 96 97 kwset = (struct kwset *) malloc(sizeof (struct kwset)); 98 if (!kwset) 99 return NULL; 100 101 obstack_init(&kwset->obstack); 102 kwset->words = 0; 103 kwset->trie 104 = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie)); 105 if (!kwset->trie) 106 { 107 kwsfree((kwset_t) kwset); 108 return NULL; 109 } 110 kwset->trie->accepting = 0; 111 kwset->trie->links = NULL; 112 kwset->trie->parent = NULL; 113 kwset->trie->next = NULL; 114 kwset->trie->fail = NULL; 115 kwset->trie->depth = 0; 116 kwset->trie->shift = 0; 117 kwset->mind = INT_MAX; 118 kwset->maxd = -1; 119 kwset->target = NULL; 120 kwset->trans = trans; 121 122 return (kwset_t) kwset; 123 } 124 125 /* This upper bound is valid for CHAR_BIT >= 4 and 126 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ 127 #define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2) 128 129 /* Add the given string to the contents of the keyword set. Return NULL 130 for success, an error message otherwise. */ 131 const char * 132 kwsincr (kwset_t kws, char const *text, size_t len) 133 { 134 struct kwset *kwset; 135 struct trie *trie; 136 unsigned char label; 137 struct tree *link; 138 int depth; 139 struct tree *links[DEPTH_SIZE]; 140 enum { L, R } dirs[DEPTH_SIZE]; 141 struct tree *t, *r, *l, *rl, *lr; 142 143 kwset = (struct kwset *) kws; 144 trie = kwset->trie; 145 text += len; 146 147 /* Descend the trie (built of reversed keywords) character-by-character, 148 installing new nodes when necessary. */ 149 while (len--) 150 { 151 label = kwset->trans ? kwset->trans[U(*--text)] : *--text; 152 153 /* Descend the tree of outgoing links for this trie node, 154 looking for the current character and keeping track 155 of the path followed. */ 156 link = trie->links; 157 links[0] = (struct tree *) &trie->links; 158 dirs[0] = L; 159 depth = 1; 160 161 while (link && label != link->label) 162 { 163 links[depth] = link; 164 if (label < link->label) 165 dirs[depth++] = L, link = link->llink; 166 else 167 dirs[depth++] = R, link = link->rlink; 168 } 169 170 /* The current character doesn't have an outgoing link at 171 this trie node, so build a new trie node and install 172 a link in the current trie node's tree. */ 173 if (!link) 174 { 175 link = (struct tree *) obstack_alloc(&kwset->obstack, 176 sizeof (struct tree)); 177 if (!link) 178 return _("memory exhausted"); 179 link->llink = NULL; 180 link->rlink = NULL; 181 link->trie = (struct trie *) obstack_alloc(&kwset->obstack, 182 sizeof (struct trie)); 183 if (!link->trie) 184 { 185 obstack_free(&kwset->obstack, link); 186 return _("memory exhausted"); 187 } 188 link->trie->accepting = 0; 189 link->trie->links = NULL; 190 link->trie->parent = trie; 191 link->trie->next = NULL; 192 link->trie->fail = NULL; 193 link->trie->depth = trie->depth + 1; 194 link->trie->shift = 0; 195 link->label = label; 196 link->balance = 0; 197 198 /* Install the new tree node in its parent. */ 199 if (dirs[--depth] == L) 200 links[depth]->llink = link; 201 else 202 links[depth]->rlink = link; 203 204 /* Back up the tree fixing the balance flags. */ 205 while (depth && !links[depth]->balance) 206 { 207 if (dirs[depth] == L) 208 --links[depth]->balance; 209 else 210 ++links[depth]->balance; 211 --depth; 212 } 213 214 /* Rebalance the tree by pointer rotations if necessary. */ 215 if (depth && ((dirs[depth] == L && --links[depth]->balance) 216 || (dirs[depth] == R && ++links[depth]->balance))) 217 { 218 switch (links[depth]->balance) 219 { 220 case (char) -2: 221 switch (dirs[depth + 1]) 222 { 223 case L: 224 r = links[depth], t = r->llink, rl = t->rlink; 225 t->rlink = r, r->llink = rl; 226 t->balance = r->balance = 0; 227 break; 228 case R: 229 r = links[depth], l = r->llink, t = l->rlink; 230 rl = t->rlink, lr = t->llink; 231 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 232 l->balance = t->balance != 1 ? 0 : -1; 233 r->balance = t->balance != (char) -1 ? 0 : 1; 234 t->balance = 0; 235 break; 236 default: 237 abort (); 238 } 239 break; 240 case 2: 241 switch (dirs[depth + 1]) 242 { 243 case R: 244 l = links[depth], t = l->rlink, lr = t->llink; 245 t->llink = l, l->rlink = lr; 246 t->balance = l->balance = 0; 247 break; 248 case L: 249 l = links[depth], r = l->rlink, t = r->llink; 250 lr = t->llink, rl = t->rlink; 251 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 252 l->balance = t->balance != 1 ? 0 : -1; 253 r->balance = t->balance != (char) -1 ? 0 : 1; 254 t->balance = 0; 255 break; 256 default: 257 abort (); 258 } 259 break; 260 default: 261 abort (); 262 } 263 264 if (dirs[depth - 1] == L) 265 links[depth - 1]->llink = t; 266 else 267 links[depth - 1]->rlink = t; 268 } 269 } 270 271 trie = link->trie; 272 } 273 274 /* Mark the node we finally reached as accepting, encoding the 275 index number of this word in the keyword set so far. */ 276 if (!trie->accepting) 277 trie->accepting = 1 + 2 * kwset->words; 278 ++kwset->words; 279 280 /* Keep track of the longest and shortest string of the keyword set. */ 281 if (trie->depth < kwset->mind) 282 kwset->mind = trie->depth; 283 if (trie->depth > kwset->maxd) 284 kwset->maxd = trie->depth; 285 286 return NULL; 287 } 288 289 /* Enqueue the trie nodes referenced from the given tree in the 290 given queue. */ 291 static void 292 enqueue (struct tree *tree, struct trie **last) 293 { 294 if (!tree) 295 return; 296 enqueue(tree->llink, last); 297 enqueue(tree->rlink, last); 298 (*last) = (*last)->next = tree->trie; 299 } 300 301 /* Compute the Aho-Corasick failure function for the trie nodes referenced 302 from the given tree, given the failure function for their parent as 303 well as a last resort failure node. */ 304 static void 305 treefails (struct tree const *tree, struct trie const *fail, 306 struct trie *recourse) 307 { 308 struct tree *link; 309 310 if (!tree) 311 return; 312 313 treefails(tree->llink, fail, recourse); 314 treefails(tree->rlink, fail, recourse); 315 316 /* Find, in the chain of fails going back to the root, the first 317 node that has a descendent on the current label. */ 318 while (fail) 319 { 320 link = fail->links; 321 while (link && tree->label != link->label) 322 if (tree->label < link->label) 323 link = link->llink; 324 else 325 link = link->rlink; 326 if (link) 327 { 328 tree->trie->fail = link->trie; 329 return; 330 } 331 fail = fail->fail; 332 } 333 334 tree->trie->fail = recourse; 335 } 336 337 /* Set delta entries for the links of the given tree such that 338 the preexisting delta value is larger than the current depth. */ 339 static void 340 treedelta (struct tree const *tree, 341 unsigned int depth, 342 unsigned char delta[]) 343 { 344 if (!tree) 345 return; 346 treedelta(tree->llink, depth, delta); 347 treedelta(tree->rlink, depth, delta); 348 if (depth < delta[tree->label]) 349 delta[tree->label] = depth; 350 } 351 352 /* Return true if A has every label in B. */ 353 static int 354 hasevery (struct tree const *a, struct tree const *b) 355 { 356 if (!b) 357 return 1; 358 if (!hasevery(a, b->llink)) 359 return 0; 360 if (!hasevery(a, b->rlink)) 361 return 0; 362 while (a && b->label != a->label) 363 if (b->label < a->label) 364 a = a->llink; 365 else 366 a = a->rlink; 367 return !!a; 368 } 369 370 /* Compute a vector, indexed by character code, of the trie nodes 371 referenced from the given tree. */ 372 static void 373 treenext (struct tree const *tree, struct trie *next[]) 374 { 375 if (!tree) 376 return; 377 treenext(tree->llink, next); 378 treenext(tree->rlink, next); 379 next[tree->label] = tree->trie; 380 } 381 382 /* Compute the shift for each trie node, as well as the delta 383 table and next cache for the given keyword set. */ 384 const char * 385 kwsprep (kwset_t kws) 386 { 387 struct kwset *kwset; 388 int i; 389 struct trie *curr; 390 char const *trans; 391 unsigned char delta[NCHAR]; 392 393 kwset = (struct kwset *) kws; 394 395 /* Initial values for the delta table; will be changed later. The 396 delta entry for a given character is the smallest depth of any 397 node at which an outgoing edge is labeled by that character. */ 398 memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); 399 400 /* Check if we can use the simple boyer-moore algorithm, instead 401 of the hairy commentz-walter algorithm. */ 402 if (kwset->words == 1 && kwset->trans == NULL) 403 { 404 char c; 405 406 /* Looking for just one string. Extract it from the trie. */ 407 kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); 408 if (!kwset->target) 409 return _("memory exhausted"); 410 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 411 { 412 kwset->target[i] = curr->links->label; 413 curr = curr->links->trie; 414 } 415 /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ 416 for (i = 0; i < kwset->mind; ++i) 417 delta[U(kwset->target[i])] = kwset->mind - (i + 1); 418 /* Find the minimal delta2 shift that we might make after 419 a backwards match has failed. */ 420 c = kwset->target[kwset->mind - 1]; 421 for (i = kwset->mind - 2; i >= 0; --i) 422 if (kwset->target[i] == c) 423 break; 424 kwset->mind2 = kwset->mind - (i + 1); 425 } 426 else 427 { 428 struct trie *fail; 429 struct trie *last, *next[NCHAR]; 430 431 /* Traverse the nodes of the trie in level order, simultaneously 432 computing the delta table, failure function, and shift function. */ 433 for (curr = last = kwset->trie; curr; curr = curr->next) 434 { 435 /* Enqueue the immediate descendents in the level order queue. */ 436 enqueue(curr->links, &last); 437 438 curr->shift = kwset->mind; 439 curr->maxshift = kwset->mind; 440 441 /* Update the delta table for the descendents of this node. */ 442 treedelta(curr->links, curr->depth, delta); 443 444 /* Compute the failure function for the decendents of this node. */ 445 treefails(curr->links, curr->fail, kwset->trie); 446 447 /* Update the shifts at each node in the current node's chain 448 of fails back to the root. */ 449 for (fail = curr->fail; fail; fail = fail->fail) 450 { 451 /* If the current node has some outgoing edge that the fail 452 doesn't, then the shift at the fail should be no larger 453 than the difference of their depths. */ 454 if (!hasevery(fail->links, curr->links)) 455 if (curr->depth - fail->depth < fail->shift) 456 fail->shift = curr->depth - fail->depth; 457 458 /* If the current node is accepting then the shift at the 459 fail and its descendents should be no larger than the 460 difference of their depths. */ 461 if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 462 fail->maxshift = curr->depth - fail->depth; 463 } 464 } 465 466 /* Traverse the trie in level order again, fixing up all nodes whose 467 shift exceeds their inherited maxshift. */ 468 for (curr = kwset->trie->next; curr; curr = curr->next) 469 { 470 if (curr->maxshift > curr->parent->maxshift) 471 curr->maxshift = curr->parent->maxshift; 472 if (curr->shift > curr->maxshift) 473 curr->shift = curr->maxshift; 474 } 475 476 /* Create a vector, indexed by character code, of the outgoing links 477 from the root node. */ 478 for (i = 0; i < NCHAR; ++i) 479 next[i] = NULL; 480 treenext(kwset->trie->links, next); 481 482 if ((trans = kwset->trans) != NULL) 483 for (i = 0; i < NCHAR; ++i) 484 kwset->next[i] = next[U(trans[i])]; 485 else 486 memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); 487 } 488 489 /* Fix things up for any translation table. */ 490 if ((trans = kwset->trans) != NULL) 491 for (i = 0; i < NCHAR; ++i) 492 kwset->delta[i] = delta[U(trans[i])]; 493 else 494 memcpy(kwset->delta, delta, NCHAR); 495 496 return NULL; 497 } 498 499 /* Fast boyer-moore search. */ 500 static size_t 501 bmexec (kwset_t kws, char const *text, size_t size) 502 { 503 struct kwset const *kwset; 504 unsigned char const *d1; 505 char const *ep, *sp, *tp; 506 int d, gc, i, len, md2; 507 508 kwset = (struct kwset const *) kws; 509 len = kwset->mind; 510 511 if (len == 0) 512 return 0; 513 if (len > size) 514 return -1; 515 if (len == 1) 516 { 517 tp = memchr (text, kwset->target[0], size); 518 return tp ? tp - text : -1; 519 } 520 521 d1 = kwset->delta; 522 sp = kwset->target + len; 523 gc = U(sp[-2]); 524 md2 = kwset->mind2; 525 tp = text + len; 526 527 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 528 if (size > 12 * len) 529 /* 11 is not a bug, the initial offset happens only once. */ 530 for (ep = text + size - 11 * len;;) 531 { 532 while (tp <= ep) 533 { 534 d = d1[U(tp[-1])], tp += d; 535 d = d1[U(tp[-1])], tp += d; 536 if (d == 0) 537 goto found; 538 d = d1[U(tp[-1])], tp += d; 539 d = d1[U(tp[-1])], tp += d; 540 d = d1[U(tp[-1])], tp += d; 541 if (d == 0) 542 goto found; 543 d = d1[U(tp[-1])], tp += d; 544 d = d1[U(tp[-1])], tp += d; 545 d = d1[U(tp[-1])], tp += d; 546 if (d == 0) 547 goto found; 548 d = d1[U(tp[-1])], tp += d; 549 d = d1[U(tp[-1])], tp += d; 550 } 551 break; 552 found: 553 if (U(tp[-2]) == gc) 554 { 555 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 556 ; 557 if (i > len) 558 return tp - len - text; 559 } 560 tp += md2; 561 } 562 563 /* Now we have only a few characters left to search. We 564 carefully avoid ever producing an out-of-bounds pointer. */ 565 ep = text + size; 566 d = d1[U(tp[-1])]; 567 while (d <= ep - tp) 568 { 569 d = d1[U((tp += d)[-1])]; 570 if (d != 0) 571 continue; 572 if (U(tp[-2]) == gc) 573 { 574 for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) 575 ; 576 if (i > len) 577 return tp - len - text; 578 } 579 d = md2; 580 } 581 582 return -1; 583 } 584 585 /* Hairy multiple string search. */ 586 static size_t _GL_ARG_NONNULL ((4)) 587 cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) 588 { 589 struct kwset const *kwset; 590 struct trie * const *next; 591 struct trie const *trie; 592 struct trie const *accept; 593 char const *beg, *lim, *mch, *lmch; 594 unsigned char c; 595 unsigned char const *delta; 596 int d; 597 char const *end, *qlim; 598 struct tree const *tree; 599 char const *trans; 600 601 #ifdef lint 602 accept = NULL; 603 #endif 604 605 /* Initialize register copies and look for easy ways out. */ 606 kwset = (struct kwset *) kws; 607 if (len < kwset->mind) 608 return -1; 609 next = kwset->next; 610 delta = kwset->delta; 611 trans = kwset->trans; 612 lim = text + len; 613 end = text; 614 if ((d = kwset->mind) != 0) 615 mch = NULL; 616 else 617 { 618 mch = text, accept = kwset->trie; 619 goto match; 620 } 621 622 if (len >= 4 * kwset->mind) 623 qlim = lim - 4 * kwset->mind; 624 else 625 qlim = NULL; 626 627 while (lim - end >= d) 628 { 629 if (qlim && end <= qlim) 630 { 631 end += d - 1; 632 while ((d = delta[c = *end]) && end < qlim) 633 { 634 end += d; 635 end += delta[U(*end)]; 636 end += delta[U(*end)]; 637 } 638 ++end; 639 } 640 else 641 d = delta[c = (end += d)[-1]]; 642 if (d) 643 continue; 644 beg = end - 1; 645 trie = next[c]; 646 if (trie->accepting) 647 { 648 mch = beg; 649 accept = trie; 650 } 651 d = trie->shift; 652 while (beg > text) 653 { 654 c = trans ? trans[U(*--beg)] : *--beg; 655 tree = trie->links; 656 while (tree && c != tree->label) 657 if (c < tree->label) 658 tree = tree->llink; 659 else 660 tree = tree->rlink; 661 if (tree) 662 { 663 trie = tree->trie; 664 if (trie->accepting) 665 { 666 mch = beg; 667 accept = trie; 668 } 669 } 670 else 671 break; 672 d = trie->shift; 673 } 674 if (mch) 675 goto match; 676 } 677 return -1; 678 679 match: 680 /* Given a known match, find the longest possible match anchored 681 at or before its starting point. This is nearly a verbatim 682 copy of the preceding main search loops. */ 683 if (lim - mch > kwset->maxd) 684 lim = mch + kwset->maxd; 685 lmch = 0; 686 d = 1; 687 while (lim - end >= d) 688 { 689 if ((d = delta[c = (end += d)[-1]]) != 0) 690 continue; 691 beg = end - 1; 692 if (!(trie = next[c])) 693 { 694 d = 1; 695 continue; 696 } 697 if (trie->accepting && beg <= mch) 698 { 699 lmch = beg; 700 accept = trie; 701 } 702 d = trie->shift; 703 while (beg > text) 704 { 705 c = trans ? trans[U(*--beg)] : *--beg; 706 tree = trie->links; 707 while (tree && c != tree->label) 708 if (c < tree->label) 709 tree = tree->llink; 710 else 711 tree = tree->rlink; 712 if (tree) 713 { 714 trie = tree->trie; 715 if (trie->accepting && beg <= mch) 716 { 717 lmch = beg; 718 accept = trie; 719 } 720 } 721 else 722 break; 723 d = trie->shift; 724 } 725 if (lmch) 726 { 727 mch = lmch; 728 goto match; 729 } 730 if (!d) 731 d = 1; 732 } 733 734 kwsmatch->index = accept->accepting / 2; 735 kwsmatch->offset[0] = mch - text; 736 kwsmatch->size[0] = accept->depth; 737 738 return mch - text; 739 } 740 741 /* Search TEXT for a match of any member of the keyword set, KWS. 742 Return the offset (into TEXT) of the first byte of the matching substring, 743 or (size_t) -1 if no match is found. Upon a match, store details in 744 *KWSMATCH: index of matched keyword, start offset (same as the return 745 value), and length. */ 746 size_t 747 kwsexec (kwset_t kws, char const *text, size_t size, struct kwsmatch *kwsmatch) 748 { 749 struct kwset const *kwset = (struct kwset *) kws; 750 if (kwset->words == 1 && kwset->trans == NULL) 751 { 752 size_t ret = bmexec (kws, text, size); 753 if (ret != (size_t) -1) 754 { 755 kwsmatch->index = 0; 756 kwsmatch->offset[0] = ret; 757 kwsmatch->size[0] = kwset->mind; 758 } 759 return ret; 760 } 761 else 762 return cwexec(kws, text, size, kwsmatch); 763 } 764 765 /* Free the components of the given keyword set. */ 766 void 767 kwsfree (kwset_t kws) 768 { 769 struct kwset *kwset; 770 771 kwset = (struct kwset *) kws; 772 obstack_free(&kwset->obstack, NULL); 773 free(kws); 774 } 775