1 /* kwset.c - search for any of a set of keywords. 2 Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2014 Free Software 3 Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 18 02110-1301, USA. */ 19 20 /* Written August 1989 by Mike Haertel. 21 The author may be reached (Email) at the address mike@ai.mit.edu, 22 or (US mail) as Mike Haertel c/o Free Software Foundation. */ 23 24 /* The algorithm implemented by these routines bears a startling resemblance 25 to one discovered by Beate Commentz-Walter, although it is not identical. 26 See: Commentz-Walter B. A string matching algorithm fast on the average. 27 Lecture Notes in Computer Science 71 (1979), 118-32 28 <http://dx.doi.org/10.1007/3-540-09510-1_10>. 29 See also: Aho AV, Corasick MJ. Efficient string matching: an aid to 30 bibliographic search. CACM 18, 6 (1975), 333-40 31 <http://dx.doi.org/10.1145/360825.360855>, which describes the 32 failure function used below. */ 33 34 #include <config.h> 35 36 #include "kwset.h" 37 38 #include <stdbool.h> 39 #include <stdint.h> 40 #include <sys/types.h> 41 #include "system.h" 42 #include "memchr2.h" 43 #include "obstack.h" 44 #include "xalloc.h" 45 46 #define link kwset_link 47 48 #ifdef GREP 49 # include "xalloc.h" 50 # undef malloc 51 # define malloc xmalloc 52 #endif 53 54 #define NCHAR (UCHAR_MAX + 1) 55 #define obstack_chunk_alloc malloc 56 #define obstack_chunk_free free 57 58 #define U(c) (to_uchar (c)) 59 60 /* Balanced tree of edges and labels leaving a given trie node. */ 61 struct tree 62 { 63 struct tree *llink; /* Left link; MUST be first field. */ 64 struct tree *rlink; /* Right link (to larger labels). */ 65 struct trie *trie; /* Trie node pointed to by this edge. */ 66 unsigned char label; /* Label on this edge. */ 67 char balance; /* Difference in depths of subtrees. */ 68 }; 69 70 /* Node of a trie representing a set of reversed keywords. */ 71 struct trie 72 { 73 size_t accepting; /* Word index of accepted word, or zero. */ 74 struct tree *links; /* Tree of edges leaving this node. */ 75 struct trie *parent; /* Parent of this node. */ 76 struct trie *next; /* List of all trie nodes in level order. */ 77 struct trie *fail; /* Aho-Corasick failure function. */ 78 int depth; /* Depth of this node from the root. */ 79 int shift; /* Shift function for search failures. */ 80 int maxshift; /* Max shift of self and descendants. */ 81 }; 82 83 /* Structure returned opaquely to the caller, containing everything. */ 84 struct kwset 85 { 86 struct obstack obstack; /* Obstack for node allocation. */ 87 ptrdiff_t words; /* Number of words in the trie. */ 88 struct trie *trie; /* The trie itself. */ 89 int mind; /* Minimum depth of an accepting node. */ 90 int maxd; /* Maximum depth of any node. */ 91 unsigned char delta[NCHAR]; /* Delta table for rapid search. */ 92 struct trie *next[NCHAR]; /* Table of children of the root. */ 93 char *target; /* Target string if there's only one. */ 94 int *shift; /* Used in Boyer-Moore search for one string. */ 95 char const *trans; /* Character translation table. */ 96 97 /* If there's only one string, this is the string's last byte, 98 translated via TRANS if TRANS is nonnull. */ 99 char gc1; 100 101 /* Likewise for the string's penultimate byte, if it has two or more 102 bytes. */ 103 char gc2; 104 105 /* If there's only one string, this helps to match the string's last byte. 106 If GC1HELP is negative, only GC1 matches the string's last byte; 107 otherwise at least two bytes match, and B matches if TRANS[B] == GC1. 108 If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two 109 such matches, and GC1HELP is the other match after conversion to 110 unsigned char. If GC1HELP is at least NCHAR, there are three or 111 more such matches; e.g., Greek has three sigma characters that 112 all match when case-folding. */ 113 int gc1help; 114 }; 115 116 /* Use TRANS to transliterate C. A null TRANS does no transliteration. */ 117 static inline char 118 tr (char const *trans, char c) 119 { 120 return trans ? trans[U(c)] : c; 121 } 122 123 /* Allocate and initialize a keyword set object, returning an opaque 124 pointer to it. */ 125 kwset_t 126 kwsalloc (char const *trans) 127 { 128 struct kwset *kwset = xmalloc (sizeof *kwset); 129 130 obstack_init (&kwset->obstack); 131 kwset->words = 0; 132 kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie); 133 kwset->trie->accepting = 0; 134 kwset->trie->links = NULL; 135 kwset->trie->parent = NULL; 136 kwset->trie->next = NULL; 137 kwset->trie->fail = NULL; 138 kwset->trie->depth = 0; 139 kwset->trie->shift = 0; 140 kwset->mind = INT_MAX; 141 kwset->maxd = -1; 142 kwset->target = NULL; 143 kwset->trans = trans; 144 145 return kwset; 146 } 147 148 /* This upper bound is valid for CHAR_BIT >= 4 and 149 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */ 150 #define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2) 151 152 /* Add the given string to the contents of the keyword set. */ 153 void 154 kwsincr (kwset_t kwset, char const *text, size_t len) 155 { 156 struct trie *trie = kwset->trie; 157 char const *trans = kwset->trans; 158 159 text += len; 160 161 /* Descend the trie (built of reversed keywords) character-by-character, 162 installing new nodes when necessary. */ 163 while (len--) 164 { 165 unsigned char uc = *--text; 166 unsigned char label = trans ? trans[uc] : uc; 167 168 /* Descend the tree of outgoing links for this trie node, 169 looking for the current character and keeping track 170 of the path followed. */ 171 struct tree *link = trie->links; 172 struct tree *links[DEPTH_SIZE]; 173 enum { L, R } dirs[DEPTH_SIZE]; 174 links[0] = (struct tree *) &trie->links; 175 dirs[0] = L; 176 int depth = 1; 177 178 while (link && label != link->label) 179 { 180 links[depth] = link; 181 if (label < link->label) 182 dirs[depth++] = L, link = link->llink; 183 else 184 dirs[depth++] = R, link = link->rlink; 185 } 186 187 /* The current character doesn't have an outgoing link at 188 this trie node, so build a new trie node and install 189 a link in the current trie node's tree. */ 190 if (!link) 191 { 192 link = obstack_alloc (&kwset->obstack, sizeof *link); 193 link->llink = NULL; 194 link->rlink = NULL; 195 link->trie = obstack_alloc (&kwset->obstack, sizeof *link->trie); 196 link->trie->accepting = 0; 197 link->trie->links = NULL; 198 link->trie->parent = trie; 199 link->trie->next = NULL; 200 link->trie->fail = NULL; 201 link->trie->depth = trie->depth + 1; 202 link->trie->shift = 0; 203 link->label = label; 204 link->balance = 0; 205 206 /* Install the new tree node in its parent. */ 207 if (dirs[--depth] == L) 208 links[depth]->llink = link; 209 else 210 links[depth]->rlink = link; 211 212 /* Back up the tree fixing the balance flags. */ 213 while (depth && !links[depth]->balance) 214 { 215 if (dirs[depth] == L) 216 --links[depth]->balance; 217 else 218 ++links[depth]->balance; 219 --depth; 220 } 221 222 /* Rebalance the tree by pointer rotations if necessary. */ 223 if (depth && ((dirs[depth] == L && --links[depth]->balance) 224 || (dirs[depth] == R && ++links[depth]->balance))) 225 { 226 struct tree *t, *r, *l, *rl, *lr; 227 228 switch (links[depth]->balance) 229 { 230 case (char) -2: 231 switch (dirs[depth + 1]) 232 { 233 case L: 234 r = links[depth], t = r->llink, rl = t->rlink; 235 t->rlink = r, r->llink = rl; 236 t->balance = r->balance = 0; 237 break; 238 case R: 239 r = links[depth], l = r->llink, t = l->rlink; 240 rl = t->rlink, lr = t->llink; 241 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 242 l->balance = t->balance != 1 ? 0 : -1; 243 r->balance = t->balance != (char) -1 ? 0 : 1; 244 t->balance = 0; 245 break; 246 default: 247 abort (); 248 } 249 break; 250 case 2: 251 switch (dirs[depth + 1]) 252 { 253 case R: 254 l = links[depth], t = l->rlink, lr = t->llink; 255 t->llink = l, l->rlink = lr; 256 t->balance = l->balance = 0; 257 break; 258 case L: 259 l = links[depth], r = l->rlink, t = r->llink; 260 lr = t->llink, rl = t->rlink; 261 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl; 262 l->balance = t->balance != 1 ? 0 : -1; 263 r->balance = t->balance != (char) -1 ? 0 : 1; 264 t->balance = 0; 265 break; 266 default: 267 abort (); 268 } 269 break; 270 default: 271 abort (); 272 } 273 274 if (dirs[depth - 1] == L) 275 links[depth - 1]->llink = t; 276 else 277 links[depth - 1]->rlink = t; 278 } 279 } 280 281 trie = link->trie; 282 } 283 284 /* Mark the node we finally reached as accepting, encoding the 285 index number of this word in the keyword set so far. */ 286 if (!trie->accepting) 287 trie->accepting = 1 + 2 * kwset->words; 288 ++kwset->words; 289 290 /* Keep track of the longest and shortest string of the keyword set. */ 291 if (trie->depth < kwset->mind) 292 kwset->mind = trie->depth; 293 if (trie->depth > kwset->maxd) 294 kwset->maxd = trie->depth; 295 } 296 297 /* Enqueue the trie nodes referenced from the given tree in the 298 given queue. */ 299 static void 300 enqueue (struct tree *tree, struct trie **last) 301 { 302 if (!tree) 303 return; 304 enqueue(tree->llink, last); 305 enqueue(tree->rlink, last); 306 (*last) = (*last)->next = tree->trie; 307 } 308 309 /* Compute the Aho-Corasick failure function for the trie nodes referenced 310 from the given tree, given the failure function for their parent as 311 well as a last resort failure node. */ 312 static void 313 treefails (struct tree const *tree, struct trie const *fail, 314 struct trie *recourse) 315 { 316 struct tree *link; 317 318 if (!tree) 319 return; 320 321 treefails(tree->llink, fail, recourse); 322 treefails(tree->rlink, fail, recourse); 323 324 /* Find, in the chain of fails going back to the root, the first 325 node that has a descendant on the current label. */ 326 while (fail) 327 { 328 link = fail->links; 329 while (link && tree->label != link->label) 330 if (tree->label < link->label) 331 link = link->llink; 332 else 333 link = link->rlink; 334 if (link) 335 { 336 tree->trie->fail = link->trie; 337 return; 338 } 339 fail = fail->fail; 340 } 341 342 tree->trie->fail = recourse; 343 } 344 345 /* Set delta entries for the links of the given tree such that 346 the preexisting delta value is larger than the current depth. */ 347 static void 348 treedelta (struct tree const *tree, 349 unsigned int depth, 350 unsigned char delta[]) 351 { 352 if (!tree) 353 return; 354 treedelta(tree->llink, depth, delta); 355 treedelta(tree->rlink, depth, delta); 356 if (depth < delta[tree->label]) 357 delta[tree->label] = depth; 358 } 359 360 /* Return true if A has every label in B. */ 361 static int _GL_ATTRIBUTE_PURE 362 hasevery (struct tree const *a, struct tree const *b) 363 { 364 if (!b) 365 return 1; 366 if (!hasevery(a, b->llink)) 367 return 0; 368 if (!hasevery(a, b->rlink)) 369 return 0; 370 while (a && b->label != a->label) 371 if (b->label < a->label) 372 a = a->llink; 373 else 374 a = a->rlink; 375 return !!a; 376 } 377 378 /* Compute a vector, indexed by character code, of the trie nodes 379 referenced from the given tree. */ 380 static void 381 treenext (struct tree const *tree, struct trie *next[]) 382 { 383 if (!tree) 384 return; 385 treenext(tree->llink, next); 386 treenext(tree->rlink, next); 387 next[tree->label] = tree->trie; 388 } 389 390 /* Compute the shift for each trie node, as well as the delta 391 table and next cache for the given keyword set. */ 392 void 393 kwsprep (kwset_t kwset) 394 { 395 char const *trans = kwset->trans; 396 int i; 397 unsigned char deltabuf[NCHAR]; 398 unsigned char *delta = trans ? deltabuf : kwset->delta; 399 400 /* Initial values for the delta table; will be changed later. The 401 delta entry for a given character is the smallest depth of any 402 node at which an outgoing edge is labeled by that character. */ 403 memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf); 404 405 /* Traverse the nodes of the trie in level order, simultaneously 406 computing the delta table, failure function, and shift function. */ 407 struct trie *curr, *last; 408 for (curr = last = kwset->trie; curr; curr = curr->next) 409 { 410 /* Enqueue the immediate descendants in the level order queue. */ 411 enqueue (curr->links, &last); 412 413 curr->shift = kwset->mind; 414 curr->maxshift = kwset->mind; 415 416 /* Update the delta table for the descendants of this node. */ 417 treedelta (curr->links, curr->depth, delta); 418 419 /* Compute the failure function for the descendants of this node. */ 420 treefails (curr->links, curr->fail, kwset->trie); 421 422 /* Update the shifts at each node in the current node's chain 423 of fails back to the root. */ 424 struct trie *fail; 425 for (fail = curr->fail; fail; fail = fail->fail) 426 { 427 /* If the current node has some outgoing edge that the fail 428 doesn't, then the shift at the fail should be no larger 429 than the difference of their depths. */ 430 if (!hasevery (fail->links, curr->links)) 431 if (curr->depth - fail->depth < fail->shift) 432 fail->shift = curr->depth - fail->depth; 433 434 /* If the current node is accepting then the shift at the 435 fail and its descendants should be no larger than the 436 difference of their depths. */ 437 if (curr->accepting && fail->maxshift > curr->depth - fail->depth) 438 fail->maxshift = curr->depth - fail->depth; 439 } 440 } 441 442 /* Traverse the trie in level order again, fixing up all nodes whose 443 shift exceeds their inherited maxshift. */ 444 for (curr = kwset->trie->next; curr; curr = curr->next) 445 { 446 if (curr->maxshift > curr->parent->maxshift) 447 curr->maxshift = curr->parent->maxshift; 448 if (curr->shift > curr->maxshift) 449 curr->shift = curr->maxshift; 450 } 451 452 /* Create a vector, indexed by character code, of the outgoing links 453 from the root node. */ 454 struct trie *nextbuf[NCHAR]; 455 struct trie **next = trans ? nextbuf : kwset->next; 456 memset (next, 0, sizeof nextbuf); 457 treenext (kwset->trie->links, next); 458 if (trans) 459 for (i = 0; i < NCHAR; ++i) 460 kwset->next[i] = next[U(trans[i])]; 461 462 /* Check if we can use the simple boyer-moore algorithm, instead 463 of the hairy commentz-walter algorithm. */ 464 if (kwset->words == 1) 465 { 466 /* Looking for just one string. Extract it from the trie. */ 467 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); 468 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) 469 { 470 kwset->target[i] = curr->links->label; 471 curr = curr->next; 472 } 473 /* Looking for the delta2 shift that we might make after a 474 backwards match has failed. Extract it from the trie. */ 475 if (kwset->mind > 1) 476 { 477 kwset->shift 478 = obstack_alloc (&kwset->obstack, 479 sizeof *kwset->shift * (kwset->mind - 1)); 480 for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) 481 { 482 kwset->shift[i] = curr->shift; 483 curr = curr->next; 484 } 485 } 486 487 char gc1 = tr (trans, kwset->target[kwset->mind - 1]); 488 489 /* Set GC1HELP according to whether exactly one, exactly two, or 490 three-or-more characters match GC1. */ 491 int gc1help = -1; 492 if (trans) 493 { 494 char const *equiv1 = memchr (trans, gc1, NCHAR); 495 char const *equiv2 = memchr (equiv1 + 1, gc1, 496 trans + NCHAR - (equiv1 + 1)); 497 if (equiv2) 498 gc1help = (memchr (equiv2 + 1, gc1, trans + NCHAR - (equiv2 + 1)) 499 ? NCHAR 500 : U(gc1) ^ (equiv1 - trans) ^ (equiv2 - trans)); 501 } 502 503 kwset->gc1 = gc1; 504 kwset->gc1help = gc1help; 505 if (kwset->mind > 1) 506 kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]); 507 } 508 509 /* Fix things up for any translation table. */ 510 if (trans) 511 for (i = 0; i < NCHAR; ++i) 512 kwset->delta[i] = delta[U(trans[i])]; 513 } 514 515 /* Delta2 portion of a Boyer-Moore search. *TP is the string text 516 pointer; it is updated in place. EP is the end of the string text, 517 and SP the end of the pattern. LEN is the pattern length; it must 518 be at least 2. TRANS, if nonnull, is the input translation table. 519 GC1 and GC2 are the last and second-from last bytes of the pattern, 520 transliterated by TRANS; the caller precomputes them for 521 efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP 522 when failing. KWSET->shift says how much to shift. */ 523 static inline bool 524 bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len, 525 char const *trans, char gc1, char gc2, 526 unsigned char const *d1, kwset_t kwset) 527 { 528 char const *tp = *tpp; 529 int d = len, skip = 0; 530 531 while (true) 532 { 533 int i = 2; 534 if (tr (trans, tp[-2]) == gc2) 535 { 536 while (++i <= d) 537 if (tr (trans, tp[-i]) != tr (trans, sp[-i])) 538 break; 539 if (i > d) 540 { 541 for (i = d + skip + 1; i <= len; ++i) 542 if (tr (trans, tp[-i]) != tr (trans, sp[-i])) 543 break; 544 if (i > len) 545 { 546 *tpp = tp - len; 547 return true; 548 } 549 } 550 } 551 552 tp += d = kwset->shift[i - 2]; 553 if (tp > ep) 554 break; 555 if (tr (trans, tp[-1]) != gc1) 556 { 557 if (d1) 558 tp += d1[U(tp[-1])]; 559 break; 560 } 561 skip = i - 1; 562 } 563 564 *tpp = tp; 565 return false; 566 } 567 568 /* Return the address of the first byte in the buffer S (of size N) 569 that matches the last byte specified by KWSET, a singleton. */ 570 static char const * 571 memchr_kwset (char const *s, size_t n, kwset_t kwset) 572 { 573 if (kwset->gc1help < 0) 574 return memchr (s, kwset->gc1, n); 575 int small_heuristic = 2; 576 int small = (- (uintptr_t) s % sizeof (long) 577 + small_heuristic * sizeof (long)); 578 size_t ntrans = kwset->gc1help < NCHAR && small < n ? small : n; 579 char const *slim = s + ntrans; 580 for (; s < slim; s++) 581 if (kwset->trans[U(*s)] == kwset->gc1) 582 return s; 583 n -= ntrans; 584 return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n); 585 } 586 587 /* Fast Boyer-Moore search (inlinable version). */ 588 static inline size_t _GL_ATTRIBUTE_PURE 589 bmexec_trans (kwset_t kwset, char const *text, size_t size) 590 { 591 unsigned char const *d1; 592 char const *ep, *sp, *tp; 593 int d; 594 int len = kwset->mind; 595 char const *trans = kwset->trans; 596 597 if (len == 0) 598 return 0; 599 if (len > size) 600 return -1; 601 if (len == 1) 602 { 603 tp = memchr_kwset (text, size, kwset); 604 return tp ? tp - text : -1; 605 } 606 607 d1 = kwset->delta; 608 sp = kwset->target + len; 609 tp = text + len; 610 char gc1 = kwset->gc1; 611 char gc2 = kwset->gc2; 612 613 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ 614 if (size > 12 * len) 615 /* 11 is not a bug, the initial offset happens only once. */ 616 for (ep = text + size - 11 * len; tp <= ep; ) 617 { 618 char const *tp0 = tp; 619 d = d1[U(tp[-1])], tp += d; 620 d = d1[U(tp[-1])], tp += d; 621 if (d != 0) 622 { 623 d = d1[U(tp[-1])], tp += d; 624 d = d1[U(tp[-1])], tp += d; 625 d = d1[U(tp[-1])], tp += d; 626 if (d != 0) 627 { 628 d = d1[U(tp[-1])], tp += d; 629 d = d1[U(tp[-1])], tp += d; 630 d = d1[U(tp[-1])], tp += d; 631 if (d != 0) 632 { 633 d = d1[U(tp[-1])], tp += d; 634 d = d1[U(tp[-1])], tp += d; 635 636 /* As a heuristic, prefer memchr to seeking by 637 delta1 when the latter doesn't advance much. */ 638 int advance_heuristic = 16 * sizeof (long); 639 if (advance_heuristic <= tp - tp0) 640 goto big_advance; 641 tp--; 642 tp = memchr_kwset (tp, text + size - tp, kwset); 643 if (! tp) 644 return -1; 645 tp++; 646 } 647 } 648 } 649 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset)) 650 return tp - text; 651 big_advance:; 652 } 653 654 /* Now we have only a few characters left to search. We 655 carefully avoid ever producing an out-of-bounds pointer. */ 656 ep = text + size; 657 d = d1[U(tp[-1])]; 658 while (d <= ep - tp) 659 { 660 d = d1[U((tp += d)[-1])]; 661 if (d != 0) 662 continue; 663 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset)) 664 return tp - text; 665 } 666 667 return -1; 668 } 669 670 /* Fast Boyer-Moore search. */ 671 static size_t 672 bmexec (kwset_t kwset, char const *text, size_t size) 673 { 674 /* Help the compiler inline bmexec_trans in two ways, depending on 675 whether kwset->trans is null. */ 676 return (kwset->trans 677 ? bmexec_trans (kwset, text, size) 678 : bmexec_trans (kwset, text, size)); 679 } 680 681 /* Hairy multiple string search. */ 682 static size_t _GL_ARG_NONNULL ((4)) 683 cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) 684 { 685 struct trie * const *next; 686 struct trie const *trie; 687 struct trie const *accept; 688 char const *beg, *lim, *mch, *lmch; 689 unsigned char c; 690 unsigned char const *delta; 691 int d; 692 char const *end, *qlim; 693 struct tree const *tree; 694 char const *trans; 695 696 #ifdef lint 697 accept = NULL; 698 #endif 699 700 /* Initialize register copies and look for easy ways out. */ 701 if (len < kwset->mind) 702 return -1; 703 next = kwset->next; 704 delta = kwset->delta; 705 trans = kwset->trans; 706 lim = text + len; 707 end = text; 708 if ((d = kwset->mind) != 0) 709 mch = NULL; 710 else 711 { 712 mch = text, accept = kwset->trie; 713 goto match; 714 } 715 716 if (len >= 4 * kwset->mind) 717 qlim = lim - 4 * kwset->mind; 718 else 719 qlim = NULL; 720 721 while (lim - end >= d) 722 { 723 if (qlim && end <= qlim) 724 { 725 end += d - 1; 726 while ((d = delta[c = *end]) && end < qlim) 727 { 728 end += d; 729 end += delta[U(*end)]; 730 end += delta[U(*end)]; 731 } 732 ++end; 733 } 734 else 735 d = delta[c = (end += d)[-1]]; 736 if (d) 737 continue; 738 beg = end - 1; 739 trie = next[c]; 740 if (trie->accepting) 741 { 742 mch = beg; 743 accept = trie; 744 } 745 d = trie->shift; 746 while (beg > text) 747 { 748 unsigned char uc = *--beg; 749 c = trans ? trans[uc] : uc; 750 tree = trie->links; 751 while (tree && c != tree->label) 752 if (c < tree->label) 753 tree = tree->llink; 754 else 755 tree = tree->rlink; 756 if (tree) 757 { 758 trie = tree->trie; 759 if (trie->accepting) 760 { 761 mch = beg; 762 accept = trie; 763 } 764 } 765 else 766 break; 767 d = trie->shift; 768 } 769 if (mch) 770 goto match; 771 } 772 return -1; 773 774 match: 775 /* Given a known match, find the longest possible match anchored 776 at or before its starting point. This is nearly a verbatim 777 copy of the preceding main search loops. */ 778 if (lim - mch > kwset->maxd) 779 lim = mch + kwset->maxd; 780 lmch = 0; 781 d = 1; 782 while (lim - end >= d) 783 { 784 if ((d = delta[c = (end += d)[-1]]) != 0) 785 continue; 786 beg = end - 1; 787 if (!(trie = next[c])) 788 { 789 d = 1; 790 continue; 791 } 792 if (trie->accepting && beg <= mch) 793 { 794 lmch = beg; 795 accept = trie; 796 } 797 d = trie->shift; 798 while (beg > text) 799 { 800 unsigned char uc = *--beg; 801 c = trans ? trans[uc] : uc; 802 tree = trie->links; 803 while (tree && c != tree->label) 804 if (c < tree->label) 805 tree = tree->llink; 806 else 807 tree = tree->rlink; 808 if (tree) 809 { 810 trie = tree->trie; 811 if (trie->accepting && beg <= mch) 812 { 813 lmch = beg; 814 accept = trie; 815 } 816 } 817 else 818 break; 819 d = trie->shift; 820 } 821 if (lmch) 822 { 823 mch = lmch; 824 goto match; 825 } 826 if (!d) 827 d = 1; 828 } 829 830 kwsmatch->index = accept->accepting / 2; 831 kwsmatch->offset[0] = mch - text; 832 kwsmatch->size[0] = accept->depth; 833 834 return mch - text; 835 } 836 837 /* Search TEXT for a match of any member of KWSET. 838 Return the offset (into TEXT) of the first byte of the matching substring, 839 or (size_t) -1 if no match is found. Upon a match, store details in 840 *KWSMATCH: index of matched keyword, start offset (same as the return 841 value), and length. */ 842 size_t 843 kwsexec (kwset_t kwset, char const *text, size_t size, 844 struct kwsmatch *kwsmatch) 845 { 846 if (kwset->words == 1) 847 { 848 size_t ret = bmexec (kwset, text, size); 849 if (ret != (size_t) -1) 850 { 851 kwsmatch->index = 0; 852 kwsmatch->offset[0] = ret; 853 kwsmatch->size[0] = kwset->mind; 854 } 855 return ret; 856 } 857 else 858 return cwexec (kwset, text, size, kwsmatch); 859 } 860 861 /* Free the components of the given keyword set. */ 862 void 863 kwsfree (kwset_t kwset) 864 { 865 obstack_free (&kwset->obstack, NULL); 866 free (kwset); 867 } 868