xref: /dragonfly/contrib/grep/src/kwset.c (revision 335b9e93)
1 /* kwset.c - search for any of a set of keywords.
2    Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2015 Free Software
3    Foundation, Inc.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software
17    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18    02110-1301, USA.  */
19 
20 /* Written August 1989 by Mike Haertel.
21    The author may be reached (Email) at the address mike@ai.mit.edu,
22    or (US mail) as Mike Haertel c/o Free Software Foundation. */
23 
24 /* The algorithm implemented by these routines bears a startling resemblance
25    to one discovered by Beate Commentz-Walter, although it is not identical.
26    See: Commentz-Walter B. A string matching algorithm fast on the average.
27    Lecture Notes in Computer Science 71 (1979), 118-32
28    <http://dx.doi.org/10.1007/3-540-09510-1_10>.
29    See also: Aho AV, Corasick MJ. Efficient string matching: an aid to
30    bibliographic search. CACM 18, 6 (1975), 333-40
31    <http://dx.doi.org/10.1145/360825.360855>, which describes the
32    failure function used below. */
33 
34 #include <config.h>
35 
36 #include "kwset.h"
37 
38 #include <stdbool.h>
39 #include <stdint.h>
40 #include <sys/types.h>
41 #include "system.h"
42 #include "memchr2.h"
43 #include "obstack.h"
44 #include "xalloc.h"
45 
46 #define link kwset_link
47 
48 #ifdef GREP
49 # include "xalloc.h"
50 # undef malloc
51 # define malloc xmalloc
52 #endif
53 
54 #define NCHAR (UCHAR_MAX + 1)
55 #define obstack_chunk_alloc malloc
56 #define obstack_chunk_free free
57 
58 #define U(c) (to_uchar (c))
59 
60 /* Balanced tree of edges and labels leaving a given trie node. */
61 struct tree
62 {
63   struct tree *llink;		/* Left link; MUST be first field. */
64   struct tree *rlink;		/* Right link (to larger labels). */
65   struct trie *trie;		/* Trie node pointed to by this edge. */
66   unsigned char label;		/* Label on this edge. */
67   char balance;			/* Difference in depths of subtrees. */
68 };
69 
70 /* Node of a trie representing a set of reversed keywords. */
71 struct trie
72 {
73   size_t accepting;		/* Word index of accepted word, or zero. */
74   struct tree *links;		/* Tree of edges leaving this node. */
75   struct trie *parent;		/* Parent of this node. */
76   struct trie *next;		/* List of all trie nodes in level order. */
77   struct trie *fail;		/* Aho-Corasick failure function. */
78   int depth;			/* Depth of this node from the root. */
79   int shift;			/* Shift function for search failures. */
80   int maxshift;			/* Max shift of self and descendants. */
81 };
82 
83 /* Structure returned opaquely to the caller, containing everything. */
84 struct kwset
85 {
86   struct obstack obstack;	/* Obstack for node allocation. */
87   ptrdiff_t words;		/* Number of words in the trie. */
88   struct trie *trie;		/* The trie itself. */
89   int mind;			/* Minimum depth of an accepting node. */
90   int maxd;			/* Maximum depth of any node. */
91   unsigned char delta[NCHAR];	/* Delta table for rapid search. */
92   struct trie *next[NCHAR];	/* Table of children of the root. */
93   char *target;			/* Target string if there's only one. */
94   int *shift;			/* Used in Boyer-Moore search for one string. */
95   char const *trans;		/* Character translation table. */
96 
97   /* If there's only one string, this is the string's last byte,
98      translated via TRANS if TRANS is nonnull.  */
99   char gc1;
100 
101   /* Likewise for the string's penultimate byte, if it has two or more
102      bytes.  */
103   char gc2;
104 
105   /* If there's only one string, this helps to match the string's last byte.
106      If GC1HELP is negative, only GC1 matches the string's last byte;
107      otherwise at least two bytes match, and B matches if TRANS[B] == GC1.
108      If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two
109      such matches, and GC1HELP is the other match after conversion to
110      unsigned char.  If GC1HELP is at least NCHAR, there are three or
111      more such matches; e.g., Greek has three sigma characters that
112      all match when case-folding.  */
113   int gc1help;
114 };
115 
116 /* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
117 static inline char
118 tr (char const *trans, char c)
119 {
120   return trans ? trans[U(c)] : c;
121 }
122 
123 /* Allocate and initialize a keyword set object, returning an opaque
124    pointer to it.  */
125 kwset_t
126 kwsalloc (char const *trans)
127 {
128   struct kwset *kwset = xmalloc (sizeof *kwset);
129 
130   obstack_init (&kwset->obstack);
131   kwset->words = 0;
132   kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie);
133   kwset->trie->accepting = 0;
134   kwset->trie->links = NULL;
135   kwset->trie->parent = NULL;
136   kwset->trie->next = NULL;
137   kwset->trie->fail = NULL;
138   kwset->trie->depth = 0;
139   kwset->trie->shift = 0;
140   kwset->mind = INT_MAX;
141   kwset->maxd = -1;
142   kwset->target = NULL;
143   kwset->trans = trans;
144 
145   return kwset;
146 }
147 
148 /* This upper bound is valid for CHAR_BIT >= 4 and
149    exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
150 #define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2)
151 
152 /* Add the given string to the contents of the keyword set.  */
153 void
154 kwsincr (kwset_t kwset, char const *text, size_t len)
155 {
156   struct trie *trie = kwset->trie;
157   char const *trans = kwset->trans;
158 
159   text += len;
160 
161   /* Descend the trie (built of reversed keywords) character-by-character,
162      installing new nodes when necessary. */
163   while (len--)
164     {
165       unsigned char uc = *--text;
166       unsigned char label = trans ? trans[uc] : uc;
167 
168       /* Descend the tree of outgoing links for this trie node,
169          looking for the current character and keeping track
170          of the path followed. */
171       struct tree *link = trie->links;
172       struct tree *links[DEPTH_SIZE];
173       enum { L, R } dirs[DEPTH_SIZE];
174       links[0] = (struct tree *) &trie->links;
175       dirs[0] = L;
176       int depth = 1;
177 
178       while (link && label != link->label)
179         {
180           links[depth] = link;
181           if (label < link->label)
182             dirs[depth++] = L, link = link->llink;
183           else
184             dirs[depth++] = R, link = link->rlink;
185         }
186 
187       /* The current character doesn't have an outgoing link at
188          this trie node, so build a new trie node and install
189          a link in the current trie node's tree. */
190       if (!link)
191         {
192           link = obstack_alloc (&kwset->obstack, sizeof *link);
193           link->llink = NULL;
194           link->rlink = NULL;
195           link->trie = obstack_alloc (&kwset->obstack, sizeof *link->trie);
196           link->trie->accepting = 0;
197           link->trie->links = NULL;
198           link->trie->parent = trie;
199           link->trie->next = NULL;
200           link->trie->fail = NULL;
201           link->trie->depth = trie->depth + 1;
202           link->trie->shift = 0;
203           link->label = label;
204           link->balance = 0;
205 
206           /* Install the new tree node in its parent. */
207           if (dirs[--depth] == L)
208             links[depth]->llink = link;
209           else
210             links[depth]->rlink = link;
211 
212           /* Back up the tree fixing the balance flags. */
213           while (depth && !links[depth]->balance)
214             {
215               if (dirs[depth] == L)
216                 --links[depth]->balance;
217               else
218                 ++links[depth]->balance;
219               --depth;
220             }
221 
222           /* Rebalance the tree by pointer rotations if necessary. */
223           if (depth && ((dirs[depth] == L && --links[depth]->balance)
224                         || (dirs[depth] == R && ++links[depth]->balance)))
225             {
226               struct tree *t, *r, *l, *rl, *lr;
227 
228               switch (links[depth]->balance)
229                 {
230                 case (char) -2:
231                   switch (dirs[depth + 1])
232                     {
233                     case L:
234                       r = links[depth], t = r->llink, rl = t->rlink;
235                       t->rlink = r, r->llink = rl;
236                       t->balance = r->balance = 0;
237                       break;
238                     case R:
239                       r = links[depth], l = r->llink, t = l->rlink;
240                       rl = t->rlink, lr = t->llink;
241                       t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
242                       l->balance = t->balance != 1 ? 0 : -1;
243                       r->balance = t->balance != (char) -1 ? 0 : 1;
244                       t->balance = 0;
245                       break;
246                     default:
247                       abort ();
248                     }
249                   break;
250                 case 2:
251                   switch (dirs[depth + 1])
252                     {
253                     case R:
254                       l = links[depth], t = l->rlink, lr = t->llink;
255                       t->llink = l, l->rlink = lr;
256                       t->balance = l->balance = 0;
257                       break;
258                     case L:
259                       l = links[depth], r = l->rlink, t = r->llink;
260                       lr = t->llink, rl = t->rlink;
261                       t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
262                       l->balance = t->balance != 1 ? 0 : -1;
263                       r->balance = t->balance != (char) -1 ? 0 : 1;
264                       t->balance = 0;
265                       break;
266                     default:
267                       abort ();
268                     }
269                   break;
270                 default:
271                   abort ();
272                 }
273 
274               if (dirs[depth - 1] == L)
275                 links[depth - 1]->llink = t;
276               else
277                 links[depth - 1]->rlink = t;
278             }
279         }
280 
281       trie = link->trie;
282     }
283 
284   /* Mark the node we finally reached as accepting, encoding the
285      index number of this word in the keyword set so far. */
286   if (!trie->accepting)
287     trie->accepting = 1 + 2 * kwset->words;
288   ++kwset->words;
289 
290   /* Keep track of the longest and shortest string of the keyword set. */
291   if (trie->depth < kwset->mind)
292     kwset->mind = trie->depth;
293   if (trie->depth > kwset->maxd)
294     kwset->maxd = trie->depth;
295 }
296 
297 /* Enqueue the trie nodes referenced from the given tree in the
298    given queue. */
299 static void
300 enqueue (struct tree *tree, struct trie **last)
301 {
302   if (!tree)
303     return;
304   enqueue(tree->llink, last);
305   enqueue(tree->rlink, last);
306   (*last) = (*last)->next = tree->trie;
307 }
308 
309 /* Compute the Aho-Corasick failure function for the trie nodes referenced
310    from the given tree, given the failure function for their parent as
311    well as a last resort failure node. */
312 static void
313 treefails (struct tree const *tree, struct trie const *fail,
314            struct trie *recourse)
315 {
316   struct tree *link;
317 
318   if (!tree)
319     return;
320 
321   treefails(tree->llink, fail, recourse);
322   treefails(tree->rlink, fail, recourse);
323 
324   /* Find, in the chain of fails going back to the root, the first
325      node that has a descendant on the current label. */
326   while (fail)
327     {
328       link = fail->links;
329       while (link && tree->label != link->label)
330         if (tree->label < link->label)
331           link = link->llink;
332         else
333           link = link->rlink;
334       if (link)
335         {
336           tree->trie->fail = link->trie;
337           return;
338         }
339       fail = fail->fail;
340     }
341 
342   tree->trie->fail = recourse;
343 }
344 
345 /* Set delta entries for the links of the given tree such that
346    the preexisting delta value is larger than the current depth. */
347 static void
348 treedelta (struct tree const *tree,
349            unsigned int depth,
350            unsigned char delta[])
351 {
352   if (!tree)
353     return;
354   treedelta(tree->llink, depth, delta);
355   treedelta(tree->rlink, depth, delta);
356   if (depth < delta[tree->label])
357     delta[tree->label] = depth;
358 }
359 
360 /* Return true if A has every label in B. */
361 static int _GL_ATTRIBUTE_PURE
362 hasevery (struct tree const *a, struct tree const *b)
363 {
364   if (!b)
365     return 1;
366   if (!hasevery(a, b->llink))
367     return 0;
368   if (!hasevery(a, b->rlink))
369     return 0;
370   while (a && b->label != a->label)
371     if (b->label < a->label)
372       a = a->llink;
373     else
374       a = a->rlink;
375   return !!a;
376 }
377 
378 /* Compute a vector, indexed by character code, of the trie nodes
379    referenced from the given tree. */
380 static void
381 treenext (struct tree const *tree, struct trie *next[])
382 {
383   if (!tree)
384     return;
385   treenext(tree->llink, next);
386   treenext(tree->rlink, next);
387   next[tree->label] = tree->trie;
388 }
389 
390 /* Compute the shift for each trie node, as well as the delta
391    table and next cache for the given keyword set. */
392 void
393 kwsprep (kwset_t kwset)
394 {
395   char const *trans = kwset->trans;
396   int i;
397   unsigned char deltabuf[NCHAR];
398   unsigned char *delta = trans ? deltabuf : kwset->delta;
399 
400   /* Initial values for the delta table; will be changed later.  The
401      delta entry for a given character is the smallest depth of any
402      node at which an outgoing edge is labeled by that character. */
403   memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
404 
405   /* Traverse the nodes of the trie in level order, simultaneously
406      computing the delta table, failure function, and shift function.  */
407   struct trie *curr, *last;
408   for (curr = last = kwset->trie; curr; curr = curr->next)
409     {
410       /* Enqueue the immediate descendants in the level order queue.  */
411       enqueue (curr->links, &last);
412 
413       curr->shift = kwset->mind;
414       curr->maxshift = kwset->mind;
415 
416       /* Update the delta table for the descendants of this node.  */
417       treedelta (curr->links, curr->depth, delta);
418 
419       /* Compute the failure function for the descendants of this node.  */
420       treefails (curr->links, curr->fail, kwset->trie);
421 
422       /* Update the shifts at each node in the current node's chain
423          of fails back to the root.  */
424       struct trie *fail;
425       for (fail = curr->fail; fail; fail = fail->fail)
426         {
427           /* If the current node has some outgoing edge that the fail
428              doesn't, then the shift at the fail should be no larger
429              than the difference of their depths.  */
430           if (!hasevery (fail->links, curr->links))
431             if (curr->depth - fail->depth < fail->shift)
432               fail->shift = curr->depth - fail->depth;
433 
434           /* If the current node is accepting then the shift at the
435              fail and its descendants should be no larger than the
436              difference of their depths.  */
437           if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
438             fail->maxshift = curr->depth - fail->depth;
439         }
440     }
441 
442   /* Traverse the trie in level order again, fixing up all nodes whose
443      shift exceeds their inherited maxshift.  */
444   for (curr = kwset->trie->next; curr; curr = curr->next)
445     {
446       if (curr->maxshift > curr->parent->maxshift)
447         curr->maxshift = curr->parent->maxshift;
448       if (curr->shift > curr->maxshift)
449         curr->shift = curr->maxshift;
450     }
451 
452   /* Create a vector, indexed by character code, of the outgoing links
453      from the root node.  */
454   struct trie *nextbuf[NCHAR];
455   struct trie **next = trans ? nextbuf : kwset->next;
456   memset (next, 0, sizeof nextbuf);
457   treenext (kwset->trie->links, next);
458   if (trans)
459     for (i = 0; i < NCHAR; ++i)
460       kwset->next[i] = next[U(trans[i])];
461 
462   /* Check if we can use the simple boyer-moore algorithm, instead
463      of the hairy commentz-walter algorithm. */
464   if (kwset->words == 1)
465     {
466       /* Looking for just one string.  Extract it from the trie. */
467       kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
468       for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
469         {
470           kwset->target[i] = curr->links->label;
471           curr = curr->next;
472         }
473       /* Looking for the delta2 shift that we might make after a
474          backwards match has failed.  Extract it from the trie.  */
475       if (kwset->mind > 1)
476         {
477           kwset->shift
478             = obstack_alloc (&kwset->obstack,
479                              sizeof *kwset->shift * (kwset->mind - 1));
480           for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i)
481             {
482               kwset->shift[i] = curr->shift;
483               curr = curr->next;
484             }
485         }
486 
487       char gc1 = tr (trans, kwset->target[kwset->mind - 1]);
488 
489       /* Set GC1HELP according to whether exactly one, exactly two, or
490          three-or-more characters match GC1.  */
491       int gc1help = -1;
492       if (trans)
493         {
494           char const *equiv1 = memchr (trans, gc1, NCHAR);
495           char const *equiv2 = memchr (equiv1 + 1, gc1,
496                                        trans + NCHAR - (equiv1 + 1));
497           if (equiv2)
498             gc1help = (memchr (equiv2 + 1, gc1, trans + NCHAR - (equiv2 + 1))
499                        ? NCHAR
500                        : U(gc1) ^ (equiv1 - trans) ^ (equiv2 - trans));
501         }
502 
503       kwset->gc1 = gc1;
504       kwset->gc1help = gc1help;
505       if (kwset->mind > 1)
506         kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
507     }
508 
509   /* Fix things up for any translation table. */
510   if (trans)
511     for (i = 0; i < NCHAR; ++i)
512       kwset->delta[i] = delta[U(trans[i])];
513 }
514 
515 /* Delta2 portion of a Boyer-Moore search.  *TP is the string text
516    pointer; it is updated in place.  EP is the end of the string text,
517    and SP the end of the pattern.  LEN is the pattern length; it must
518    be at least 2.  TRANS, if nonnull, is the input translation table.
519    GC1 and GC2 are the last and second-from last bytes of the pattern,
520    transliterated by TRANS; the caller precomputes them for
521    efficiency.  If D1 is nonnull, it is a delta1 table for shifting *TP
522    when failing.  KWSET->shift says how much to shift.  */
523 static inline bool
524 bm_delta2_search (char const **tpp, char const *ep, char const *sp, int len,
525                   char const *trans, char gc1, char gc2,
526                   unsigned char const *d1, kwset_t kwset)
527 {
528   char const *tp = *tpp;
529   int d = len, skip = 0;
530 
531   while (true)
532     {
533       int i = 2;
534       if (tr (trans, tp[-2]) == gc2)
535         {
536           while (++i <= d)
537             if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
538               break;
539           if (i > d)
540             {
541               for (i = d + skip + 1; i <= len; ++i)
542                 if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
543                   break;
544               if (i > len)
545                 {
546                   *tpp = tp - len;
547                   return true;
548                 }
549             }
550         }
551 
552       tp += d = kwset->shift[i - 2];
553       if (tp > ep)
554         break;
555       if (tr (trans, tp[-1]) != gc1)
556         {
557           if (d1)
558             tp += d1[U(tp[-1])];
559           break;
560         }
561       skip = i - 1;
562     }
563 
564   *tpp = tp;
565   return false;
566 }
567 
568 /* Return the address of the first byte in the buffer S (of size N)
569    that matches the last byte specified by KWSET, a singleton.  */
570 static char const *
571 memchr_kwset (char const *s, size_t n, kwset_t kwset)
572 {
573   if (kwset->gc1help < 0)
574     return memchr (s, kwset->gc1, n);
575   int small_heuristic = 2;
576   int small = (- (uintptr_t) s % sizeof (long)
577                + small_heuristic * sizeof (long));
578   size_t ntrans = kwset->gc1help < NCHAR && small < n ? small : n;
579   char const *slim = s + ntrans;
580   for (; s < slim; s++)
581     if (kwset->trans[U(*s)] == kwset->gc1)
582       return s;
583   n -= ntrans;
584   return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n);
585 }
586 
587 /* Fast Boyer-Moore search (inlinable version).  */
588 static inline size_t _GL_ATTRIBUTE_PURE
589 bmexec_trans (kwset_t kwset, char const *text, size_t size)
590 {
591   unsigned char const *d1;
592   char const *ep, *sp, *tp;
593   int d;
594   int len = kwset->mind;
595   char const *trans = kwset->trans;
596 
597   if (len == 0)
598     return 0;
599   if (len > size)
600     return -1;
601   if (len == 1)
602     {
603       tp = memchr_kwset (text, size, kwset);
604       return tp ? tp - text : -1;
605     }
606 
607   d1 = kwset->delta;
608   sp = kwset->target + len;
609   tp = text + len;
610   char gc1 = kwset->gc1;
611   char gc2 = kwset->gc2;
612 
613   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
614   if (size > 12 * len)
615     /* 11 is not a bug, the initial offset happens only once. */
616     for (ep = text + size - 11 * len; tp <= ep; )
617       {
618         char const *tp0 = tp;
619         d = d1[U(tp[-1])], tp += d;
620         d = d1[U(tp[-1])], tp += d;
621         if (d != 0)
622           {
623             d = d1[U(tp[-1])], tp += d;
624             d = d1[U(tp[-1])], tp += d;
625             d = d1[U(tp[-1])], tp += d;
626             if (d != 0)
627               {
628                 d = d1[U(tp[-1])], tp += d;
629                 d = d1[U(tp[-1])], tp += d;
630                 d = d1[U(tp[-1])], tp += d;
631                 if (d != 0)
632                   {
633                     d = d1[U(tp[-1])], tp += d;
634                     d = d1[U(tp[-1])], tp += d;
635 
636                     /* As a heuristic, prefer memchr to seeking by
637                        delta1 when the latter doesn't advance much.  */
638                     int advance_heuristic = 16 * sizeof (long);
639                     if (advance_heuristic <= tp - tp0)
640                       continue;
641                     tp--;
642                     tp = memchr_kwset (tp, text + size - tp, kwset);
643                     if (! tp)
644                       return -1;
645                     tp++;
646                     if (ep <= tp)
647                       break;
648                   }
649               }
650           }
651         if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
652           return tp - text;
653       }
654 
655   /* Now we have only a few characters left to search.  We
656      carefully avoid ever producing an out-of-bounds pointer. */
657   ep = text + size;
658   d = d1[U(tp[-1])];
659   while (d <= ep - tp)
660     {
661       d = d1[U((tp += d)[-1])];
662       if (d != 0)
663         continue;
664       if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
665         return tp - text;
666     }
667 
668   return -1;
669 }
670 
671 /* Fast Boyer-Moore search.  */
672 static size_t
673 bmexec (kwset_t kwset, char const *text, size_t size)
674 {
675   /* Help the compiler inline bmexec_trans in two ways, depending on
676      whether kwset->trans is null.  */
677   return (kwset->trans
678           ? bmexec_trans (kwset, text, size)
679           : bmexec_trans (kwset, text, size));
680 }
681 
682 /* Hairy multiple string search. */
683 static size_t _GL_ARG_NONNULL ((4))
684 cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
685 {
686   struct trie * const *next;
687   struct trie const *trie;
688   struct trie const *accept;
689   char const *beg, *lim, *mch, *lmch;
690   unsigned char c;
691   unsigned char const *delta;
692   int d;
693   char const *end, *qlim;
694   struct tree const *tree;
695   char const *trans;
696 
697 #ifdef lint
698   accept = NULL;
699 #endif
700 
701   /* Initialize register copies and look for easy ways out. */
702   if (len < kwset->mind)
703     return -1;
704   next = kwset->next;
705   delta = kwset->delta;
706   trans = kwset->trans;
707   lim = text + len;
708   end = text;
709   if ((d = kwset->mind) != 0)
710     mch = NULL;
711   else
712     {
713       mch = text, accept = kwset->trie;
714       goto match;
715     }
716 
717   if (len >= 4 * kwset->mind)
718     qlim = lim - 4 * kwset->mind;
719   else
720     qlim = NULL;
721 
722   while (lim - end >= d)
723     {
724       if (qlim && end <= qlim)
725         {
726           end += d - 1;
727           while ((d = delta[c = *end]) && end < qlim)
728             {
729               end += d;
730               end += delta[U(*end)];
731               end += delta[U(*end)];
732             }
733           ++end;
734         }
735       else
736         d = delta[c = (end += d)[-1]];
737       if (d)
738         continue;
739       beg = end - 1;
740       trie = next[c];
741       if (trie->accepting)
742         {
743           mch = beg;
744           accept = trie;
745         }
746       d = trie->shift;
747       while (beg > text)
748         {
749           unsigned char uc = *--beg;
750           c = trans ? trans[uc] : uc;
751           tree = trie->links;
752           while (tree && c != tree->label)
753             if (c < tree->label)
754               tree = tree->llink;
755             else
756               tree = tree->rlink;
757           if (tree)
758             {
759               trie = tree->trie;
760               if (trie->accepting)
761                 {
762                   mch = beg;
763                   accept = trie;
764                 }
765             }
766           else
767             break;
768           d = trie->shift;
769         }
770       if (mch)
771         goto match;
772     }
773   return -1;
774 
775  match:
776   /* Given a known match, find the longest possible match anchored
777      at or before its starting point.  This is nearly a verbatim
778      copy of the preceding main search loops. */
779   if (lim - mch > kwset->maxd)
780     lim = mch + kwset->maxd;
781   lmch = 0;
782   d = 1;
783   while (lim - end >= d)
784     {
785       if ((d = delta[c = (end += d)[-1]]) != 0)
786         continue;
787       beg = end - 1;
788       if (!(trie = next[c]))
789         {
790           d = 1;
791           continue;
792         }
793       if (trie->accepting && beg <= mch)
794         {
795           lmch = beg;
796           accept = trie;
797         }
798       d = trie->shift;
799       while (beg > text)
800         {
801           unsigned char uc = *--beg;
802           c = trans ? trans[uc] : uc;
803           tree = trie->links;
804           while (tree && c != tree->label)
805             if (c < tree->label)
806               tree = tree->llink;
807             else
808               tree = tree->rlink;
809           if (tree)
810             {
811               trie = tree->trie;
812               if (trie->accepting && beg <= mch)
813                 {
814                   lmch = beg;
815                   accept = trie;
816                 }
817             }
818           else
819             break;
820           d = trie->shift;
821         }
822       if (lmch)
823         {
824           mch = lmch;
825           goto match;
826         }
827       if (!d)
828         d = 1;
829     }
830 
831   kwsmatch->index = accept->accepting / 2;
832   kwsmatch->offset[0] = mch - text;
833   kwsmatch->size[0] = accept->depth;
834 
835   return mch - text;
836 }
837 
838 /* Search TEXT for a match of any member of KWSET.
839    Return the offset (into TEXT) of the first byte of the matching substring,
840    or (size_t) -1 if no match is found.  Upon a match, store details in
841    *KWSMATCH: index of matched keyword, start offset (same as the return
842    value), and length.  */
843 size_t
844 kwsexec (kwset_t kwset, char const *text, size_t size,
845          struct kwsmatch *kwsmatch)
846 {
847   if (kwset->words == 1)
848     {
849       size_t ret = bmexec (kwset, text, size);
850       if (ret != (size_t) -1)
851         {
852           kwsmatch->index = 0;
853           kwsmatch->offset[0] = ret;
854           kwsmatch->size[0] = kwset->mind;
855         }
856       return ret;
857     }
858   else
859     return cwexec (kwset, text, size, kwsmatch);
860 }
861 
862 /* Free the components of the given keyword set. */
863 void
864 kwsfree (kwset_t kwset)
865 {
866   obstack_free (&kwset->obstack, NULL);
867   free (kwset);
868 }
869