xref: /dragonfly/contrib/grep/src/kwset.c (revision a4da4a90)
1 /* kwset.c - search for any of a set of keywords.
2    Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2020 Free Software
3    Foundation, Inc.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software
17    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18    02110-1301, USA.  */
19 
20 /* Written August 1989 by Mike Haertel.  */
21 
22 /* For the Aho-Corasick algorithm, see:
23    Aho AV, Corasick MJ. Efficient string matching: an aid to
24    bibliographic search. CACM 18, 6 (1975), 333-40
25    <https://dx.doi.org/10.1145/360825.360855>, which describes the
26    failure function used below.
27 
28    For the Boyer-Moore algorithm, see: Boyer RS, Moore JS.
29    A fast string searching algorithm. CACM 20, 10 (1977), 762-72
30    <https://dx.doi.org/10.1145/359842.359859>.
31 
32    For a survey of more-recent string matching algorithms that might
33    help improve performance, see: Faro S, Lecroq T. The exact online
34    string matching problem: a review of the most recent results.
35    ACM Computing Surveys 45, 2 (2013), 13
36    <https://dx.doi.org/10.1145/2431211.2431212>.  */
37 
38 #include <config.h>
39 
40 #include "kwset.h"
41 
42 #include <stdint.h>
43 #include <sys/types.h>
44 #include "system.h"
45 #include "intprops.h"
46 #include "memchr2.h"
47 #include "obstack.h"
48 #include "xalloc.h"
49 #include "verify.h"
50 
51 #define obstack_chunk_alloc xmalloc
52 #define obstack_chunk_free free
53 
54 static unsigned char
55 U (char ch)
56 {
57   return to_uchar (ch);
58 }
59 
60 /* Balanced tree of edges and labels leaving a given trie node.  */
61 struct tree
62 {
63   struct tree *llink;		/* Left link; MUST be first field.  */
64   struct tree *rlink;		/* Right link (to larger labels).  */
65   struct trie *trie;		/* Trie node pointed to by this edge.  */
66   unsigned char label;		/* Label on this edge.  */
67   char balance;			/* Difference in depths of subtrees.  */
68 };
69 
70 /* Node of a trie representing a set of keywords.  */
71 struct trie
72 {
73   /* If an accepting node, this is either 2*W + 1 where W is the word
74      index, or is SIZE_MAX if Aho-Corasick is in use and FAIL
75      specifies where to look for more info.  If not an accepting node,
76      this is zero.  */
77   size_t accepting;
78 
79   struct tree *links;		/* Tree of edges leaving this node.  */
80   struct trie *parent;		/* Parent of this node.  */
81   struct trie *next;		/* List of all trie nodes in level order.  */
82   struct trie *fail;		/* Aho-Corasick failure function.  */
83   ptrdiff_t depth;		/* Depth of this node from the root.  */
84   ptrdiff_t shift;		/* Shift function for search failures.  */
85   ptrdiff_t maxshift;		/* Max shift of self and descendants.  */
86 };
87 
88 /* Structure returned opaquely to the caller, containing everything.  */
89 struct kwset
90 {
91   struct obstack obstack;	/* Obstack for node allocation.  */
92   ptrdiff_t words;		/* Number of words in the trie.  */
93   struct trie *trie;		/* The trie itself.  */
94   ptrdiff_t mind;		/* Minimum depth of an accepting node.  */
95   ptrdiff_t maxd;		/* Maximum depth of any node.  */
96   unsigned char delta[NCHAR];	/* Delta table for rapid search.  */
97   struct trie *next[NCHAR];	/* Table of children of the root.  */
98   char *target;			/* Target string if there's only one.  */
99   ptrdiff_t *shift;		/* Used in Boyer-Moore search for one
100                                    string.  */
101   char const *trans;		/* Character translation table.  */
102 
103   /* This helps to match a terminal byte, which is the first byte
104      for Aho-Corasick, and the last byte for Boyer-More.  If all the
105      patterns have the same terminal byte (after translation via TRANS
106      if TRANS is nonnull), then this is that byte as an unsigned char.
107      Otherwise this is -1 if there is disagreement among the strings
108      about terminal bytes, and -2 if there are no terminal bytes and
109      no disagreement because all the patterns are empty.  */
110   int gc1;
111 
112   /* This helps to match a terminal byte.  If 0 <= GC1HELP, B is
113      terminal when B == GC1 || B == GC1HELP (note that GC1 == GCHELP
114      is common here).  This is typically faster than evaluating
115      to_uchar (TRANS[B]) == GC1.  */
116   int gc1help;
117 
118   /* If the string has two or more bytes, this is the penultimate byte,
119      after translation via TRANS if TRANS is nonnull.  This variable
120      is used only by Boyer-Moore.  */
121   char gc2;
122 
123   /* kwsexec implementation.  */
124   ptrdiff_t (*kwsexec) (kwset_t, char const *, ptrdiff_t,
125                         struct kwsmatch *, bool);
126 };
127 
128 /* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
129 static inline char
130 tr (char const *trans, char c)
131 {
132   return trans ? trans[U(c)] : c;
133 }
134 
135 static ptrdiff_t acexec (kwset_t, char const *, ptrdiff_t,
136                          struct kwsmatch *, bool);
137 static ptrdiff_t bmexec (kwset_t, char const *, ptrdiff_t,
138                          struct kwsmatch *, bool);
139 
140 /* Return a newly allocated keyword set.  A nonnull TRANS specifies a
141    table of character translations to be applied to all pattern and
142    search text.  */
143 kwset_t
144 kwsalloc (char const *trans)
145 {
146   struct kwset *kwset = xmalloc (sizeof *kwset);
147 
148   obstack_init (&kwset->obstack);
149   kwset->words = 0;
150   kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie);
151   kwset->trie->accepting = 0;
152   kwset->trie->links = NULL;
153   kwset->trie->parent = NULL;
154   kwset->trie->next = NULL;
155   kwset->trie->fail = NULL;
156   kwset->trie->depth = 0;
157   kwset->trie->shift = 0;
158   kwset->mind = PTRDIFF_MAX;
159   kwset->maxd = -1;
160   kwset->target = NULL;
161   kwset->trans = trans;
162   kwset->kwsexec = acexec;
163 
164   return kwset;
165 }
166 
167 /* This upper bound is valid for CHAR_BIT >= 4 and
168    exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }.  */
169 enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
170 
171 /* Add the given string to the contents of the keyword set.  */
172 void
173 kwsincr (kwset_t kwset, char const *text, ptrdiff_t len)
174 {
175   assume (0 <= len);
176   struct trie *trie = kwset->trie;
177   char const *trans = kwset->trans;
178   bool reverse = kwset->kwsexec == bmexec;
179 
180   if (reverse)
181     text += len;
182 
183   /* Descend the trie (built of keywords) character-by-character,
184      installing new nodes when necessary.  */
185   while (len--)
186     {
187       unsigned char uc = reverse ? *--text : *text++;
188       unsigned char label = trans ? trans[uc] : uc;
189 
190       /* Descend the tree of outgoing links for this trie node,
191          looking for the current character and keeping track
192          of the path followed.  */
193       struct tree *cur = trie->links;
194       struct tree *links[DEPTH_SIZE];
195       enum { L, R } dirs[DEPTH_SIZE];
196       links[0] = (struct tree *) &trie->links;
197       dirs[0] = L;
198       ptrdiff_t depth = 1;
199 
200       while (cur && label != cur->label)
201         {
202           links[depth] = cur;
203           if (label < cur->label)
204             dirs[depth++] = L, cur = cur->llink;
205           else
206             dirs[depth++] = R, cur = cur->rlink;
207         }
208 
209       /* The current character doesn't have an outgoing link at
210          this trie node, so build a new trie node and install
211          a link in the current trie node's tree.  */
212       if (!cur)
213         {
214           cur = obstack_alloc (&kwset->obstack, sizeof *cur);
215           cur->llink = NULL;
216           cur->rlink = NULL;
217           cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
218           cur->trie->accepting = 0;
219           cur->trie->links = NULL;
220           cur->trie->parent = trie;
221           cur->trie->next = NULL;
222           cur->trie->fail = NULL;
223           cur->trie->depth = trie->depth + 1;
224           cur->trie->shift = 0;
225           cur->label = label;
226           cur->balance = 0;
227 
228           /* Install the new tree node in its parent.  */
229           if (dirs[--depth] == L)
230             links[depth]->llink = cur;
231           else
232             links[depth]->rlink = cur;
233 
234           /* Back up the tree fixing the balance flags.  */
235           while (depth && !links[depth]->balance)
236             {
237               if (dirs[depth] == L)
238                 --links[depth]->balance;
239               else
240                 ++links[depth]->balance;
241               --depth;
242             }
243 
244           /* Rebalance the tree by pointer rotations if necessary.  */
245           if (depth && ((dirs[depth] == L && --links[depth]->balance)
246                         || (dirs[depth] == R && ++links[depth]->balance)))
247             {
248               struct tree *t, *r, *l, *rl, *lr;
249 
250               switch (links[depth]->balance)
251                 {
252                 case (char) -2:
253                   switch (dirs[depth + 1])
254                     {
255                     case L:
256                       r = links[depth], t = r->llink, rl = t->rlink;
257                       t->rlink = r, r->llink = rl;
258                       t->balance = r->balance = 0;
259                       break;
260                     case R:
261                       r = links[depth], l = r->llink, t = l->rlink;
262                       rl = t->rlink, lr = t->llink;
263                       t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
264                       l->balance = t->balance != 1 ? 0 : -1;
265                       r->balance = t->balance != (char) -1 ? 0 : 1;
266                       t->balance = 0;
267                       break;
268                     default:
269                       abort ();
270                     }
271                   break;
272                 case 2:
273                   switch (dirs[depth + 1])
274                     {
275                     case R:
276                       l = links[depth], t = l->rlink, lr = t->llink;
277                       t->llink = l, l->rlink = lr;
278                       t->balance = l->balance = 0;
279                       break;
280                     case L:
281                       l = links[depth], r = l->rlink, t = r->llink;
282                       lr = t->llink, rl = t->rlink;
283                       t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
284                       l->balance = t->balance != 1 ? 0 : -1;
285                       r->balance = t->balance != (char) -1 ? 0 : 1;
286                       t->balance = 0;
287                       break;
288                     default:
289                       abort ();
290                     }
291                   break;
292                 default:
293                   abort ();
294                 }
295 
296               if (dirs[depth - 1] == L)
297                 links[depth - 1]->llink = t;
298               else
299                 links[depth - 1]->rlink = t;
300             }
301         }
302 
303       trie = cur->trie;
304     }
305 
306   /* Mark the node finally reached as accepting, encoding the
307      index number of this word in the keyword set so far.  */
308   if (!trie->accepting)
309     {
310       size_t words = kwset->words;
311       trie->accepting = 2 * words + 1;
312     }
313   ++kwset->words;
314 
315   /* Keep track of the longest and shortest string of the keyword set.  */
316   if (trie->depth < kwset->mind)
317     kwset->mind = trie->depth;
318   if (trie->depth > kwset->maxd)
319     kwset->maxd = trie->depth;
320 }
321 
322 ptrdiff_t
323 kwswords (kwset_t kwset)
324 {
325   return kwset->words;
326 }
327 
328 /* Enqueue the trie nodes referenced from the given tree in the
329    given queue.  */
330 static void
331 enqueue (struct tree *tree, struct trie **last)
332 {
333   if (!tree)
334     return;
335   enqueue (tree->llink, last);
336   enqueue (tree->rlink, last);
337   (*last) = (*last)->next = tree->trie;
338 }
339 
340 /* Compute the Aho-Corasick failure function for the trie nodes referenced
341    from the given tree, given the failure function for their parent as
342    well as a last resort failure node.  */
343 static void
344 treefails (struct tree const *tree, struct trie const *fail,
345            struct trie *recourse, bool reverse)
346 {
347   struct tree *cur;
348 
349   if (!tree)
350     return;
351 
352   treefails (tree->llink, fail, recourse, reverse);
353   treefails (tree->rlink, fail, recourse, reverse);
354 
355   /* Find, in the chain of fails going back to the root, the first
356      node that has a descendant on the current label.  */
357   while (fail)
358     {
359       cur = fail->links;
360       while (cur && tree->label != cur->label)
361         if (tree->label < cur->label)
362           cur = cur->llink;
363         else
364           cur = cur->rlink;
365       if (cur)
366         {
367           tree->trie->fail = cur->trie;
368           if (!reverse && cur->trie->accepting && !tree->trie->accepting)
369             tree->trie->accepting = SIZE_MAX;
370           return;
371         }
372       fail = fail->fail;
373     }
374 
375   tree->trie->fail = recourse;
376 }
377 
378 /* Set delta entries for the links of the given tree such that
379    the preexisting delta value is larger than the current depth.  */
380 static void
381 treedelta (struct tree const *tree, ptrdiff_t depth, unsigned char delta[])
382 {
383   if (!tree)
384     return;
385   treedelta (tree->llink, depth, delta);
386   treedelta (tree->rlink, depth, delta);
387   if (depth < delta[tree->label])
388     delta[tree->label] = depth;
389 }
390 
391 /* Return true if A has every label in B.  */
392 static bool _GL_ATTRIBUTE_PURE
393 hasevery (struct tree const *a, struct tree const *b)
394 {
395   if (!b)
396     return true;
397   if (!hasevery (a, b->llink))
398     return false;
399   if (!hasevery (a, b->rlink))
400     return false;
401   while (a && b->label != a->label)
402     if (b->label < a->label)
403       a = a->llink;
404     else
405       a = a->rlink;
406   return !!a;
407 }
408 
409 /* Compute a vector, indexed by character code, of the trie nodes
410    referenced from the given tree.  */
411 static void
412 treenext (struct tree const *tree, struct trie *next[])
413 {
414   if (!tree)
415     return;
416   treenext (tree->llink, next);
417   treenext (tree->rlink, next);
418   next[tree->label] = tree->trie;
419 }
420 
421 /* Prepare a built keyword set for use.  */
422 void
423 kwsprep (kwset_t kwset)
424 {
425   char const *trans = kwset->trans;
426   ptrdiff_t i;
427   unsigned char deltabuf[NCHAR];
428   unsigned char *delta = trans ? deltabuf : kwset->delta;
429   struct trie *curr, *last;
430 
431   /* Use Boyer-Moore if just one pattern, Aho-Corasick otherwise.  */
432   bool reverse = kwset->words == 1;
433 
434   if (reverse)
435     {
436       kwset_t new_kwset;
437 
438       /* Enqueue the immediate descendants in the level order queue.  */
439       for (curr = last = kwset->trie; curr; curr = curr->next)
440         enqueue (curr->links, &last);
441 
442       /* Looking for just one string.  Extract it from the trie.  */
443       kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
444       for (i = 0, curr = kwset->trie; i < kwset->mind; ++i)
445         {
446           kwset->target[i] = curr->links->label;
447           curr = curr->next;
448         }
449 
450       new_kwset = kwsalloc (kwset->trans);
451       new_kwset->kwsexec = bmexec;
452       kwsincr (new_kwset, kwset->target, kwset->mind);
453       obstack_free (&kwset->obstack, NULL);
454       *kwset = *new_kwset;
455       free (new_kwset);
456     }
457 
458   /* Initial values for the delta table; will be changed later.  The
459      delta entry for a given character is the smallest depth of any
460      node at which an outgoing edge is labeled by that character.  */
461   memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
462 
463   /* Traverse the nodes of the trie in level order, simultaneously
464      computing the delta table, failure function, and shift function.  */
465   for (curr = last = kwset->trie; curr; curr = curr->next)
466     {
467       /* Enqueue the immediate descendants in the level order queue.  */
468       enqueue (curr->links, &last);
469 
470       /* Update the delta table for the descendants of this node.  */
471       treedelta (curr->links, curr->depth, delta);
472 
473       /* Compute the failure function for the descendants of this node.  */
474       treefails (curr->links, curr->fail, kwset->trie, reverse);
475 
476       if (reverse)
477         {
478           curr->shift = kwset->mind;
479           curr->maxshift = kwset->mind;
480 
481           /* Update the shifts at each node in the current node's chain
482              of fails back to the root.  */
483           struct trie *fail;
484           for (fail = curr->fail; fail; fail = fail->fail)
485             {
486               /* If the current node has some outgoing edge that the fail
487                  doesn't, then the shift at the fail should be no larger
488                  than the difference of their depths.  */
489               if (!hasevery (fail->links, curr->links))
490                 if (curr->depth - fail->depth < fail->shift)
491                   fail->shift = curr->depth - fail->depth;
492 
493               /* If the current node is accepting then the shift at the
494                  fail and its descendants should be no larger than the
495                  difference of their depths.  */
496               if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
497                 fail->maxshift = curr->depth - fail->depth;
498             }
499         }
500     }
501 
502   if (reverse)
503     {
504       /* Traverse the trie in level order again, fixing up all nodes whose
505          shift exceeds their inherited maxshift.  */
506       for (curr = kwset->trie->next; curr; curr = curr->next)
507         {
508           if (curr->maxshift > curr->parent->maxshift)
509             curr->maxshift = curr->parent->maxshift;
510           if (curr->shift > curr->maxshift)
511             curr->shift = curr->maxshift;
512         }
513     }
514 
515   /* Create a vector, indexed by character code, of the outgoing links
516      from the root node.  Accumulate GC1 and GC1HELP.  */
517   struct trie *nextbuf[NCHAR];
518   struct trie **next = trans ? nextbuf : kwset->next;
519   memset (next, 0, sizeof nextbuf);
520   treenext (kwset->trie->links, next);
521   int gc1 = -2;
522   int gc1help = -1;
523   for (i = 0; i < NCHAR; i++)
524     {
525       int ti = i;
526       if (trans)
527         {
528           ti = U(trans[i]);
529           kwset->next[i] = next[ti];
530         }
531       if (kwset->next[i])
532         {
533           if (gc1 < -1)
534             {
535               gc1 = ti;
536               gc1help = i;
537             }
538           else if (gc1 == ti)
539             gc1help = gc1help == ti ? i : -1;
540           else if (i == ti && gc1 == gc1help)
541             gc1help = i;
542           else
543             gc1 = -1;
544         }
545     }
546   kwset->gc1 = gc1;
547   kwset->gc1help = gc1help;
548 
549   if (reverse)
550     {
551       /* Looking for just one string.  Extract it from the trie.  */
552       kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
553       for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
554         {
555           kwset->target[i] = curr->links->label;
556           curr = curr->next;
557         }
558 
559       if (kwset->mind > 1)
560         {
561           /* Looking for the delta2 shift that might be made after a
562              backwards match has failed.  Extract it from the trie.  */
563           kwset->shift
564             = obstack_alloc (&kwset->obstack,
565                              sizeof *kwset->shift * (kwset->mind - 1));
566           for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i)
567             {
568               kwset->shift[i] = curr->shift;
569               curr = curr->next;
570             }
571 
572           /* The penultimate byte.  */
573           kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
574         }
575     }
576 
577   /* Fix things up for any translation table.  */
578   if (trans)
579     for (i = 0; i < NCHAR; ++i)
580       kwset->delta[i] = delta[U(trans[i])];
581 }
582 
583 /* Delta2 portion of a Boyer-Moore search.  *TP is the string text
584    pointer; it is updated in place.  EP is the end of the string text,
585    and SP the end of the pattern.  LEN is the pattern length; it must
586    be at least 2.  TRANS, if nonnull, is the input translation table.
587    GC1 and GC2 are the last and second-from last bytes of the pattern,
588    transliterated by TRANS; the caller precomputes them for
589    efficiency.  If D1 is nonnull, it is a delta1 table for shifting *TP
590    when failing.  KWSET->shift says how much to shift.  */
591 static inline bool
592 bm_delta2_search (char const **tpp, char const *ep, char const *sp,
593                   ptrdiff_t len,
594                   char const *trans, char gc1, char gc2,
595                   unsigned char const *d1, kwset_t kwset)
596 {
597   char const *tp = *tpp;
598   ptrdiff_t d = len, skip = 0;
599 
600   while (true)
601     {
602       ptrdiff_t i = 2;
603       if (tr (trans, tp[-2]) == gc2)
604         {
605           while (++i <= d)
606             if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
607               break;
608           if (i > d)
609             {
610               for (i = d + skip + 1; i <= len; ++i)
611                 if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
612                   break;
613               if (i > len)
614                 {
615                   *tpp = tp - len;
616                   return true;
617                 }
618             }
619         }
620 
621       tp += d = kwset->shift[i - 2];
622       if (tp > ep)
623         break;
624       if (tr (trans, tp[-1]) != gc1)
625         {
626           if (d1)
627             tp += d1[U(tp[-1])];
628           break;
629         }
630       skip = i - 1;
631     }
632 
633   *tpp = tp;
634   return false;
635 }
636 
637 /* Return the address of the first byte in the buffer S (of size N)
638    that matches the terminal byte specified by KWSET, or NULL if there
639    is no match.  KWSET->gc1 should be nonnegative.  */
640 static char const *
641 memchr_kwset (char const *s, ptrdiff_t n, kwset_t kwset)
642 {
643   char const *slim = s + n;
644   if (kwset->gc1help < 0)
645     {
646       for (; s < slim; s++)
647         if (kwset->next[U(*s)])
648           return s;
649     }
650   else
651     {
652       int small_heuristic = 2;
653       size_t small_bytes = small_heuristic * sizeof (unsigned long int);
654       while (s < slim)
655         {
656           if (kwset->next[U(*s)])
657             return s;
658           s++;
659           if ((uintptr_t) s % small_bytes == 0)
660             return memchr2 (s, kwset->gc1, kwset->gc1help, slim - s);
661         }
662     }
663   return NULL;
664 }
665 
666 /* Fast Boyer-Moore search (inlinable version).  */
667 static inline ptrdiff_t _GL_ATTRIBUTE_PURE
668 bmexec_trans (kwset_t kwset, char const *text, ptrdiff_t size)
669 {
670   assume (0 <= size);
671   unsigned char const *d1;
672   char const *ep, *sp, *tp;
673   int d;
674   ptrdiff_t len = kwset->mind;
675   char const *trans = kwset->trans;
676 
677   if (len == 0)
678     return 0;
679   if (len > size)
680     return -1;
681   if (len == 1)
682     {
683       tp = memchr_kwset (text, size, kwset);
684       return tp ? tp - text : -1;
685     }
686 
687   d1 = kwset->delta;
688   sp = kwset->target + len;
689   tp = text + len;
690   char gc1 = kwset->gc1;
691   char gc2 = kwset->gc2;
692 
693   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2).  */
694   ptrdiff_t len12;
695   if (!INT_MULTIPLY_WRAPV (len, 12, &len12) && len12 < size)
696     /* 11 is not a bug, the initial offset happens only once.  */
697     for (ep = text + size - 11 * len; tp <= ep; )
698       {
699         char const *tp0 = tp;
700         d = d1[U(tp[-1])], tp += d;
701         d = d1[U(tp[-1])], tp += d;
702         if (d != 0)
703           {
704             d = d1[U(tp[-1])], tp += d;
705             d = d1[U(tp[-1])], tp += d;
706             d = d1[U(tp[-1])], tp += d;
707             if (d != 0)
708               {
709                 d = d1[U(tp[-1])], tp += d;
710                 d = d1[U(tp[-1])], tp += d;
711                 d = d1[U(tp[-1])], tp += d;
712                 if (d != 0)
713                   {
714                     d = d1[U(tp[-1])], tp += d;
715                     d = d1[U(tp[-1])], tp += d;
716 
717                     /* As a heuristic, prefer memchr to seeking by
718                        delta1 when the latter doesn't advance much.  */
719                     int advance_heuristic = 16 * sizeof (long);
720                     if (advance_heuristic <= tp - tp0)
721                       continue;
722                     tp--;
723                     tp = memchr_kwset (tp, text + size - tp, kwset);
724                     if (! tp)
725                       return -1;
726                     tp++;
727                     if (ep <= tp)
728                       break;
729                   }
730               }
731           }
732         if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
733           return tp - text;
734       }
735 
736   /* Now only a few characters are left to search.  Carefully avoid
737      ever producing an out-of-bounds pointer.  */
738   ep = text + size;
739   d = d1[U(tp[-1])];
740   while (d <= ep - tp)
741     {
742       d = d1[U((tp += d)[-1])];
743       if (d != 0)
744         continue;
745       if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
746         return tp - text;
747     }
748 
749   return -1;
750 }
751 
752 /* Fast Boyer-Moore search.  */
753 static ptrdiff_t
754 bmexec (kwset_t kwset, char const *text, ptrdiff_t size,
755         struct kwsmatch *kwsmatch, bool longest)
756 {
757   /* Help the compiler inline in two ways, depending on whether
758      kwset->trans is null.  */
759   ptrdiff_t ret = (IGNORE_DUPLICATE_BRANCH_WARNING
760                    (kwset->trans
761                     ? bmexec_trans (kwset, text, size)
762                     : bmexec_trans (kwset, text, size)));
763   if (0 <= ret)
764     {
765        kwsmatch->index = 0;
766        kwsmatch->offset[0] = ret;
767        kwsmatch->size[0] = kwset->mind;
768     }
769 
770   return ret;
771 }
772 
773 /* Hairy multiple string search with the Aho-Corasick algorithm.
774    (inlinable version)  */
775 static inline ptrdiff_t
776 acexec_trans (kwset_t kwset, char const *text, ptrdiff_t len,
777               struct kwsmatch *kwsmatch, bool longest)
778 {
779   struct trie const *trie, *accept;
780   char const *tp, *left, *lim;
781   struct tree const *tree;
782   char const *trans;
783 
784   /* Initialize register copies and look for easy ways out.  */
785   if (len < kwset->mind)
786     return -1;
787   trans = kwset->trans;
788   trie = kwset->trie;
789   lim = text + len;
790   tp = text;
791 
792   if (!trie->accepting)
793     {
794       unsigned char c;
795       int gc1 = kwset->gc1;
796 
797       while (true)
798         {
799           if (gc1 < 0)
800             {
801               while (! (trie = kwset->next[c = tr (trans, *tp++)]))
802                 if (tp >= lim)
803                   return -1;
804             }
805           else
806             {
807               tp = memchr_kwset (tp, lim - tp, kwset);
808               if (!tp)
809                 return -1;
810               c = tr (trans, *tp++);
811               trie = kwset->next[c];
812             }
813 
814           while (true)
815             {
816               if (trie->accepting)
817                 goto match;
818               if (tp >= lim)
819                 return -1;
820               c = tr (trans, *tp++);
821 
822               for (tree = trie->links; c != tree->label; )
823                 {
824                   tree = c < tree->label ? tree->llink : tree->rlink;
825                   if (! tree)
826                     {
827                       trie = trie->fail;
828                       if (!trie)
829                         {
830                           trie = kwset->next[c];
831                           if (trie)
832                             goto have_trie;
833                           if (tp >= lim)
834                             return -1;
835                           goto next_c;
836                         }
837                       if (trie->accepting)
838                         {
839                           --tp;
840                           goto match;
841                         }
842                       tree = trie->links;
843                     }
844                 }
845               trie = tree->trie;
846             have_trie:;
847             }
848         next_c:;
849         }
850     }
851 
852  match:
853   accept = trie;
854   while (accept->accepting == SIZE_MAX)
855     accept = accept->fail;
856   left = tp - accept->depth;
857 
858   /* Try left-most longest match.  */
859   if (longest)
860     {
861       while (tp < lim)
862         {
863           struct trie const *accept1;
864           char const *left1;
865           unsigned char c = tr (trans, *tp++);
866 
867           do
868             {
869               tree = trie->links;
870               while (tree && c != tree->label)
871                 tree = c < tree->label ? tree->llink : tree->rlink;
872             }
873           while (!tree && (trie = trie->fail) && accept->depth <= trie->depth);
874 
875           if (!tree)
876             break;
877           trie = tree->trie;
878           if (trie->accepting)
879             {
880               accept1 = trie;
881               while (accept1->accepting == SIZE_MAX)
882                 accept1 = accept1->fail;
883               left1 = tp - accept1->depth;
884               if (left1 <= left)
885                 {
886                   left = left1;
887                   accept = accept1;
888                 }
889             }
890         }
891     }
892 
893   kwsmatch->index = accept->accepting / 2;
894   kwsmatch->offset[0] = left - text;
895   kwsmatch->size[0] = accept->depth;
896 
897   return left - text;
898 }
899 
900 /* Hairy multiple string search with Aho-Corasick algorithm.  */
901 static ptrdiff_t
902 acexec (kwset_t kwset, char const *text, ptrdiff_t size,
903         struct kwsmatch *kwsmatch, bool longest)
904 {
905   assume (0 <= size);
906   /* Help the compiler inline in two ways, depending on whether
907      kwset->trans is null.  */
908   return (IGNORE_DUPLICATE_BRANCH_WARNING
909           (kwset->trans
910            ? acexec_trans (kwset, text, size, kwsmatch, longest)
911            : acexec_trans (kwset, text, size, kwsmatch, longest)));
912 }
913 
914 /* Find the first instance of a KWSET member in TEXT, which has SIZE bytes.
915    Return the offset (into TEXT) of the first byte of the matching substring,
916    or -1 if no match is found.  Upon a match, store details in
917    *KWSMATCH: index of matched keyword, start offset (same as the return
918    value), and length.  If LONGEST, find the longest match; otherwise
919    any match will do.  */
920 ptrdiff_t
921 kwsexec (kwset_t kwset, char const *text, ptrdiff_t size,
922          struct kwsmatch *kwsmatch, bool longest)
923 {
924   return kwset->kwsexec (kwset, text, size, kwsmatch, longest);
925 }
926 
927 /* Free the components of the given keyword set.  */
928 void
929 kwsfree (kwset_t kwset)
930 {
931   obstack_free (&kwset->obstack, NULL);
932   free (kwset);
933 }
934