1 /*
2  *  A fast filter for static patterns.
3  *
4  *  Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *  Copyright (C) 2008-2013 Sourcefire, Inc.
6  *
7  *  Authors: Török Edvin
8  *
9  *  This program is free software; you can redistribute it and/or modify
10  *  it under the terms of the GNU General Public License version 2 as
11  *  published by the Free Software Foundation.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program; if not, write to the Free Software
20  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21  *  MA 02110-1301, USA.
22  */
23 #if HAVE_CONFIG_H
24 #include "clamav-config.h"
25 #endif
26 
27 #include "clamav.h"
28 #include "filtering.h"
29 #include "matcher-ac.h"
30 #include <string.h>
31 #include <assert.h>
32 #include "perflogging.h"
33 /* ----- shift-or filtering -------------- */
34 
35 /*
36  * Description of algorithm:
37  *
38  * Multiple patterns are added to the filter.
39  * The filter retains an approximation of these patterns, which can lead to
40  * false positive matches, but not false negative matches.
41  *
42  * For each position in the filter we retain what qgrams can match at that
43  * position, for example (if we'd use characters as qgrams):
44  * pattern1: atu
45  * pattern2: bzf
46  * pattern3: xat
47  *
48  * filter accepts:
49  * [abx][tza][uft]
50  *
51  * But it also accepts (false positives):
52  * azu, azf, azt, ...
53  *
54  * It doesn't however accept:
55  * aaa, atz, ...
56  *
57  * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states,
58  * each active bit meaning that a state is active.
59  *
60  * The states are activated sequentially, eachtransition decision is made
61  * considering if we can accept the character at position X.
62  * Since we can start a match at any position, position 0 is
63  * reactivated each time.
64  * When the last position is activated, the filter reports a match.
65  * If we can't accept the character at position X, the state remains inactive,
66  * and further states aren't activated (unless we activate this state in the
67  * future).
68  *
69  * Essentially this is an automaton like this:
70  *
71  *  /\    (a|b|x)        (t|z|a)        (u|f|t)
72  * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match
73  *  \_______________/             |
74  *  \_____________________________/
75  *
76  *
77  * But we are tracking multiple active states at each time (or run N automatons
78  * in parallel if you like, N = number of states).
79  *
80  * We can have S3 and S2 active, meaning that if the next character is
81  * acceptable, it transitions to S1,S3 and S4 being active, otherwise it
82  * transitions to S1 being active.
83  *
84  * Active states can either be represented as a binary 1 or 0, and using
85  * bit-shifting and masking.
86  * If we choose 1, we must use &, and after shifting always reactivate bit 0.
87  * If we choose 0, we must use |, and after shifting we don't need to do
88  * anything (since by shifting a 0 is implicitly introduced).
89  *
90  * This file implements the latter (shift-or) method.
91  *
92  * The discussion above considered pattern to be of same length (or truncated to
93  * be so). In reality patterns are of variable length, and we often have short
94  * pattern.
95  *
96  * Thus another bitmap was introduced, meaning that if (end[Q] == set), then
97  * a pattern can end at this position.
98  * Also we would fill the pattern's position filters quite quickly with only 256
99  * choices for a position, so the algorithm uses overlapping qgrams of length 2:
100  * 'abcd' is 3 qgrams: 'ab','bc','cd'
101  *
102  * The algorithm is very sensitive to the end[Q] filter, since it can have false
103  * positives due to short patterns!
104  * For optimal performance we need:
105  *   - patterns as long as possible
106  *   - probability for end[Q] to match low (avoid 0000, and other common case
107  *   - choose the most "diverse" subset from a long pattern
108  *
109  * diverse = referring to what we are scanning, so that the filter rarely
110  * matches, so this actually means that we *want* to avoid adding more
111  * characters to the filter, if we have 2 patterns:
112  * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we
113  * don't add new character at the beginning.
114  *
115  * With NDB signatures there are more challenges to overcome:
116  *    e8??0000000aa
117  *
118  *    will make the filter accept:
119  *    e8<all-256-values-here>, <all-256-values>00, ... 000000aa
120  *
121  *    We should delay the pattern end as long as possible, especially if it is  0000
122  *    The problem is that now the filter accepts 0000 on position 3, regardless
123  *    of what we have on position 1 (even if we have something else than e8), so
124  *    we have to be very careful not to allow 0000 on first position too,
125  *    otherwise the filter will happily accept 000000000000.
126  *
127  * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram
128  * based (fits L2), both must match for the filter to consider it a match.
129  *
130  *
131  */
132 
133 /*#define DETAILED_DEBUG*/
134 #ifdef DETAILED_DEBUG
135 #define detailed_dbg cli_dbgmsg
136 #else
137 #define detailed_dbg(...)
138 #endif
139 
140 #define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val)&0x1f)))
141 #define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val)&0x1f)))
142 
filter_init(struct filter * m)143 void filter_init(struct filter *m)
144 {
145     memset(m->B, ~0, sizeof(m->B));
146     memset(m->end, ~0, sizeof(m->end));
147 }
148 
149 /* because we use uint32_t */
150 #define MAXSOPATLEN 8
151 
filter_isset(const struct filter * m,unsigned pos,uint16_t val)152 static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val)
153 {
154     return !(m->B[val] & (1 << pos));
155 }
156 
filter_set_atpos(struct filter * m,unsigned pos,uint16_t val)157 static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val)
158 {
159     if (!filter_isset(m, pos, val)) {
160         cli_perf_log_count(FILTER_LOAD, pos);
161         m->B[val] &= ~(1 << pos);
162     }
163 }
164 
filter_end_isset(const struct filter * m,unsigned pos,uint16_t a)165 static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a)
166 {
167     return !(m->end[a] & (1 << pos));
168 }
169 
filter_set_end(struct filter * m,unsigned pos,uint16_t a)170 static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a)
171 {
172     if (!filter_end_isset(m, pos, a)) {
173         cli_perf_log_count(FILTER_END_LOAD, pos);
174         m->end[a] &= ~(1 << pos);
175     }
176 }
177 #define MAX_CHOICES 8
178 /* just an arbitrary limit, if patterns are longer, we cut
179  * the filter can only use MAXSOPATLEN (32) characters,
180  * this longer buffer is needed so that we can choose the "best" subpattern from
181  * it */
182 #define MAXPATLEN 255
183 
184 /* merge another pattern into the filter
185  * add('abc'); add('bcd'); will match [ab][bc][cd] */
filter_add_static(struct filter * m,const unsigned char * pattern,unsigned long len,const char * name)186 int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name)
187 {
188     uint16_t q = 0;
189     uint8_t j, maxlen;
190     uint32_t best    = 0xffffffff;
191     uint8_t best_pos = 0;
192 
193     UNUSEDPARAM(name);
194 
195     cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len);
196     /* TODO: choose best among MAXCHOICES */
197     /* cut length */
198     if (len > MAXPATLEN) {
199         len = MAXPATLEN;
200     }
201     if (len < 2)
202         return -1;
203 
204     /* we want subsigs to be as long as possible */
205     if (len > 4) {
206         maxlen = len - 4;
207         if (maxlen == 1) maxlen = 2;
208     } else
209         maxlen = 2;
210     for (j = 0; (best < 100 && j < MAX_CHOICES) || (j < maxlen); j++) {
211         uint32_t num = MAXSOPATLEN;
212         uint8_t k;
213         if (j + 2 > len)
214             break;
215         for (k = j; k < len - 1 && (k - j < MAXSOPATLEN); k++) {
216             q = cli_readint16(&pattern[k]);
217             /* we want to favor subsigs that add as little as
218 			 * possible to the filter */
219             num += filter_isset(m, k - j, q) ? 0 : MAXSOPATLEN - (k - j);
220             if ((k == j || k == j + 1) && (q == 0x0000 || q == 0xffff))
221                 num += k == j ? 10000 : 1000; /* bad */
222         }
223         /* it is very important to keep the end set small */
224         num += 10 * (filter_end_isset(m, k - j - 1, q) ? 0 : 1);
225         /* it is very important to have signatures as long as possible
226 		 * */
227         num += 5 * (MAXSOPATLEN - (k - j));
228         /* if we are lower length than threshold penalize */
229         if (k - j + 1 < 4)
230             num += 200;
231         /* favour longer patterns */
232         num -= (2 * MAXSOPATLEN - (k + 1 + j)) * (k - j) / 2;
233 
234         if (num < best) {
235             best     = num;
236             best_pos = j;
237         }
238     }
239 
240     assert(best_pos < len - 1);
241     if (pattern[best_pos] == 0 && pattern[best_pos + 1] == 0) {
242         detailed_dbg("filter (warning): subsignature begins with zero (static): %s\n", name);
243     }
244     pattern += best_pos;
245     len -= best_pos;
246     /* cut length */
247     if (len > MAXSOPATLEN) {
248         len = MAXSOPATLEN;
249     }
250     /* Shift-Or like preprocessing */
251     for (j = 0; j < len - 1; j++) {
252         /* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */
253         q = cli_readint16(&pattern[j]);
254         filter_set_atpos(m, j, q);
255     }
256     /* we use variable length patterns, use last character to mark pattern end,
257 	 * can lead to false positives.*/
258     /* mark that at state j, the q-gram q can end the pattern */
259     if (j) {
260         j--;
261         filter_set_end(m, j, q);
262     }
263     return j + 2;
264 }
265 
266 struct char_spec {
267     /* if non-null i-th character = alt[start + step*i]; start+step*i < end;
268 	 */
269     struct cli_ac_special *alt;
270     uint8_t start;
271     uint8_t end;
272     uint8_t step;
273     uint8_t negative;
274 };
275 
spec_ith_char(const struct char_spec * spec,unsigned i)276 static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i)
277 {
278     const struct cli_ac_special *alt = spec->alt;
279     if (alt) {
280         assert(alt->type == 1);
281         assert(i < alt->num);
282         return (alt->alt).byte[i];
283     }
284     return i;
285 }
286 
287 #ifndef MIN
288 #define MIN(a, b) ((a) < (b) ? (a) : (b))
289 #endif
290 
291 #define SPEC_FOREACH(spec0, k0, spec1, k1)           \
292     do {                                             \
293         unsigned char c0 = spec_ith_char(spec0, k0); \
294         unsigned char c1 = spec_ith_char(spec1, k1); \
295         unsigned c0end, c1end, cc0, cc1;             \
296         c0end = spec0->negative ? 255 : c0;          \
297         c1end = spec1->negative ? 255 : c1;          \
298         cc0   = spec0->negative ? 0 : c0;            \
299         cc1   = spec1->negative ? 0 : c1;            \
300         for (; cc0 <= c0end; cc0++) {                \
301             for (; cc1 <= c1end; cc1++) {            \
302                 uint16_t a = cc0 | (cc1 << 8);       \
303                 if (spec0->negative && cc0 == c0)    \
304                     continue;                        \
305                 if (spec1->negative && cc1 == c1)    \
306                     continue;
307 
308 #define SPEC_END_FOR \
309     }                \
310     }                \
311     }                \
312     while (0)
313 
314 enum badness {
315     reject,
316     /* try to avoid if possible */
317     avoid_first,
318     avoid_anywhere, /* includes avoid_first! */
319     /* not that bad, but still not best */
320     dontlike,
321     acceptable,
322     like
323 };
get_score(enum badness badness,unsigned i,const struct filter * m,const struct char_spec * spec0,const struct char_spec * spec1,int32_t * score,int32_t * score_end)324 static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end)
325 {
326     int32_t base;
327     unsigned k0, k1, num_introduced = 0, num_end_introduced = 0;
328     switch (badness) {
329         case reject:
330             /* not reached */
331             assert(0);
332             base = -0x7fffff;
333             break;
334         case avoid_first:
335             if (!i)
336                 base = -0x700000;
337             else
338                 base = 0;
339             break;
340         case avoid_anywhere:
341             if (!i)
342                 base = -0x720000;
343             else
344                 base = -0x1000;
345             break;
346         case dontlike:
347             base = 0;
348             break;
349         case acceptable:
350             base = 0x200;
351             break;
352         case like:
353             /* a bit better only */
354             base = 0x201;
355             break;
356     }
357     if (base < 0) {
358         *score     = base;
359         *score_end = base;
360         return;
361     }
362     /* at most 256 iterations here, otherwise base would be negative */
363     for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
364         for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
365             SPEC_FOREACH(spec0, k0, spec1, k1)
366             {
367                 num_introduced += filter_isset(m, i, a);
368                 num_end_introduced += filter_end_isset(m, i, a);
369             }
370             SPEC_END_FOR;
371         }
372     }
373     *score     = base - num_introduced;
374     *score_end = base - num_end_introduced;
375     if (badness == avoid_first && i) {
376         /* what is bad to begin with, is bad at end too */
377         *score_end -= 0x1000;
378     }
379 }
380 
381 struct choice {
382     enum badness base;
383     unsigned begin;
384     unsigned len;
385 };
386 
add_choice(struct choice * choices,unsigned * cnt,unsigned i,unsigned ie,enum badness badness)387 static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness)
388 {
389     struct choice *choice;
390     int i_neg = -1;
391     assert(ie < MAXPATLEN);
392     if (ie < i + 1)
393         return;
394     if (*cnt >= MAX_CHOICES)
395         return;
396     if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) {
397         unsigned j;
398         /* replace very bad picks if we're full */
399         for (j = 0; j < *cnt; j++) {
400             if (choices[j].base < badness) {
401                 if (i_neg == -1 || choices[j].base < choices[i_neg].base) {
402                     i_neg = j;
403                 }
404             }
405         }
406     }
407     if (i_neg != -1) {
408         choice = &choices[i_neg];
409     } else {
410         choice = &choices[(*cnt)++];
411     }
412     choice->begin = i;
413     choice->len   = ie - i + 1;
414     choice->base  = badness;
415 }
416 
spec_iter(const struct char_spec * spec)417 static inline int32_t spec_iter(const struct char_spec *spec)
418 {
419     unsigned count;
420     assert(spec->step);
421     count = (spec->step + spec->end - spec->start) / spec->step;
422     if (spec->negative) /* all chars except itself are added */
423         count *= 254;
424     return count;
425 }
426 
filter_add_acpatt(struct filter * m,const struct cli_ac_patt * pat)427 int filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat)
428 {
429     unsigned i, j = 0, stop = 0, l = 0;
430     uint16_t k0, k1;
431 
432     struct char_spec chars[MAXPATLEN];
433     enum badness char_badness[MAXPATLEN];
434     unsigned char patc[MAXPATLEN];
435     unsigned altcnt         = 0;
436     int32_t best_score      = -0x7fffffff;
437     unsigned best_score_i   = 0;
438     unsigned best_score_len = 0;
439     struct char_spec *spec0 = NULL, *spec1 = NULL;
440 
441     struct choice choices[MAX_CHOICES];
442     unsigned choices_cnt = 0;
443     unsigned prefix_len  = pat->prefix_length[0];
444     unsigned speci;
445 
446     j = MIN(prefix_len + pat->length[0], MAXPATLEN);
447     for (i = 0; i < j; i++) {
448         const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
449         if ((p & CLI_MATCH_METADATA) != CLI_MATCH_CHAR)
450             break;
451         patc[i] = (uint8_t)p;
452     }
453     if (i == j) {
454         /* all static, use add_static it has better heuristics for this
455 		 * case */
456         return filter_add_static(m, patc, j, pat->virname);
457     }
458     cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j);
459     i = 0;
460     if (!prefix_len) {
461         while ((pat->pattern[i] & CLI_MATCH_METADATA) == CLI_MATCH_SPECIAL) {
462             /* we support only ALT_CHAR, skip the rest */
463             if (pat->special_table[altcnt]->type == 1)
464                 break;
465             altcnt++;
466             i++;
467         }
468     }
469     /* transform AC characters into our representation */
470     for (speci = 0; i < j && !stop; speci++, i++) {
471         struct char_spec *spec = &chars[speci];
472         const uint16_t p       = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
473         spec->alt              = NULL;
474         spec->negative         = 0;
475         switch (p & CLI_MATCH_METADATA) {
476             case CLI_MATCH_CHAR:
477                 spec->start = spec->end = (uint8_t)p;
478                 spec->step              = 1;
479                 break;
480             case CLI_MATCH_NOCASE:
481                 if ((uint8_t)p >= 'a' && (uint8_t)p <= 'z') {
482                     spec->start = (uint8_t)p - ('a' - 'A');
483                     spec->end   = (uint8_t)p;
484                     spec->step  = ('a' - 'A');
485                 } else if ((uint8_t)p >= 'A' && (uint8_t)p <= 'Z') {
486                     spec->start = (uint8_t)p;
487                     spec->end   = (uint8_t)p + ('a' - 'A');
488                     spec->step  = ('a' - 'A');
489                 } else {
490                     spec->start = spec->end = (uint8_t)p;
491                     spec->step              = 1;
492                 }
493                 break;
494             case CLI_MATCH_IGNORE:
495                 spec->start = 0x00;
496                 spec->end   = 0xff;
497                 spec->step  = 1;
498                 break;
499             case CLI_MATCH_SPECIAL:
500                 assert(pat->special_table);
501                 /* assert(altcnt < pat->alt); */
502                 assert(pat->special_table[altcnt]);
503                 spec->negative = pat->special_table[altcnt]->negative;
504                 switch (pat->special_table[altcnt++]->type) {
505                     case 1: /* ALT_CHAR */
506                         spec->start = 0;
507                         spec->end   = pat->special_table[altcnt - 1]->num - 1;
508                         spec->step  = 1;
509                         spec->alt   = pat->special_table[altcnt - 1];
510                         break;
511                     default:
512                         stop = 1;
513                         break; /* TODO: should something be done here?
514 					 * */
515                 }
516                 break;
517             case CLI_MATCH_NIBBLE_HIGH:
518                 spec->start = (p & 0xf0);
519                 spec->end   = spec->start | 0x0f;
520                 spec->step  = 1;
521                 break;
522             case CLI_MATCH_NIBBLE_LOW:
523                 spec->start = (p & 0xf);
524                 spec->end   = 0xf0 | spec->start;
525                 spec->step  = 0x10;
526                 break;
527             default:
528                 cli_errmsg("filtering: unknown wildcard character: %d\n", p);
529                 return -1;
530         }
531     }
532     if (stop) --speci;
533     j = speci;
534     if (j < 2) {
535         if (stop)
536             cli_warnmsg("Don't know how to create filter for: %s\n", pat->virname);
537         else
538             cli_warnmsg("Subpattern too short: %s\n", pat->virname);
539         return -1;
540     }
541 
542     for (i = 0; i < j - 1; i++) {
543         int32_t num_iter;
544         /* new qgrams added to the filter */
545         spec0    = &chars[i];
546         spec1    = &chars[i + 1];
547         num_iter = spec_iter(spec0) * spec_iter(spec1);
548 
549         if (num_iter >= 0x100) {
550             if (num_iter == 0x10000)
551                 char_badness[i] = reject;
552             else
553                 char_badness[i] = avoid_anywhere;
554         } else {
555             int8_t binary     = 0;
556             enum badness scor = acceptable;
557             for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
558                 for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
559                     unsigned char c0 = spec_ith_char(spec0, k0);
560                     unsigned char c1 = spec_ith_char(spec1, k1);
561                     if (spec0->negative || spec1->negative) {
562                         scor = avoid_anywhere;
563                         break;
564                     }
565                     if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) {
566                         scor = avoid_first;
567                         break;
568                     }
569                     if (c0 == c1) {
570                         scor = dontlike;
571                         break;
572                     }
573                     if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 > 127))
574                         binary = 1;
575                 }
576             }
577             if (scor == acceptable && binary) {
578                 /* slightly favor binary */
579                 scor = like;
580             }
581             char_badness[i] = scor;
582         }
583     }
584 
585     /* try to choose best subpattern */
586 
587     /* calculating the score for all possible i start pos
588 	 * and all possible length is too slow, so choose best among N choices
589 	 * only */
590     for (i = 0; i < j - 1 && choices_cnt < MAX_CHOICES; i++) {
591         enum badness base0 = like, base1 = like;
592         unsigned kend = MIN(j - 1, (i + MAXSOPATLEN) & ~1), k;
593         int ki        = -0xff;
594         /* add 2 scores: pattern with max length, one where we stop at
595 		 * first negative, and one we stop at last positive, but never
596 		 * include reject */
597         assert(kend - 1 < j - 1);
598         if (char_badness[i] == reject)
599             continue;
600         if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first) && choices_cnt > 0)
601             /* if we have another choice don't choose this */
602             continue;
603         while ((kend > i + 3) && char_badness[kend - 1] == reject) kend--;
604         for (k = i; k < kend; k++) {
605             enum badness badness = char_badness[k];
606             if (badness < acceptable) {
607                 if (badness == reject) {
608                     /* this is a never pick */
609                     kend = k;
610                     break;
611                 }
612                 if (badness == avoid_first && k != i)
613                     badness = dontlike;
614                 if (k == i && badness == avoid_anywhere)
615                     badness = avoid_first;
616                 if (ki == -0xff)
617                     ki = k;
618             }
619             base0 = MIN(base0, badness);
620             if (ki == -0xff)
621                 base1 = MIN(base1, badness);
622         }
623         add_choice(choices, &choices_cnt, i, kend, base0);
624         if (ki > (int)i) {
625             /* ki|ki+1|??| */
626             /* try subpattern from after the wildcard */
627             i = ki;
628         }
629         /* if score is positive, it replaces a negative choice */
630     }
631     for (l = 0; l < choices_cnt; l++) {
632         int32_t score;
633         unsigned kend;
634         unsigned k;
635 
636         i     = choices[l].begin;
637         kend  = i + choices[l].len;
638         score = 0;
639 
640         for (k = i; k < kend - 1; k++) {
641             unsigned p = k - i;
642             int32_t iscore, score_end;
643             assert(k < j);
644             get_score(char_badness[k], p, m, &chars[k], &chars[k + 1],
645                       &iscore, &score_end);
646             /* give more importance to the score of the characters
647 			 * at the beginning */
648             /* TODO: tune magic number here */
649             if (p < 6) {
650                 iscore *= (6 - p);
651                 score_end *= (6 - p);
652             }
653             score += iscore;
654             if (score + score_end > best_score) {
655                 /* we may have negative scores, so truncating
656 				 * the pattern could actually get us a higher
657 				 * score */
658                 best_score     = score + score_end;
659                 best_score_len = p + 2;
660                 best_score_i   = i;
661                 assert(i + best_score_len <= j);
662             }
663         }
664     }
665 
666     if (best_score <= -0x7fffffff) {
667         cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score);
668         return -1;
669     }
670     if (choices_cnt == 0) {
671         cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname);
672         return -1;
673     }
674     assert(best_score_len >= 2);
675     detailed_dbg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len);
676     /* Shift-Or like preprocessing */
677     assert(1 < best_score_len);
678     for (i = 0; i < best_score_len - 1; i++) {
679         spec0 = &chars[best_score_i + i];
680         spec1 = &chars[best_score_i + i + 1];
681         /* use overlapping little-endian 2-grams, overlapping because match can start
682 		 * at any position (including odd) */
683 
684         for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
685             for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
686                 SPEC_FOREACH(spec0, k0, spec1, k1)
687                 {
688                     if (!cc0 && !cc1 && !i) {
689                         detailed_dbg("filter (warning): subsignature begins with zero: %s\n", pat->virname);
690                     }
691                     filter_set_atpos(m, i, a);
692                 }
693                 SPEC_END_FOR;
694             }
695         }
696     }
697 
698     j = best_score_len - 2;
699     if (spec0 && spec1) {
700         for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
701             for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
702                 SPEC_FOREACH(spec0, k0, spec1, k1)
703                 {
704                     if (!cc0 && !cc1) {
705                         detailed_dbg("filter (warning): subsignature ends with zero: %s\n", pat->virname);
706                     }
707                     filter_set_end(m, j, a);
708                 }
709                 SPEC_END_FOR;
710             }
711         }
712     }
713     return j + 2;
714 }
715 
716 /* state 11110011 means that we may have a match of length min 4, max 5 */
717 
filter_search_ext(const struct filter * m,const unsigned char * data,unsigned long len,struct filter_match_info * inf)718 __hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf)
719 {
720     size_t j;
721     uint8_t state      = ~0;
722     const uint8_t *B   = m->B;
723     const uint8_t *End = m->end;
724 
725     if (len < 2) return -1;
726     /* look for first match */
727     for (j = 0; j < len - 1; j++) {
728         uint8_t match_state_end;
729         const uint16_t q0 = cli_readint16(&data[j]);
730 
731         state           = (state << 1) | B[q0];
732         match_state_end = state | End[q0];
733         if (match_state_end != 0xff) {
734             inf->first_match = j;
735             return 0;
736         }
737     }
738     /* no match, inf is invalid */
739     return -1;
740 }
741 
742 /* this is like a FSM, with multiple active states at the same time.
743  * each bit in "state" means an active state, when a char is encountered
744  * we determine what states can remain active.
745  * The FSM transition rules are expressed as bit-masks */
filter_search(const struct filter * m,const unsigned char * data,unsigned long len)746 long filter_search(const struct filter *m, const unsigned char *data, unsigned long len)
747 {
748     size_t j;
749     uint8_t state      = ~0;
750     const uint8_t *B   = m->B;
751     const uint8_t *End = m->end;
752 
753     /* we use 2-grams, must be higher than 1 */
754     if (len < 2) return -1;
755     /* Shift-Or like search algorithm */
756     for (j = 0; j < len - 1; j++) {
757         const uint16_t q0 = cli_readint16(&data[j]);
758         uint8_t match_end;
759         state = (state << 1) | B[q0];
760         /* state marks with a 0 bit all active states
761 		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
762 		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
763         match_end = state | End[q0];
764         if (match_end != 0xff) {
765 
766             /* if state is reachable, and this character can finish a pattern, assume match */
767             /* to reduce false positives check if qgram can finish the pattern */
768             /* return position of probable match */
769             /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
770 			 * longest pattern that could match */
771             return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0;
772         }
773     }
774     /* no match */
775     return -1;
776 }
777