1 /*
2 * A fast filter for static patterns.
3 *
4 * Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 * Copyright (C) 2008-2013 Sourcefire, Inc.
6 *
7 * Authors: Török Edvin
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 * MA 02110-1301, USA.
22 */
23 #if HAVE_CONFIG_H
24 #include "clamav-config.h"
25 #endif
26
27 #include "clamav.h"
28 #include "filtering.h"
29 #include "matcher-ac.h"
30 #include <string.h>
31 #include <assert.h>
32 #include "perflogging.h"
33 /* ----- shift-or filtering -------------- */
34
35 /*
36 * Description of algorithm:
37 *
38 * Multiple patterns are added to the filter.
39 * The filter retains an approximation of these patterns, which can lead to
40 * false positive matches, but not false negative matches.
41 *
42 * For each position in the filter we retain what qgrams can match at that
43 * position, for example (if we'd use characters as qgrams):
44 * pattern1: atu
45 * pattern2: bzf
46 * pattern3: xat
47 *
48 * filter accepts:
49 * [abx][tza][uft]
50 *
51 * But it also accepts (false positives):
52 * azu, azf, azt, ...
53 *
54 * It doesn't however accept:
55 * aaa, atz, ...
56 *
57 * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states,
58 * each active bit meaning that a state is active.
59 *
60 * The states are activated sequentially, eachtransition decision is made
61 * considering if we can accept the character at position X.
62 * Since we can start a match at any position, position 0 is
63 * reactivated each time.
64 * When the last position is activated, the filter reports a match.
65 * If we can't accept the character at position X, the state remains inactive,
66 * and further states aren't activated (unless we activate this state in the
67 * future).
68 *
69 * Essentially this is an automaton like this:
70 *
71 * /\ (a|b|x) (t|z|a) (u|f|t)
72 * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match
73 * \_______________/ |
74 * \_____________________________/
75 *
76 *
77 * But we are tracking multiple active states at each time (or run N automatons
78 * in parallel if you like, N = number of states).
79 *
80 * We can have S3 and S2 active, meaning that if the next character is
81 * acceptable, it transitions to S1,S3 and S4 being active, otherwise it
82 * transitions to S1 being active.
83 *
84 * Active states can either be represented as a binary 1 or 0, and using
85 * bit-shifting and masking.
86 * If we choose 1, we must use &, and after shifting always reactivate bit 0.
87 * If we choose 0, we must use |, and after shifting we don't need to do
88 * anything (since by shifting a 0 is implicitly introduced).
89 *
90 * This file implements the latter (shift-or) method.
91 *
92 * The discussion above considered pattern to be of same length (or truncated to
93 * be so). In reality patterns are of variable length, and we often have short
94 * pattern.
95 *
96 * Thus another bitmap was introduced, meaning that if (end[Q] == set), then
97 * a pattern can end at this position.
98 * Also we would fill the pattern's position filters quite quickly with only 256
99 * choices for a position, so the algorithm uses overlapping qgrams of length 2:
100 * 'abcd' is 3 qgrams: 'ab','bc','cd'
101 *
102 * The algorithm is very sensitive to the end[Q] filter, since it can have false
103 * positives due to short patterns!
104 * For optimal performance we need:
105 * - patterns as long as possible
106 * - probability for end[Q] to match low (avoid 0000, and other common case
107 * - choose the most "diverse" subset from a long pattern
108 *
109 * diverse = referring to what we are scanning, so that the filter rarely
110 * matches, so this actually means that we *want* to avoid adding more
111 * characters to the filter, if we have 2 patterns:
112 * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we
113 * don't add new character at the beginning.
114 *
115 * With NDB signatures there are more challenges to overcome:
116 * e8??0000000aa
117 *
118 * will make the filter accept:
119 * e8<all-256-values-here>, <all-256-values>00, ... 000000aa
120 *
121 * We should delay the pattern end as long as possible, especially if it is 0000
122 * The problem is that now the filter accepts 0000 on position 3, regardless
123 * of what we have on position 1 (even if we have something else than e8), so
124 * we have to be very careful not to allow 0000 on first position too,
125 * otherwise the filter will happily accept 000000000000.
126 *
127 * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram
128 * based (fits L2), both must match for the filter to consider it a match.
129 *
130 *
131 */
132
133 /*#define DETAILED_DEBUG*/
134 #ifdef DETAILED_DEBUG
135 #define detailed_dbg cli_dbgmsg
136 #else
137 #define detailed_dbg(...)
138 #endif
139
140 #define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val)&0x1f)))
141 #define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val)&0x1f)))
142
filter_init(struct filter * m)143 void filter_init(struct filter *m)
144 {
145 memset(m->B, ~0, sizeof(m->B));
146 memset(m->end, ~0, sizeof(m->end));
147 }
148
149 /* because we use uint32_t */
150 #define MAXSOPATLEN 8
151
filter_isset(const struct filter * m,unsigned pos,uint16_t val)152 static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val)
153 {
154 return !(m->B[val] & (1 << pos));
155 }
156
filter_set_atpos(struct filter * m,unsigned pos,uint16_t val)157 static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val)
158 {
159 if (!filter_isset(m, pos, val)) {
160 cli_perf_log_count(FILTER_LOAD, pos);
161 m->B[val] &= ~(1 << pos);
162 }
163 }
164
filter_end_isset(const struct filter * m,unsigned pos,uint16_t a)165 static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a)
166 {
167 return !(m->end[a] & (1 << pos));
168 }
169
filter_set_end(struct filter * m,unsigned pos,uint16_t a)170 static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a)
171 {
172 if (!filter_end_isset(m, pos, a)) {
173 cli_perf_log_count(FILTER_END_LOAD, pos);
174 m->end[a] &= ~(1 << pos);
175 }
176 }
177 #define MAX_CHOICES 8
178 /* just an arbitrary limit, if patterns are longer, we cut
179 * the filter can only use MAXSOPATLEN (32) characters,
180 * this longer buffer is needed so that we can choose the "best" subpattern from
181 * it */
182 #define MAXPATLEN 255
183
184 /* merge another pattern into the filter
185 * add('abc'); add('bcd'); will match [ab][bc][cd] */
filter_add_static(struct filter * m,const unsigned char * pattern,unsigned long len,const char * name)186 int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name)
187 {
188 uint16_t q = 0;
189 uint8_t j, maxlen;
190 uint32_t best = 0xffffffff;
191 uint8_t best_pos = 0;
192
193 UNUSEDPARAM(name);
194
195 cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len);
196 /* TODO: choose best among MAXCHOICES */
197 /* cut length */
198 if (len > MAXPATLEN) {
199 len = MAXPATLEN;
200 }
201 if (len < 2)
202 return -1;
203
204 /* we want subsigs to be as long as possible */
205 if (len > 4) {
206 maxlen = len - 4;
207 if (maxlen == 1) maxlen = 2;
208 } else
209 maxlen = 2;
210 for (j = 0; (best < 100 && j < MAX_CHOICES) || (j < maxlen); j++) {
211 uint32_t num = MAXSOPATLEN;
212 uint8_t k;
213 if (j + 2 > len)
214 break;
215 for (k = j; k < len - 1 && (k - j < MAXSOPATLEN); k++) {
216 q = cli_readint16(&pattern[k]);
217 /* we want to favor subsigs that add as little as
218 * possible to the filter */
219 num += filter_isset(m, k - j, q) ? 0 : MAXSOPATLEN - (k - j);
220 if ((k == j || k == j + 1) && (q == 0x0000 || q == 0xffff))
221 num += k == j ? 10000 : 1000; /* bad */
222 }
223 /* it is very important to keep the end set small */
224 num += 10 * (filter_end_isset(m, k - j - 1, q) ? 0 : 1);
225 /* it is very important to have signatures as long as possible
226 * */
227 num += 5 * (MAXSOPATLEN - (k - j));
228 /* if we are lower length than threshold penalize */
229 if (k - j + 1 < 4)
230 num += 200;
231 /* favour longer patterns */
232 num -= (2 * MAXSOPATLEN - (k + 1 + j)) * (k - j) / 2;
233
234 if (num < best) {
235 best = num;
236 best_pos = j;
237 }
238 }
239
240 assert(best_pos < len - 1);
241 if (pattern[best_pos] == 0 && pattern[best_pos + 1] == 0) {
242 detailed_dbg("filter (warning): subsignature begins with zero (static): %s\n", name);
243 }
244 pattern += best_pos;
245 len -= best_pos;
246 /* cut length */
247 if (len > MAXSOPATLEN) {
248 len = MAXSOPATLEN;
249 }
250 /* Shift-Or like preprocessing */
251 for (j = 0; j < len - 1; j++) {
252 /* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */
253 q = cli_readint16(&pattern[j]);
254 filter_set_atpos(m, j, q);
255 }
256 /* we use variable length patterns, use last character to mark pattern end,
257 * can lead to false positives.*/
258 /* mark that at state j, the q-gram q can end the pattern */
259 if (j) {
260 j--;
261 filter_set_end(m, j, q);
262 }
263 return j + 2;
264 }
265
266 struct char_spec {
267 /* if non-null i-th character = alt[start + step*i]; start+step*i < end;
268 */
269 struct cli_ac_special *alt;
270 uint8_t start;
271 uint8_t end;
272 uint8_t step;
273 uint8_t negative;
274 };
275
spec_ith_char(const struct char_spec * spec,unsigned i)276 static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i)
277 {
278 const struct cli_ac_special *alt = spec->alt;
279 if (alt) {
280 assert(alt->type == 1);
281 assert(i < alt->num);
282 return (alt->alt).byte[i];
283 }
284 return i;
285 }
286
287 #ifndef MIN
288 #define MIN(a, b) ((a) < (b) ? (a) : (b))
289 #endif
290
291 #define SPEC_FOREACH(spec0, k0, spec1, k1) \
292 do { \
293 unsigned char c0 = spec_ith_char(spec0, k0); \
294 unsigned char c1 = spec_ith_char(spec1, k1); \
295 unsigned c0end, c1end, cc0, cc1; \
296 c0end = spec0->negative ? 255 : c0; \
297 c1end = spec1->negative ? 255 : c1; \
298 cc0 = spec0->negative ? 0 : c0; \
299 cc1 = spec1->negative ? 0 : c1; \
300 for (; cc0 <= c0end; cc0++) { \
301 for (; cc1 <= c1end; cc1++) { \
302 uint16_t a = cc0 | (cc1 << 8); \
303 if (spec0->negative && cc0 == c0) \
304 continue; \
305 if (spec1->negative && cc1 == c1) \
306 continue;
307
308 #define SPEC_END_FOR \
309 } \
310 } \
311 } \
312 while (0)
313
314 enum badness {
315 reject,
316 /* try to avoid if possible */
317 avoid_first,
318 avoid_anywhere, /* includes avoid_first! */
319 /* not that bad, but still not best */
320 dontlike,
321 acceptable,
322 like
323 };
get_score(enum badness badness,unsigned i,const struct filter * m,const struct char_spec * spec0,const struct char_spec * spec1,int32_t * score,int32_t * score_end)324 static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end)
325 {
326 int32_t base;
327 unsigned k0, k1, num_introduced = 0, num_end_introduced = 0;
328 switch (badness) {
329 case reject:
330 /* not reached */
331 assert(0);
332 base = -0x7fffff;
333 break;
334 case avoid_first:
335 if (!i)
336 base = -0x700000;
337 else
338 base = 0;
339 break;
340 case avoid_anywhere:
341 if (!i)
342 base = -0x720000;
343 else
344 base = -0x1000;
345 break;
346 case dontlike:
347 base = 0;
348 break;
349 case acceptable:
350 base = 0x200;
351 break;
352 case like:
353 /* a bit better only */
354 base = 0x201;
355 break;
356 }
357 if (base < 0) {
358 *score = base;
359 *score_end = base;
360 return;
361 }
362 /* at most 256 iterations here, otherwise base would be negative */
363 for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
364 for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
365 SPEC_FOREACH(spec0, k0, spec1, k1)
366 {
367 num_introduced += filter_isset(m, i, a);
368 num_end_introduced += filter_end_isset(m, i, a);
369 }
370 SPEC_END_FOR;
371 }
372 }
373 *score = base - num_introduced;
374 *score_end = base - num_end_introduced;
375 if (badness == avoid_first && i) {
376 /* what is bad to begin with, is bad at end too */
377 *score_end -= 0x1000;
378 }
379 }
380
381 struct choice {
382 enum badness base;
383 unsigned begin;
384 unsigned len;
385 };
386
add_choice(struct choice * choices,unsigned * cnt,unsigned i,unsigned ie,enum badness badness)387 static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness)
388 {
389 struct choice *choice;
390 int i_neg = -1;
391 assert(ie < MAXPATLEN);
392 if (ie < i + 1)
393 return;
394 if (*cnt >= MAX_CHOICES)
395 return;
396 if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) {
397 unsigned j;
398 /* replace very bad picks if we're full */
399 for (j = 0; j < *cnt; j++) {
400 if (choices[j].base < badness) {
401 if (i_neg == -1 || choices[j].base < choices[i_neg].base) {
402 i_neg = j;
403 }
404 }
405 }
406 }
407 if (i_neg != -1) {
408 choice = &choices[i_neg];
409 } else {
410 choice = &choices[(*cnt)++];
411 }
412 choice->begin = i;
413 choice->len = ie - i + 1;
414 choice->base = badness;
415 }
416
spec_iter(const struct char_spec * spec)417 static inline int32_t spec_iter(const struct char_spec *spec)
418 {
419 unsigned count;
420 assert(spec->step);
421 count = (spec->step + spec->end - spec->start) / spec->step;
422 if (spec->negative) /* all chars except itself are added */
423 count *= 254;
424 return count;
425 }
426
filter_add_acpatt(struct filter * m,const struct cli_ac_patt * pat)427 int filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat)
428 {
429 unsigned i, j = 0, stop = 0, l = 0;
430 uint16_t k0, k1;
431
432 struct char_spec chars[MAXPATLEN];
433 enum badness char_badness[MAXPATLEN];
434 unsigned char patc[MAXPATLEN];
435 unsigned altcnt = 0;
436 int32_t best_score = -0x7fffffff;
437 unsigned best_score_i = 0;
438 unsigned best_score_len = 0;
439 struct char_spec *spec0 = NULL, *spec1 = NULL;
440
441 struct choice choices[MAX_CHOICES];
442 unsigned choices_cnt = 0;
443 unsigned prefix_len = pat->prefix_length[0];
444 unsigned speci;
445
446 j = MIN(prefix_len + pat->length[0], MAXPATLEN);
447 for (i = 0; i < j; i++) {
448 const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
449 if ((p & CLI_MATCH_METADATA) != CLI_MATCH_CHAR)
450 break;
451 patc[i] = (uint8_t)p;
452 }
453 if (i == j) {
454 /* all static, use add_static it has better heuristics for this
455 * case */
456 return filter_add_static(m, patc, j, pat->virname);
457 }
458 cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j);
459 i = 0;
460 if (!prefix_len) {
461 while ((pat->pattern[i] & CLI_MATCH_METADATA) == CLI_MATCH_SPECIAL) {
462 /* we support only ALT_CHAR, skip the rest */
463 if (pat->special_table[altcnt]->type == 1)
464 break;
465 altcnt++;
466 i++;
467 }
468 }
469 /* transform AC characters into our representation */
470 for (speci = 0; i < j && !stop; speci++, i++) {
471 struct char_spec *spec = &chars[speci];
472 const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
473 spec->alt = NULL;
474 spec->negative = 0;
475 switch (p & CLI_MATCH_METADATA) {
476 case CLI_MATCH_CHAR:
477 spec->start = spec->end = (uint8_t)p;
478 spec->step = 1;
479 break;
480 case CLI_MATCH_NOCASE:
481 if ((uint8_t)p >= 'a' && (uint8_t)p <= 'z') {
482 spec->start = (uint8_t)p - ('a' - 'A');
483 spec->end = (uint8_t)p;
484 spec->step = ('a' - 'A');
485 } else if ((uint8_t)p >= 'A' && (uint8_t)p <= 'Z') {
486 spec->start = (uint8_t)p;
487 spec->end = (uint8_t)p + ('a' - 'A');
488 spec->step = ('a' - 'A');
489 } else {
490 spec->start = spec->end = (uint8_t)p;
491 spec->step = 1;
492 }
493 break;
494 case CLI_MATCH_IGNORE:
495 spec->start = 0x00;
496 spec->end = 0xff;
497 spec->step = 1;
498 break;
499 case CLI_MATCH_SPECIAL:
500 assert(pat->special_table);
501 /* assert(altcnt < pat->alt); */
502 assert(pat->special_table[altcnt]);
503 spec->negative = pat->special_table[altcnt]->negative;
504 switch (pat->special_table[altcnt++]->type) {
505 case 1: /* ALT_CHAR */
506 spec->start = 0;
507 spec->end = pat->special_table[altcnt - 1]->num - 1;
508 spec->step = 1;
509 spec->alt = pat->special_table[altcnt - 1];
510 break;
511 default:
512 stop = 1;
513 break; /* TODO: should something be done here?
514 * */
515 }
516 break;
517 case CLI_MATCH_NIBBLE_HIGH:
518 spec->start = (p & 0xf0);
519 spec->end = spec->start | 0x0f;
520 spec->step = 1;
521 break;
522 case CLI_MATCH_NIBBLE_LOW:
523 spec->start = (p & 0xf);
524 spec->end = 0xf0 | spec->start;
525 spec->step = 0x10;
526 break;
527 default:
528 cli_errmsg("filtering: unknown wildcard character: %d\n", p);
529 return -1;
530 }
531 }
532 if (stop) --speci;
533 j = speci;
534 if (j < 2) {
535 if (stop)
536 cli_warnmsg("Don't know how to create filter for: %s\n", pat->virname);
537 else
538 cli_warnmsg("Subpattern too short: %s\n", pat->virname);
539 return -1;
540 }
541
542 for (i = 0; i < j - 1; i++) {
543 int32_t num_iter;
544 /* new qgrams added to the filter */
545 spec0 = &chars[i];
546 spec1 = &chars[i + 1];
547 num_iter = spec_iter(spec0) * spec_iter(spec1);
548
549 if (num_iter >= 0x100) {
550 if (num_iter == 0x10000)
551 char_badness[i] = reject;
552 else
553 char_badness[i] = avoid_anywhere;
554 } else {
555 int8_t binary = 0;
556 enum badness scor = acceptable;
557 for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
558 for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
559 unsigned char c0 = spec_ith_char(spec0, k0);
560 unsigned char c1 = spec_ith_char(spec1, k1);
561 if (spec0->negative || spec1->negative) {
562 scor = avoid_anywhere;
563 break;
564 }
565 if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) {
566 scor = avoid_first;
567 break;
568 }
569 if (c0 == c1) {
570 scor = dontlike;
571 break;
572 }
573 if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 > 127))
574 binary = 1;
575 }
576 }
577 if (scor == acceptable && binary) {
578 /* slightly favor binary */
579 scor = like;
580 }
581 char_badness[i] = scor;
582 }
583 }
584
585 /* try to choose best subpattern */
586
587 /* calculating the score for all possible i start pos
588 * and all possible length is too slow, so choose best among N choices
589 * only */
590 for (i = 0; i < j - 1 && choices_cnt < MAX_CHOICES; i++) {
591 enum badness base0 = like, base1 = like;
592 unsigned kend = MIN(j - 1, (i + MAXSOPATLEN) & ~1), k;
593 int ki = -0xff;
594 /* add 2 scores: pattern with max length, one where we stop at
595 * first negative, and one we stop at last positive, but never
596 * include reject */
597 assert(kend - 1 < j - 1);
598 if (char_badness[i] == reject)
599 continue;
600 if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first) && choices_cnt > 0)
601 /* if we have another choice don't choose this */
602 continue;
603 while ((kend > i + 3) && char_badness[kend - 1] == reject) kend--;
604 for (k = i; k < kend; k++) {
605 enum badness badness = char_badness[k];
606 if (badness < acceptable) {
607 if (badness == reject) {
608 /* this is a never pick */
609 kend = k;
610 break;
611 }
612 if (badness == avoid_first && k != i)
613 badness = dontlike;
614 if (k == i && badness == avoid_anywhere)
615 badness = avoid_first;
616 if (ki == -0xff)
617 ki = k;
618 }
619 base0 = MIN(base0, badness);
620 if (ki == -0xff)
621 base1 = MIN(base1, badness);
622 }
623 add_choice(choices, &choices_cnt, i, kend, base0);
624 if (ki > (int)i) {
625 /* ki|ki+1|??| */
626 /* try subpattern from after the wildcard */
627 i = ki;
628 }
629 /* if score is positive, it replaces a negative choice */
630 }
631 for (l = 0; l < choices_cnt; l++) {
632 int32_t score;
633 unsigned kend;
634 unsigned k;
635
636 i = choices[l].begin;
637 kend = i + choices[l].len;
638 score = 0;
639
640 for (k = i; k < kend - 1; k++) {
641 unsigned p = k - i;
642 int32_t iscore, score_end;
643 assert(k < j);
644 get_score(char_badness[k], p, m, &chars[k], &chars[k + 1],
645 &iscore, &score_end);
646 /* give more importance to the score of the characters
647 * at the beginning */
648 /* TODO: tune magic number here */
649 if (p < 6) {
650 iscore *= (6 - p);
651 score_end *= (6 - p);
652 }
653 score += iscore;
654 if (score + score_end > best_score) {
655 /* we may have negative scores, so truncating
656 * the pattern could actually get us a higher
657 * score */
658 best_score = score + score_end;
659 best_score_len = p + 2;
660 best_score_i = i;
661 assert(i + best_score_len <= j);
662 }
663 }
664 }
665
666 if (best_score <= -0x7fffffff) {
667 cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score);
668 return -1;
669 }
670 if (choices_cnt == 0) {
671 cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname);
672 return -1;
673 }
674 assert(best_score_len >= 2);
675 detailed_dbg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len);
676 /* Shift-Or like preprocessing */
677 assert(1 < best_score_len);
678 for (i = 0; i < best_score_len - 1; i++) {
679 spec0 = &chars[best_score_i + i];
680 spec1 = &chars[best_score_i + i + 1];
681 /* use overlapping little-endian 2-grams, overlapping because match can start
682 * at any position (including odd) */
683
684 for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
685 for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
686 SPEC_FOREACH(spec0, k0, spec1, k1)
687 {
688 if (!cc0 && !cc1 && !i) {
689 detailed_dbg("filter (warning): subsignature begins with zero: %s\n", pat->virname);
690 }
691 filter_set_atpos(m, i, a);
692 }
693 SPEC_END_FOR;
694 }
695 }
696 }
697
698 j = best_score_len - 2;
699 if (spec0 && spec1) {
700 for (k0 = spec0->start; k0 <= spec0->end; k0 += spec0->step) {
701 for (k1 = spec1->start; k1 <= spec1->end; k1 += spec1->step) {
702 SPEC_FOREACH(spec0, k0, spec1, k1)
703 {
704 if (!cc0 && !cc1) {
705 detailed_dbg("filter (warning): subsignature ends with zero: %s\n", pat->virname);
706 }
707 filter_set_end(m, j, a);
708 }
709 SPEC_END_FOR;
710 }
711 }
712 }
713 return j + 2;
714 }
715
716 /* state 11110011 means that we may have a match of length min 4, max 5 */
717
filter_search_ext(const struct filter * m,const unsigned char * data,unsigned long len,struct filter_match_info * inf)718 __hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf)
719 {
720 size_t j;
721 uint8_t state = ~0;
722 const uint8_t *B = m->B;
723 const uint8_t *End = m->end;
724
725 if (len < 2) return -1;
726 /* look for first match */
727 for (j = 0; j < len - 1; j++) {
728 uint8_t match_state_end;
729 const uint16_t q0 = cli_readint16(&data[j]);
730
731 state = (state << 1) | B[q0];
732 match_state_end = state | End[q0];
733 if (match_state_end != 0xff) {
734 inf->first_match = j;
735 return 0;
736 }
737 }
738 /* no match, inf is invalid */
739 return -1;
740 }
741
742 /* this is like a FSM, with multiple active states at the same time.
743 * each bit in "state" means an active state, when a char is encountered
744 * we determine what states can remain active.
745 * The FSM transition rules are expressed as bit-masks */
filter_search(const struct filter * m,const unsigned char * data,unsigned long len)746 long filter_search(const struct filter *m, const unsigned char *data, unsigned long len)
747 {
748 size_t j;
749 uint8_t state = ~0;
750 const uint8_t *B = m->B;
751 const uint8_t *End = m->end;
752
753 /* we use 2-grams, must be higher than 1 */
754 if (len < 2) return -1;
755 /* Shift-Or like search algorithm */
756 for (j = 0; j < len - 1; j++) {
757 const uint16_t q0 = cli_readint16(&data[j]);
758 uint8_t match_end;
759 state = (state << 1) | B[q0];
760 /* state marks with a 0 bit all active states
761 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
762 * if we got two 0's at matching positions, it means we encountered a pattern's end */
763 match_end = state | End[q0];
764 if (match_end != 0xff) {
765
766 /* if state is reachable, and this character can finish a pattern, assume match */
767 /* to reduce false positives check if qgram can finish the pattern */
768 /* return position of probable match */
769 /* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
770 * longest pattern that could match */
771 return j >= MAXSOPATLEN ? j - MAXSOPATLEN : 0;
772 }
773 }
774 /* no match */
775 return -1;
776 }
777