1 /* ssdeep
2 * Copyright (C) 2002 Andrew Tridgell <tridge@samba.org>
3 * Copyright (C) 2006 ManTech International Corporation
4 * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
5 * Copyright (C) 2017 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 * Earlier versions of this code were named fuzzy.c and can be found at:
22 * http://www.samba.org/ftp/unpacked/junkcode/spamsum/
23 * http://ssdeep.sf.net/
24 */
25
26 #include "main.h"
27
28 #include <assert.h>
29 #include <errno.h>
30 #include <limits.h>
31 #include <stdint.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35
36 #include "fuzzy.h"
37 #include "edit_dist.h"
38
39 #if defined(__GNUC__) && __GNUC__ >= 3
40 #define likely(x) __builtin_expect(!!(x), 1)
41 #define unlikely(x) __builtin_expect(!!(x), 0)
42 #else
43 #define likely(x) x
44 #define unlikely(x) x
45 #endif
46
47 #define ROLLING_WINDOW 7
48 #define MIN_BLOCKSIZE 3
49 #define HASH_INIT 0x27
50 #define NUM_BLOCKHASHES 31
51
52 // Enable bit-parallel string processing only if bit-parallel algorithms
53 // are enabled and considered to be efficient.
54 #if !defined(SSDEEP_DISABLE_POSITION_ARRAY) || !SSDEEP_DISABLE_POSITION_ARRAY
55 #if SPAMSUM_LENGTH <= 64 && CHAR_MIN >= -256 && CHAR_MAX <= 256 && (CHAR_MAX - CHAR_MIN + 1) <= 256
56 #define SSDEEP_ENABLE_POSITION_ARRAY
57 #endif
58 #endif
59
60 struct roll_state
61 {
62 unsigned char window[ROLLING_WINDOW];
63 uint32_t h1, h2, h3;
64 uint32_t n;
65 };
66
roll_init(struct roll_state * self)67 static void roll_init(/*@out@*/ struct roll_state *self)
68 {
69 memset(self, 0, sizeof(struct roll_state));
70 }
71
72 /*
73 * a rolling hash, based on the Adler checksum. By using a rolling hash
74 * we can perform auto resynchronisation after inserts/deletes
75
76 * internally, h1 is the sum of the bytes in the window and h2
77 * is the sum of the bytes times the index
78
79 * h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
80 * we can cope with large blocksize values
81 */
roll_hash(struct roll_state * self,unsigned char c)82 static void roll_hash(struct roll_state *self, unsigned char c)
83 {
84 self->h2 -= self->h1;
85 self->h2 += ROLLING_WINDOW * (uint32_t)c;
86
87 self->h1 += (uint32_t)c;
88 self->h1 -= (uint32_t)self->window[self->n];
89
90 self->window[self->n] = c;
91 self->n++;
92 if (self->n == ROLLING_WINDOW)
93 self->n = 0;
94
95 /* The original spamsum AND'ed this value with 0xFFFFFFFF which
96 * in theory should have no effect. This AND has been removed
97 * for performance (jk) */
98 self->h3 <<= 5;
99 self->h3 ^= c;
100 }
101
roll_sum(const struct roll_state * self)102 static uint32_t roll_sum(const struct roll_state *self)
103 {
104 return self->h1 + self->h2 + self->h3;
105 }
106
107 /* A simple non-rolling hash, based on the FNV hash. */
108 #include "sum_table.h"
sum_hash(unsigned char c,unsigned char h)109 static unsigned char sum_hash(unsigned char c, unsigned char h)
110 {
111 return sum_table[h][c & 0x3f];
112 }
113
114 /* A blockhash contains a signature state for a specific (implicit) blocksize.
115 * The blocksize is given by SSDEEP_BS(index). The h and halfh members are the
116 * partial FNV hashes, where halfh stops to be reset after digest is
117 * SPAMSUM_LENGTH/2 long. The halfh hash is needed be able to truncate digest
118 * for the second output hash to stay compatible with ssdeep output. */
119 struct blockhash_context
120 {
121 unsigned int dindex;
122 char digest[SPAMSUM_LENGTH];
123 char halfdigest;
124 char h, halfh;
125 };
126
127 struct fuzzy_state
128 {
129 uint_least64_t total_size;
130 uint_least64_t fixed_size;
131 uint_least64_t reduce_border;
132 unsigned int bhstart, bhend, bhendlimit;
133 unsigned int flags;
134 uint32_t rollmask;
135 struct blockhash_context bh[NUM_BLOCKHASHES];
136 struct roll_state roll;
137 unsigned char lasth;
138 };
139
140 #define FUZZY_STATE_NEED_LASTHASH 1u
141 #define FUZZY_STATE_SIZE_FIXED 2u
142
143 #define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index))
144 #define SSDEEP_TOTAL_SIZE_MAX \
145 ((uint_least64_t)SSDEEP_BS(NUM_BLOCKHASHES-1) * SPAMSUM_LENGTH)
146
fuzzy_new(void)147 /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void)
148 {
149 struct fuzzy_state *self;
150 if(NULL == (self = malloc(sizeof(struct fuzzy_state))))
151 /* malloc sets ENOMEM */
152 return NULL;
153 self->bhstart = 0;
154 self->bhend = 1;
155 self->bhendlimit = NUM_BLOCKHASHES - 1;
156 self->bh[0].h = HASH_INIT;
157 self->bh[0].halfh = HASH_INIT;
158 self->bh[0].digest[0] = '\0';
159 self->bh[0].halfdigest = '\0';
160 self->bh[0].dindex = 0;
161 self->total_size = 0;
162 self->reduce_border = (uint_least64_t)MIN_BLOCKSIZE * SPAMSUM_LENGTH;
163 self->flags = 0;
164 self->rollmask = 0;
165 roll_init(&self->roll);
166 return self;
167 }
168
fuzzy_clone(const struct fuzzy_state * state)169 /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_clone(const struct fuzzy_state *state)
170 {
171 struct fuzzy_state *newstate;
172 if (NULL == (newstate = malloc(sizeof(struct fuzzy_state))))
173 /* malloc sets ENOMEM */
174 return NULL;
175 memcpy(newstate, state, sizeof(struct fuzzy_state));
176 return newstate;
177 }
178
179 #ifdef S_SPLINT_S
180 extern const int EOVERFLOW;
181 #endif
182
fuzzy_set_total_input_length(struct fuzzy_state * state,uint_least64_t total_fixed_length)183 int fuzzy_set_total_input_length(struct fuzzy_state *state, uint_least64_t total_fixed_length)
184 {
185 unsigned int bi = 0;
186 if (total_fixed_length > SSDEEP_TOTAL_SIZE_MAX)
187 {
188 errno = EOVERFLOW;
189 return -1;
190 }
191 if ((state->flags & FUZZY_STATE_SIZE_FIXED) &&
192 state->fixed_size != total_fixed_length)
193 {
194 errno = EINVAL;
195 return -1;
196 }
197 state->flags |= FUZZY_STATE_SIZE_FIXED;
198 state->fixed_size = total_fixed_length;
199 while ((uint_least64_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < total_fixed_length)
200 {
201 ++bi;
202 if (bi == NUM_BLOCKHASHES - 2)
203 break;
204 }
205 ++bi;
206 state->bhendlimit = bi;
207 return 0;
208 }
209
210
fuzzy_try_fork_blockhash(struct fuzzy_state * self)211 static void fuzzy_try_fork_blockhash(struct fuzzy_state *self)
212 {
213 struct blockhash_context *obh, *nbh;
214 assert(self->bhend > 0);
215 obh = self->bh + (self->bhend - 1);
216 if (self->bhend <= self->bhendlimit)
217 {
218 nbh = obh + 1;
219 nbh->h = obh->h;
220 nbh->halfh = obh->halfh;
221 nbh->digest[0] = '\0';
222 nbh->halfdigest = '\0';
223 nbh->dindex = 0;
224 ++self->bhend;
225 }
226 else if (self->bhend == NUM_BLOCKHASHES &&
227 !(self->flags & FUZZY_STATE_NEED_LASTHASH))
228 {
229 self->flags |= FUZZY_STATE_NEED_LASTHASH;
230 self->lasth = obh->h;
231 }
232 }
233
fuzzy_try_reduce_blockhash(struct fuzzy_state * self)234 static void fuzzy_try_reduce_blockhash(struct fuzzy_state *self)
235 {
236 assert(self->bhstart < self->bhend);
237 if (self->bhend - self->bhstart < 2)
238 /* Need at least two working hashes. */
239 return;
240 if (self->reduce_border >= ((self->flags & FUZZY_STATE_SIZE_FIXED) ? self->fixed_size : self->total_size))
241 /* Initial blocksize estimate would select this or a smaller
242 * blocksize. */
243 return;
244 if (self->bh[self->bhstart + 1].dindex < SPAMSUM_LENGTH / 2)
245 /* Estimate adjustment would select this blocksize. */
246 return;
247 /* At this point we are clearly no longer interested in the
248 * start_blocksize. Get rid of it. */
249 ++self->bhstart;
250 self->reduce_border *= 2;
251 self->rollmask = self->rollmask * 2 + 1;
252 }
253
254 static const char *b64 =
255 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
256
fuzzy_engine_step(struct fuzzy_state * self,unsigned char c)257 static void fuzzy_engine_step(struct fuzzy_state *self, unsigned char c)
258 {
259 uint32_t horg, h;
260 unsigned int i;
261 /* At each character we update the rolling hash and the normal hashes.
262 * When the rolling hash hits a reset value then we emit a normal hash
263 * as a element of the signature and reset the normal hash. */
264 roll_hash(&self->roll, c);
265 horg = roll_sum(&self->roll) + 1;
266 h = horg / (uint32_t)MIN_BLOCKSIZE;
267
268 for (i = self->bhstart; i < self->bhend; ++i)
269 {
270 self->bh[i].h = sum_hash(c, self->bh[i].h);
271 self->bh[i].halfh = sum_hash(c, self->bh[i].halfh);
272 }
273 if (self->flags & FUZZY_STATE_NEED_LASTHASH)
274 self->lasth = sum_hash(c, self->lasth);
275
276 /* 0xffffffff !== -1 (mod 3) */
277 if (!horg)
278 return;
279 /* With growing blocksize almost no runs fail the next test. */
280 if (likely(h & self->rollmask))
281 return;
282 /* Delay computation of modulo as possible. */
283 if (horg % (uint32_t)MIN_BLOCKSIZE)
284 return;
285 h >>= self->bhstart;
286
287 i = self->bhstart;
288 do
289 {
290 /* We have hit a reset point. We now emit hashes which are
291 * based on all characters in the piece of the message between
292 * the last reset point and this one */
293 if (unlikely(0 == self->bh[i].dindex)) {
294 /* Can only happen 30 times. */
295 /* First step for this blocksize. Clone next. */
296 fuzzy_try_fork_blockhash(self);
297 }
298 self->bh[i].digest[self->bh[i].dindex] =
299 b64[self->bh[i].h];
300 self->bh[i].halfdigest = b64[self->bh[i].halfh];
301 if (self->bh[i].dindex < SPAMSUM_LENGTH - 1) {
302 /* We can have a problem with the tail overflowing. The
303 * easiest way to cope with this is to only reset the
304 * normal hash if we have room for more characters in
305 * our signature. This has the effect of combining the
306 * last few pieces of the message into a single piece
307 * */
308 self->bh[i].digest[++(self->bh[i].dindex)] = '\0';
309 self->bh[i].h = HASH_INIT;
310 if (self->bh[i].dindex < SPAMSUM_LENGTH / 2) {
311 self->bh[i].halfh = HASH_INIT;
312 self->bh[i].halfdigest = '\0';
313 }
314 }
315 else
316 fuzzy_try_reduce_blockhash(self);
317 if (h & 1)
318 break;
319 h >>= 1;
320 } while (++i < self->bhend);
321 }
322
fuzzy_update(struct fuzzy_state * self,const unsigned char * buffer,size_t buffer_size)323 int fuzzy_update(struct fuzzy_state *self,
324 const unsigned char *buffer,
325 size_t buffer_size)
326 {
327 if (unlikely(buffer_size > SSDEEP_TOTAL_SIZE_MAX ||
328 SSDEEP_TOTAL_SIZE_MAX - buffer_size < self->total_size )) {
329 self->total_size = SSDEEP_TOTAL_SIZE_MAX + 1;
330 }
331 else
332 self->total_size += buffer_size;
333 for ( ;buffer_size > 0; ++buffer, --buffer_size)
334 fuzzy_engine_step(self, *buffer);
335 return 0;
336 }
337
memcpy_eliminate_sequences(char * dst,const char * src,int n)338 static int memcpy_eliminate_sequences(char *dst,
339 const char *src,
340 int n)
341 {
342 const char *srcend = src + n;
343 assert(n >= 0);
344 if (src < srcend) *dst++ = *src++;
345 if (src < srcend) *dst++ = *src++;
346 if (src < srcend) *dst++ = *src++;
347 while (src < srcend)
348 {
349 if (*src == dst[-1] && *src == dst[-2] && *src == dst[-3])
350 {
351 ++src;
352 --n;
353 }
354 else
355 *dst++ = *src++;
356 }
357 return n;
358 }
359
fuzzy_digest(const struct fuzzy_state * self,char * result,unsigned int flags)360 int fuzzy_digest(const struct fuzzy_state *self,
361 /*@out@*/ char *result,
362 unsigned int flags)
363 {
364 unsigned int bi = self->bhstart;
365 uint32_t h = roll_sum(&self->roll);
366 int i, remain = FUZZY_MAX_RESULT - 1; /* Exclude terminating '\0'. */
367 /* Verify that our elimination was not overeager. */
368 assert(bi == 0 || (uint_least64_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
369 self->total_size);
370
371 if (self->total_size > SSDEEP_TOTAL_SIZE_MAX) {
372 /* The input exceeds data types. */
373 errno = EOVERFLOW;
374 return -1;
375 }
376 /* Fixed size optimization. */
377 if ((self->flags & FUZZY_STATE_SIZE_FIXED) &&
378 self->fixed_size != self->total_size) {
379 errno = EINVAL;
380 return -1;
381 }
382 /* Initial blocksize guess. */
383 while ((uint_least64_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size)
384 ++bi;
385 /* Adapt blocksize guess to actual digest length. */
386 if (bi >= self->bhend)
387 bi = self->bhend - 1;
388 while (bi > self->bhstart && self->bh[bi].dindex < SPAMSUM_LENGTH / 2)
389 --bi;
390 assert (!(bi > 0 && self->bh[bi].dindex < SPAMSUM_LENGTH / 2));
391
392 i = snprintf(result, (size_t)remain, "%lu:", (unsigned long)SSDEEP_BS(bi));
393 if (i <= 0)
394 /* Maybe snprintf has set errno here? */
395 return -1;
396 assert(i < remain);
397 remain -= i;
398 result += i;
399 i = (int)self->bh[bi].dindex;
400 assert(i <= remain);
401 if ((flags & FUZZY_FLAG_ELIMSEQ) != 0)
402 i = memcpy_eliminate_sequences(result, self->bh[bi].digest, i);
403 else
404 memcpy(result, self->bh[bi].digest, (size_t)i);
405 result += i;
406 remain -= i;
407 if (h != 0)
408 {
409 assert(remain > 0);
410 *result = b64[self->bh[bi].h];
411 if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
412 *result != result[-1] ||
413 *result != result[-2] ||
414 *result != result[-3]) {
415 ++result;
416 --remain;
417 }
418 }
419 else if (self->bh[bi].digest[self->bh[bi].dindex] != '\0') {
420 assert(remain > 0);
421 *result = self->bh[bi].digest[self->bh[bi].dindex];
422 if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
423 *result != result[-1] ||
424 *result != result[-2] ||
425 *result != result[-3]) {
426 ++result;
427 --remain;
428 }
429 }
430 assert(remain > 0);
431 *result++ = ':';
432 --remain;
433 if (bi < self->bhend - 1)
434 {
435 ++bi;
436 i = (int)self->bh[bi].dindex;
437 if ((flags & FUZZY_FLAG_NOTRUNC) == 0 &&
438 i > SPAMSUM_LENGTH / 2 - 1)
439 i = SPAMSUM_LENGTH / 2 - 1;
440 assert(i <= remain);
441 if ((flags & FUZZY_FLAG_ELIMSEQ) != 0)
442 i = memcpy_eliminate_sequences(result,
443 self->bh[bi].digest, i);
444 else
445 memcpy(result, self->bh[bi].digest, (size_t)i);
446 result += i;
447 remain -= i;
448 if (h != 0) {
449 assert(remain > 0);
450 h = (flags & FUZZY_FLAG_NOTRUNC) != 0 ? self->bh[bi].h :
451 self->bh[bi].halfh;
452 *result = b64[h];
453 if ((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
454 *result != result[-1] ||
455 *result != result[-2] ||
456 *result != result[-3])
457 {
458 ++result;
459 --remain;
460 }
461 }
462 else {
463 i = (flags & FUZZY_FLAG_NOTRUNC) != 0 ?
464 self->bh[bi].digest[self->bh[bi].dindex] : self->bh[bi].halfdigest;
465 if (i != '\0') {
466 assert(remain > 0);
467 *result = i;
468 if ((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
469 *result != result[-1] ||
470 *result != result[-2] ||
471 *result != result[-3])
472 {
473 ++result;
474 --remain;
475 }
476 }
477 }
478 }
479 else if (h != 0)
480 {
481 assert(bi == 0 || bi == NUM_BLOCKHASHES - 1);
482 assert(remain > 0);
483 if (bi == 0)
484 *result++ = b64[self->bh[bi].h];
485 else
486 *result++ = b64[self->lasth];
487 /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this
488 * digest has length 1. */
489 --remain;
490 }
491 *result = '\0';
492 return 0;
493 }
494
fuzzy_free(struct fuzzy_state * self)495 void fuzzy_free(/*@only@*/ struct fuzzy_state *self)
496 {
497 free(self);
498 }
499
fuzzy_hash_buf(const unsigned char * buf,uint32_t buf_len,char * result)500 int fuzzy_hash_buf(const unsigned char *buf,
501 uint32_t buf_len,
502 /*@out@*/ char *result)
503 {
504 struct fuzzy_state *ctx;
505 int ret = -1;
506 if (NULL == (ctx = fuzzy_new()))
507 return -1;
508 if (fuzzy_set_total_input_length(ctx, buf_len) < 0)
509 goto out;
510 if (fuzzy_update(ctx, buf, buf_len) < 0)
511 goto out;
512 if (fuzzy_digest(ctx, result, 0) < 0)
513 goto out;
514 ret = 0;
515 out:
516 fuzzy_free(ctx);
517 return ret;
518 }
519
fuzzy_update_stream(struct fuzzy_state * state,FILE * handle)520 static int fuzzy_update_stream(struct fuzzy_state *state,
521 FILE *handle)
522 {
523 unsigned char buffer[4096];
524 size_t n;
525 for(;;)
526 {
527 n = fread(buffer, 1, 4096, handle);
528 if (0 == n)
529 break;
530 if (fuzzy_update(state, buffer, n) < 0)
531 return -1;
532 }
533 if (ferror(handle) != 0)
534 return -1;
535 return 0;
536 }
537
fuzzy_hash_stream(FILE * handle,char * result)538 int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result)
539 {
540 struct fuzzy_state *ctx;
541 int ret = -1;
542 if (NULL == (ctx = fuzzy_new()))
543 return -1;
544 if (fuzzy_update_stream(ctx, handle) < 0)
545 goto out;
546 if (fuzzy_digest(ctx, result, 0) < 0)
547 goto out;
548 ret = 0;
549 out:
550 fuzzy_free(ctx);
551 return ret;
552 }
553
554 #ifdef S_SPLINT_S
555 typedef size_t off_t;
556 int fseeko(FILE *, off_t, int);
557 off_t ftello(FILE *);
558 #endif
559
fuzzy_hash_file(FILE * handle,char * result)560 int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result)
561 {
562 off_t fpos, fposend;
563 int status = -1;
564 struct fuzzy_state *ctx;
565 fpos = ftello(handle);
566 if (fpos < 0)
567 return -1;
568 if (fseeko(handle, 0, SEEK_END) < 0)
569 return -1;
570 fposend = ftello(handle);
571 if (fposend < 0)
572 return -1;
573 if (fseeko(handle, 0, SEEK_SET) < 0)
574 return -1;
575 if (NULL == (ctx = fuzzy_new()))
576 return -1;
577 if (fuzzy_set_total_input_length(ctx, (uint_least64_t)fposend) < 0)
578 goto out;
579 if (fuzzy_update_stream(ctx, handle) < 0)
580 goto out;
581 status = fuzzy_digest(ctx, result, 0);
582 out:
583 if (status == 0)
584 {
585 if (fseeko(handle, fpos, SEEK_SET) < 0)
586 return -1;
587 }
588 fuzzy_free(ctx);
589 return status;
590 }
591
fuzzy_hash_filename(const char * filename,char * result)592 int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result)
593 {
594 int status;
595 FILE *handle = fopen(filename, "rb");
596 if (NULL == handle)
597 return -1;
598 status = fuzzy_hash_stream(handle, result);
599 /* We cannot do anything about an fclose failure. */
600 (void)fclose(handle);
601 return status;
602 }
603
604 //
605 // We only accept a match if we have at least one common substring in
606 // the signature of length ROLLING_WINDOW. This dramatically drops the
607 // false positive rate for low score thresholds while having
608 // negligable affect on the rate of spam detection.
609 //
610 // return 1 if the two strings do have a common substring, 0 otherwise
611 //
has_common_substring(const char * s1,size_t s1len,const char * s2,size_t s2len)612 static int has_common_substring(const char *s1, size_t s1len, const char *s2, size_t s2len)
613 {
614 size_t i, j;
615 uint32_t hashes[SPAMSUM_LENGTH - (ROLLING_WINDOW - 1)];
616
617 if (s1len < ROLLING_WINDOW)
618 return 0;
619 if (s2len < ROLLING_WINDOW)
620 return 0;
621
622 // there are many possible algorithms for common substring
623 // detection. In this case I am re-using the rolling hash code
624 // to act as a filter for possible substring matches
625
626 memset(hashes, 0, sizeof(hashes));
627
628 // first compute the windowed rolling hash at each offset in
629 // the first string
630 struct roll_state state;
631 roll_init (&state);
632
633 for (i = 0; i < ROLLING_WINDOW - 1; i++)
634 roll_hash(&state, (unsigned char)s1[i]);
635 for (i = ROLLING_WINDOW - 1; i < s1len; i++)
636 {
637 roll_hash(&state, (unsigned char)s1[i]);
638 hashes[i - (ROLLING_WINDOW - 1)] = roll_sum(&state);
639 }
640 s1len -= (ROLLING_WINDOW - 1);
641
642 roll_init(&state);
643
644 // now for each offset in the second string compute the
645 // rolling hash and compare it to all of the rolling hashes
646 // for the first string. If one matches then we have a
647 // candidate substring match. We then confirm that match with
648 // a direct string comparison */
649 for (j = 0; j < ROLLING_WINDOW - 1; j++)
650 roll_hash(&state, (unsigned char)s2[j]);
651 for (j = 0; j < s2len - (ROLLING_WINDOW - 1); j++)
652 {
653 roll_hash(&state, (unsigned char)s2[j + (ROLLING_WINDOW - 1)]);
654 uint32_t h = roll_sum(&state);
655 for (i = 0; i < s1len; i++)
656 {
657 // confirm the match after checking potential match
658 if (hashes[i] == h && !memcmp(s1 + i, s2 + j, ROLLING_WINDOW))
659 return 1;
660 }
661 }
662
663 return 0;
664 }
665
666
667
668 #ifdef SSDEEP_ENABLE_POSITION_ARRAY
669
670 // position array-based version of has_common_substring
has_common_substring_pa(const unsigned long long * parray,const char * s2,size_t s2len)671 static int has_common_substring_pa(const unsigned long long *parray, const char *s2, size_t s2len)
672 {
673 unsigned long long D;
674 // ROLLING_WINDOW <= s2len <= 64
675 size_t r = ROLLING_WINDOW - 1;
676 size_t l;
677 const char *ch;
678 while (r < s2len)
679 {
680 // because we want to reuse position array for s1,
681 // both s1 and s2 (in the original pseudocode) are reversed.
682 l = r - (ROLLING_WINDOW - 1);
683 ch = &s2[s2len - 1 - r];
684 D = parray[*ch - CHAR_MIN];
685 while (D)
686 {
687 r--;
688 D = (D << 1) & parray[*++ch - CHAR_MIN];
689 if (r == l && D)
690 return 1;
691 }
692 // Boyer-Moore-like skipping
693 r += ROLLING_WINDOW;
694 }
695 return 0;
696 }
697
698 // position array-based version of edit_distn
edit_distn_pa(const unsigned long long * parray,size_t s1len,const char * s2,size_t s2len)699 static int edit_distn_pa(const unsigned long long *parray, size_t s1len, const char *s2, size_t s2len)
700 {
701 unsigned long long pv, nv, ph, nh, zd, mt, x, y;
702 unsigned long long msb;
703 size_t i;
704 // 0 < s1len <= 64
705 int cur = s1len;
706 msb = 1ull << (s1len - 1);
707 pv = -1;
708 nv = 0;
709 for (i = 0; i < s2len; i++)
710 {
711 mt = parray[s2[i] - CHAR_MIN];
712 zd = (((mt & pv) + pv) ^ pv) | mt | nv;
713 nh = pv & zd;
714 if (nh & msb)
715 --cur;
716 x = nv | ~(pv | zd) | (pv & ~mt & 1ull);
717 y = (pv - nh) >> 1;
718 ph = (x + y) ^ y;
719 if (ph & msb)
720 ++cur;
721 x = (ph << 1) | 1ull;
722 nv = x & zd;
723 pv = (nh << 1) | ~(x | zd) | (x & (pv - nh));
724 }
725 return cur;
726 }
727
728 #endif
729
730
731
732 // eliminate sequences of longer than 3 identical characters. These
733 // sequences contain very little information so they tend to just bias
734 // the result unfairly
copy_eliminate_sequences(char ** out,size_t outsize,char ** in,char etoken)735 static int copy_eliminate_sequences(char **out, size_t outsize, char **in, char etoken)
736 {
737 size_t seq = 0;
738 char prev = **in, curr;
739 if (!prev || prev == etoken)
740 return 1;
741 if (!outsize--)
742 return 0;
743 *(*out)++ = prev;
744 ++(*in);
745 while (1)
746 {
747 curr = **in;
748 if (!curr || curr == etoken)
749 return 1;
750 ++(*in);
751 if (curr == prev)
752 {
753 if (++seq >= 3)
754 {
755 seq = 3;
756 continue;
757 }
758 if (!outsize--)
759 return 0;
760 *(*out)++ = curr;
761 }
762 else
763 {
764 if (!outsize--)
765 return 0;
766 *(*out)++ = curr;
767 seq = 0;
768 prev = curr;
769 }
770 }
771 // unreachable
772 return 0;
773 }
774
775 //
776 // this is the low level string scoring algorithm. It takes two strings
777 // and scores them on a scale of 0-100 where 0 is a terrible match and
778 // 100 is a great match. The block_size is used to cope with very small
779 // messages.
780 //
score_strings(const char * s1,size_t s1len,const char * s2,size_t s2len,unsigned long block_size)781 static uint32_t score_strings(const char *s1,
782 size_t s1len,
783 const char *s2,
784 size_t s2len,
785 unsigned long block_size)
786 {
787 uint32_t score;
788
789 #ifdef SSDEEP_ENABLE_POSITION_ARRAY
790 unsigned long long parray[CHAR_MAX - CHAR_MIN + 1];
791 size_t i;
792 // skip short strings
793 if (s1len < ROLLING_WINDOW)
794 return 0;
795 if (s2len < ROLLING_WINDOW)
796 return 0;
797 // construct position array for faster string algorithms
798 memset(parray, 0, sizeof(parray));
799 for (i = 0; i < s1len; i++)
800 parray[s1[i] - CHAR_MIN] |= 1ull << i;
801 // the two strings must have a common substring of length
802 // ROLLING_WINDOW to be candidates
803 if (!has_common_substring_pa(parray, s2, s2len))
804 return 0;
805 // compute the edit distance between the two strings. The edit distance gives
806 // us a pretty good idea of how closely related the two strings are
807 score = edit_distn_pa(parray, s1len, s2, s2len);
808 #else
809 // the two strings must have a common substring of length
810 // ROLLING_WINDOW to be candidates
811 if (!has_common_substring(s1, s1len, s2, s2len))
812 return 0;
813 // compute the edit distance between the two strings. The edit distance gives
814 // us a pretty good idea of how closely related the two strings are
815 score = edit_distn(s1, s1len, s2, s2len);
816 #endif
817
818 // scale the edit distance by the lengths of the two
819 // strings. This changes the score to be a measure of the
820 // proportion of the message that has changed rather than an
821 // absolute quantity. It also copes with the variability of
822 // the string lengths.
823 score = (score * SPAMSUM_LENGTH) / (s1len + s2len);
824
825 // at this stage the score occurs roughly on a 0-SPAMSUM_LENGTH scale,
826 // with 0 being a good match and SPAMSUM_LENGTH being a complete
827 // mismatch
828
829 // rescale to a 0-100 scale (friendlier to humans)
830 score = (100 * score) / SPAMSUM_LENGTH;
831
832 // now re-scale on a 0-100 scale with 0 being a poor match and
833 // 100 being a excellent match.
834 score = 100 - score;
835
836 // printf ("s1len: %"PRIu32" s2len: %"PRIu32"\n", (uint32_t)s1len, (uint32_t)s2len);
837
838 // when the blocksize is small we don't want to exaggerate the match size
839 if (block_size >= (99 + ROLLING_WINDOW) / ROLLING_WINDOW * MIN_BLOCKSIZE)
840 return score;
841 if (score > block_size/MIN_BLOCKSIZE * MIN(s1len, s2len))
842 {
843 score = block_size/MIN_BLOCKSIZE * MIN(s1len, s2len);
844 }
845 return score;
846 }
847
848 //
849 // Given two spamsum strings return a value indicating the degree
850 // to which they match.
851 //
fuzzy_compare(const char * str1,const char * str2)852 int fuzzy_compare(const char *str1, const char *str2)
853 {
854 unsigned long block_size1, block_size2;
855 uint32_t score = 0;
856 size_t s1b1len, s1b2len, s2b1len, s2b2len;
857 char s1b1[SPAMSUM_LENGTH], s1b2[SPAMSUM_LENGTH];
858 char s2b1[SPAMSUM_LENGTH], s2b2[SPAMSUM_LENGTH];
859 char *s1p, *s2p, *tmp;
860
861 if (NULL == str1 || NULL == str2)
862 return -1;
863
864 // each spamsum is prefixed by its block size
865 if (sscanf(str1, "%lu:", &block_size1) != 1 ||
866 sscanf(str2, "%lu:", &block_size2) != 1) {
867 return -1;
868 }
869
870 // if the blocksizes don't match then we are comparing
871 // apples to oranges. This isn't an 'error' per se. We could
872 // have two valid signatures, but they can't be compared.
873 if (block_size1 != block_size2 &&
874 (block_size1 > ULONG_MAX / 2 || block_size1*2 != block_size2) &&
875 (block_size1 % 2 == 1 || block_size1 / 2 != block_size2)) {
876 return 0;
877 }
878
879 // move past the prefix
880 s1p = strchr(str1, ':');
881 s2p = strchr(str2, ':');
882
883 if (!s1p || !s2p) {
884 // badly formed ...
885 return -1;
886 }
887
888 // there is very little information content is sequences of
889 // the same character like 'LLLLL'. Eliminate any sequences
890 // longer than 3 while reading two pieces.
891 // This is especially important when combined with the
892 // has_common_substring() test at score_strings().
893
894 // read the first digest
895 ++s1p;
896 tmp = s1b1;
897 if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s1p, ':'))
898 return -1;
899 s1b1len = tmp - s1b1;
900 if (!*s1p++) {
901 // a signature is malformed - it doesn't have 2 parts
902 return -1;
903 }
904 tmp = s1b2;
905 if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s1p, ','))
906 return -1;
907 s1b2len = tmp - s1b2;
908
909 // read the second digest
910 ++s2p;
911 tmp = s2b1;
912 if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s2p, ':'))
913 return -1;
914 s2b1len = tmp - s2b1;
915 if (!*s2p++) {
916 // a signature is malformed - it doesn't have 2 parts
917 return -1;
918 }
919 tmp = s2b2;
920 if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s2p, ','))
921 return -1;
922 s2b2len = tmp - s2b2;
923
924 // Now that we know the strings are both well formed, are they
925 // identical? We could save ourselves some work here
926 if (block_size1 == block_size2 && s1b1len == s2b1len && s1b2len == s2b2len) {
927 if (!memcmp(s1b1, s2b1, s1b1len) && !memcmp(s1b2, s2b2, s1b2len)) {
928 return 100;
929 }
930 }
931
932 // each signature has a string for two block sizes. We now
933 // choose how to combine the two block sizes. We checked above
934 // that they have at least one block size in common
935 if (block_size1 <= ULONG_MAX / 2) {
936 if (block_size1 == block_size2) {
937 uint32_t score1, score2;
938 score1 = score_strings(s1b1, s1b1len, s2b1, s2b1len, block_size1);
939 score2 = score_strings(s1b2, s1b2len, s2b2, s2b2len, block_size1*2);
940 score = MAX(score1, score2);
941 }
942 else if (block_size1 * 2 == block_size2) {
943 score = score_strings(s2b1, s2b1len, s1b2, s1b2len, block_size2);
944 }
945 else {
946 score = score_strings(s1b1, s1b1len, s2b2, s2b2len, block_size1);
947 }
948 }
949 else {
950 if (block_size1 == block_size2) {
951 score = score_strings(s1b1, s1b1len, s2b1, s2b1len, block_size1);
952 }
953 else if (block_size1 % 2 == 0 && block_size1 / 2 == block_size2) {
954 score = score_strings(s1b1, s1b1len, s2b2, s2b2len, block_size1);
955 }
956 else {
957 score = 0;
958 }
959 }
960
961 return (int)score;
962 }
963