1 /* ssdeep
2  * Copyright (C) 2002 Andrew Tridgell <tridge@samba.org>
3  * Copyright (C) 2006 ManTech International Corporation
4  * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
5  * Copyright (C) 2017 Tsukasa OI <floss_ssdeep@irq.a4lg.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
20  *
21  * Earlier versions of this code were named fuzzy.c and can be found at:
22  *     http://www.samba.org/ftp/unpacked/junkcode/spamsum/
23  *     http://ssdeep.sf.net/
24  */
25 
26 #include "main.h"
27 
28 #include <assert.h>
29 #include <errno.h>
30 #include <limits.h>
31 #include <stdint.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 
36 #include "fuzzy.h"
37 #include "edit_dist.h"
38 
39 #if defined(__GNUC__) && __GNUC__ >= 3
40 #define likely(x)       __builtin_expect(!!(x), 1)
41 #define unlikely(x)     __builtin_expect(!!(x), 0)
42 #else
43 #define likely(x) x
44 #define unlikely(x) x
45 #endif
46 
47 #define ROLLING_WINDOW 7
48 #define MIN_BLOCKSIZE 3
49 #define HASH_INIT 0x27
50 #define NUM_BLOCKHASHES 31
51 
52 // Enable bit-parallel string processing only if bit-parallel algorithms
53 // are enabled and considered to be efficient.
54 #if !defined(SSDEEP_DISABLE_POSITION_ARRAY) || !SSDEEP_DISABLE_POSITION_ARRAY
55 #if SPAMSUM_LENGTH <= 64 && CHAR_MIN >= -256 && CHAR_MAX <= 256 && (CHAR_MAX - CHAR_MIN + 1) <= 256
56 #define SSDEEP_ENABLE_POSITION_ARRAY
57 #endif
58 #endif
59 
60 struct roll_state
61 {
62   unsigned char window[ROLLING_WINDOW];
63   uint32_t h1, h2, h3;
64   uint32_t n;
65 };
66 
roll_init(struct roll_state * self)67 static void roll_init(/*@out@*/ struct roll_state *self)
68 {
69   memset(self, 0, sizeof(struct roll_state));
70 }
71 
72 /*
73  * a rolling hash, based on the Adler checksum. By using a rolling hash
74  * we can perform auto resynchronisation after inserts/deletes
75 
76  * internally, h1 is the sum of the bytes in the window and h2
77  * is the sum of the bytes times the index
78 
79  * h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
80  * we can cope with large blocksize values
81  */
roll_hash(struct roll_state * self,unsigned char c)82 static void roll_hash(struct roll_state *self, unsigned char c)
83 {
84   self->h2 -= self->h1;
85   self->h2 += ROLLING_WINDOW * (uint32_t)c;
86 
87   self->h1 += (uint32_t)c;
88   self->h1 -= (uint32_t)self->window[self->n];
89 
90   self->window[self->n] = c;
91   self->n++;
92   if (self->n == ROLLING_WINDOW)
93     self->n = 0;
94 
95   /* The original spamsum AND'ed this value with 0xFFFFFFFF which
96    * in theory should have no effect. This AND has been removed
97    * for performance (jk) */
98   self->h3 <<= 5;
99   self->h3 ^= c;
100 }
101 
roll_sum(const struct roll_state * self)102 static uint32_t roll_sum(const struct roll_state *self)
103 {
104   return self->h1 + self->h2 + self->h3;
105 }
106 
107 /* A simple non-rolling hash, based on the FNV hash. */
108 #include "sum_table.h"
sum_hash(unsigned char c,unsigned char h)109 static unsigned char sum_hash(unsigned char c, unsigned char h)
110 {
111   return sum_table[h][c & 0x3f];
112 }
113 
114 /* A blockhash contains a signature state for a specific (implicit) blocksize.
115  * The blocksize is given by SSDEEP_BS(index). The h and halfh members are the
116  * partial FNV hashes, where halfh stops to be reset after digest is
117  * SPAMSUM_LENGTH/2 long. The halfh hash is needed be able to truncate digest
118  * for the second output hash to stay compatible with ssdeep output. */
119 struct blockhash_context
120 {
121   unsigned int dindex;
122   char digest[SPAMSUM_LENGTH];
123   char halfdigest;
124   char h, halfh;
125 };
126 
127 struct fuzzy_state
128 {
129   uint_least64_t total_size;
130   uint_least64_t fixed_size;
131   uint_least64_t reduce_border;
132   unsigned int bhstart, bhend, bhendlimit;
133   unsigned int flags;
134   uint32_t rollmask;
135   struct blockhash_context bh[NUM_BLOCKHASHES];
136   struct roll_state roll;
137   unsigned char lasth;
138 };
139 
140 #define FUZZY_STATE_NEED_LASTHASH  1u
141 #define FUZZY_STATE_SIZE_FIXED     2u
142 
143 #define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index))
144 #define SSDEEP_TOTAL_SIZE_MAX \
145   ((uint_least64_t)SSDEEP_BS(NUM_BLOCKHASHES-1) * SPAMSUM_LENGTH)
146 
fuzzy_new(void)147 /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void)
148 {
149   struct fuzzy_state *self;
150   if(NULL == (self = malloc(sizeof(struct fuzzy_state))))
151     /* malloc sets ENOMEM */
152     return NULL;
153   self->bhstart = 0;
154   self->bhend = 1;
155   self->bhendlimit = NUM_BLOCKHASHES - 1;
156   self->bh[0].h = HASH_INIT;
157   self->bh[0].halfh = HASH_INIT;
158   self->bh[0].digest[0] = '\0';
159   self->bh[0].halfdigest = '\0';
160   self->bh[0].dindex = 0;
161   self->total_size = 0;
162   self->reduce_border = (uint_least64_t)MIN_BLOCKSIZE * SPAMSUM_LENGTH;
163   self->flags = 0;
164   self->rollmask = 0;
165   roll_init(&self->roll);
166   return self;
167 }
168 
fuzzy_clone(const struct fuzzy_state * state)169 /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_clone(const struct fuzzy_state *state)
170 {
171   struct fuzzy_state *newstate;
172   if (NULL == (newstate = malloc(sizeof(struct fuzzy_state))))
173     /* malloc sets ENOMEM */
174     return NULL;
175   memcpy(newstate, state, sizeof(struct fuzzy_state));
176   return newstate;
177 }
178 
179 #ifdef S_SPLINT_S
180 extern const int EOVERFLOW;
181 #endif
182 
fuzzy_set_total_input_length(struct fuzzy_state * state,uint_least64_t total_fixed_length)183 int fuzzy_set_total_input_length(struct fuzzy_state *state, uint_least64_t total_fixed_length)
184 {
185   unsigned int bi = 0;
186   if (total_fixed_length > SSDEEP_TOTAL_SIZE_MAX)
187   {
188     errno = EOVERFLOW;
189     return -1;
190   }
191   if ((state->flags & FUZZY_STATE_SIZE_FIXED) &&
192       state->fixed_size != total_fixed_length)
193   {
194     errno = EINVAL;
195     return -1;
196   }
197   state->flags |= FUZZY_STATE_SIZE_FIXED;
198   state->fixed_size = total_fixed_length;
199   while ((uint_least64_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < total_fixed_length)
200   {
201     ++bi;
202     if (bi == NUM_BLOCKHASHES - 2)
203       break;
204   }
205   ++bi;
206   state->bhendlimit = bi;
207   return 0;
208 }
209 
210 
fuzzy_try_fork_blockhash(struct fuzzy_state * self)211 static void fuzzy_try_fork_blockhash(struct fuzzy_state *self)
212 {
213   struct blockhash_context *obh, *nbh;
214   assert(self->bhend > 0);
215   obh = self->bh + (self->bhend - 1);
216   if (self->bhend <= self->bhendlimit)
217   {
218     nbh = obh + 1;
219     nbh->h = obh->h;
220     nbh->halfh = obh->halfh;
221     nbh->digest[0] = '\0';
222     nbh->halfdigest = '\0';
223     nbh->dindex = 0;
224     ++self->bhend;
225   }
226   else if (self->bhend == NUM_BLOCKHASHES &&
227 	   !(self->flags & FUZZY_STATE_NEED_LASTHASH))
228   {
229     self->flags |= FUZZY_STATE_NEED_LASTHASH;
230     self->lasth = obh->h;
231   }
232 }
233 
fuzzy_try_reduce_blockhash(struct fuzzy_state * self)234 static void fuzzy_try_reduce_blockhash(struct fuzzy_state *self)
235 {
236   assert(self->bhstart < self->bhend);
237   if (self->bhend - self->bhstart < 2)
238     /* Need at least two working hashes. */
239     return;
240   if (self->reduce_border >= ((self->flags & FUZZY_STATE_SIZE_FIXED) ? self->fixed_size : self->total_size))
241     /* Initial blocksize estimate would select this or a smaller
242      * blocksize. */
243     return;
244   if (self->bh[self->bhstart + 1].dindex < SPAMSUM_LENGTH / 2)
245     /* Estimate adjustment would select this blocksize. */
246     return;
247   /* At this point we are clearly no longer interested in the
248    * start_blocksize. Get rid of it. */
249   ++self->bhstart;
250   self->reduce_border *= 2;
251   self->rollmask = self->rollmask * 2 + 1;
252 }
253 
254 static const char *b64 =
255   "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
256 
fuzzy_engine_step(struct fuzzy_state * self,unsigned char c)257 static void fuzzy_engine_step(struct fuzzy_state *self, unsigned char c)
258 {
259   uint32_t horg, h;
260   unsigned int i;
261   /* At each character we update the rolling hash and the normal hashes.
262    * When the rolling hash hits a reset value then we emit a normal hash
263    * as a element of the signature and reset the normal hash. */
264   roll_hash(&self->roll, c);
265   horg = roll_sum(&self->roll) + 1;
266   h = horg / (uint32_t)MIN_BLOCKSIZE;
267 
268   for (i = self->bhstart; i < self->bhend; ++i)
269   {
270     self->bh[i].h = sum_hash(c, self->bh[i].h);
271     self->bh[i].halfh = sum_hash(c, self->bh[i].halfh);
272   }
273   if (self->flags & FUZZY_STATE_NEED_LASTHASH)
274     self->lasth = sum_hash(c, self->lasth);
275 
276   /* 0xffffffff !== -1 (mod 3) */
277   if (!horg)
278     return;
279   /* With growing blocksize almost no runs fail the next test. */
280   if (likely(h & self->rollmask))
281     return;
282   /* Delay computation of modulo as possible. */
283   if (horg % (uint32_t)MIN_BLOCKSIZE)
284     return;
285   h >>= self->bhstart;
286 
287   i = self->bhstart;
288   do
289   {
290     /* We have hit a reset point. We now emit hashes which are
291      * based on all characters in the piece of the message between
292      * the last reset point and this one */
293     if (unlikely(0 == self->bh[i].dindex)) {
294       /* Can only happen 30 times. */
295       /* First step for this blocksize. Clone next. */
296       fuzzy_try_fork_blockhash(self);
297     }
298     self->bh[i].digest[self->bh[i].dindex] =
299       b64[self->bh[i].h];
300     self->bh[i].halfdigest = b64[self->bh[i].halfh];
301     if (self->bh[i].dindex < SPAMSUM_LENGTH - 1) {
302       /* We can have a problem with the tail overflowing. The
303        * easiest way to cope with this is to only reset the
304        * normal hash if we have room for more characters in
305        * our signature. This has the effect of combining the
306        * last few pieces of the message into a single piece
307        * */
308       self->bh[i].digest[++(self->bh[i].dindex)] = '\0';
309       self->bh[i].h = HASH_INIT;
310       if (self->bh[i].dindex < SPAMSUM_LENGTH / 2) {
311 	self->bh[i].halfh = HASH_INIT;
312 	self->bh[i].halfdigest = '\0';
313       }
314     }
315     else
316       fuzzy_try_reduce_blockhash(self);
317     if (h & 1)
318       break;
319     h >>= 1;
320   } while (++i < self->bhend);
321 }
322 
fuzzy_update(struct fuzzy_state * self,const unsigned char * buffer,size_t buffer_size)323 int fuzzy_update(struct fuzzy_state *self,
324 		 const unsigned char *buffer,
325 		 size_t buffer_size)
326 {
327   if (unlikely(buffer_size > SSDEEP_TOTAL_SIZE_MAX ||
328       SSDEEP_TOTAL_SIZE_MAX - buffer_size < self->total_size )) {
329     self->total_size = SSDEEP_TOTAL_SIZE_MAX + 1;
330   }
331   else
332     self->total_size += buffer_size;
333   for ( ;buffer_size > 0; ++buffer, --buffer_size)
334     fuzzy_engine_step(self, *buffer);
335   return 0;
336 }
337 
memcpy_eliminate_sequences(char * dst,const char * src,int n)338 static int memcpy_eliminate_sequences(char *dst,
339 				      const char *src,
340 				      int n)
341 {
342   const char *srcend = src + n;
343   assert(n >= 0);
344   if (src < srcend) *dst++ = *src++;
345   if (src < srcend) *dst++ = *src++;
346   if (src < srcend) *dst++ = *src++;
347   while (src < srcend)
348   {
349     if (*src == dst[-1] && *src == dst[-2] && *src == dst[-3])
350     {
351       ++src;
352       --n;
353     }
354     else
355       *dst++ = *src++;
356   }
357   return n;
358 }
359 
fuzzy_digest(const struct fuzzy_state * self,char * result,unsigned int flags)360 int fuzzy_digest(const struct fuzzy_state *self,
361 		 /*@out@*/ char *result,
362 		 unsigned int flags)
363 {
364   unsigned int bi = self->bhstart;
365   uint32_t h = roll_sum(&self->roll);
366   int i, remain = FUZZY_MAX_RESULT - 1; /* Exclude terminating '\0'. */
367   /* Verify that our elimination was not overeager. */
368   assert(bi == 0 || (uint_least64_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
369 	 self->total_size);
370 
371   if (self->total_size > SSDEEP_TOTAL_SIZE_MAX) {
372     /* The input exceeds data types. */
373     errno = EOVERFLOW;
374     return -1;
375   }
376   /* Fixed size optimization. */
377   if ((self->flags & FUZZY_STATE_SIZE_FIXED) &&
378       self->fixed_size != self->total_size) {
379     errno = EINVAL;
380     return -1;
381   }
382   /* Initial blocksize guess. */
383   while ((uint_least64_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size)
384     ++bi;
385   /* Adapt blocksize guess to actual digest length. */
386   if (bi >= self->bhend)
387     bi = self->bhend - 1;
388   while (bi > self->bhstart && self->bh[bi].dindex < SPAMSUM_LENGTH / 2)
389     --bi;
390   assert (!(bi > 0 && self->bh[bi].dindex < SPAMSUM_LENGTH / 2));
391 
392   i = snprintf(result, (size_t)remain, "%lu:", (unsigned long)SSDEEP_BS(bi));
393   if (i <= 0)
394     /* Maybe snprintf has set errno here? */
395     return -1;
396   assert(i < remain);
397   remain -= i;
398   result += i;
399   i = (int)self->bh[bi].dindex;
400   assert(i <= remain);
401   if ((flags & FUZZY_FLAG_ELIMSEQ) != 0)
402     i = memcpy_eliminate_sequences(result, self->bh[bi].digest, i);
403   else
404     memcpy(result, self->bh[bi].digest, (size_t)i);
405   result += i;
406   remain -= i;
407   if (h != 0)
408   {
409     assert(remain > 0);
410     *result = b64[self->bh[bi].h];
411     if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
412        *result != result[-1] ||
413        *result != result[-2] ||
414        *result != result[-3]) {
415       ++result;
416       --remain;
417     }
418   }
419   else if (self->bh[bi].digest[self->bh[bi].dindex] != '\0') {
420     assert(remain > 0);
421     *result = self->bh[bi].digest[self->bh[bi].dindex];
422     if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
423        *result != result[-1] ||
424        *result != result[-2] ||
425        *result != result[-3]) {
426       ++result;
427       --remain;
428     }
429   }
430   assert(remain > 0);
431   *result++ = ':';
432   --remain;
433   if (bi < self->bhend - 1)
434   {
435     ++bi;
436     i = (int)self->bh[bi].dindex;
437     if ((flags & FUZZY_FLAG_NOTRUNC) == 0 &&
438 	i > SPAMSUM_LENGTH / 2 - 1)
439       i = SPAMSUM_LENGTH / 2 - 1;
440     assert(i <= remain);
441     if ((flags & FUZZY_FLAG_ELIMSEQ) != 0)
442       i = memcpy_eliminate_sequences(result,
443 				     self->bh[bi].digest, i);
444     else
445       memcpy(result, self->bh[bi].digest, (size_t)i);
446     result += i;
447     remain -= i;
448     if (h != 0) {
449       assert(remain > 0);
450       h = (flags & FUZZY_FLAG_NOTRUNC) != 0 ? self->bh[bi].h :
451 	self->bh[bi].halfh;
452       *result = b64[h];
453       if ((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
454 	  *result != result[-1] ||
455 	  *result != result[-2] ||
456 	  *result != result[-3])
457       {
458 	++result;
459 	--remain;
460       }
461     }
462     else {
463       i = (flags & FUZZY_FLAG_NOTRUNC) != 0 ?
464 	  self->bh[bi].digest[self->bh[bi].dindex] : self->bh[bi].halfdigest;
465       if (i != '\0') {
466 	assert(remain > 0);
467 	*result = i;
468 	if ((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
469 	    *result != result[-1] ||
470 	    *result != result[-2] ||
471 	    *result != result[-3])
472 	{
473 	  ++result;
474 	  --remain;
475 	}
476       }
477     }
478   }
479   else if (h != 0)
480   {
481     assert(bi == 0 || bi == NUM_BLOCKHASHES - 1);
482     assert(remain > 0);
483     if (bi == 0)
484       *result++ = b64[self->bh[bi].h];
485     else
486       *result++ = b64[self->lasth];
487     /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this
488      * digest has length 1. */
489     --remain;
490   }
491   *result = '\0';
492   return 0;
493 }
494 
fuzzy_free(struct fuzzy_state * self)495 void fuzzy_free(/*@only@*/ struct fuzzy_state *self)
496 {
497   free(self);
498 }
499 
fuzzy_hash_buf(const unsigned char * buf,uint32_t buf_len,char * result)500 int fuzzy_hash_buf(const unsigned char *buf,
501 		   uint32_t buf_len,
502 		   /*@out@*/ char *result)
503 {
504   struct fuzzy_state *ctx;
505   int ret = -1;
506   if (NULL == (ctx = fuzzy_new()))
507     return -1;
508   if (fuzzy_set_total_input_length(ctx, buf_len) < 0)
509     goto out;
510   if (fuzzy_update(ctx, buf, buf_len) < 0)
511     goto out;
512   if (fuzzy_digest(ctx, result, 0) < 0)
513     goto out;
514   ret = 0;
515  out:
516   fuzzy_free(ctx);
517   return ret;
518 }
519 
fuzzy_update_stream(struct fuzzy_state * state,FILE * handle)520 static int fuzzy_update_stream(struct fuzzy_state *state,
521 			       FILE *handle)
522 {
523   unsigned char buffer[4096];
524   size_t n;
525   for(;;)
526   {
527     n = fread(buffer, 1, 4096, handle);
528     if (0 == n)
529       break;
530     if (fuzzy_update(state, buffer, n) < 0)
531       return -1;
532   }
533   if (ferror(handle) != 0)
534     return -1;
535   return 0;
536 }
537 
fuzzy_hash_stream(FILE * handle,char * result)538 int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result)
539 {
540   struct fuzzy_state *ctx;
541   int ret = -1;
542   if (NULL == (ctx = fuzzy_new()))
543     return -1;
544   if (fuzzy_update_stream(ctx, handle) < 0)
545     goto out;
546   if (fuzzy_digest(ctx, result, 0) < 0)
547     goto out;
548   ret = 0;
549  out:
550   fuzzy_free(ctx);
551   return ret;
552 }
553 
554 #ifdef S_SPLINT_S
555 typedef size_t off_t;
556 int fseeko(FILE *, off_t, int);
557 off_t ftello(FILE *);
558 #endif
559 
fuzzy_hash_file(FILE * handle,char * result)560 int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result)
561 {
562   off_t fpos, fposend;
563   int status = -1;
564   struct fuzzy_state *ctx;
565   fpos = ftello(handle);
566   if (fpos < 0)
567     return -1;
568   if (fseeko(handle, 0, SEEK_END) < 0)
569     return -1;
570   fposend = ftello(handle);
571   if (fposend < 0)
572     return -1;
573   if (fseeko(handle, 0, SEEK_SET) < 0)
574     return -1;
575   if (NULL == (ctx = fuzzy_new()))
576     return -1;
577   if (fuzzy_set_total_input_length(ctx, (uint_least64_t)fposend) < 0)
578     goto out;
579   if (fuzzy_update_stream(ctx, handle) < 0)
580     goto out;
581   status = fuzzy_digest(ctx, result, 0);
582 out:
583   if (status == 0)
584   {
585     if (fseeko(handle, fpos, SEEK_SET) < 0)
586       return -1;
587   }
588   fuzzy_free(ctx);
589   return status;
590 }
591 
fuzzy_hash_filename(const char * filename,char * result)592 int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result)
593 {
594   int status;
595   FILE *handle = fopen(filename, "rb");
596   if (NULL == handle)
597     return -1;
598   status = fuzzy_hash_stream(handle, result);
599   /* We cannot do anything about an fclose failure. */
600   (void)fclose(handle);
601   return status;
602 }
603 
604 //
605 // We only accept a match if we have at least one common substring in
606 // the signature of length ROLLING_WINDOW. This dramatically drops the
607 // false positive rate for low score thresholds while having
608 // negligable affect on the rate of spam detection.
609 //
610 // return 1 if the two strings do have a common substring, 0 otherwise
611 //
has_common_substring(const char * s1,size_t s1len,const char * s2,size_t s2len)612 static int has_common_substring(const char *s1, size_t s1len, const char *s2, size_t s2len)
613 {
614   size_t i, j;
615   uint32_t hashes[SPAMSUM_LENGTH - (ROLLING_WINDOW - 1)];
616 
617   if (s1len < ROLLING_WINDOW)
618     return 0;
619   if (s2len < ROLLING_WINDOW)
620     return 0;
621 
622   // there are many possible algorithms for common substring
623   // detection. In this case I am re-using the rolling hash code
624   // to act as a filter for possible substring matches
625 
626   memset(hashes, 0, sizeof(hashes));
627 
628   // first compute the windowed rolling hash at each offset in
629   // the first string
630   struct roll_state state;
631   roll_init (&state);
632 
633   for (i = 0; i < ROLLING_WINDOW - 1; i++)
634     roll_hash(&state, (unsigned char)s1[i]);
635   for (i = ROLLING_WINDOW - 1; i < s1len; i++)
636   {
637     roll_hash(&state, (unsigned char)s1[i]);
638     hashes[i - (ROLLING_WINDOW - 1)] = roll_sum(&state);
639   }
640   s1len -= (ROLLING_WINDOW - 1);
641 
642   roll_init(&state);
643 
644   // now for each offset in the second string compute the
645   // rolling hash and compare it to all of the rolling hashes
646   // for the first string. If one matches then we have a
647   // candidate substring match. We then confirm that match with
648   // a direct string comparison */
649   for (j = 0; j < ROLLING_WINDOW - 1; j++)
650     roll_hash(&state, (unsigned char)s2[j]);
651   for (j = 0; j < s2len - (ROLLING_WINDOW - 1); j++)
652   {
653     roll_hash(&state, (unsigned char)s2[j + (ROLLING_WINDOW - 1)]);
654     uint32_t h = roll_sum(&state);
655     for (i = 0; i < s1len; i++)
656     {
657       // confirm the match after checking potential match
658       if (hashes[i] == h && !memcmp(s1 + i, s2 + j, ROLLING_WINDOW))
659 	  return 1;
660     }
661   }
662 
663   return 0;
664 }
665 
666 
667 
668 #ifdef SSDEEP_ENABLE_POSITION_ARRAY
669 
670 // position array-based version of has_common_substring
has_common_substring_pa(const unsigned long long * parray,const char * s2,size_t s2len)671 static int has_common_substring_pa(const unsigned long long *parray, const char *s2, size_t s2len)
672 {
673   unsigned long long D;
674   // ROLLING_WINDOW <= s2len <= 64
675   size_t r = ROLLING_WINDOW - 1;
676   size_t l;
677   const char *ch;
678   while (r < s2len)
679   {
680     // because we want to reuse position array for s1,
681     // both s1 and s2 (in the original pseudocode) are reversed.
682     l = r - (ROLLING_WINDOW - 1);
683     ch = &s2[s2len - 1 - r];
684     D = parray[*ch - CHAR_MIN];
685     while (D)
686     {
687       r--;
688       D = (D << 1) & parray[*++ch - CHAR_MIN];
689       if (r == l && D)
690 	return 1;
691     }
692     // Boyer-Moore-like skipping
693     r += ROLLING_WINDOW;
694   }
695   return 0;
696 }
697 
698 // position array-based version of edit_distn
edit_distn_pa(const unsigned long long * parray,size_t s1len,const char * s2,size_t s2len)699 static int edit_distn_pa(const unsigned long long *parray, size_t s1len, const char *s2, size_t s2len)
700 {
701   unsigned long long pv, nv, ph, nh, zd, mt, x, y;
702   unsigned long long msb;
703   size_t i;
704   // 0 < s1len <= 64
705   int cur = s1len;
706   msb = 1ull << (s1len - 1);
707   pv = -1;
708   nv = 0;
709   for (i = 0; i < s2len; i++)
710   {
711     mt = parray[s2[i] - CHAR_MIN];
712     zd = (((mt & pv) + pv) ^ pv) | mt | nv;
713     nh = pv & zd;
714     if (nh & msb)
715       --cur;
716     x  = nv | ~(pv | zd) | (pv & ~mt & 1ull);
717     y  = (pv - nh) >> 1;
718     ph = (x + y) ^ y;
719     if (ph & msb)
720       ++cur;
721     x  = (ph << 1) | 1ull;
722     nv = x & zd;
723     pv = (nh << 1) | ~(x | zd) | (x & (pv - nh));
724   }
725   return cur;
726 }
727 
728 #endif
729 
730 
731 
732 // eliminate sequences of longer than 3 identical characters. These
733 // sequences contain very little information so they tend to just bias
734 // the result unfairly
copy_eliminate_sequences(char ** out,size_t outsize,char ** in,char etoken)735 static int copy_eliminate_sequences(char **out, size_t outsize, char **in, char etoken)
736 {
737   size_t seq = 0;
738   char prev = **in, curr;
739   if (!prev || prev == etoken)
740     return 1;
741   if (!outsize--)
742     return 0;
743   *(*out)++ = prev;
744   ++(*in);
745   while (1)
746   {
747     curr = **in;
748     if (!curr || curr == etoken)
749       return 1;
750     ++(*in);
751     if (curr == prev)
752     {
753       if (++seq >= 3)
754       {
755 	seq = 3;
756 	continue;
757       }
758       if (!outsize--)
759 	return 0;
760       *(*out)++ = curr;
761     }
762     else
763     {
764       if (!outsize--)
765 	return 0;
766       *(*out)++ = curr;
767       seq = 0;
768       prev = curr;
769     }
770   }
771   // unreachable
772   return 0;
773 }
774 
775 //
776 // this is the low level string scoring algorithm. It takes two strings
777 // and scores them on a scale of 0-100 where 0 is a terrible match and
778 // 100 is a great match. The block_size is used to cope with very small
779 // messages.
780 //
score_strings(const char * s1,size_t s1len,const char * s2,size_t s2len,unsigned long block_size)781 static uint32_t score_strings(const char *s1,
782 			      size_t      s1len,
783 			      const char *s2,
784 			      size_t      s2len,
785 			      unsigned long block_size)
786 {
787   uint32_t score;
788 
789 #ifdef SSDEEP_ENABLE_POSITION_ARRAY
790   unsigned long long parray[CHAR_MAX - CHAR_MIN + 1];
791   size_t i;
792   // skip short strings
793   if (s1len < ROLLING_WINDOW)
794     return 0;
795   if (s2len < ROLLING_WINDOW)
796     return 0;
797   // construct position array for faster string algorithms
798   memset(parray, 0, sizeof(parray));
799   for (i = 0; i < s1len; i++)
800     parray[s1[i] - CHAR_MIN] |= 1ull << i;
801   // the two strings must have a common substring of length
802   // ROLLING_WINDOW to be candidates
803   if (!has_common_substring_pa(parray, s2, s2len))
804     return 0;
805   // compute the edit distance between the two strings. The edit distance gives
806   // us a pretty good idea of how closely related the two strings are
807   score = edit_distn_pa(parray, s1len, s2, s2len);
808 #else
809   // the two strings must have a common substring of length
810   // ROLLING_WINDOW to be candidates
811   if (!has_common_substring(s1, s1len, s2, s2len))
812     return 0;
813   // compute the edit distance between the two strings. The edit distance gives
814   // us a pretty good idea of how closely related the two strings are
815   score = edit_distn(s1, s1len, s2, s2len);
816 #endif
817 
818   // scale the edit distance by the lengths of the two
819   // strings. This changes the score to be a measure of the
820   // proportion of the message that has changed rather than an
821   // absolute quantity. It also copes with the variability of
822   // the string lengths.
823   score = (score * SPAMSUM_LENGTH) / (s1len + s2len);
824 
825   // at this stage the score occurs roughly on a 0-SPAMSUM_LENGTH scale,
826   // with 0 being a good match and SPAMSUM_LENGTH being a complete
827   // mismatch
828 
829   // rescale to a 0-100 scale (friendlier to humans)
830   score = (100 * score) / SPAMSUM_LENGTH;
831 
832   // now re-scale on a 0-100 scale with 0 being a poor match and
833   // 100 being a excellent match.
834   score = 100 - score;
835 
836   //  printf ("s1len: %"PRIu32"  s2len: %"PRIu32"\n", (uint32_t)s1len, (uint32_t)s2len);
837 
838   // when the blocksize is small we don't want to exaggerate the match size
839   if (block_size >= (99 + ROLLING_WINDOW) / ROLLING_WINDOW * MIN_BLOCKSIZE)
840     return score;
841   if (score > block_size/MIN_BLOCKSIZE * MIN(s1len, s2len))
842   {
843     score = block_size/MIN_BLOCKSIZE * MIN(s1len, s2len);
844   }
845   return score;
846 }
847 
848 //
849 // Given two spamsum strings return a value indicating the degree
850 // to which they match.
851 //
fuzzy_compare(const char * str1,const char * str2)852 int fuzzy_compare(const char *str1, const char *str2)
853 {
854   unsigned long block_size1, block_size2;
855   uint32_t score = 0;
856   size_t s1b1len, s1b2len, s2b1len, s2b2len;
857   char s1b1[SPAMSUM_LENGTH], s1b2[SPAMSUM_LENGTH];
858   char s2b1[SPAMSUM_LENGTH], s2b2[SPAMSUM_LENGTH];
859   char *s1p, *s2p, *tmp;
860 
861   if (NULL == str1 || NULL == str2)
862     return -1;
863 
864   // each spamsum is prefixed by its block size
865   if (sscanf(str1, "%lu:", &block_size1) != 1 ||
866       sscanf(str2, "%lu:", &block_size2) != 1) {
867     return -1;
868   }
869 
870   // if the blocksizes don't match then we are comparing
871   // apples to oranges. This isn't an 'error' per se. We could
872   // have two valid signatures, but they can't be compared.
873   if (block_size1 != block_size2 &&
874       (block_size1 > ULONG_MAX / 2 || block_size1*2 != block_size2) &&
875       (block_size1 % 2 == 1 || block_size1 / 2 != block_size2)) {
876     return 0;
877   }
878 
879   // move past the prefix
880   s1p = strchr(str1, ':');
881   s2p = strchr(str2, ':');
882 
883   if (!s1p || !s2p) {
884     // badly formed ...
885     return -1;
886   }
887 
888   // there is very little information content is sequences of
889   // the same character like 'LLLLL'. Eliminate any sequences
890   // longer than 3 while reading two pieces.
891   // This is especially important when combined with the
892   // has_common_substring() test at score_strings().
893 
894   // read the first digest
895   ++s1p;
896   tmp = s1b1;
897   if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s1p, ':'))
898     return -1;
899   s1b1len = tmp - s1b1;
900   if (!*s1p++) {
901     // a signature is malformed - it doesn't have 2 parts
902     return -1;
903   }
904   tmp = s1b2;
905   if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s1p, ','))
906     return -1;
907   s1b2len = tmp - s1b2;
908 
909   // read the second digest
910   ++s2p;
911   tmp = s2b1;
912   if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s2p, ':'))
913     return -1;
914   s2b1len = tmp - s2b1;
915   if (!*s2p++) {
916     // a signature is malformed - it doesn't have 2 parts
917     return -1;
918   }
919   tmp = s2b2;
920   if (!copy_eliminate_sequences(&tmp, SPAMSUM_LENGTH, &s2p, ','))
921     return -1;
922   s2b2len = tmp - s2b2;
923 
924   // Now that we know the strings are both well formed, are they
925   // identical? We could save ourselves some work here
926   if (block_size1 == block_size2 && s1b1len == s2b1len && s1b2len == s2b2len) {
927     if (!memcmp(s1b1, s2b1, s1b1len) && !memcmp(s1b2, s2b2, s1b2len)) {
928       return 100;
929     }
930   }
931 
932   // each signature has a string for two block sizes. We now
933   // choose how to combine the two block sizes. We checked above
934   // that they have at least one block size in common
935   if (block_size1 <= ULONG_MAX / 2) {
936     if (block_size1 == block_size2) {
937       uint32_t score1, score2;
938       score1 = score_strings(s1b1, s1b1len, s2b1, s2b1len, block_size1);
939       score2 = score_strings(s1b2, s1b2len, s2b2, s2b2len, block_size1*2);
940       score = MAX(score1, score2);
941     }
942     else if (block_size1 * 2 == block_size2) {
943       score = score_strings(s2b1, s2b1len, s1b2, s1b2len, block_size2);
944     }
945     else {
946       score = score_strings(s1b1, s1b1len, s2b2, s2b2len, block_size1);
947     }
948   }
949   else {
950     if (block_size1 == block_size2) {
951       score = score_strings(s1b1, s1b1len, s2b1, s2b1len, block_size1);
952     }
953     else if (block_size1 % 2 == 0 && block_size1 / 2 == block_size2) {
954       score = score_strings(s1b1, s1b1len, s2b2, s2b2len, block_size1);
955     }
956     else {
957       score = 0;
958     }
959   }
960 
961   return (int)score;
962 }
963