1 // ssdeep
2 // (C) Copyright 2012 Kyrus
3 //
4 // $Id$
5 //
6 // This program is free software; you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License as published by
8 // the Free Software Foundation; either version 2 of the License, or
9 // (at your option) any later version.
10 //
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 //
16 //You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software
18 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 
21 #include "match.h"
22 
23 // The longest line we should encounter when reading files of known hashes
24 #define MAX_STR_LEN  2048
25 
26 #define MIN_SUBSTR_LEN 7
27 
28 // ------------------------------------------------------------------
29 // SIGNATURE FILE FUNCTIONS
30 // ------------------------------------------------------------------
31 
32 /// Open a file of known hashes and determine if it's valid
33 ///
34 /// @param s State variable
35 /// @param fn filename to open
36 ///
37 /// @return Returns false success, true on error
sig_file_open(state * s,const char * fn)38 bool sig_file_open(state *s, const char * fn)
39 {
40   if (NULL == s or NULL == fn)
41     return true;
42 
43   s->known_handle = fopen(fn,"rb");
44   if (NULL == s->known_handle)
45   {
46     if ( ! (MODE(mode_silent)) )
47       perror(fn);
48     return true;
49   }
50 
51   // The first line of the file should contain a valid ssdeep header.
52   char buffer[MAX_STR_LEN];
53   if (NULL == fgets(buffer,MAX_STR_LEN,s->known_handle))
54   {
55     if ( ! (MODE(mode_silent)) )
56       perror(fn);
57     fclose(s->known_handle);
58     return true;
59   }
60 
61   chop_line(buffer);
62 
63   if (strncmp(buffer,SSDEEPV1_0_HEADER,MAX_STR_LEN) and
64       strncmp(buffer,SSDEEPV1_1_HEADER,MAX_STR_LEN))
65   {
66     if ( ! (MODE(mode_silent)) )
67       print_error(s,"%s: Invalid file header.", fn);
68     fclose(s->known_handle);
69     return true;
70   }
71 
72   // We've now read the first line
73   s->line_number = 1;
74   s->known_fn = strdup(fn);
75 
76   return false;
77 }
78 
79 
80 
81 /// @brief Read the next entry in a file of known hashes and convert
82 /// it to a Filedata
83 ///
84 /// @param s State variable
85 /// @param f Structure where to store the data we read
86 ///
87 /// @return Returns true if there is no entry to read or on error.
88 /// Otherwise, false.
sig_file_next(state * s,Filedata ** f)89 bool sig_file_next(state *s, Filedata ** f)
90 {
91   if (NULL == s or NULL == f or NULL == s->known_handle)
92     return true;
93 
94   char buffer[MAX_STR_LEN];
95   memset(buffer,0,MAX_STR_LEN);
96   if (NULL == fgets(buffer,MAX_STR_LEN,s->known_handle))
97     return true;
98 
99   s->line_number++;
100   chop_line(buffer);
101 
102   try
103   {
104     *f = new Filedata(std::string(buffer),s->known_fn);
105   }
106   catch (std::bad_alloc)
107   {
108     // This can happen on a badly formatted line, or a blank one.
109     // We don't display errors on blank lines.
110     if (strlen(buffer) > 0)
111       print_error(s,
112 		  "%s: Bad hash in line %llu",
113 		  s->known_fn,
114 		  s->line_number);
115 
116     return true;
117   }
118 
119   return false;
120 }
121 
122 
sig_file_close(state * s)123 bool sig_file_close(state *s)
124 {
125   if (NULL == s)
126     return true;
127 
128   free(s->known_fn);
129 
130   if (s->known_handle != NULL)
131     return true;
132 
133   if (fclose(s->known_handle))
134     return true;
135 
136   return false;
137 }
138 
139 
sig_file_end(state * s)140 bool sig_file_end(state *s)
141 {
142   return (feof(s->known_handle));
143 }
144 
145 
146 
147 // ------------------------------------------------------------------
148 // MATCHING FUNCTIONS
149 // ------------------------------------------------------------------
150 
display_clusters(const state * s)151 void display_clusters(const state *s)
152 {
153   if (NULL == s)
154     return;
155 
156   std::set<std::set<Filedata *> *>::const_iterator it;
157   for (it = s->all_clusters.begin(); it != s->all_clusters.end() ; ++it)
158   {
159     print_status("** Cluster size %u", (*it)->size());
160     std::set<Filedata *>::const_iterator cit;
161     for (cit = (*it)->begin() ; cit != (*it)->end() ; ++cit)
162     {
163       display_filename(stdout,(*cit)->get_filename(),FALSE);
164       print_status("");
165     }
166 
167     print_status("");
168   }
169 }
170 
171 
cluster_add(Filedata * dest,Filedata * src)172 void cluster_add(Filedata * dest, Filedata * src)
173 {
174   dest->get_cluster()->insert(src);
175   src->set_cluster(dest->get_cluster());
176 }
177 
178 
cluster_join(state * s,Filedata * a,Filedata * b)179 void cluster_join(state *s, Filedata * a, Filedata * b)
180 {
181   // If these items are already in the same cluster there is nothing to do
182   if (a->get_cluster() == b->get_cluster())
183     return;
184 
185   Filedata * dest, * src;
186   // Combine the smaller cluster into the larger cluster for speed
187   // (fewer items to move)
188   if (a->get_cluster()->size() > b->get_cluster()->size())
189   {
190     dest = a;
191     src  = b;
192   }
193   else
194   {
195     dest = b;
196     src  = a;
197   }
198 
199   // Add members of src to dest
200   std::set<Filedata *>::const_iterator it;
201   for (it =  src->get_cluster()->begin() ;
202        it != src->get_cluster()->end() ;
203        ++it)
204   {
205     dest->get_cluster()->insert(*it);
206   }
207 
208   // Remove the old cluster
209   s->all_clusters.erase(src->get_cluster());
210   // This call sets the cluster to NULL. Do not access the src
211   // cluster after this call!
212   src->clear_cluster();
213 
214   src->set_cluster(dest->get_cluster());
215 }
216 
217 
handle_clustering(state * s,Filedata * a,Filedata * b)218 void handle_clustering(state *s, Filedata *a, Filedata *b)
219 {
220   bool a_has = a->has_cluster(), b_has = b->has_cluster();
221 
222   // In the easiest case, one of these has a cluster and one doesn't
223   if (a_has and not b_has)
224   {
225     cluster_add(a,b);
226     return;
227   }
228   if (b_has and not a_has)
229   {
230     cluster_add(b,a);
231     return;
232   }
233 
234   // Combine existing clusters
235   if (a_has and b_has)
236   {
237     cluster_join(s,a,b);
238     return;
239   }
240 
241   // Create a new cluster
242   std::set<Filedata *> * cluster = new std::set<Filedata *>();
243   cluster->insert(a);
244   cluster->insert(b);
245 
246   s->all_clusters.insert(cluster);
247 
248   a->set_cluster(cluster);
249   b->set_cluster(cluster);
250 }
251 
252 
253 
handle_match(state * s,Filedata * a,Filedata * b,int score)254 void handle_match(state *s,
255 		  Filedata *a,
256 		  Filedata *b,
257 		  int score)
258 {
259   if (s->mode & mode_csv)
260   {
261     printf("\"");
262     display_filename(stdout,a->get_filename(),TRUE);
263     printf("\",\"");
264     display_filename(stdout,b->get_filename(),TRUE);
265     print_status("\",%u", score);
266   }
267   else if (s->mode & mode_cluster)
268   {
269     handle_clustering(s,a,b);
270   }
271   else
272   {
273     // The match file names may be empty. If so, we don't print them
274     // or the colon which separates them from the filename
275     if (a->has_match_file())
276       printf ("%s:", a->get_match_file().c_str());
277     display_filename(stdout,a->get_filename(),FALSE);
278     printf (" matches ");
279     if (b->has_match_file())
280       printf ("%s:", b->get_match_file().c_str());
281     display_filename(stdout,b->get_filename(),FALSE);
282     print_status(" (%u)", score);
283   }
284 }
285 
286 
match_compare(state * s,Filedata * f)287 bool match_compare(state *s, Filedata * f)
288 {
289   if (NULL == s)
290     fatal_error("%s: Null state passed into match_compare", __progname);
291 
292   bool status = false;
293   size_t fn_len = _tcslen(f->get_filename());
294 
295   std::vector<Filedata* >::const_iterator it;
296   for (it = s->all_files.begin() ; it != s->all_files.end() ; ++it)
297   {
298     // When in pretty mode, we still want to avoid printing
299     // A matches A (100).
300     if (s->mode & mode_match_pretty)
301     {
302       if (!(_tcsncmp(f->get_filename(),
303 		     (*it)->get_filename(),
304 		     std::max(fn_len,_tcslen((*it)->get_filename())))) and
305 	  (f->get_signature() == (*it)->get_signature()))
306       {
307 	// Unless these results from different matching files (such as
308 	// what happens in sigcompare mode). That being said, we have to
309 	// be careful to avoid NULL values such as when working in
310 	// normal pretty print mode.
311 	if (not(f->has_match_file()) or
312 	    f->get_match_file() == (*it)->get_match_file())
313 	  continue;
314       }
315     }
316 
317     int score =  fuzzy_compare(f->get_signature().c_str(),
318 			       (*it)->get_signature().c_str());
319     if (-1 == score)
320       print_error(s, "%s: Bad hashes in comparison", __progname);
321     else
322     {
323       if (score > s->threshold or MODE(mode_display_all))
324       {
325 	handle_match(s,f,(*it),score);
326 	status = true;
327       }
328     }
329   }
330 
331   return status;
332 }
333 
334 
find_matches_in_known(state * s)335 bool find_matches_in_known(state *s)
336 {
337   if (NULL == s)
338     return true;
339 
340   // Walk the vector which contains all of the known files
341   std::vector<Filedata *>::const_iterator it;
342   for (it = s->all_files.begin() ; it != s->all_files.end() ; ++it)
343   {
344     bool status = match_compare(s,*it);
345     // In pretty mode and sigcompare mode we need to display a blank
346     // line after each file. In clustering mode we don't display anything
347     // right now.
348     if (status and not(MODE(mode_cluster)))
349       print_status("");
350   }
351 
352   return false;
353 }
354 
355 
match_add(state * s,Filedata * f)356 bool match_add(state *s, Filedata * f) {
357   if (NULL == s)
358     return true;
359 
360   s->all_files.push_back(f);
361 
362   return false;
363 }
364 
365 
match_load(state * s,const char * fn)366 bool match_load(state *s, const char *fn) {
367   if (NULL == s or NULL == fn)
368     return true;
369 
370   if (sig_file_open(s,fn))
371     return true;
372 
373   bool status;
374 
375   do {
376     Filedata * f;
377     status = sig_file_next(s,&f);
378     if (not status) {
379       if (match_add(s,f)) {
380 	// One bad hash doesn't mean this load was a failure.
381 	// We don't change the return status because match_add failed.
382 	print_error(s, "%s: unable to insert hash", fn);
383 	break;
384       }
385     }
386   } while (not sig_file_end(s));
387 
388   sig_file_close(s);
389 
390   return false;
391 }
392 
393 
match_compare_unknown(state * s,const char * fn)394 bool match_compare_unknown(state *s, const char * fn)
395 {
396   if (NULL == s or NULL == fn)
397     return true;
398 
399   if (sig_file_open(s,fn))
400     return true;
401 
402   bool status;
403 
404   do
405   {
406     Filedata *f;
407     status = sig_file_next(s,&f);
408     if (not status)
409       match_compare(s,f);
410   } while (not sig_file_end(s));
411 
412   sig_file_close(s);
413 
414   return false;
415 }
416 
417