1 // ssdeep
2 // (C) Copyright 2012 Kyrus
3 //
4 // $Id$
5 //
6 // This program is free software; you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License as published by
8 // the Free Software Foundation; either version 2 of the License, or
9 // (at your option) any later version.
10 //
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 //
16 //You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software
18 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20
21 #include "match.h"
22
23 // The longest line we should encounter when reading files of known hashes
24 #define MAX_STR_LEN 2048
25
26 #define MIN_SUBSTR_LEN 7
27
28 // ------------------------------------------------------------------
29 // SIGNATURE FILE FUNCTIONS
30 // ------------------------------------------------------------------
31
32 /// Open a file of known hashes and determine if it's valid
33 ///
34 /// @param s State variable
35 /// @param fn filename to open
36 ///
37 /// @return Returns false success, true on error
sig_file_open(state * s,const char * fn)38 bool sig_file_open(state *s, const char * fn)
39 {
40 if (NULL == s or NULL == fn)
41 return true;
42
43 s->known_handle = fopen(fn,"rb");
44 if (NULL == s->known_handle)
45 {
46 if ( ! (MODE(mode_silent)) )
47 perror(fn);
48 return true;
49 }
50
51 // The first line of the file should contain a valid ssdeep header.
52 char buffer[MAX_STR_LEN];
53 if (NULL == fgets(buffer,MAX_STR_LEN,s->known_handle))
54 {
55 if ( ! (MODE(mode_silent)) )
56 perror(fn);
57 fclose(s->known_handle);
58 return true;
59 }
60
61 chop_line(buffer);
62
63 if (strncmp(buffer,SSDEEPV1_0_HEADER,MAX_STR_LEN) and
64 strncmp(buffer,SSDEEPV1_1_HEADER,MAX_STR_LEN))
65 {
66 if ( ! (MODE(mode_silent)) )
67 print_error(s,"%s: Invalid file header.", fn);
68 fclose(s->known_handle);
69 return true;
70 }
71
72 // We've now read the first line
73 s->line_number = 1;
74 s->known_fn = strdup(fn);
75
76 return false;
77 }
78
79
80
81 /// @brief Read the next entry in a file of known hashes and convert
82 /// it to a Filedata
83 ///
84 /// @param s State variable
85 /// @param f Structure where to store the data we read
86 ///
87 /// @return Returns true if there is no entry to read or on error.
88 /// Otherwise, false.
sig_file_next(state * s,Filedata ** f)89 bool sig_file_next(state *s, Filedata ** f)
90 {
91 if (NULL == s or NULL == f or NULL == s->known_handle)
92 return true;
93
94 char buffer[MAX_STR_LEN];
95 memset(buffer,0,MAX_STR_LEN);
96 if (NULL == fgets(buffer,MAX_STR_LEN,s->known_handle))
97 return true;
98
99 s->line_number++;
100 chop_line(buffer);
101
102 try
103 {
104 *f = new Filedata(std::string(buffer),s->known_fn);
105 }
106 catch (std::bad_alloc)
107 {
108 // This can happen on a badly formatted line, or a blank one.
109 // We don't display errors on blank lines.
110 if (strlen(buffer) > 0)
111 print_error(s,
112 "%s: Bad hash in line %llu",
113 s->known_fn,
114 s->line_number);
115
116 return true;
117 }
118
119 return false;
120 }
121
122
sig_file_close(state * s)123 bool sig_file_close(state *s)
124 {
125 if (NULL == s)
126 return true;
127
128 free(s->known_fn);
129
130 if (s->known_handle != NULL)
131 return true;
132
133 if (fclose(s->known_handle))
134 return true;
135
136 return false;
137 }
138
139
sig_file_end(state * s)140 bool sig_file_end(state *s)
141 {
142 return (feof(s->known_handle));
143 }
144
145
146
147 // ------------------------------------------------------------------
148 // MATCHING FUNCTIONS
149 // ------------------------------------------------------------------
150
display_clusters(const state * s)151 void display_clusters(const state *s)
152 {
153 if (NULL == s)
154 return;
155
156 std::set<std::set<Filedata *> *>::const_iterator it;
157 for (it = s->all_clusters.begin(); it != s->all_clusters.end() ; ++it)
158 {
159 print_status("** Cluster size %u", (*it)->size());
160 std::set<Filedata *>::const_iterator cit;
161 for (cit = (*it)->begin() ; cit != (*it)->end() ; ++cit)
162 {
163 display_filename(stdout,(*cit)->get_filename(),FALSE);
164 print_status("");
165 }
166
167 print_status("");
168 }
169 }
170
171
cluster_add(Filedata * dest,Filedata * src)172 void cluster_add(Filedata * dest, Filedata * src)
173 {
174 dest->get_cluster()->insert(src);
175 src->set_cluster(dest->get_cluster());
176 }
177
178
cluster_join(state * s,Filedata * a,Filedata * b)179 void cluster_join(state *s, Filedata * a, Filedata * b)
180 {
181 // If these items are already in the same cluster there is nothing to do
182 if (a->get_cluster() == b->get_cluster())
183 return;
184
185 Filedata * dest, * src;
186 // Combine the smaller cluster into the larger cluster for speed
187 // (fewer items to move)
188 if (a->get_cluster()->size() > b->get_cluster()->size())
189 {
190 dest = a;
191 src = b;
192 }
193 else
194 {
195 dest = b;
196 src = a;
197 }
198
199 // Add members of src to dest
200 std::set<Filedata *>::const_iterator it;
201 for (it = src->get_cluster()->begin() ;
202 it != src->get_cluster()->end() ;
203 ++it)
204 {
205 dest->get_cluster()->insert(*it);
206 }
207
208 // Remove the old cluster
209 s->all_clusters.erase(src->get_cluster());
210 // This call sets the cluster to NULL. Do not access the src
211 // cluster after this call!
212 src->clear_cluster();
213
214 src->set_cluster(dest->get_cluster());
215 }
216
217
handle_clustering(state * s,Filedata * a,Filedata * b)218 void handle_clustering(state *s, Filedata *a, Filedata *b)
219 {
220 bool a_has = a->has_cluster(), b_has = b->has_cluster();
221
222 // In the easiest case, one of these has a cluster and one doesn't
223 if (a_has and not b_has)
224 {
225 cluster_add(a,b);
226 return;
227 }
228 if (b_has and not a_has)
229 {
230 cluster_add(b,a);
231 return;
232 }
233
234 // Combine existing clusters
235 if (a_has and b_has)
236 {
237 cluster_join(s,a,b);
238 return;
239 }
240
241 // Create a new cluster
242 std::set<Filedata *> * cluster = new std::set<Filedata *>();
243 cluster->insert(a);
244 cluster->insert(b);
245
246 s->all_clusters.insert(cluster);
247
248 a->set_cluster(cluster);
249 b->set_cluster(cluster);
250 }
251
252
253
handle_match(state * s,Filedata * a,Filedata * b,int score)254 void handle_match(state *s,
255 Filedata *a,
256 Filedata *b,
257 int score)
258 {
259 if (s->mode & mode_csv)
260 {
261 printf("\"");
262 display_filename(stdout,a->get_filename(),TRUE);
263 printf("\",\"");
264 display_filename(stdout,b->get_filename(),TRUE);
265 print_status("\",%u", score);
266 }
267 else if (s->mode & mode_cluster)
268 {
269 handle_clustering(s,a,b);
270 }
271 else
272 {
273 // The match file names may be empty. If so, we don't print them
274 // or the colon which separates them from the filename
275 if (a->has_match_file())
276 printf ("%s:", a->get_match_file().c_str());
277 display_filename(stdout,a->get_filename(),FALSE);
278 printf (" matches ");
279 if (b->has_match_file())
280 printf ("%s:", b->get_match_file().c_str());
281 display_filename(stdout,b->get_filename(),FALSE);
282 print_status(" (%u)", score);
283 }
284 }
285
286
match_compare(state * s,Filedata * f)287 bool match_compare(state *s, Filedata * f)
288 {
289 if (NULL == s)
290 fatal_error("%s: Null state passed into match_compare", __progname);
291
292 bool status = false;
293 size_t fn_len = _tcslen(f->get_filename());
294
295 std::vector<Filedata* >::const_iterator it;
296 for (it = s->all_files.begin() ; it != s->all_files.end() ; ++it)
297 {
298 // When in pretty mode, we still want to avoid printing
299 // A matches A (100).
300 if (s->mode & mode_match_pretty)
301 {
302 if (!(_tcsncmp(f->get_filename(),
303 (*it)->get_filename(),
304 std::max(fn_len,_tcslen((*it)->get_filename())))) and
305 (f->get_signature() == (*it)->get_signature()))
306 {
307 // Unless these results from different matching files (such as
308 // what happens in sigcompare mode). That being said, we have to
309 // be careful to avoid NULL values such as when working in
310 // normal pretty print mode.
311 if (not(f->has_match_file()) or
312 f->get_match_file() == (*it)->get_match_file())
313 continue;
314 }
315 }
316
317 int score = fuzzy_compare(f->get_signature().c_str(),
318 (*it)->get_signature().c_str());
319 if (-1 == score)
320 print_error(s, "%s: Bad hashes in comparison", __progname);
321 else
322 {
323 if (score > s->threshold or MODE(mode_display_all))
324 {
325 handle_match(s,f,(*it),score);
326 status = true;
327 }
328 }
329 }
330
331 return status;
332 }
333
334
find_matches_in_known(state * s)335 bool find_matches_in_known(state *s)
336 {
337 if (NULL == s)
338 return true;
339
340 // Walk the vector which contains all of the known files
341 std::vector<Filedata *>::const_iterator it;
342 for (it = s->all_files.begin() ; it != s->all_files.end() ; ++it)
343 {
344 bool status = match_compare(s,*it);
345 // In pretty mode and sigcompare mode we need to display a blank
346 // line after each file. In clustering mode we don't display anything
347 // right now.
348 if (status and not(MODE(mode_cluster)))
349 print_status("");
350 }
351
352 return false;
353 }
354
355
match_add(state * s,Filedata * f)356 bool match_add(state *s, Filedata * f) {
357 if (NULL == s)
358 return true;
359
360 s->all_files.push_back(f);
361
362 return false;
363 }
364
365
match_load(state * s,const char * fn)366 bool match_load(state *s, const char *fn) {
367 if (NULL == s or NULL == fn)
368 return true;
369
370 if (sig_file_open(s,fn))
371 return true;
372
373 bool status;
374
375 do {
376 Filedata * f;
377 status = sig_file_next(s,&f);
378 if (not status) {
379 if (match_add(s,f)) {
380 // One bad hash doesn't mean this load was a failure.
381 // We don't change the return status because match_add failed.
382 print_error(s, "%s: unable to insert hash", fn);
383 break;
384 }
385 }
386 } while (not sig_file_end(s));
387
388 sig_file_close(s);
389
390 return false;
391 }
392
393
match_compare_unknown(state * s,const char * fn)394 bool match_compare_unknown(state *s, const char * fn)
395 {
396 if (NULL == s or NULL == fn)
397 return true;
398
399 if (sig_file_open(s,fn))
400 return true;
401
402 bool status;
403
404 do
405 {
406 Filedata *f;
407 status = sig_file_next(s,&f);
408 if (not status)
409 match_compare(s,f);
410 } while (not sig_file_end(s));
411
412 sig_file_close(s);
413
414 return false;
415 }
416
417