1 // $Id$
2
3 /** hashlist.cpp
4 * Implements a list of hashes for local database, searching, etc.
5 * Currently done with a map; could be done with an unordered set.
6 * Contains the logic for performing the audit.
7 * Formerly this code was in audit.cpp and match.cpp.
8 */
9
10 #include "main.h"
11 #include <new>
12 #include <iostream>
13
14 /// Add a fi to the hash list.
15 ///
16 /// Be sure that the hash is all lower case, because that's what we
17 /// use internally.
add_file(file_data_t * fi,int alg_num)18 void hashlist::hashmap::add_file(file_data_t *fi,int alg_num)
19 {
20 if (fi->hash_hex[alg_num].size())
21 {
22 std::string hexhash = fi->hash_hex[alg_num];
23 for (std::string::iterator it = hexhash.begin();it!=hexhash.end();it++)
24 {
25 if (isupper(*it))
26 *it = tolower(*it);
27 }
28 insert(std::pair<std::string,file_data_t *>(hexhash,fi));
29 }
30 }
31
32
33 /**
34 * Adds a file_data_t pointer to the hashlist.
35 * Does not copy the object.
36 * Object will be modified if there is a match.
37 */
add_fdt(file_data_t * fi)38 void hashlist::add_fdt(file_data_t *fi)
39 {
40 push_back(fi); // retain our copy
41 for(int i=0;i<NUM_ALGORITHMS;i++){ // and add for each algorithm
42 hashmaps[i].add_file(fi,i); // and point to the back
43 };
44 }
45
46 /**
47 * search for a hash with an (optional) given filename.
48 * Return the first hash that matches the filename.
49 * If nothing matches the filename, return the first hash that matches.
50 * If a match is found, set file_number in the hash that is found.
51 * Not sure I like modifying the store, but it's okay for now.
52 */
find_hash(hashid_t alg,const std::string & hash_hex,const std::string & file_name,uint64_t file_number)53 file_data_t *hashlist::find_hash(hashid_t alg,
54 const std::string &hash_hex,
55 const std::string &file_name,
56 uint64_t file_number)
57 {
58 if(opt_debug>2)
59 std::cerr << "find_hash alg=" << alg << " hash_hex=" << hash_hex <<
60 " fn=" << file_name << " file_number=" << file_number;
61 std::pair<hashmap::iterator,hashmap::iterator> match;
62 match = this->hashmaps[alg].equal_range(hash_hex);
63 if (match.first==match.second)
64 {
65 if (opt_debug>2)
66 std::cerr << " RETURNS 0\n";
67 return 0; // nothing found
68 }
69
70 for (hashmap::iterator it = match.first; it!=match.second; ++it)
71 {
72 if ((*it).second->file_name == file_name)
73 {
74 if (file_number)
75 (*it).second->matched_file_number = file_number;
76 if (opt_debug)
77 std::cerr << " RETURNS EXACT MATCH " << file_number << "\n";
78 return (*it).second;
79 }
80 }
81
82 // No exact matches; return the first match
83 if (file_number)
84 (*match.first).second->matched_file_number = file_number;
85 if (opt_debug)
86 std::cerr << " RETURNS FIRST MATCH " << file_number << "\n";
87 return (*match.first).second;
88 }
89
90
91 ///
92 /// Search for the provided fdt in the hashlist and return the status of the match.
93 /// Match on name if possible; otherwise match on just the hash codes.
94 ///
search(const file_data_hasher_t * fdht,file_data_t ** matched_,bool case_sensitive)95 hashlist::searchstatus_t hashlist::search(const file_data_hasher_t *fdht,
96 file_data_t ** matched_,
97 bool case_sensitive)
98 {
99 // Iterate through each of the hashes in the haslist until we find a match.
100 for (int alg = 0 ; alg < NUM_ALGORITHMS ; ++alg)
101 {
102 // Only search hash functions that are in use and hashes that are in the fdt
103 if (hashes[alg].inuse==0 || fdht->hash_hex[alg].size()==0)
104 {
105 continue;
106 }
107
108 // Find the best match using find_hash
109 file_data_t *matched = find_hash((hashid_t)alg,
110 fdht->hash_hex[alg],
111 fdht->file_name,
112 fdht->file_number);
113
114 if (not matched)
115 {
116 // No match
117 continue;
118 }
119
120 if (matched_)
121 *matched_ = matched; // note the match
122
123 // Verify that all of the other hash functions for *it match fdt as well,
124 // but only for the cases when we have a hash for both the master file
125 // and the target file.
126 for (int j=0 ; j<NUM_ALGORITHMS ; j++)
127 {
128 if (hashes[j].inuse and
129 j != alg and
130 fdht->hash_hex[j].size() and
131 matched->hash_hex[j].size())
132 {
133 if (fdht->hash_hex[j] != matched->hash_hex[j])
134 {
135 // We have found a hash collision for one algorithm, but not all
136 // of them. For example, MD5(A) == MD5(B), but SHA1(A) != SHA1(B).
137 // See http://www.win.tue.nl/hashclash/ for a program to create these.
138 return status_partial_match;
139 }
140 }
141 }
142
143 // If we got here we matched on all of the hashes.
144 // Which is to be expected.
145 // Check to see if the sizes are the same.
146 if (fdht->file_bytes != matched->file_bytes)
147 {
148 // Amazing. We found two files that have the same hash but different
149 // file sizes. This has never happened before in the history of the world.
150 // Call the newspapers!
151 return status_file_size_mismatch;
152 }
153
154 // See if the hashes are the same but the name changed.
155 if (case_sensitive)
156 {
157 if (fdht->file_name != matched->file_name)
158 return status_file_name_mismatch;
159 }
160 else
161 {
162 if (strcasecmp(fdht->file_name.c_str(), matched->file_name.c_str()))
163 return status_file_name_mismatch;
164 }
165
166 // If we get here, then all of the hash matches for all of the
167 // algorithms have been checked and found to be equal if present.
168 return status_match;
169 }
170
171 // If we get here, nothing ever matched.
172 return status_no_match;
173 }
174
175
176
177 ///
178 /// Returns the file type of a given input file.
179 /// fn is provided so that error messages can be printed.
180 ///
identify_format(class display * ocb,const std::string & fn,FILE * handle)181 hashlist::hashfile_format hashlist::identify_format(class display *ocb,
182 const std::string &fn,
183 FILE *handle)
184 {
185 char buf[MAX_STRING_LENGTH];
186
187 // Find the header
188 if ((fgets(buf,MAX_STRING_LENGTH,handle)) == NULL) {
189 return file_unknown;
190 }
191
192 chop_line(buf);
193
194 if ( ! STRINGS_EQUAL(buf,HASHDEEP_HEADER_10)) {
195 return file_unknown;
196 }
197
198 // Find which hashes are in this file
199 if ((fgets(buf,MAX_STRING_LENGTH,handle)) == NULL) {
200 return file_unknown;
201 }
202
203 chop_line(buf);
204
205 // We don't use STRINGS_EQUAL here because we only care about
206 // the first ten characters for right now.
207 if (strncasecmp("%%%% size,",buf,10)) {
208 return file_unknown;
209 }
210
211 /**
212 * Remember previously loaded hashes.
213 */
214 std::string previously_enabled_algorithms = last_enabled_algorithms;
215
216 // Skip the "%%%% size," when parsing the list of hashes
217 enable_hashing_algorithms_from_hashdeep_file(ocb,fn,buf + 10);
218
219
220 // If the set of hashes now in use doesn't match those previously in use,
221 // give a warning.
222 if (previously_enabled_algorithms.size()>0
223 && previously_enabled_algorithms != last_enabled_algorithms){
224 if(ocb) ocb->error("%s: Hashes not in same format as previously loaded",
225 fn.c_str());
226 }
227 return file_hashdeep_10;
228 }
229
230
231 /*
232 * Examine the list of hashing algorithms in the file,
233 * enable them and note their order. If the last algorithm is 'filename', ignore it.
234 */
235
enable_hashing_algorithms_from_hashdeep_file(class display * ocb,const std::string & fn,std::string val)236 void hashlist::enable_hashing_algorithms_from_hashdeep_file(class display *ocb,const std::string &fn,std::string val)
237 {
238 // The first position is always the file size, so we start with an
239 // the first position of one.
240 uint8_t num_columns = 1;
241
242 last_enabled_algorithms = val;
243 std::vector<std::string> algs = split(val,',');
244 for(std::vector<std::string>::iterator it = algs.begin(); it!=algs.end(); it++){
245 std::string name = *it;
246 lowercase(name);
247 if(name=="filename")
248 {
249 // Special value to denote the filename
250 filename_column = num_columns;
251 continue;
252 }
253 hashid_t id = algorithm_t::get_hashid_for_name(name);
254 if(id==alg_unknown){
255 if(ocb){
256 ocb->error("%s: Badly formatted file", fn.c_str());
257 ocb->try_msg();
258 }
259 exit(EXIT_FAILURE);
260 }
261
262 /* Found a known algorithm */
263 hashes[id].inuse = TRUE;
264 hash_column[num_columns] = id;
265 num_columns++;
266 }
267 }
268
269
dump_hashlist()270 void hashlist::dump_hashlist()
271 {
272 std::cout << "md5,sha1,bytes,filename matched\n";
273 for (hashlist::const_iterator it = begin(); it!=end(); it++)
274 {
275 std::cout << (*it)->hash_hex[alg_md5] << "," << (*it)->hash_hex[alg_sha1] << ","
276 << (*it)->file_bytes << "," << (*it)->file_name
277 << "\tmatched=" << (*it)->matched_file_number << "\n";
278 }
279 }
280
total_matched()281 uint64_t hashlist::total_matched()
282 {
283 uint64_t total = 0;
284 for (hashlist::const_iterator it = begin(); it!=end(); it++)
285 {
286 if ( (*it)->matched_file_number > 0)
287 total++;
288 }
289
290 return total;
291 }
292
293
294 //
295 // Loads a file of known hashes.
296 // First identifies the file type, then reads the file.
297 //
298 hashlist::loadstatus_t
load_hash_file(display * ocb,const std::string & fn)299 hashlist::load_hash_file(display *ocb,const std::string &fn)
300 {
301 loadstatus_t status = loadstatus_ok;
302 hashfile_format type;
303
304 FILE *hl_handle = fopen(fn.c_str(),"rb");
305 if (NULL == hl_handle)
306 {
307 if (ocb)
308 ocb->error("%s: %s", fn.c_str(), strerror(errno));
309 return status_file_error;
310 }
311
312 type = identify_format(ocb,fn,hl_handle);
313 if (file_unknown == type)
314 {
315 if (ocb)
316 ocb->error("%s: Unable to identify file format", fn.c_str());
317 fclose(hl_handle);
318 hl_handle = 0;
319 return status_unknown_filetype;
320 }
321
322 bool contains_bad_lines = false;
323 bool record_valid;
324
325 // We start our counter at line number two for the two lines
326 // of header we've already read
327 uint64_t line_number = 2;
328
329 // TODO: Read the line directly into a std::string
330 char line[MAX_STRING_LENGTH];
331 while (fgets(line,MAX_STRING_LENGTH,hl_handle))
332 {
333 line_number++;
334
335 // Lines starting with a pound sign are comments and can be ignored
336 if ('#' == line[0])
337 continue;
338
339 // C++ typically fails with a bad_alloc, but you can make it return null
340 // http://www.cplusplus.com/reference/std/new/bad_alloc/
341 // http://www.cplusplus.com/reference/std/new/nothrow/
342 file_data_t *t = new (std::nothrow) file_data_t();
343 if (NULL == t)
344 {
345 ocb->fatal_error("%s: Out of memory in line %" PRIu64,
346 fn.c_str(), line_number);
347 }
348
349 chop_line(line);
350 record_valid = true;
351
352 // Convert the input line to a string for easier manipulations
353 std::string line_as_string(line);
354 std::vector<std::string> fields = split(line_as_string,',');
355
356 size_t column_number;
357 // The offset of the current word within this line. Used for filenames.
358 size_t offset_in_line = 0;
359 for (column_number=0 ; column_number<fields.size() ; column_number++)
360 {
361 std::string word = fields[column_number];
362
363 if (column_number == filename_column)
364 {
365 // If the filename contained commas, it was split
366 // incorrectly by the 'split' statememt above. The filename
367 // will be split across more than one column.
368 // To be safe, we grab everything from where this field starts
369 // to the end of the line, and call that the 'filename'.
370 // (This also avoids a problem
371 // when the filename is the same as one of the hashes, which
372 // happens now and again.)
373 t->file_name = line_as_string.substr(offset_in_line, std::string::npos);
374
375 // This should be the last column, so we break out now.
376 break;
377 }
378
379 // The extra +1 is for the comma
380 offset_in_line += word.size() + 1;
381
382 // The first column should always be the file size
383 if (0 == column_number)
384 {
385 t->file_bytes = (uint64_t)strtoll(word.c_str(),NULL,10);
386 continue;
387 }
388
389 // All other columns should contain a valid hash in hex
390 if ( !algorithm_t::valid_hash(hash_column[column_number],word))
391 {
392 if (ocb)
393 ocb->error("%s: Invalid %s hash in line %" PRIu64,
394 fn.c_str(),
395 hashes[hash_column[column_number]].name.c_str(),
396 line_number);
397 contains_bad_lines = true;
398 record_valid = false;
399 // Break out (done = true) and then process the next line
400 break;
401 }
402
403 // Convert the hash to a std::string and save it
404 lowercase(word);
405 t->hash_hex[hash_column[column_number]] = word;
406 }
407
408 if (record_valid)
409 add_fdt(t);
410 }
411
412 fclose(hl_handle);
413 hl_handle = 0;
414
415 if (contains_bad_lines)
416 return status_contains_bad_hashes;
417
418 return status;
419 }
420
421
422 /**
423 * We don't use this function anymore, but it's handy to have just in case
424 */
searchstatus_to_str(searchstatus_t val)425 const char *hashlist::searchstatus_to_str(searchstatus_t val)
426 {
427 switch (val)
428 {
429 case searchstatus_ok: return "ok";
430 case status_match: return "complete match";
431 case status_partial_match: return "partial match";
432 case status_file_size_mismatch: return "file size mismatch";
433 case status_file_name_mismatch: return "file name mismatch";
434 case status_no_match: return "no match";
435
436 default:
437 return "unknown";
438 }
439 }
440
441
442
443
444