1 // $Id$
2 
3 /** hashlist.cpp
4  * Implements a list of hashes for local database, searching, etc.
5  * Currently done with a map; could be done with an unordered set.
6  * Contains the logic for performing the audit.
7  * Formerly this code was in audit.cpp and match.cpp.
8  */
9 
10 #include "main.h"
11 #include <new>
12 #include <iostream>
13 
14 /// Add a fi to the hash list.
15 ///
16 /// Be sure that the hash is all lower case, because that's what we
17 /// use internally.
add_file(file_data_t * fi,int alg_num)18 void hashlist::hashmap::add_file(file_data_t *fi,int alg_num)
19 {
20     if (fi->hash_hex[alg_num].size())
21     {
22       std::string hexhash = fi->hash_hex[alg_num];
23       for (std::string::iterator it = hexhash.begin();it!=hexhash.end();it++)
24       {
25 	if (isupper(*it))
26 	  *it = tolower(*it);
27       }
28       insert(std::pair<std::string,file_data_t *>(hexhash,fi));
29     }
30 }
31 
32 
33 /**
34  * Adds a file_data_t pointer to the hashlist.
35  * Does not copy the object.
36  * Object will be modified if there is a match.
37  */
add_fdt(file_data_t * fi)38 void hashlist::add_fdt(file_data_t *fi)
39 {
40     push_back(fi);			// retain our copy
41     for(int i=0;i<NUM_ALGORITHMS;i++){	// and add for each algorithm
42 	hashmaps[i].add_file(fi,i); // and point to the back
43     };
44 }
45 
46 /**
47  * search for a hash with an (optional) given filename.
48  * Return the first hash that matches the filename.
49  * If nothing matches the filename, return the first hash that matches.
50  * If a match is found, set file_number in the hash that is found.
51  * Not sure I like modifying the store, but it's okay for now.
52  */
find_hash(hashid_t alg,const std::string & hash_hex,const std::string & file_name,uint64_t file_number)53 file_data_t *hashlist::find_hash(hashid_t alg,
54 				 const std::string &hash_hex,
55 				 const std::string &file_name,
56 				 uint64_t file_number)
57 {
58     if(opt_debug>2)
59       std::cerr << "find_hash alg=" << alg << " hash_hex=" << hash_hex <<
60 	" fn=" << file_name << " file_number=" << file_number;
61     std::pair<hashmap::iterator,hashmap::iterator> match;
62     match = this->hashmaps[alg].equal_range(hash_hex);
63     if (match.first==match.second)
64     {
65       if (opt_debug>2)
66 	std::cerr << " RETURNS 0\n";
67       return 0; // nothing found
68     }
69 
70     for (hashmap::iterator it = match.first; it!=match.second; ++it)
71     {
72       if ((*it).second->file_name == file_name)
73       {
74 	if (file_number)
75 	  (*it).second->matched_file_number = file_number;
76 	if (opt_debug)
77 	  std::cerr << " RETURNS EXACT MATCH " << file_number << "\n";
78 	return (*it).second;
79       }
80     }
81 
82     // No exact matches; return the first match
83     if (file_number)
84       (*match.first).second->matched_file_number = file_number;
85     if (opt_debug)
86       std::cerr << " RETURNS FIRST MATCH " << file_number << "\n";
87     return (*match.first).second;
88 }
89 
90 
91 ///
92 /// Search for the provided fdt in the hashlist and return the status of the match.
93 /// Match on name if possible; otherwise match on just the hash codes.
94 ///
search(const file_data_hasher_t * fdht,file_data_t ** matched_,bool case_sensitive)95 hashlist::searchstatus_t hashlist::search(const file_data_hasher_t *fdht,
96 					  file_data_t ** matched_,
97 					  bool case_sensitive)
98 {
99   // Iterate through each of the hashes in the haslist until we find a match.
100   for (int alg = 0 ; alg < NUM_ALGORITHMS ; ++alg)
101   {
102     // Only search hash functions that are in use and hashes that are in the fdt
103     if (hashes[alg].inuse==0 || fdht->hash_hex[alg].size()==0)
104     {
105       continue;
106     }
107 
108     // Find the best match using find_hash
109     file_data_t *matched = find_hash((hashid_t)alg,
110 				     fdht->hash_hex[alg],
111 				     fdht->file_name,
112 				     fdht->file_number);
113 
114     if (not matched)
115     {
116       // No match
117       continue;
118     }
119 
120     if (matched_)
121       *matched_ = matched; // note the match
122 
123     // Verify that all of the other hash functions for *it match fdt as well,
124     // but only for the cases when we have a hash for both the master file
125     // and the target file.
126     for (int j=0 ; j<NUM_ALGORITHMS ; j++)
127     {
128       if (hashes[j].inuse and
129 	  j != alg and
130 	  fdht->hash_hex[j].size() and
131 	  matched->hash_hex[j].size())
132       {
133 	if (fdht->hash_hex[j] != matched->hash_hex[j])
134 	{
135 	  // We have found a hash collision for one algorithm, but not all
136 	  // of them. For example, MD5(A) == MD5(B), but SHA1(A) != SHA1(B).
137 	  // See http://www.win.tue.nl/hashclash/ for a program to create these.
138 	  return status_partial_match;
139 	}
140       }
141     }
142 
143     // If we got here we matched on all of the hashes.
144     // Which is to be expected.
145     // Check to see if the sizes are the same.
146     if (fdht->file_bytes != matched->file_bytes)
147     {
148       // Amazing. We found two files that have the same hash but different
149       // file sizes. This has never happened before in the history of the world.
150       // Call the newspapers!
151       return status_file_size_mismatch;
152     }
153 
154     // See if the hashes are the same but the name changed.
155     if (case_sensitive)
156     {
157       if (fdht->file_name != matched->file_name)
158 	return status_file_name_mismatch;
159     }
160     else
161     {
162       if (strcasecmp(fdht->file_name.c_str(), matched->file_name.c_str()))
163 	return status_file_name_mismatch;
164     }
165 
166     // If we get here, then all of the hash matches for all of the
167     // algorithms have been checked and found to be equal if present.
168     return status_match;
169   }
170 
171   // If we get here, nothing ever matched.
172   return status_no_match;
173 }
174 
175 
176 
177 ///
178 /// Returns the file type of a given input file.
179 /// fn is provided so that error messages can be printed.
180 ///
identify_format(class display * ocb,const std::string & fn,FILE * handle)181 hashlist::hashfile_format hashlist::identify_format(class display *ocb,
182 						    const std::string &fn,
183 						    FILE *handle)
184 {
185     char buf[MAX_STRING_LENGTH];
186 
187     // Find the header
188     if ((fgets(buf,MAX_STRING_LENGTH,handle)) == NULL) {
189 	return file_unknown;
190     }
191 
192     chop_line(buf);
193 
194     if ( ! STRINGS_EQUAL(buf,HASHDEEP_HEADER_10)) {
195 	return file_unknown;
196     }
197 
198     // Find which hashes are in this file
199     if ((fgets(buf,MAX_STRING_LENGTH,handle)) == NULL) {
200 	return file_unknown;
201     }
202 
203     chop_line(buf);
204 
205     // We don't use STRINGS_EQUAL here because we only care about
206     // the first ten characters for right now.
207     if (strncasecmp("%%%% size,",buf,10))  {
208 	return file_unknown;
209     }
210 
211     /**
212      * Remember previously loaded hashes.
213      */
214     std::string previously_enabled_algorithms = last_enabled_algorithms;
215 
216     // Skip the "%%%% size," when parsing the list of hashes
217     enable_hashing_algorithms_from_hashdeep_file(ocb,fn,buf + 10);
218 
219 
220     // If the set of hashes now in use doesn't match those previously in use,
221     // give a warning.
222     if (previously_enabled_algorithms.size()>0
223 	&& previously_enabled_algorithms != last_enabled_algorithms){
224 	if(ocb) ocb->error("%s: Hashes not in same format as previously loaded",
225 				 fn.c_str());
226     }
227     return file_hashdeep_10;
228 }
229 
230 
231 /*
232  * Examine the list of hashing algorithms in the file,
233  * enable them and note their order. If the last algorithm is 'filename', ignore it.
234  */
235 
enable_hashing_algorithms_from_hashdeep_file(class display * ocb,const std::string & fn,std::string val)236 void hashlist::enable_hashing_algorithms_from_hashdeep_file(class display *ocb,const std::string &fn,std::string val)
237 {
238     // The first position is always the file size, so we start with an
239     // the first position of one.
240     uint8_t num_columns = 1;
241 
242     last_enabled_algorithms = val;
243     std::vector<std::string> algs = split(val,',');
244     for(std::vector<std::string>::iterator it = algs.begin(); it!=algs.end(); it++){
245 	std::string name = *it;
246 	lowercase(name);
247 	if(name=="filename")
248   {
249     // Special value to denote the filename
250     filename_column = num_columns;
251     continue;
252   }
253 	hashid_t id = algorithm_t::get_hashid_for_name(name);
254 	if(id==alg_unknown){
255 	    if(ocb){
256 		ocb->error("%s: Badly formatted file", fn.c_str());
257 		ocb->try_msg();
258 	    }
259 	    exit(EXIT_FAILURE);
260 	}
261 
262 	/* Found a known algorithm */
263 	hashes[id].inuse = TRUE;
264 	hash_column[num_columns] = id;
265 	num_columns++;
266     }
267 }
268 
269 
dump_hashlist()270 void hashlist::dump_hashlist()
271 {
272     std::cout << "md5,sha1,bytes,filename   matched\n";
273     for (hashlist::const_iterator it = begin(); it!=end(); it++)
274     {
275       std::cout << (*it)->hash_hex[alg_md5] << "," << (*it)->hash_hex[alg_sha1] << ","
276 		<< (*it)->file_bytes << "," << (*it)->file_name
277 		<< "\tmatched=" << (*it)->matched_file_number << "\n";
278     }
279 }
280 
total_matched()281 uint64_t hashlist::total_matched()
282 {
283     uint64_t total = 0;
284     for (hashlist::const_iterator it = begin(); it!=end(); it++)
285     {
286       if ( (*it)->matched_file_number > 0)
287 	  total++;
288     }
289 
290     return total;
291 }
292 
293 
294 //
295 // Loads a file of known hashes.
296 // First identifies the file type, then reads the file.
297  //
298 hashlist::loadstatus_t
load_hash_file(display * ocb,const std::string & fn)299 hashlist::load_hash_file(display *ocb,const std::string &fn)
300 {
301   loadstatus_t status = loadstatus_ok;
302   hashfile_format type;
303 
304   FILE *hl_handle = fopen(fn.c_str(),"rb");
305   if (NULL == hl_handle)
306   {
307     if (ocb)
308       ocb->error("%s: %s", fn.c_str(), strerror(errno));
309     return status_file_error;
310   }
311 
312   type = identify_format(ocb,fn,hl_handle);
313   if (file_unknown == type)
314   {
315     if (ocb)
316       ocb->error("%s: Unable to identify file format", fn.c_str());
317     fclose(hl_handle);
318     hl_handle = 0;
319     return status_unknown_filetype;
320   }
321 
322   bool contains_bad_lines = false;
323   bool record_valid;
324 
325   // We start our counter at line number two for the two lines
326   // of header we've already read
327   uint64_t line_number = 2;
328 
329   // TODO: Read the line directly into a std::string
330   char line[MAX_STRING_LENGTH];
331   while (fgets(line,MAX_STRING_LENGTH,hl_handle))
332   {
333     line_number++;
334 
335     // Lines starting with a pound sign are comments and can be ignored
336     if ('#' == line[0])
337       continue;
338 
339     // C++ typically fails with a bad_alloc, but you can make it return null
340     // http://www.cplusplus.com/reference/std/new/bad_alloc/
341     // http://www.cplusplus.com/reference/std/new/nothrow/
342     file_data_t *t = new (std::nothrow) file_data_t();
343     if (NULL == t)
344     {
345       ocb->fatal_error("%s: Out of memory in line %" PRIu64,
346 		       fn.c_str(), line_number);
347     }
348 
349     chop_line(line);
350     record_valid = true;
351 
352     // Convert the input line to a string for easier manipulations
353     std::string line_as_string(line);
354     std::vector<std::string> fields = split(line_as_string,',');
355 
356     size_t column_number;
357     // The offset of the current word within this line. Used for filenames.
358     size_t offset_in_line = 0;
359     for (column_number=0 ; column_number<fields.size() ; column_number++)
360     {
361       std::string word = fields[column_number];
362 
363       if (column_number == filename_column)
364       {
365 	// If the filename contained commas, it was split
366 	// incorrectly by the 'split' statememt above. The filename
367 	// will be split across more than one column.
368 	// To be safe, we grab everything from where this field starts
369 	// to the end of the line, and call that the 'filename'.
370 	// (This also avoids a problem
371 	// when the filename is the same as one of the hashes, which
372 	// happens now and again.)
373 	t->file_name = line_as_string.substr(offset_in_line, std::string::npos);
374 
375 	// This should be the last column, so we break out now.
376 	break;
377       }
378 
379       // The extra +1 is for the comma
380       offset_in_line += word.size() + 1;
381 
382       // The first column should always be the file size
383       if (0 == column_number)
384       {
385 	t->file_bytes = (uint64_t)strtoll(word.c_str(),NULL,10);
386 	continue;
387       }
388 
389       // All other columns should contain a valid hash in hex
390       if ( !algorithm_t::valid_hash(hash_column[column_number],word))
391       {
392 	if (ocb)
393 	  ocb->error("%s: Invalid %s hash in line %" PRIu64,
394 		     fn.c_str(),
395 		     hashes[hash_column[column_number]].name.c_str(),
396 		     line_number);
397 	contains_bad_lines = true;
398 	record_valid = false;
399 	// Break out (done = true) and then process the next line
400 	break;
401       }
402 
403       // Convert the hash to a std::string and save it
404       lowercase(word);
405       t->hash_hex[hash_column[column_number]] = word;
406     }
407 
408     if (record_valid)
409       add_fdt(t);
410   }
411 
412   fclose(hl_handle);
413   hl_handle = 0;
414 
415   if (contains_bad_lines)
416     return status_contains_bad_hashes;
417 
418   return status;
419 }
420 
421 
422 /**
423  * We don't use this function anymore, but it's handy to have just in case
424  */
searchstatus_to_str(searchstatus_t val)425 const char *hashlist::searchstatus_to_str(searchstatus_t val)
426 {
427     switch (val)
428       {
429       case searchstatus_ok:           return "ok";
430       case status_match:              return "complete match";
431       case status_partial_match:      return "partial match";
432       case status_file_size_mismatch: return "file size mismatch";
433       case status_file_name_mismatch: return "file name mismatch";
434       case status_no_match:           return "no match";
435 
436       default:
437 	return "unknown";
438     }
439 }
440 
441 
442 
443 
444