1 /*
2  * main.h:
3  *
4  * This is the main file included by all other modules in md5deep/hashdeep/etc.
5  *
6  * It includes:
7  * common.h - the common system include files
8  * xml.h    - the C++ XML system.
9  * hash function headers
10  *
11  * C++ STL stuff.
12  *
13  * It then creates all the C++ classes and structures used.
14  *
15  * $Id$
16  */
17 
18 
19 #ifndef __MAIN_H
20 #define __MAIN_H
21 
22 #include "common.h"
23 #include "xml.h"
24 
25 #ifdef HAVE_PTHREAD
26 #include "threadpool.h"
27 #endif
28 
29 #include <map>
30 #include <vector>
31 
32 #if !defined(VERSION) && defined(PACKAGE_VERSION)
33 #define VERSION PACKAGE_VERSION
34 #endif
35 
36 #define VERBOSE		 1
37 #define MORE_VERBOSE	 2
38 #define INSANELY_VERBOSE 3
39 
40 /* These describe the version of the file format being used, not
41  *   the version of the program.
42  */
43 #define HASHDEEP_PREFIX     "%%%% "
44 #define HASHDEEP_HEADER_10  "%%%% HASHDEEP-1.0"
45 
46 /* HOW TO ADD A NEW HASHING ALGORITHM
47   * Add a value for the algorithm to the hashid_t enumeration
48   * Add the functions to compute the hashes. There should be three functions,
49     an initialization route, an update routine, and a finalize routine.
50     The convention, for an algorithm "foo", is
51     foo_init, foo_update, and foo_final.
52   * Add your new code to Makefile.am under hashdeep_SOURCES
53   * Add a call to insert the algorithm in state::load_hashing_algorithms
54   * See if you need to increase MAX_ALGORITHM_NAME_LENGTH or
55     MAX_ALGORITHM_CONTEXT_SIZE for your algorithm in common.h
56   * Update the usage function and man page to include the function
57   */
58 
59 typedef enum {
60   alg_md5=0,
61   alg_sha1,
62   alg_sha256,
63   alg_tiger,
64   alg_whirlpool,
65   alg_sha3,
66 
67   // alg_unknown must always be last in this list. It's used
68   // as a loop terminator in many functions.
69   alg_unknown
70 } hashid_t;
71 
72 inline std::ostream & operator << (std::ostream &os,const hashid_t &h)
73 {
74   switch (h)
75   {
76   case alg_md5:       os << "alg_md5" ; break ;
77   case alg_sha1:      os << "alg_sha1" ; break ;
78   case alg_sha256:    os << "alg_sha256" ; break ;
79   case alg_tiger:     os << "alg_tiger" ; break ;
80   case alg_whirlpool: os << "alg_whirlpool" ; break ;
81   case alg_sha3:      os << "alg_sha3" ; break ;
82   case alg_unknown:   os << "alg_unknown" ; break ;
83   }
84 
85   return os;
86 }
87 
88 #define NUM_ALGORITHMS  alg_unknown
89 
90 /* Which ones are enabled by default */
91 #define DEFAULT_ENABLE_MD5         TRUE
92 #define DEFAULT_ENABLE_SHA1        FALSE
93 #define DEFAULT_ENABLE_SHA256      TRUE
94 #define DEFAULT_ENABLE_TIGER       FALSE
95 #define DEFAULT_ENABLE_WHIRLPOOL   FALSE
96 #define DEFAULT_ENABLE_SHA3        FALSE
97 
98 class iomode {
99 public:;
100     static const int buffered=0;			// use fopen, fread, fclose
101     static const int unbuffered=1;			// use open, read, close
102     static const int mmapped=2;				// use open, mmap, close
toiomode(const std::string & str)103     static int toiomode(const std::string &str){
104 	if(str=="0" || str[0]=='b') return iomode::buffered;
105 	if(str=="1" || str[0]=='u') return iomode::unbuffered;
106 	if(str=="2" || str[0]=='m') return iomode::mmapped;
107 	std::cerr << "Invalid iomode '" << str << "'";
108 	assert(0);
109 	return iomode::unbuffered;	// default
110     }
111 };
112 
113 /* This class holds the information known about each hash algorithm.
114  * It's sort of like the EVP system in OpenSSL.
115  *
116  * In version 3 the list of known hashes was stored here as well.
117  * That has been moved to the hashlist database (further down).
118  *
119 * Right now we are using some global variables; the better way to do this
120  * would be with a C++ singleton.
121  *
122  * Perhaps the correct way to do this would be a global C++ vector of objects?
123  */
124 class algorithm_t {
125 public:
126     bool		inuse;		// true if we are using this algorithm
127     std::string		name;		// name of algorithm
128     size_t		bit_length;	// 128 for MD5
129     hashid_t		id;		// usually the position in the array...
130 
131     /* The hashing functions */
132     void ( *f_init)(void *ctx);
133     void ( *f_update)(void *ctx, const unsigned char *buf, size_t len );
134     void ( *f_finalize)(void *ctx, unsigned char *);
135 
136     /* The methods */
137     static void add_algorithm(hashid_t pos, const char *name, uint16_t bits,
138 			      void ( *func_init)(void *ctx),
139 			      void ( *func_update)(void *ctx, const unsigned char *buf, size_t len ),
140 			      void ( *func_finalize)(void *ctx, unsigned char *),
141 			      int inuse);
142     static void load_hashing_algorithms();
143     static void clear_algorithms_inuse();
144     static void enable_hashing_algorithms(std::string var);  // enable the algorithms in 'var'; var can be 'all'
145     static hashid_t get_hashid_for_name(std::string name);   // return the hashid_t for 'name'
146     static bool valid_hex(const std::string &buf);	     // returns true if buf contains only hex characters
147     static bool valid_hash(hashid_t alg,const std::string &buf); // returns true if buf is a valid hash for hashid_t a
148     static int  algorithms_in_use_count(); // returns count of algorithms in use
149 };
150 
151 extern algorithm_t     hashes[NUM_ALGORITHMS];		// which hash algorithms are available and in use
152 
153 
154 
155 /** status_t describes exit codes for the program
156  *
157  */
158 class status_t  {
159 private:
160     int32_t code;
161 public:;
status_t()162     status_t():code(0){};
163     static const int32_t status_ok = EXIT_SUCCESS; // 0
164     static const int32_t status_EXIT_FAILURE = EXIT_FAILURE;
165     static const int32_t status_out_of_memory = -2;
166     static const int32_t status_invalid_hash = -3;
167     static const int32_t status_unknown_error = -4;
168     static const int32_t status_omg_ponies = -5;
169 
170     /*
171      * Return values for the program
172      * RBF - Document these return values for hashdeep
173      * A successful run has these or'ed together
174      */
175     static const int32_t STATUS_UNUSED_HASHES = 1;
176     static const int32_t STATUS_INPUT_DID_NOT_MATCH = 2;
177     static const int32_t STATUS_USER_ERROR = 64;
178     static const int32_t STATUS_INTERNAL_ERROR = 128;
add(int32_t val)179     void add(int32_t val){ code |= val; }
set(int32_t val)180     void set(int32_t val){ code = val; }
get_status()181     int32_t get_status(){ return code; }
182     bool operator==(int32_t v){ return this->code==v; }
183     bool operator!=(int32_t v){ return this->code!=v; }
184 };
185 
186 
187 #ifdef _WIN32
188 typedef __time64_t	timestamp_t;
189 typedef std::wstring	filename_t;
190 #else
191 typedef time_t		timestamp_t;
192 typedef std::string	filename_t;
193 #endif
194 
195 /**
196  * file_metadata_t contains metadata information about a file.
197  * It also includes a stat call that returns the inode information
198  * and link count even on windows, where the API is different than stat.
199  * Note that we only include information we care about in this program
200  *
201  * this is in dig.cpp.
202  */
203 
204 
205 /* strangely, we define our own file types */
206 typedef enum {
207     stat_regular=0,
208     stat_directory,
209     stat_door,
210     stat_block,
211     stat_character,
212     stat_pipe,
213     stat_socket,
214     stat_symlink,
215     stat_unknown=254
216 } file_types;
217 
218 class file_metadata_t {
219 public:
220     static file_types decode_file_type(const struct __stat64 &sb);
221 
222     // stat a file, print an error and return -1 if it fails, otherwise return 0
223     static int stat(const filename_t &path,file_metadata_t *m,class display &ocb);
224     class fileid_t {				      // uniquely defines a file on this system
225     public:
fileid_t()226 	fileid_t():dev(0),ino(0){};
fileid_t(uint64_t dev_,uint64_t ino_)227 	fileid_t(uint64_t dev_,uint64_t ino_):dev(dev_),ino(ino_){};
228 	uint64_t	dev;			      // device number
229 	uint64_t	ino;			      // inode number
230     };
file_metadata_t()231     file_metadata_t():fileid(),nlink(0),size(0),ctime(0),mtime(0),atime(0){};
file_metadata_t(fileid_t fileid_,uint64_t nlink_,uint64_t size_,timestamp_t ctime_,timestamp_t mtime_,timestamp_t atime_)232     file_metadata_t(fileid_t fileid_,uint64_t nlink_,uint64_t size_,timestamp_t ctime_,timestamp_t mtime_,
233 		    timestamp_t atime_):fileid(fileid_),nlink(nlink_),size(size_),ctime(ctime_),mtime(mtime_),atime(atime_){};
234     fileid_t	fileid;
235     uint64_t	nlink;
236     uint64_t	size;
237     timestamp_t ctime;
238     timestamp_t mtime;
239     timestamp_t atime;
240 
241 };
242 
243 /** file_data_t contains information about a file.
244  * It can be created by hashing an actual file, or by reading a hash file a file of hashes.
245  * The object is simple so that the built in C++ shallow copy will make a proper copy of it.
246  * Note that all hashes are currently stored as a hex string. That incurs a 2x memory overhead.
247  * This will be changed.
248  */
249 class file_data_t {
250 public:
file_data_t()251     file_data_t():file_bytes(0),matched_file_number(0){
252     };
~file_data_t()253     virtual ~file_data_t(){}		// required because we subclass
254 
255     std::string hash_hex[NUM_ALGORITHMS];	     // the hash in hex of the entire file
256     std::string	hash512_hex[NUM_ALGORITHMS];	     // hash of the first 512 bytes, for triage mode
257     std::string	file_name;		// just the file_name; native on POSIX; UTF-8 on Windows.
258 
259     uint64_t    file_bytes;		// how many bytes were actually read
260     uint64_t    matched_file_number;	 // file number that we matched.; 0 if no match
261 
262 };
263 
264 /**
265  * hash_context stores information for a specific hash.
266  * which may for a piece of a file or an entire file
267  */
268 class hash_context_obj {
269 public:;
hash_context_obj()270     hash_context_obj():read_offset(0),read_len(0){}
271 
272     /* Information for the hashing underway */
273     uint8_t	hash_context[NUM_ALGORITHMS][MAX_ALGORITHM_CONTEXT_SIZE];
274 
275     /* The actual hashing */
276     void multihash_initialize();
277     void multihash_update(const unsigned char *buffer,size_t bufsize);
278     void multihash_finalize(std::string dest[]);
279 
280     // for piecewise hashing: where this segment was actually read
281     uint64_t	read_offset;		// where the segment we read started
282     uint64_t	read_len;		// how many bytes were read and hashed
283 };
284 
285 
286 /** file_data_hasher_t is a subclass of file_data_t.
287  * It contains additional information necessary to actually hash a file.
288  */
289 class file_data_hasher_t : public file_data_t {
290 private:
291     static uint64_t	next_file_number;
292     static mutex_t	fdh_lock;
293 public:
stat_megs()294     uint64_t	stat_megs() const {	// return how many megabytes is the file in MB?
295 	return stat_bytes / ONE_MEGABYTE;
296     }
297     static const size_t MD5DEEP_IDEAL_BLOCK_SIZE = 8192;
file_data_hasher_t(class display * ocb_)298     file_data_hasher_t(class display *ocb_):
299 	ocb(ocb_),			// where we put results
300 	handle(0),
301 	fd(-1),
302 	base(0),bounds(0),		// for mmap
303 	file_number(0),ctime(0),mtime(0),atime(0),stat_bytes(0),
304 	start_time(0),last_time(0),eof(false),workerid(-1){
305 	file_number = ++next_file_number;
306     };
~file_data_hasher_t()307     virtual ~file_data_hasher_t(){
308 	if(handle){
309 	    fclose(handle);
310 	    handle = 0;
311 	}
312 	if(fd){
313 #ifdef HAVE_MMAP
314 	    if(base) munmap((void *)base,bounds);
315 #endif
316 	    close(fd);
317 	    fd = 0;
318 	}
319     }
320 
is_stdin()321     bool is_stdin(){ return handle==stdin; }
322 
323     /* The actual file to hash */
324     filename_t file_name_to_hash;
325 
326     /* Where the results go */
327     class display *ocb;
328 
329     /* How we read the data */
330     FILE        *handle;		// the file we are reading
331     int		fd;			// fd used for unbuffered and mmap
332     const unsigned char *base;		// base of mapped file
333     size_t	bounds;			// size of the mapped file
334 
335     std::string		triage_info;	// if true, must print on output
336     std::stringstream	dfxml_hash;	// the DFXML hash digest for the piece just hashed;
337 					// used to build piecewise
338     uint64_t	file_number;
339     void	append_dfxml_for_byterun();
340     void	compute_dfxml(bool known_hash,const hash_context_obj *hc);
341 
342     timestamp_t	ctime;		// ctime; previously 'timestamp'
343     timestamp_t	mtime;
344     timestamp_t	atime;
345 
346     // How many bytes (and megs) we think are in the file, via stat(2)
347     // and how many bytes we've actually read in the file
348     uint64_t    stat_bytes;		// how much stat returned
349 
350     /* When we started the hashing, and when was the last time a display was printed,
351      * for printing status updates.
352      */
353     time_t	start_time, last_time;	// of hashing
354     bool	eof;			// end of file encountered while reading
355     int		workerid;		// my worker id, or -1 if there is none
set_workerid(int id)356     void	set_workerid(int id){workerid=id;}
357 
358     /* multithreaded hash implementation is these functions in hash.cpp.
359      * hash() is called to hash each file and record the results.
360      * Return codes are both stored in display return_code and returned
361      * 0 - for success, -1 for error
362      */
363     // called to actually do the computation; returns true if successful
364     // and fills in the read_offset and read_len
365     void dfxml_timeout(const std::string &tag,const timestamp_t &val);
366     void dfxml_write_hashes(std::string hex_hashes[],int indent);
367     bool compute_hash(uint64_t request_start,uint64_t request_len,hash_context_obj *segment,hash_context_obj *file);
368     void hash();	// called to hash each file and record results
369 };
370 
371 
372 /** The hashlist holds a list of file_data_t objects.
373  * state->known is used to hold the audit file that is loaded.
374  * state->seen is used to hold the hashes seen on the current run.
375  * We store multiple maps for each algorithm number which map the hash hex code
376  * to the pointer as well.
377  *
378  * the hashlist.cpp file contains the implementation. It's largely taken
379  * from the v3 audit.cpp and match.cpp files.
380  */
381 class hashlist : public std::vector<file_data_t *> {
382     /**
383      * The largest number of columns we can expect in a file of hashes
384      * (knowns).  Normally this should be the number of hash
385      * algorithms plus a column for file size, file name, and, well,
386      * some fudge factors. Any values after this number will be
387      * ignored. For example, if the user invokes the program as:
388      *
389      * hashdeep -c md5,md5,md5,md5,...,md5,md5,md5,md5,md5,md5,md5,whirlpool
390      *
391      * the whirlpool will not be registered.
392      */
393 
394 public:;
395     static const int MAX_KNOWN_COLUMNS= NUM_ALGORITHMS+ 6;
396     typedef enum {
397 	/* return codes from loading a hash list */
398 	loadstatus_ok = 0,
399 	status_unknown_filetype,
400 	status_contains_bad_hashes,
401 	status_contains_no_hashes,
402 	status_file_error
403     } loadstatus_t;
404 
405     typedef enum   {
406 	searchstatus_ok = 0,
407 
408 	/* Matching hashes */
409 	status_match,			// all hashes match
410 	status_partial_match,	 /* One or more hashes match, but not all */
411 	status_file_size_mismatch,   /* Implies all hashes match */
412 	status_file_name_mismatch,   /* Implies all hashes and file size match */
413 	status_no_match             /* Implies none of the hashes match */
414     } searchstatus_t;
415     static const char *searchstatus_to_str(searchstatus_t val);
416 
417     // Types of files that contain known hashes
418     typedef enum   {
419 	file_plain,
420 	file_bsd,
421 	file_hashkeeper,
422 	file_nsrl_15,
423 	file_nsrl_20,
424 	file_encase3,
425 	file_encase4,
426 	file_ilook,
427 
428 	// Files generated by md5deep with the ten digit filesize at the start
429 	// of each line
430 	file_md5deep_size,
431 	file_hashdeep_10,
432 	file_unknown
433     } hashfile_format;
434 
435     class hashmap : public  std::multimap<std::string,file_data_t *> {
436     public:;
437 	void add_file(file_data_t *fi,int alg_num);
438     };
439     hashmap		hashmaps[NUM_ALGORITHMS];
440 
441     /****************************************************************
442      ** Search functions follow
443      ** It's not entirely clear why we have two search functions, but we do.
444      ** Perhaps one is from md5deep and the other is from hashdeep
445      ****************************************************************/
446 
447     /**
448      * hashlist.cpp
449      * find_hash finds the 'best match', which ideally is a match for both the hash and the filename.
450      */
451     file_data_t	*find_hash(hashid_t alg,const std::string &hash_hex,
452 				   const std::string &file_name,
453 				   uint64_t file_number);
454 
455     /**
456      * look up a fdt by hash code(s) and return if it is present or not.
457      * optionally return a pointer to it as well.
458      */
459     searchstatus_t	search(const file_data_hasher_t *fdht, file_data_t ** matched, bool case_sensitive) ;
460     uint64_t		total_matched(); // return the total matched from all calls to search()
461 
462     /****************************************************************/
463 
464     /**
465      * Figure out the format of a hashlist file and load it.
466      * Both of these functions take the file name and the open handle.
467      * They read from the handle and just use the filename for printing error messages.
468      */
469     void		enable_hashing_algorithms_from_hashdeep_file(class display *ocb,
470 								     const std::string &fn,std::string val);
471 
472     std::string		last_enabled_algorithms; // a string with the algorithms that were enabled last
473     hashid_t		hash_column[NUM_ALGORITHMS]; // maps a column number to a hashid;
474 						     // the order columns appear in the file being loaded.
475     uint8_t   filename_column;  // Column number which should contain the filename
476     hashfile_format	identify_format(class display *ocb,const std::string &fn,FILE *handle);
477     loadstatus_t	load_hash_file(class display *ocb,const std::string &fn); // not tstring! always ASCII
478 
479     void		dump_hashlist(); // send contents to stdout
480 
481     /**
482      * add_fdt adds a file_data_t record to the hashlist, and its hashes to all the hashmaps.
483      * @param fi - a file_data_t to add. Don't erase it; we're going to use it (and modify it)
484      */
485     void add_fdt(file_data_t *fi);
486 };
487 
488 /* Primary modes of operation (primary_function) */
489 typedef enum  {
490   primary_compute=0,
491   primary_match=1,
492   primary_match_neg=2,
493   primary_audit=3
494 } primary_t;
495 
496 
497 // These are the types of files that we can match against
498 #define TYPE_PLAIN        0
499 #define TYPE_BSD          1
500 #define TYPE_HASHKEEPER   2
501 #define TYPE_NSRL_15      3
502 #define TYPE_NSRL_20      4
503 #define TYPE_ILOOK        5
504 #define TYPE_ILOOK3       6
505 #define TYPE_ILOOK4       7
506 #define TYPE_MD5DEEP_SIZE 8
507 #define TYPE_ENCASE       9
508 #define TYPE_UNKNOWN    254
509 
510 /* audit mode stats */
511 class audit_stats {
512 public:
audit_stats()513     audit_stats():exact(0), expect(0), partial(0), moved(0), unused(0), unknown(0), total(0){
514     };
515     /* For audit mode, the number of each type of file */
516     uint64_t	exact, expect, partial; //
517     uint64_t	moved, unused, unknown, total; //
clear()518     void clear(){
519 	exact = 0;
520 	expect = 0;
521 	partial = 0;
522 	moved = 0;
523 	unused = 0;
524 	unknown = 0;
525 	total = 0;
526     }
527 };
528 
529 /** display describes how information is output.
530  * There is only one OCB (it is a singleton).
531  * It needs to be mutex protected.
532  *
533  * The hashing happens in lots of threads and then calls the output
534  * classes in output_control_block to actually do the outputing. The
535  * problem here is that one of the things that is done is looking up,
536  * so the searches into "known" and "seen" also need to be
537  * protected. Hence "known" and "seen" appear in the
538  * output_control_block, and not elsewhere, and all of the access to
539  * them needs to be mediated.
540  *
541  * It also needs to maintain all of the state for audit mode.
542  * Finally, it maintains options for reading
543  * (e.g. buffered, unbuffered, or memory-mapped I/O)
544  *
545  * It is a class because it is protected and is passed around.
546  */
547 class display {
548  private:
549     mutable mutex_t	M;	// lock for anything in output section
lock()550     void lock() const	{ M.lock(); }
unlock()551     void unlock() const { M.unlock(); }
552 
553     /* all display state variables are protected by M and must be private */
554     std::ostream	*out;		// where things get sent
555     std::ofstream       myoutstream;	// if we open it
556     std::string		utf8_banner;	// banner to be displayed
557     bool		banner_displayed;	// has the header been shown (text output)
558     XML			*dfxml;			/* output in DFXML */
559 
560     /* The set of known values; typically read from the audit file */
561     hashlist		known;		// hashes read from the -k file
562     hashlist		seen;		// hashes seen on this hashing run; from the command line
563     class audit_stats	match;		// for the audit mode
564     status_t		return_code;	// prevously returned by hash() and dig().
565 
566  public:
display()567  display():
568     out(&std::cout),
569       banner_displayed(0),dfxml(0),
570       mode_triage(false),
571       mode_not_matched(false),mode_quiet(false),mode_timestamp(false),
572       mode_barename(false),
573       mode_size(false),mode_size_all(false),
574       opt_silent(false),
575       opt_verbose(0),
576       opt_estimate(false),
577       opt_relative(false),
578       opt_unicode_escape(false),
579       opt_mode_match(false),
580       opt_mode_match_neg(false),
581       opt_csv(false),
582       opt_asterisk(false),
583       opt_zero(false),
584       opt_display_size(false),
585       opt_display_hash(false),
586       opt_show_matched(false),
587       opt_case_sensitive(true),
588       opt_iomode(iomode::buffered),	// by default, use buffered
589 #ifdef HAVE_PTHREAD
590       opt_threadcount(threadpool::numCPU()),
591       tp(0),
592 #else
593       opt_threadcount(0),
594 #endif
595       size_threshold(0),
596       piecewise_size(0),
597       primary_function(primary_compute){
598       }
599 
600     /* These variables are read-only after threading starts */
601     bool	mode_triage;
602     bool	mode_not_matched;
603     bool	mode_quiet;
604     bool	mode_timestamp;
605     bool	mode_barename;
606     bool	mode_size;
607     bool	mode_size_all;
608     std::string	opt_outfilename;
609     bool	opt_silent;
610     int		opt_verbose;
611     bool	opt_estimate;
612     bool	opt_relative;
613     bool	opt_unicode_escape;
614     bool	opt_mode_match;
615     bool	opt_mode_match_neg;
616     bool	opt_csv;
617     bool	opt_asterisk;
618     bool	opt_zero;
619     bool	opt_display_size;
620     bool	opt_display_hash;
621     bool	opt_show_matched;
622     bool        opt_case_sensitive;
623     int		opt_iomode;
624     int		opt_threadcount;
625 
626 #ifdef HAVE_PTHREAD
627     threadpool		*tp;
628 #endif
629 
630     // When only hashing files larger/smaller than a given threshold
631     uint64_t        size_threshold;
632     uint64_t        piecewise_size;    // non-zero for piecewise mode
633     primary_t       primary_function;    /* what do we want to do? */
634 
635 
636     /* Functions for working */
637 
638     void	set_outfilename(std::string outfilename);
639 
640     /* Return code support */
get_return_code()641     int32_t	get_return_code(){ lock(); int ret = return_code.get_status(); unlock(); return ret; }
set_return_code(status_t code)642     void	set_return_code(status_t code){ lock(); return_code = code; unlock(); }
set_return_code(int32_t code)643     void	set_return_code(int32_t code){ lock(); return_code.set(code); unlock(); }
set_return_code_if_not_ok(status_t code)644     void	set_return_code_if_not_ok(status_t code){
645 	lock();
646 	if(code!=status_t::status_ok) return_code = code;
647 	unlock();
648     }
649 
650     /* DFXML support */
651 
xml_open(FILE * out_)652     void	xml_open(FILE *out_){
653 	lock();
654 	dfxml = new XML(out_);
655 	unlock();
656     }
657     void dfxml_startup(int argc,char **argv);
658     void dfxml_shutdown();
659     void dfxml_timeout(const std::string &tag,const timestamp_t &val);
660     void dfxml_write(file_data_hasher_t *fdht);
661 
662 
663     /* Known hash database interface */
664     /* Display the unused files and return the count */
665     uint64_t	compute_unused(bool show_display,std::string annotation);
set_utf8_banner(std::string utf8_banner_)666     void	set_utf8_banner(std::string utf8_banner_){
667 	utf8_banner = utf8_banner_;
668     }
669 
670 
671     static	mutex_t		portable_gmtime_mutex;
672     struct tm  *portable_gmtime(struct tm *my_time,const timestamp_t *t);
673     void	try_msg(void);
674 
675     void	display_banner_if_needed();
676     void	display_match_result(file_data_hasher_t *fdht,const hash_context_obj *hc);
677 
678     void	md5deep_display_match_result(file_data_hasher_t *fdht,const hash_context_obj *hc);
679     void	md5deep_display_hash(file_data_hasher_t *fdht,const hash_context_obj *hc);
680 
681     void	display_hash(file_data_hasher_t *fdht,const hash_context_obj *hc);
682     void	display_hash_simple(file_data_hasher_t *fdt,const hash_context_obj *hc);
683 
684     /* The following routines are for printing and outputing filenames.
685      *
686      * fmt_filename formats the filename.
687      * On Windows this version outputs as UTF-8 unless unicode quoting is requested,
688      * in which case Unicode characters are emited as U+xxxx.
689      * For example, the Unicode smiley character ☺ is output as U+263A.
690      *
691      */
692     std::string	fmt_size(const file_data_t *fdh) const;
693     std::string fmt_filename(const std::string  &fn) const;
694 #ifdef _WIN32
695     std::string fmt_filename(const std::wstring &fn) const;
696 #endif
fmt_filename(const file_data_t * fdt)697     std::string fmt_filename(const file_data_t *fdt) const {
698 	return fmt_filename(fdt->file_name);
699     }
700     void	writeln(std::ostream *s,const std::string &str);    // writes a line with NEWLINE and locking
701 
702     // Display an ordinary message with newline added
703     void	status(const char *fmt, ...) __attribute__((format(printf, 2, 0))); // note that 1 is 'self'
704 
705     // Display an error message if not in silent mode
706     void	error(const char *fmt, ...) __attribute__((format(printf, 2, 0)));
707 
708     // Display an error message if not in silent mode and exit
709     void	fatal_error(const char *fmt, ...) __attribute__((format(printf, 2, 0))) __attribute__ ((__noreturn__));
710     // Display an error message, ask user to contact the developer,
711     void	internal_error(const char *fmt, ...) __attribute__((format(printf, 2, 0))) __attribute__ ((__noreturn__));
712     void	print_debug(const char *fmt, ...) __attribute__((format(printf, 2, 0)));
713     void	error_filename(const std::string &fn, const char *fmt, ...) __attribute__((format(printf, 3, 0))) ;
714 #ifdef _WIN32
715     void	error_filename(const std::wstring &fn, const char *fmt, ...) __attribute__((format(printf, 3, 0)));
716 #endif
717 
718     /* these versions extract the filename and the annotation if it is present.
719      */
720 
721     /* known hash database and realtime stats.
722      * Note that this is not locked() and unlocked().
723      * It can only be run from the main thread before fork.
724      */
load_hash_file(const std::string & fn)725     hashlist::loadstatus_t load_hash_file(const std::string &fn){
726 	hashlist::loadstatus_t ret = known.load_hash_file(this,fn);
727 	return ret;
728     }
729 
730     /** These are multi-threaded */
731 
known_size()732     uint64_t known_size() const {
733 	lock();
734 	uint64_t ret= known.size();
735 	unlock();
736 	return ret;
737     }
find_hash(hashid_t alg,const std::string & hash_hex,const std::string & file_name,uint64_t file_number)738     const file_data_t *find_hash(hashid_t alg,const std::string &hash_hex,
739 				 const std::string &file_name,
740 				 uint64_t file_number){
741 	lock();
742 	const file_data_t *ret = known.find_hash(alg,hash_hex,file_name,file_number);
743 	unlock();
744 	return ret;
745     }
746     void	clear_realtime_stats();
747     void	display_realtime_stats(const file_data_hasher_t *fdht,const hash_context_obj *hc,time_t elapsed);
hashes_loaded()748     bool	hashes_loaded() const{ lock(); bool ret = known.size()>0; unlock(); return ret; }
add_fdt(file_data_t * fdt)749     void	add_fdt(file_data_t *fdt){ lock(); known.add_fdt(fdt); unlock(); }
750 
751     /* audit mode */
752     int		audit_update(file_data_hasher_t *fdt);
753     int		audit_check();		// performs an audit; return 0 if pass, -1 if fail
754     void	display_audit_results(); // sets return code if fails
755     void	finalize_matching();
756 
757     /* hash.cpp: Actually trigger the hashing. */
758     void	hash_file(const tstring &file_name);
759     void	hash_stdin();
dump_hashlist()760     void	dump_hashlist(){ lock(); known.dump_hashlist(); unlock(); }
761 };
762 
763 /**
764  * The 'state' class holds the state of the hashdeep/md5deep program.
765  * This includes:
766  * startup parameters
767  * known - the list of hashes in the hash database.
768  * seen  - the list of hashes that have been seen this time through.
769  */
770 
771 
772 class global {
773 public:
774     static tstring getcwd();			// returns the current directory
775     static tstring get_realpath(const tstring &fn); // returns the full path
776     static std::string get_realpath8(const tstring &fn);  // returns the full path in UTF-8
777     static std::string escape_utf8(const std::string &fn); // turns "⦿" to "U+29BF"
778 #ifdef _WIN32
779     static std::string make_utf8(const std::wstring &tfn) ;
780 #endif
make_utf8(const std::string & tfn)781     static std::string make_utf8(const std::string &tfn){return tfn;}
782 };
783 
784 /* On Win32, allow output of wstr's by converting them to UTF-8 */
785 #ifdef _WIN32
786 inline std::ostream & operator <<(std::ostream &os,const std::wstring &wstr) {
787     os << global::make_utf8(wstr);
788     return os;
789 }
790 #endif
791 
792 class state {
793 public:;
794 
state()795  state():mode_recursive(false),	// do we recurse?
796       mode_warn_only(false),	// for loading hash files
797 
798       // these determine which files get hashed
799       mode_expert(false),
800       mode_regular(false),
801       mode_directory(false),
802       mode_door(false),
803       mode_block(false),
804       mode_character(false),
805       mode_pipe(false),
806       mode_socket(false),
807       mode_symlink(false),
808       mode_winpe(false),
809 
810       // command line argument
811       argc(0),argv(0),
812 
813       // these have something to do with hash files that are loaded
814       h_field(0),
815       h_plain(0),h_bsd(0),
816       h_md5deep_size(0),
817       h_hashkeeper(0),h_ilook(0),h_ilook3(0),h_ilook4(0), h_nsrl20(0), h_encase(0),
818       usage_count(0)		// allows -hh to print extra help
819 	{};
820 
821     bool	mode_recursive;
822     bool	mode_warn_only;
823 
824     // which files do we hash.
825     bool	mode_expert;
826     bool	mode_regular;
827     bool	mode_directory;
828     bool	mode_door;
829     bool	mode_block;
830     bool	mode_character;
831     bool	mode_pipe;
832     bool	mode_socket;
833     bool	mode_symlink;
834     bool        mode_winpe;
835 
836 
837     /* Command line arguments */
838     std::string opt_input_list;		// file with a list of files to read
839     int		argc;
840 #ifdef _WIN32
841     wchar_t     **argv;			// never allocated, never freed
842 #else
843     char	**argv;
844 #endif
845 
846     // configuration and output
847     display	ocb;		// output control block
848 
849     // Which filetypes this algorithm supports and their position in the file
850     uint8_t	 h_field;		// which field to extract from a hash file.
851     uint8_t      h_plain, h_bsd, h_md5deep_size, h_hashkeeper;
852     uint8_t      h_ilook, h_ilook3, h_ilook4, h_nsrl20, h_encase;
853 
854     void	md5deep_add_hash(char *h, char *fn); // explicitly add a hash
855     void	setup_expert_mode(char *arg);
856 
857     /* main.cpp */
858     uint64_t	find_block_size(std::string input_str);
859     int		usage_count;
860     bool	opt_enable_mac_cc;
861     tstring	generate_filename(const tstring &input);
862     void	hashdeep_usage();
863     std::string	make_banner();
864     void	md5deep_usage();
865     void	hashdeep_check_flags_okay();
866     void	check_wow64();
867     void	md5deep_check_flags_okay();
868     int		hashdeep_process_command_line(int argc,char **argv);
869     void	md5deep_check_matching_modes();
870     void        hashdeep_check_matching_modes();
871     int		md5deep_process_command_line(int argc,char **argv);
872 #ifdef _WIN32
873     int		prepare_windows_command_line();
874 #endif
875 
876     /* files.cpp
877      * Not quite sure what to do with this stuff yet...
878      */
879 
880     void	md5deep_load_match_file(const char *fn);
881     int		find_hash_in_line(char *buf, int fileType, char *filename);
882     int		parse_encase_file(const char *fn,FILE *f,uint32_t num_expected_hashes);
883     int		find_plain_hash(char *buf,char *known_fn); // returns FALSE if error
884     int         find_md5deep_size_hash(char *buf, char *known_fn);
885     int		find_bsd_hash(char *buf, char *fn);
886     int		find_rigid_hash(char *buf,  char *fn, unsigned int fn_location, unsigned int hash_location);
887     int		find_ilook_hash(char *buf, char *known_fn);
888     int		check_for_encase(FILE *f,uint32_t *expected_hashes);
889 
890     /* dig.cpp
891      *
892      * Note the file typing system needs to be able to display errors...
893      */
894 
895     class dir_table_t : public std::set<tstring>{
896     };
897     dir_table_t dir_table;
898     void	done_processing_dir(const tstring &fn_);
899     void	processing_dir(const tstring &fn_);
900     bool	have_processed_dir(const tstring &fn_);
901 
902 
903     int		identify_hash_file_type(FILE *f,uint32_t *expected_hashes); // identify the hash file type
904     bool	should_hash_symlink(const tstring &fn,file_types *link_type);
905     bool        should_hash_winpe(const tstring &fn);
906     bool	should_hash_expert(const tstring &fn, file_types type);
907     bool	should_hash(const tstring &fn);
908 
909     /* file_type returns the file type of a string.
910      * If an error is found and ocb is provided, send the error to ocb.
911      * If filesize and timestamp are provided, give them.
912      */
913     static file_types file_type(const filename_t &fn,class display *ocb,uint64_t *filesize,
914 				timestamp_t *ctime,timestamp_t *mtime,timestamp_t *atime);
915 #ifdef _WIN32
916     bool	is_junction_point(const std::wstring &fn);
917 #endif
918     void	clean_name_posix(std::string &fn);
919     void	process_dir(const tstring &path);
920     void	dig_normal(const tstring &path);	// posix  & win32
921     void	dig_win32(const tstring &path);	// win32 only; calls dig_normal
922     static	void dig_self_test();
923 
hashes_loaded()924     bool hashes_loaded(){
925 	return ocb.hashes_loaded();
926     }
927 
928     int main(int argc,char **argv);	// main
929     void sanity_check(int condition,const char *msg);
930 
931 };
932 
933 /**
934  * the files class knows how to read various hash file types
935  */
936 
937 /* Due to an inadvertant code fork several years ago, this program has different usage
938  * and output when run as 'md5deep' then when run as 'hashdeep'. We call this the
939  * 'md5deep_mode' and track it with the variables below.
940  */
941 
942 /* main.cpp */
943 extern bool	md5deep_mode;		// if true, then we were run as md5deep, sha1deep, etc.
944 extern int	opt_debug;		// for debugging
945 extern hashid_t opt_md5deep_mode_algorithm;	// for when we are in MD5DEEP mode
946 
947 
948 std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems);
949 std::vector<std::string> split(const std::string &s, char delim);
950 void lowercase(std::string &s);
951 extern std::string progname;		// formerly const char *__progname
952 
953 // ------------------------------------------------------------------
954 // HELPER FUNCTIONS
955 //
956 // helper.cpp
957 // ------------------------------------------------------------------
958 
959 void     chop_line(char *s);
960 off_t	find_file_size(FILE *f,class display *ocb); // Return the size, in bytes of an open file stream. On error, return -1
961 
962 // ------------------------------------------------------------------
963 // MAIN PROCESSING
964 // ------------------------------------------------------------------
965 /* dig.cpp */
966 void dig_self_test();			// check the string-processing
967 
968 
969 
970 
971 #endif /* ifndef __MAIN_H */
972