1 // Author:  Bruce Allen
2 // Created: 2/25/2013
3 //
4 // The software provided here is released by the Naval Postgraduate
5 // School, an agency of the U.S. Department of Navy.  The software
6 // bears no warranty, either expressed or implied. NPS does not assume
7 // legal liability nor responsibility for a User's use of the software
8 // or the results of such use.
9 //
10 // Please note that within the United States, copyright protection,
11 // under Section 105 of the United States Code, Title 17, is not
12 // available for any work of the United States Government and/or for
13 // any works created by United States Government employees. User
14 // acknowledges that this software contains work which was created by
15 // NPS government employees and is therefore in the public domain and
16 // not subject to copyright.
17 //
18 // Released into the public domain on February 25, 2013 by Bruce Allen.
19 
20 /**
21  * \file
22  * Provides hashdb commands.
23  */
24 
25 #ifndef COMMANDS_HPP
26 #define COMMANDS_HPP
27 #include "../src_libhashdb/hashdb.hpp"
28 #include "import_tab.hpp"
29 #include "import_json.hpp"
30 #include "export_json.hpp"
31 #include "scan_list.hpp"
32 #include "adder.hpp"
33 #include "adder_set.hpp"
34 
35 // Standard includes
36 #include <cerrno>
37 #include <cstdlib>
38 #include <cstdio>
39 #include <string>
40 #include <sstream>
41 #include <iostream>
42 #include <algorithm>
43 #include <vector>
44 
45 // leave alone else create using existing settings if new
create_if_new(const std::string & hashdb_dir,const std::string & from_hashdb_dir,const std::string & command_string)46 void create_if_new(const std::string& hashdb_dir,
47                    const std::string& from_hashdb_dir,
48                    const std::string& command_string) {
49 
50   std::string error_message;
51   hashdb::settings_t settings;
52 
53   // try to read hashdb_dir settings
54   error_message = hashdb::read_settings(hashdb_dir, settings);
55   if (error_message.size() == 0) {
56     // hashdb_dir already exists
57     return;
58   }
59 
60   // no hashdb_dir, so read from_hashdb_dir settings
61   error_message = hashdb::read_settings(from_hashdb_dir, settings);
62   if (error_message.size() != 0) {
63     // bad since from_hashdb_dir is not valid
64     std::cerr << "Error: " << error_message << "\n";
65     exit(1);
66   }
67 
68   // create hashdb_dir using from_hashdb_dir settings
69   error_message = hashdb::create_hashdb(hashdb_dir, settings, command_string);
70   if (error_message.size() != 0) {
71     // bad since from_hashdb_dir is not valid
72     std::cerr << "Error: " << error_message << "\n";
73     exit(1);
74   }
75 }
76 
77 // require hashdb_dir else fail
require_hashdb_dir(const std::string & hashdb_dir)78 static void require_hashdb_dir(const std::string& hashdb_dir) {
79   std::string error_message;
80   hashdb::settings_t settings;
81   error_message = hashdb::read_settings(hashdb_dir, settings);
82   if (error_message.size() != 0) {
83     std::cerr << "Error: " << error_message << "\n";
84     exit(1);
85   }
86 }
87 
print_header(const std::string & cmd)88 static void print_header(const std::string& cmd) {
89   std::cout << "# command: " << cmd << "\n"
90             << "# hashdb-Version: " << PACKAGE_VERSION << "\n";
91 }
92 
93 // helper
94 /**
95  * Return 16 bytes of random hash.
96  */
random_binary_hash()97 std::string random_binary_hash() {
98   char hash[16];
99   for (size_t i=0; i<16; i++) {
100     // note: uint32_t not used because windows rand only uses 15 bits.
101     hash[i]=(static_cast<char>(rand()));
102   }
103   return std::string(hash, 16);
104 }
105 
106 namespace commands {
107 
108 // ************************************************************
109 // helpers
110 // ************************************************************
111 class in_ptr_t {
112   private:
113   std::istream* in;
114 
115   // do not allow copy or assignment
116   in_ptr_t(const in_ptr_t&);
117   in_ptr_t& operator=(const in_ptr_t&);
118 
119   public:
in_ptr_t(const std::string & in_filename)120   in_ptr_t(const std::string& in_filename) : in(NULL) {
121     if (in_filename == "-") {
122       in = &std::cin;
123     } else {
124       std::ifstream* inf = new std::ifstream(in_filename.c_str());
125       if (!inf->is_open()) {
126         std::cerr << "Error: Cannot open " << in_filename
127                   << ": " << strerror(errno) << "\n";
128         exit(1);
129       }
130       in = inf;
131     }
132   }
133 
~in_ptr_t()134   ~in_ptr_t() {
135     if (in != &std::cin) {
136       delete in;
137     }
138   }
139 
operator ()()140   std::istream* operator()() {
141     return in;
142   }
143 };
144 
145 class out_ptr_t {
146   private:
147   std::ostream* out;
148 
149   // do not allow copy or assignment
150   out_ptr_t(const out_ptr_t&);
151   out_ptr_t& operator=(const out_ptr_t&);
152 
153   public:
out_ptr_t(const std::string & out_filename)154   out_ptr_t(const std::string& out_filename) : out(NULL) {
155     if (out_filename == "-") {
156       out = &std::cout;
157     } else {
158       std::ofstream* outf = new std::ofstream(out_filename.c_str());
159       if (!outf->is_open()) {
160         std::cerr << "Error: Cannot open " << out_filename
161                   << ": " << strerror(errno) << "\n";
162         exit(1);
163       }
164       out = outf;
165     }
166   }
167 
~out_ptr_t()168   ~out_ptr_t() {
169     if (out != &std::cout) {
170       delete out;
171     }
172   }
173 
operator ()()174   std::ostream* operator()() {
175     return out;
176   }
177 };
178 
179   // ************************************************************
180   // new database
181   // ************************************************************
create(const std::string & hashdb_dir,const hashdb::settings_t & settings,const std::string & cmd)182   void create(const std::string& hashdb_dir,
183               const hashdb::settings_t& settings,
184               const std::string& cmd) {
185 
186     std::string error_message;
187     error_message = hashdb::create_hashdb(hashdb_dir, settings, cmd);
188 
189   if (error_message.size() == 0) {
190       std::cout << "New database created.\n";
191     } else {
192       std::cerr << "Error: " << error_message << "\n";
193       exit(1);
194     }
195   }
196 
197   // ************************************************************
198   // import/export
199   // ************************************************************
200   // import recursively from path
ingest(const std::string & hashdb_dir,const std::string & ingest_path,const size_t step_size,const std::string & repository_name,const std::string & whitelist_dir,const bool disable_recursive_processing,const bool disable_calculate_entropy,const bool disable_calculate_labels,const std::string & cmd)201   static void ingest(const std::string& hashdb_dir,
202                      const std::string& ingest_path,
203                      const size_t step_size,
204                      const std::string& repository_name,
205                      const std::string& whitelist_dir,
206                      const bool disable_recursive_processing,
207                      const bool disable_calculate_entropy,
208                      const bool disable_calculate_labels,
209                      const std::string& cmd) {
210 
211     // ingest
212     std::string error_message = hashdb::ingest(
213                     hashdb_dir, ingest_path, step_size, repository_name,
214                     whitelist_dir,
215                     disable_recursive_processing,
216                     disable_calculate_entropy,
217                     disable_calculate_labels,
218                     cmd);
219     if (error_message.size() != 0) {
220       std::cerr << "Error: " << error_message << "\n";
221       exit(1);
222     }
223   }
224 
225   // import_tab
import_tab(const std::string & hashdb_dir,const std::string & tab_file,const std::string & repository_name,const std::string & whitelist_dir,const std::string & cmd)226   static void import_tab(const std::string& hashdb_dir,
227                      const std::string& tab_file,
228                      const std::string& repository_name,
229                          const std::string& whitelist_dir,
230                      const std::string& cmd) {
231 
232     // validate hashdb_dir path
233     require_hashdb_dir(hashdb_dir);
234 
235     // resources
236     hashdb::import_manager_t manager(hashdb_dir, cmd);
237     hashdb::scan_manager_t* whitelist_manager = NULL;
238     if (whitelist_dir != "") {
239       require_hashdb_dir(whitelist_dir);
240       whitelist_manager = new hashdb::scan_manager_t(whitelist_dir);
241     }
242     progress_tracker_t progress_tracker(hashdb_dir, 0, cmd);
243 
244     // open the tab file for reading
245     in_ptr_t in_ptr(tab_file);
246     ::import_tab(manager, repository_name, tab_file, whitelist_manager,
247                  progress_tracker, *in_ptr());
248 
249     // done
250     if (whitelist_manager != NULL) {
251       delete whitelist_manager;
252     }
253   }
254 
255   // import json
import_json(const std::string & hashdb_dir,const std::string & json_file,const std::string & cmd)256   static void import_json(const std::string& hashdb_dir,
257                           const std::string& json_file,
258                           const std::string& cmd) {
259 
260     // validate hashdb_dir path
261     require_hashdb_dir(hashdb_dir);
262 
263     // resources
264     hashdb::import_manager_t manager(hashdb_dir, cmd);
265     progress_tracker_t progress_tracker(hashdb_dir, 0, cmd);
266 
267     // open the JSON file for reading
268     in_ptr_t in_ptr(json_file);
269     ::import_json(manager, progress_tracker, *in_ptr());
270   }
271 
272   // export json
export_json(const std::string & hashdb_dir,const std::string & json_file,const std::string & cmd)273   static void export_json(const std::string& hashdb_dir,
274                           const std::string& json_file,
275                           const std::string& cmd) {
276 
277     // validate hashdb_dir path
278     require_hashdb_dir(hashdb_dir);
279 
280     // resources
281     hashdb::scan_manager_t manager(hashdb_dir);
282     progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
283 
284     // open the JSON file for writing
285     out_ptr_t out_ptr(json_file);
286 
287     // print header to file
288     *out_ptr() << "# command: '" << cmd << "'\n"
289                << "# hashdb-Version: " << PACKAGE_VERSION << "\n";
290 
291     // export the hashdb
292     ::export_json_hashes(manager, progress_tracker, *out_ptr());
293     ::export_json_sources(manager, *out_ptr());
294   }
295 
296   // export json range
export_json_range(const std::string & hashdb_dir,const std::string & json_file,const std::string & begin_block_hash,const std::string & end_block_hash,const std::string & cmd)297   static void export_json_range(const std::string& hashdb_dir,
298                                 const std::string& json_file,
299                                 const std::string& begin_block_hash,
300                                 const std::string& end_block_hash,
301                                 const std::string& cmd) {
302 
303     // validate hashdb_dir path
304     require_hashdb_dir(hashdb_dir);
305 
306     // resources
307     hashdb::scan_manager_t manager(hashdb_dir);
308     progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
309 
310     // open the JSON file for writing
311     out_ptr_t out_ptr(json_file);
312 
313     // print header to file
314     *out_ptr() << "# command: '" << cmd << "'\n"
315                << "# hashdb-Version: " << PACKAGE_VERSION << "\n";
316 
317     // export the range to the hashdb
318     ::export_json_range(manager, begin_block_hash, end_block_hash,
319                         progress_tracker, *out_ptr());
320   }
321 
322   // ************************************************************
323   // database manipulation
324   // ************************************************************
325   // add
add(const std::string & hashdb_dir,const std::string & dest_dir,const std::string & cmd)326   static void add(const std::string& hashdb_dir,
327                   const std::string& dest_dir,
328                   const std::string& cmd) {
329 
330     // validate hashdb directories, maybe make dest_dir
331     require_hashdb_dir(hashdb_dir);
332     create_if_new(dest_dir, hashdb_dir, cmd);
333 
334     // resources
335     hashdb::scan_manager_t manager_a(hashdb_dir);
336     hashdb::import_manager_t manager_b(dest_dir, cmd);
337     progress_tracker_t progress_tracker(
338                                 dest_dir, manager_a.size_hashes(), cmd);
339     adder_t adder(&manager_a, &manager_b, &progress_tracker);
340 
341     // add data for binary_hash from A to B
342     std::string binary_hash = manager_a.first_hash();
343     while (binary_hash.size() != 0) {
344       // add the hash
345       adder.add(binary_hash);
346       binary_hash = manager_a.next_hash(binary_hash);
347     }
348   }
349 
350   // add_multiple
351   // Flow:
352   //   1) Create an ordered multimap of key=hash, value=producer_t
353   //      where key is the first key from a producer.
354   //   2) Consume elements from the ordered multimap and copy them
355   //      until the producers are depleted.  Do not enque when a producer
356   //      is depleted.  Done when the ordered multimap becomes empty.
add_multiple(const std::vector<std::string> & p_hashdb_dirs,const std::string & cmd)357   static void add_multiple(const std::vector<std::string>& p_hashdb_dirs,
358                            const std::string& cmd) {
359 
360     std::vector<std::string> hashdb_dirs = p_hashdb_dirs;
361 
362     // read then strip off dest_dir from end of list
363     const std::string dest_dir = hashdb_dirs.back();
364     hashdb_dirs.pop_back();
365 
366     // validate hashdb directories, maybe make dest_dir
367     for (std::vector<std::string>::const_iterator it = hashdb_dirs.begin();
368                     it != hashdb_dirs.end(); ++it) {
369       require_hashdb_dir(*it);
370     }
371     create_if_new(dest_dir, hashdb_dirs[0], cmd);
372 
373     // open the consumer at dest_dir
374     hashdb::import_manager_t consumer(dest_dir, cmd);
375 
376     // calculate the total hash records for the tracker
377     size_t total_hash_records = 0;
378     for (std::vector<std::string>::const_iterator it = hashdb_dirs.begin();
379                     it != hashdb_dirs.end(); ++it) {
380       hashdb::scan_manager_t scan_manager(*it);
381       total_hash_records += scan_manager.size_hashes();
382     }
383 
384     // start progress tracker
385     progress_tracker_t progress_tracker(dest_dir, total_hash_records, cmd);
386 
387     // define the ordered multimap of key=hash, value=producer_t
388     typedef std::pair<hashdb::scan_manager_t*, adder_t*> producer_t;
389     typedef std::pair<std::string, producer_t> ordered_producers_value_t;
390     typedef std::multimap<std::string, producer_t> ordered_producers_t;
391 
392     // create the multimap of ordered producers
393     ordered_producers_t ordered_producers;
394 
395     // open the producers
396     for (std::vector<std::string>::const_iterator it = hashdb_dirs.begin();
397                     it != hashdb_dirs.end(); ++it) {
398       std::string hashdb_dir = *it;
399       hashdb::scan_manager_t* producer = new hashdb::scan_manager_t(hashdb_dir);
400       std::string binary_hash = producer->first_hash();
401       if (binary_hash.size() != 0) {
402         // the producer is not empty, so enqueue it
403         // create the adder
404         adder_t* adder = new adder_t(producer, &consumer, &progress_tracker);
405         ordered_producers.insert(ordered_producers_value_t(binary_hash,
406                                       producer_t(producer, adder)));
407 
408       // also track total hashes to be processed
409       total_hash_records += producer->size_hashes();
410 
411       } else {
412         // no hashes for this producer so close it
413         delete producer;
414       }
415     }
416 
417     // add ordered hashes from producers until all hashes are consumed
418     while (ordered_producers.size() != 0) {
419       // get the hash, producer, and adder for the first hash
420       ordered_producers_t::iterator it = ordered_producers.begin();
421       hashdb::scan_manager_t* producer = it->second.first;
422       adder_t* adder = it->second.second;
423 
424       // add the hash to the consumer
425       adder->add(it->first);
426 
427       // get the next hash from this producer
428       std::string binary_hash = producer->next_hash(it->first);
429 
430       // remove this hash, producer_t entry
431       ordered_producers.erase(it);
432 
433       if (binary_hash.size() != 0) {
434         // hash exists so add the hash, producer, and adder
435         ordered_producers.insert(ordered_producers_value_t(binary_hash,
436                                       producer_t(producer, adder)));
437       } else {
438         // no hashes for this producer so close it
439         delete producer;
440         delete adder;
441       }
442     }
443   }
444 
445   // add_repository
add_repository(const std::string & hashdb_dir,const std::string & dest_dir,const std::string & repository_name,const std::string & cmd)446   static void add_repository(const std::string& hashdb_dir,
447                              const std::string& dest_dir,
448                              const std::string& repository_name,
449                              const std::string& cmd) {
450 
451     // validate hashdb directories, maybe make dest_dir
452     require_hashdb_dir(hashdb_dir);
453     create_if_new(dest_dir, hashdb_dir, cmd);
454 
455     // resources
456     hashdb::scan_manager_t manager_a(hashdb_dir);
457     hashdb::import_manager_t manager_b(dest_dir, cmd);
458     progress_tracker_t progress_tracker(dest_dir,
459                                         manager_a.size_hashes(), cmd);
460     adder_t adder(&manager_a, &manager_b, repository_name, &progress_tracker);
461 
462     // add data for binary_hash from A to B
463     std::string binary_hash = manager_a.first_hash();
464     while (binary_hash.size() != 0) {
465       // add the hash
466       adder.add_repository(binary_hash);
467       binary_hash = manager_a.next_hash(binary_hash);
468     }
469   }
470 
471   // add_range
add_range(const std::string & hashdb_dir,const std::string & dest_dir,const size_t m,const size_t n,const std::string & cmd)472   static void add_range(const std::string& hashdb_dir,
473                         const std::string& dest_dir,
474                         const size_t m,
475                         const size_t n,
476                         const std::string& cmd) {
477 
478     // validate hashdb directories, maybe make dest_dir
479     require_hashdb_dir(hashdb_dir);
480     create_if_new(dest_dir, hashdb_dir, cmd);
481 
482     // resources
483     hashdb::scan_manager_t manager_a(hashdb_dir);
484     hashdb::import_manager_t manager_b(dest_dir, cmd);
485     progress_tracker_t progress_tracker(dest_dir,
486                                         manager_a.size_hashes(), cmd);
487     adder_t adder(&manager_a, &manager_b, &progress_tracker);
488 
489     // add data for binary_hash from A to B
490     std::string binary_hash = manager_a.first_hash();
491     while (binary_hash.size() != 0) {
492       // add the hash
493       adder.add_range(binary_hash, m, n);
494       binary_hash = manager_a.next_hash(binary_hash);
495     }
496   }
497 
498   // intersect A and B into C
intersect(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)499   static void intersect(const std::string& hashdb_dir1,
500                         const std::string& hashdb_dir2,
501                         const std::string& dest_dir,
502                         const std::string& cmd) {
503 
504     // validate hashdb directories, maybe make dest_dir
505     require_hashdb_dir(hashdb_dir1);
506     require_hashdb_dir(hashdb_dir2);
507     create_if_new(dest_dir, hashdb_dir1, cmd);
508 
509     // resources
510     hashdb::scan_manager_t manager_a(hashdb_dir1);
511     hashdb::scan_manager_t manager_b(hashdb_dir2);
512     hashdb::import_manager_t manager_c(dest_dir, cmd);
513     progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
514     adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
515                                                            &progress_tracker);
516 
517     // iterate A to intersect A and B into C
518     std::string binary_hash = manager_a.first_hash();
519     while (binary_hash.size() != 0) {
520       adder_set.intersect(binary_hash);
521       binary_hash = manager_a.next_hash(binary_hash);
522     }
523   }
524 
525   // intersect_hash
intersect_hash(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)526   static void intersect_hash(const std::string& hashdb_dir1,
527                              const std::string& hashdb_dir2,
528                              const std::string& dest_dir,
529                              const std::string& cmd) {
530 
531     // validate hashdb directories, maybe make dest_dir
532     require_hashdb_dir(hashdb_dir1);
533     require_hashdb_dir(hashdb_dir2);
534     create_if_new(dest_dir, hashdb_dir1, cmd);
535 
536     // resources
537     hashdb::scan_manager_t manager_a(hashdb_dir1);
538     hashdb::scan_manager_t manager_b(hashdb_dir2);
539     hashdb::import_manager_t manager_c(dest_dir, cmd);
540     progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
541     adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
542                                                           & progress_tracker);
543 
544     // iterate A to intersect_hash A and B into C
545     std::string binary_hash = manager_a.first_hash();
546     while (binary_hash.size() != 0) {
547       adder_set.intersect_hash(binary_hash);
548       binary_hash = manager_a.next_hash(binary_hash);
549     }
550   }
551 
552   // subtract
subtract(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)553   static void subtract(const std::string& hashdb_dir1,
554                        const std::string& hashdb_dir2,
555                        const std::string& dest_dir,
556                        const std::string& cmd) {
557 
558     // validate hashdb directories, maybe make dest_dir
559     require_hashdb_dir(hashdb_dir1);
560     require_hashdb_dir(hashdb_dir2);
561     create_if_new(dest_dir, hashdb_dir1, cmd);
562 
563     // resources
564     hashdb::scan_manager_t manager_a(hashdb_dir1);
565     hashdb::scan_manager_t manager_b(hashdb_dir2);
566     hashdb::import_manager_t manager_c(dest_dir, cmd);
567     progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
568     adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
569                                                           &progress_tracker);
570 
571     // iterate A to add A to C if A hash and source not in B
572     std::string binary_hash = manager_a.first_hash();
573     while (binary_hash.size() != 0) {
574 
575       // add A to C if A hash and source not in B
576       adder_set.subtract(binary_hash);
577       binary_hash = manager_a.next_hash(binary_hash);
578     }
579   }
580 
581   // subtract_hash
subtract_hash(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)582   static void subtract_hash(const std::string& hashdb_dir1,
583                             const std::string& hashdb_dir2,
584                             const std::string& dest_dir,
585                             const std::string& cmd) {
586 
587     // validate hashdb directories, maybe make dest_dir
588     require_hashdb_dir(hashdb_dir1);
589     require_hashdb_dir(hashdb_dir2);
590     create_if_new(dest_dir, hashdb_dir1, cmd);
591 
592     // resources
593     hashdb::scan_manager_t manager_a(hashdb_dir1);
594     hashdb::scan_manager_t manager_b(hashdb_dir2);
595     hashdb::import_manager_t manager_c(dest_dir, cmd);
596     progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
597     adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
598                                                           &progress_tracker);
599 
600     // iterate A to add A to C if A hash not in B
601     std::string binary_hash = manager_a.first_hash();
602     while (binary_hash.size() != 0) {
603 
604       // add A to C if A hash not in B
605       adder_set.subtract_hash(binary_hash);
606       binary_hash = manager_a.next_hash(binary_hash);
607     }
608   }
609 
610   // subtract_repository
subtract_repository(const std::string & hashdb_dir,const std::string & dest_dir,const std::string & repository_name,const std::string & cmd)611   static void subtract_repository(const std::string& hashdb_dir,
612                                   const std::string& dest_dir,
613                                   const std::string& repository_name,
614                                   const std::string& cmd) {
615 
616     // validate hashdb directories, maybe make dest_dir
617     require_hashdb_dir(hashdb_dir);
618     create_if_new(dest_dir, hashdb_dir, cmd);
619 
620     // resources
621     hashdb::scan_manager_t manager_a(hashdb_dir);
622     hashdb::import_manager_t manager_b(dest_dir, cmd);
623     progress_tracker_t progress_tracker(dest_dir,
624                                         manager_a.size_hashes(), cmd);
625     adder_t adder(&manager_a, &manager_b, repository_name, &progress_tracker);
626 
627     // add data for binary_hash from A to B
628     std::string binary_hash = manager_a.first_hash();
629     while (binary_hash.size() != 0) {
630       // add the hash
631       adder.add_non_repository(binary_hash);
632       binary_hash = manager_a.next_hash(binary_hash);
633     }
634   }
635 
636   // ************************************************************
637   // scan
638   // ************************************************************
639   // scan
scan_list(const std::string & hashdb_dir,const std::string & hashes_file,const hashdb::scan_mode_t scan_mode,const std::string & cmd)640   static void scan_list(const std::string& hashdb_dir,
641                         const std::string& hashes_file,
642                         const hashdb::scan_mode_t scan_mode,
643                         const std::string& cmd) {
644 
645     // validate hashdb_dir path
646     require_hashdb_dir(hashdb_dir);
647 
648     // resources
649     hashdb::scan_manager_t manager(hashdb_dir);
650 
651     // open the hashes list file for reading
652     in_ptr_t in_ptr(hashes_file);
653 
654     // print header information
655     print_header(cmd);
656 
657     // scan the list
658     ::scan_list(manager, *in_ptr(), scan_mode);
659 
660     // done
661     std::cout << "# scan_list completed.\n";
662   }
663 
664   // scan_hash
scan_hash(const std::string & hashdb_dir,const std::string & hex_block_hash,const hashdb::scan_mode_t scan_mode,const std::string & cmd)665   static void scan_hash(const std::string& hashdb_dir,
666                         const std::string& hex_block_hash,
667                         const hashdb::scan_mode_t scan_mode,
668                         const std::string& cmd) {
669 
670     // validate hashdb_dir path
671     require_hashdb_dir(hashdb_dir);
672 
673     // get the binary hash
674     std::string binary_hash = hashdb::hex_to_bin(hex_block_hash);
675 
676     // reject invalid input
677     if (binary_hash == "") {
678       std::cerr << "Error: Invalid hash: '" << hex_block_hash << "'\n";
679       exit(1);
680     }
681 
682     // open DB
683     hashdb::scan_manager_t scan_manager(hashdb_dir);
684 
685     // scan
686     std::string expanded_text = scan_manager.find_hash_json(
687                                                   scan_mode, binary_hash);
688 
689     if (expanded_text.size() != 0) {
690       std::cout << expanded_text << std::endl;
691     } else {
692       std::cout << "Hash not found for '" << hex_block_hash << "'\n";
693     }
694   }
695 
696   // scan_media
scan_media(const std::string & hashdb_dir,const std::string & media_image_filename,const size_t step_size,const bool disable_recursive_processing,const hashdb::scan_mode_t scan_mode,const std::string & cmd)697   static void scan_media(const std::string& hashdb_dir,
698                          const std::string& media_image_filename,
699                          const size_t step_size,
700                          const bool disable_recursive_processing,
701                          const hashdb::scan_mode_t scan_mode,
702                          const std::string& cmd) {
703 
704     // print header information
705     print_header(cmd);
706 
707     // scan
708     std::string error_message = hashdb::scan_media(hashdb_dir,
709                              media_image_filename, step_size,
710                              disable_recursive_processing, scan_mode);
711     if (error_message.size() == 0) {
712       std::cout << "# scan_media completed.\n";
713     } else {
714       std::cerr << "Error: " << error_message << "\n";
715       exit(1);
716     }
717   }
718 
719   // ************************************************************
720   // statistics
721   // ************************************************************
722   // size
size(const std::string & hashdb_dir,const std::string & cmd)723   static void size(const std::string& hashdb_dir,
724                    const std::string& cmd) {
725 
726     // validate hashdb_dir path
727     require_hashdb_dir(hashdb_dir);
728 
729     // open DB
730     hashdb::scan_manager_t manager(hashdb_dir);
731 
732     std::cout << manager.size() << std::endl;
733   }
734 
735   // sources
sources(const std::string & hashdb_dir,const std::string & cmd)736   static void sources(const std::string& hashdb_dir,
737                       const std::string& cmd) {
738 
739     // validate hashdb_dir path
740     require_hashdb_dir(hashdb_dir);
741 
742     // open DB
743     hashdb::scan_manager_t manager(hashdb_dir);
744 
745     // print the sources
746     ::export_json_sources(manager, std::cout);
747   }
748 
749   // histogram
histogram(const std::string & hashdb_dir,const std::string & cmd)750   static void histogram(const std::string& hashdb_dir,
751                         const std::string& cmd) {
752 
753     // validate hashdb_dir path
754     require_hashdb_dir(hashdb_dir);
755 
756     // open DB
757     hashdb::scan_manager_t manager(hashdb_dir);
758 
759     // print header information
760     print_header(cmd);
761 
762     // start progress tracker
763     progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
764 
765     // total number of hashes in the database
766     uint64_t total_hashes = 0;
767 
768     // total number of distinct hashes
769     uint64_t total_distinct_hashes = 0;
770 
771     // hash histogram as <count, number of hashes with count>
772     std::map<uint32_t, uint64_t> hash_histogram;
773 
774     // space for variables
775     uint64_t k_entropy;
776     std::string block_label;
777     uint64_t count;
778     hashdb::source_sub_counts_t source_sub_counts;
779 
780     // iterate over hashdb and set variables for calculating the histogram
781     std::string binary_hash = manager.first_hash();
782 
783     // note if the DB is empty
784     if (binary_hash.size() == 0) {
785       std::cout << "The map is empty.\n";
786     }
787 
788     while (binary_hash.size() != 0) {
789       manager.find_hash(binary_hash, k_entropy, block_label, count,
790                         source_sub_counts);
791       // update total hashes observed
792       total_hashes += count;
793       // update total distinct hashes
794       if (count == 1) {
795         ++total_distinct_hashes;
796       }
797 
798       // update hash_histogram information
799       // look for existing entry
800       std::map<uint32_t, uint64_t>::iterator hash_histogram_it =
801                                               hash_histogram.find(count);
802       if (hash_histogram_it == hash_histogram.end()) {
803 
804         // this is the first hash found with this count value
805         // so start a new element for it
806         hash_histogram.insert(std::pair<uint32_t, uint64_t>(count, 1));
807 
808       } else {
809 
810         // increment existing value for number of hashes with this count
811         uint64_t old_number = hash_histogram_it->second;
812         hash_histogram.erase(count);
813         hash_histogram.insert(std::pair<uint32_t, uint64_t>(
814                                            count, old_number + 1));
815       }
816 
817       // move forward
818       progress_tracker.track_hash_data(source_sub_counts.size());
819       binary_hash = manager.next_hash(binary_hash);
820     }
821 
822     // show totals
823     std::cout << "{\"total_hashes\": " << total_hashes << ", "
824               << "\"total_distinct_hashes\": " << total_distinct_hashes << "}\n";
825 
826     // show hash histogram as <count, number of hashes with count>
827     std::map<uint32_t, uint64_t>::iterator hash_histogram_it2;
828     for (hash_histogram_it2 = hash_histogram.begin();
829          hash_histogram_it2 != hash_histogram.end(); ++hash_histogram_it2) {
830       std::cout << "{\"duplicates\":" << hash_histogram_it2->first
831                 << ", \"distinct_hashes\":" << hash_histogram_it2->second
832                 << ", \"total\":" << hash_histogram_it2->first *
833                                  hash_histogram_it2->second << "}\n";
834     }
835   }
836 
837   // duplicates
duplicates(const std::string & hashdb_dir,const std::string & number_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)838   static void duplicates(const std::string& hashdb_dir,
839                          const std::string& number_string,
840                          const hashdb::scan_mode_t scan_mode,
841                          const std::string& cmd) {
842 
843     // validate hashdb_dir path
844     require_hashdb_dir(hashdb_dir);
845 
846     // convert duplicates string to number
847     uint32_t number = atoi(number_string.c_str());
848 
849     // open DB
850     hashdb::scan_manager_t manager(hashdb_dir);
851 
852     // there is nothing to report if the map is empty
853     if (manager.size_hashes() == 0) {
854       std::cout << "The map is empty.\n";
855       return;
856     }
857 
858     // print header information
859     print_header(cmd);
860 
861     // start progress tracker
862     progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
863 
864     bool any_found = false;
865 
866     // space for variables
867     uint64_t k_entropy;
868     std::string block_label;
869     uint64_t count;
870     hashdb::source_sub_counts_t source_sub_counts;
871 
872     // iterate over hashdb and set variables for finding duplicates
873     std::string binary_hash = manager.first_hash();
874 
875     while (binary_hash.size() != 0) {
876       manager.find_hash(binary_hash, k_entropy, block_label, count,
877                                   source_sub_counts);
878       if (count == number) {
879         // show hash with requested duplicates number
880         std::string expanded_text = manager.find_hash_json(
881                                                     scan_mode, binary_hash);
882         std::cout << hashdb::bin_to_hex(binary_hash) << "\t"
883                   << expanded_text << "\n";
884         any_found = true;
885       }
886 
887       // move forward
888       progress_tracker.track_hash_data(source_sub_counts.size());
889       binary_hash = manager.next_hash(binary_hash);
890     }
891 
892     // say so if nothing was found
893     if (!any_found) {
894       std::cout << "No hashes were found with this count.\n";
895       return;
896     }
897   }
898 
899   // hash_table
hash_table(const std::string & hashdb_dir,const std::string & hex_file_hash,const hashdb::scan_mode_t scan_mode,const std::string & cmd)900   static void hash_table(const std::string& hashdb_dir,
901                          const std::string& hex_file_hash,
902                          const hashdb::scan_mode_t scan_mode,
903                          const std::string& cmd) {
904 
905     // validate hashdb_dir path
906     require_hashdb_dir(hashdb_dir);
907 
908     // open DB
909     hashdb::scan_manager_t manager(hashdb_dir);
910 
911     // source data
912     std::string file_binary_hash = hashdb::hex_to_bin(hex_file_hash);
913     uint64_t filesize = 0;
914     std::string file_type = "";
915     uint64_t zero_count = 0;
916     uint64_t nonprobative_count = 0;
917 
918     // see if this source is even present
919     bool has_source_data = manager.find_source_data(file_binary_hash,
920                        filesize, file_type, zero_count, nonprobative_count);
921     if (has_source_data == false) {
922       // the source is not present
923       std::cout << "There is no source with this file hash\n";
924       return;
925     }
926 
927     // print header information
928     print_header(cmd);
929 
930     // start progress tracker
931     progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
932 
933     // space for variables
934     uint64_t k_entropy;
935     std::string block_label;
936     uint64_t count;
937     hashdb::source_sub_counts_t source_sub_counts;
938 
939     // look for hashes that belong to this source
940     // get the first hash
941     std::string binary_hash = manager.first_hash();
942     while (binary_hash.size() != 0) {
943 
944       // read hash data for the hash
945       manager.find_hash(binary_hash, k_entropy, block_label, count,
946                                                     source_sub_counts);
947 
948       // find sources that match the source we are looking for
949       for (hashdb::source_sub_counts_t::const_iterator it =
950                        source_sub_counts.begin();
951                        it!= source_sub_counts.end(); ++it) {
952         if (it->file_hash == file_binary_hash) {
953 
954           // the source matches so print the hash and move on
955           std::string expanded_text = manager.find_hash_json(
956                                                     scan_mode, binary_hash);
957           std::cout << hashdb::bin_to_hex(binary_hash) << "\t" << expanded_text
958                     << "\n";
959           break;
960         }
961       }
962 
963       // move forward
964       progress_tracker.track_hash_data(source_sub_counts.size());
965       binary_hash = manager.next_hash(binary_hash);
966     }
967   }
968 
969   // read_media
read_media(const std::string & media_image_filename,const std::string & media_offset,const std::string & count_string)970   static void read_media(const std::string& media_image_filename,
971                          const std::string& media_offset,
972                          const std::string& count_string) {
973 
974     // convert count string to number
975     const uint64_t count = s_to_uint64(count_string);
976 
977     // read the bytes
978     std::string bytes;
979     std::string error_message = hashdb::read_media(
980                          media_image_filename, media_offset, count, bytes);
981 
982     if (error_message.size() == 0) {
983       // print the bytes to stdout
984       std::cout << bytes << std::flush;
985     } else {
986       // print the error to stderr
987       std::cerr << "Error: " << error_message << "\n";
988       exit(1);
989     }
990   }
991 
992   // read_media_size
read_media_size(const std::string & media_image_filename)993   static void read_media_size(const std::string& media_image_filename) {
994 
995     // read the media size
996     uint64_t media_size;
997     std::string error_message = hashdb::read_media_size(
998                          media_image_filename, media_size);
999 
1000     if (error_message.size() == 0) {
1001       // print the bytes to stdout
1002       std::cout << media_size << "\n";
1003     } else {
1004       // print the error to stderr
1005       std::cerr << "Error: " << error_message << "\n";
1006       exit(1);
1007     }
1008   }
1009 
1010   // ************************************************************
1011   // performance analysis
1012   // ************************************************************
1013   // add_random
add_random(const std::string & hashdb_dir,const std::string & count_string,const std::string & cmd)1014   static void add_random(const std::string& hashdb_dir,
1015                          const std::string& count_string,
1016                          const std::string& cmd) {
1017 
1018     // validate hashdb_dir path
1019     require_hashdb_dir(hashdb_dir);
1020 
1021     // convert count string to number
1022     const uint64_t count = s_to_uint64(count_string);
1023 
1024     // read settings for byte alignment
1025     hashdb::settings_t settings;
1026     std::string error_message = hashdb::read_settings(hashdb_dir, settings);
1027     if (error_message.size() != 0) {
1028       std::cerr << "Error: " << error_message << "\n";
1029       exit(1);
1030     }
1031 
1032     // initialize random seed
1033     srand (time(NULL));
1034 
1035     // open manager
1036     hashdb::import_manager_t manager(hashdb_dir, cmd);
1037 
1038     // start progress tracker
1039     progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1040 
1041     // set up the source
1042     std::string file_binary_hash = hashdb::hex_to_bin("00");
1043     manager.insert_source_name(file_binary_hash, "add_random_repository_name",
1044                                "add_random_filename");
1045     manager.insert_source_data(file_binary_hash, 0, "", 0, 0);
1046 
1047     // get start index for this run
1048     uint64_t start_index = manager.size_hashes();
1049     if (start_index > 1) {
1050       --start_index;
1051     }
1052 
1053     // insert count random hshes into the database
1054     for (uint64_t i=0; i<count; i++) {
1055 
1056       // add hash
1057       manager.insert_hash(random_binary_hash(), 0.0, "", file_binary_hash);
1058 
1059       // update progress tracker
1060       progress_tracker.track();
1061     }
1062   }
1063 
1064   // scan_random
scan_random(const std::string & hashdb_dir,const std::string & count_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)1065   static void scan_random(const std::string& hashdb_dir,
1066                           const std::string& count_string,
1067                           const hashdb::scan_mode_t scan_mode,
1068                           const std::string& cmd) {
1069 
1070     // validate hashdb_dir path
1071     require_hashdb_dir(hashdb_dir);
1072 
1073     // convert count string to number
1074     const uint64_t count = s_to_uint64(count_string);
1075 
1076     // initialize random seed
1077     srand (time(NULL)+1); // ensure seed is different by advancing 1 second
1078 
1079     // open manager
1080     hashdb::scan_manager_t manager(hashdb_dir);
1081 
1082     // start progress tracker
1083     progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1084 
1085     // scan random hashes where hash values are unlikely to match
1086     for (uint64_t i=1; i<=count; ++i) {
1087       std::string binary_hash = random_binary_hash();
1088 
1089       std::string expanded_text = manager.find_hash_json(
1090                                                     scan_mode, binary_hash);
1091 
1092       if (expanded_text.size() != 0) {
1093         std::cout << "Match found, hash "
1094                   << hashdb::bin_to_hex(binary_hash)
1095                   << ": " << expanded_text << "\n";
1096       }
1097 
1098       // update progress tracker
1099       progress_tracker.track();
1100     }
1101   }
1102 
1103   // add_same
1104   // add same hash but different source offset
add_same(const std::string & hashdb_dir,const std::string & count_string,const std::string & cmd)1105   static void add_same(const std::string& hashdb_dir,
1106                        const std::string& count_string,
1107                        const std::string& cmd) {
1108 
1109     // validate hashdb_dir path
1110     require_hashdb_dir(hashdb_dir);
1111 
1112     // convert count string to number
1113     const uint64_t count = s_to_uint64(count_string);
1114 
1115     // read settings for byte alignment
1116     hashdb::settings_t settings;
1117     std::string error_message = hashdb::read_settings(hashdb_dir, settings);
1118     if (error_message.size() != 0) {
1119       std::cerr << "Error: " << error_message << "\n";
1120       exit(1);
1121     }
1122 
1123     // open manager
1124     hashdb::import_manager_t manager(hashdb_dir, cmd);
1125 
1126     // start progress tracker
1127     progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1128 
1129     // set up the source
1130     std::string file_binary_hash = hashdb::hex_to_bin("00");
1131     manager.insert_source_name(file_binary_hash, "add_same_repository_name",
1132                                "add_same_filename");
1133     manager.insert_source_data(file_binary_hash, 0, "", 0, 0);
1134 
1135     // hash to use
1136     std::string binary_hash =
1137                    hashdb::hex_to_bin("80000000000000000000000000000000");
1138 
1139     // get start index for this run
1140     uint64_t start_index = manager.size_hashes();
1141     if (start_index > 1) {
1142       --start_index;
1143     }
1144 
1145     // insert count same hshes into the database
1146     for (uint64_t i=0; i<count; i++) {
1147 
1148       // add hash
1149       manager.insert_hash(binary_hash, 0.0, "", file_binary_hash);
1150 
1151       // update progress tracker
1152       progress_tracker.track();
1153     }
1154   }
1155 
1156   // scan_same
scan_same(const std::string & hashdb_dir,const std::string & count_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)1157   static void scan_same(const std::string& hashdb_dir,
1158                         const std::string& count_string,
1159                         const hashdb::scan_mode_t scan_mode,
1160                         const std::string& cmd) {
1161 
1162     // validate hashdb_dir path
1163     require_hashdb_dir(hashdb_dir);
1164 
1165     // convert count string to number
1166     const uint64_t count = s_to_uint64(count_string);
1167 
1168     // open manager
1169     hashdb::scan_manager_t manager(hashdb_dir);
1170 
1171     // start progress tracker
1172     progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1173 
1174     // hash to use
1175     std::string binary_hash =
1176                    hashdb::hex_to_bin("80000000000000000000000000000000");
1177 
1178     // scan same hash repeatedly
1179     for (uint64_t i=1; i<=count; ++i) {
1180       std::string expanded_text = manager.find_hash_json(
1181                                                     scan_mode, binary_hash);
1182 
1183       if (expanded_text.size() == 0) {
1184         std::cout << "Match not found, hash "
1185                   << hashdb::bin_to_hex(binary_hash)
1186                   << ": " << expanded_text << "\n";
1187       }
1188 
1189       // update progress tracker
1190       progress_tracker.track();
1191     }
1192   }
1193 
1194   // test_scan_stream
test_scan_stream(const std::string & hashdb_dir,const std::string & count_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)1195   static void test_scan_stream(const std::string& hashdb_dir,
1196                                const std::string& count_string,
1197                                const hashdb::scan_mode_t scan_mode,
1198                                const std::string& cmd) {
1199 
1200     const size_t list_size = 10000;
1201 
1202     // validate hashdb_dir path
1203     require_hashdb_dir(hashdb_dir);
1204 
1205     // convert count string to number
1206     const uint64_t count = s_to_uint64(count_string);
1207 
1208     // open manager
1209     hashdb::scan_manager_t manager(hashdb_dir);
1210 
1211     // open scan_stream
1212     hashdb::scan_stream_t scan_stream(&manager, 16, scan_mode);
1213 
1214     // start progress tracker
1215     progress_tracker_t progress_tracker(hashdb_dir, list_size * count, cmd);
1216 
1217     // hash to use
1218     std::string binary_hash =
1219                    hashdb::hex_to_bin("80000000000000000000000000000000");
1220 
1221     // prepare the unscanned record of 10,000
1222     std::stringstream ss;
1223     for (size_t i=0; i<list_size; ++i) {
1224       const uint16_t index_length = std::to_string(i).size();
1225       ss << binary_hash;
1226       ss.write(reinterpret_cast<const char*>(&index_length), sizeof(uint16_t));
1227       ss << i;
1228     }
1229     const std::string unscanned(ss.str());
1230 
1231     // put/get data
1232     for (uint64_t i=1; i<=count; ++i) {
1233       scan_stream.put(unscanned);
1234       const std::string scanned = scan_stream.get();
1235       if (scanned.size() > 0) {
1236         progress_tracker.track_count(list_size);
1237       }
1238     }
1239 
1240     // get data until processing is done
1241     while (!scan_stream.empty()) {
1242       const std::string scanned = scan_stream.get();
1243       if (scanned.size() > 0) {
1244         progress_tracker.track_count(list_size);
1245       }
1246     }
1247   }
1248 }
1249 
1250 #endif
1251 
1252