1 // Author: Bruce Allen
2 // Created: 2/25/2013
3 //
4 // The software provided here is released by the Naval Postgraduate
5 // School, an agency of the U.S. Department of Navy. The software
6 // bears no warranty, either expressed or implied. NPS does not assume
7 // legal liability nor responsibility for a User's use of the software
8 // or the results of such use.
9 //
10 // Please note that within the United States, copyright protection,
11 // under Section 105 of the United States Code, Title 17, is not
12 // available for any work of the United States Government and/or for
13 // any works created by United States Government employees. User
14 // acknowledges that this software contains work which was created by
15 // NPS government employees and is therefore in the public domain and
16 // not subject to copyright.
17 //
18 // Released into the public domain on February 25, 2013 by Bruce Allen.
19
20 /**
21 * \file
22 * Provides hashdb commands.
23 */
24
25 #ifndef COMMANDS_HPP
26 #define COMMANDS_HPP
27 #include "../src_libhashdb/hashdb.hpp"
28 #include "import_tab.hpp"
29 #include "import_json.hpp"
30 #include "export_json.hpp"
31 #include "scan_list.hpp"
32 #include "adder.hpp"
33 #include "adder_set.hpp"
34
35 // Standard includes
36 #include <cerrno>
37 #include <cstdlib>
38 #include <cstdio>
39 #include <string>
40 #include <sstream>
41 #include <iostream>
42 #include <algorithm>
43 #include <vector>
44
45 // leave alone else create using existing settings if new
create_if_new(const std::string & hashdb_dir,const std::string & from_hashdb_dir,const std::string & command_string)46 void create_if_new(const std::string& hashdb_dir,
47 const std::string& from_hashdb_dir,
48 const std::string& command_string) {
49
50 std::string error_message;
51 hashdb::settings_t settings;
52
53 // try to read hashdb_dir settings
54 error_message = hashdb::read_settings(hashdb_dir, settings);
55 if (error_message.size() == 0) {
56 // hashdb_dir already exists
57 return;
58 }
59
60 // no hashdb_dir, so read from_hashdb_dir settings
61 error_message = hashdb::read_settings(from_hashdb_dir, settings);
62 if (error_message.size() != 0) {
63 // bad since from_hashdb_dir is not valid
64 std::cerr << "Error: " << error_message << "\n";
65 exit(1);
66 }
67
68 // create hashdb_dir using from_hashdb_dir settings
69 error_message = hashdb::create_hashdb(hashdb_dir, settings, command_string);
70 if (error_message.size() != 0) {
71 // bad since from_hashdb_dir is not valid
72 std::cerr << "Error: " << error_message << "\n";
73 exit(1);
74 }
75 }
76
77 // require hashdb_dir else fail
require_hashdb_dir(const std::string & hashdb_dir)78 static void require_hashdb_dir(const std::string& hashdb_dir) {
79 std::string error_message;
80 hashdb::settings_t settings;
81 error_message = hashdb::read_settings(hashdb_dir, settings);
82 if (error_message.size() != 0) {
83 std::cerr << "Error: " << error_message << "\n";
84 exit(1);
85 }
86 }
87
print_header(const std::string & cmd)88 static void print_header(const std::string& cmd) {
89 std::cout << "# command: " << cmd << "\n"
90 << "# hashdb-Version: " << PACKAGE_VERSION << "\n";
91 }
92
93 // helper
94 /**
95 * Return 16 bytes of random hash.
96 */
random_binary_hash()97 std::string random_binary_hash() {
98 char hash[16];
99 for (size_t i=0; i<16; i++) {
100 // note: uint32_t not used because windows rand only uses 15 bits.
101 hash[i]=(static_cast<char>(rand()));
102 }
103 return std::string(hash, 16);
104 }
105
106 namespace commands {
107
108 // ************************************************************
109 // helpers
110 // ************************************************************
111 class in_ptr_t {
112 private:
113 std::istream* in;
114
115 // do not allow copy or assignment
116 in_ptr_t(const in_ptr_t&);
117 in_ptr_t& operator=(const in_ptr_t&);
118
119 public:
in_ptr_t(const std::string & in_filename)120 in_ptr_t(const std::string& in_filename) : in(NULL) {
121 if (in_filename == "-") {
122 in = &std::cin;
123 } else {
124 std::ifstream* inf = new std::ifstream(in_filename.c_str());
125 if (!inf->is_open()) {
126 std::cerr << "Error: Cannot open " << in_filename
127 << ": " << strerror(errno) << "\n";
128 exit(1);
129 }
130 in = inf;
131 }
132 }
133
~in_ptr_t()134 ~in_ptr_t() {
135 if (in != &std::cin) {
136 delete in;
137 }
138 }
139
operator ()()140 std::istream* operator()() {
141 return in;
142 }
143 };
144
145 class out_ptr_t {
146 private:
147 std::ostream* out;
148
149 // do not allow copy or assignment
150 out_ptr_t(const out_ptr_t&);
151 out_ptr_t& operator=(const out_ptr_t&);
152
153 public:
out_ptr_t(const std::string & out_filename)154 out_ptr_t(const std::string& out_filename) : out(NULL) {
155 if (out_filename == "-") {
156 out = &std::cout;
157 } else {
158 std::ofstream* outf = new std::ofstream(out_filename.c_str());
159 if (!outf->is_open()) {
160 std::cerr << "Error: Cannot open " << out_filename
161 << ": " << strerror(errno) << "\n";
162 exit(1);
163 }
164 out = outf;
165 }
166 }
167
~out_ptr_t()168 ~out_ptr_t() {
169 if (out != &std::cout) {
170 delete out;
171 }
172 }
173
operator ()()174 std::ostream* operator()() {
175 return out;
176 }
177 };
178
179 // ************************************************************
180 // new database
181 // ************************************************************
create(const std::string & hashdb_dir,const hashdb::settings_t & settings,const std::string & cmd)182 void create(const std::string& hashdb_dir,
183 const hashdb::settings_t& settings,
184 const std::string& cmd) {
185
186 std::string error_message;
187 error_message = hashdb::create_hashdb(hashdb_dir, settings, cmd);
188
189 if (error_message.size() == 0) {
190 std::cout << "New database created.\n";
191 } else {
192 std::cerr << "Error: " << error_message << "\n";
193 exit(1);
194 }
195 }
196
197 // ************************************************************
198 // import/export
199 // ************************************************************
200 // import recursively from path
ingest(const std::string & hashdb_dir,const std::string & ingest_path,const size_t step_size,const std::string & repository_name,const std::string & whitelist_dir,const bool disable_recursive_processing,const bool disable_calculate_entropy,const bool disable_calculate_labels,const std::string & cmd)201 static void ingest(const std::string& hashdb_dir,
202 const std::string& ingest_path,
203 const size_t step_size,
204 const std::string& repository_name,
205 const std::string& whitelist_dir,
206 const bool disable_recursive_processing,
207 const bool disable_calculate_entropy,
208 const bool disable_calculate_labels,
209 const std::string& cmd) {
210
211 // ingest
212 std::string error_message = hashdb::ingest(
213 hashdb_dir, ingest_path, step_size, repository_name,
214 whitelist_dir,
215 disable_recursive_processing,
216 disable_calculate_entropy,
217 disable_calculate_labels,
218 cmd);
219 if (error_message.size() != 0) {
220 std::cerr << "Error: " << error_message << "\n";
221 exit(1);
222 }
223 }
224
225 // import_tab
import_tab(const std::string & hashdb_dir,const std::string & tab_file,const std::string & repository_name,const std::string & whitelist_dir,const std::string & cmd)226 static void import_tab(const std::string& hashdb_dir,
227 const std::string& tab_file,
228 const std::string& repository_name,
229 const std::string& whitelist_dir,
230 const std::string& cmd) {
231
232 // validate hashdb_dir path
233 require_hashdb_dir(hashdb_dir);
234
235 // resources
236 hashdb::import_manager_t manager(hashdb_dir, cmd);
237 hashdb::scan_manager_t* whitelist_manager = NULL;
238 if (whitelist_dir != "") {
239 require_hashdb_dir(whitelist_dir);
240 whitelist_manager = new hashdb::scan_manager_t(whitelist_dir);
241 }
242 progress_tracker_t progress_tracker(hashdb_dir, 0, cmd);
243
244 // open the tab file for reading
245 in_ptr_t in_ptr(tab_file);
246 ::import_tab(manager, repository_name, tab_file, whitelist_manager,
247 progress_tracker, *in_ptr());
248
249 // done
250 if (whitelist_manager != NULL) {
251 delete whitelist_manager;
252 }
253 }
254
255 // import json
import_json(const std::string & hashdb_dir,const std::string & json_file,const std::string & cmd)256 static void import_json(const std::string& hashdb_dir,
257 const std::string& json_file,
258 const std::string& cmd) {
259
260 // validate hashdb_dir path
261 require_hashdb_dir(hashdb_dir);
262
263 // resources
264 hashdb::import_manager_t manager(hashdb_dir, cmd);
265 progress_tracker_t progress_tracker(hashdb_dir, 0, cmd);
266
267 // open the JSON file for reading
268 in_ptr_t in_ptr(json_file);
269 ::import_json(manager, progress_tracker, *in_ptr());
270 }
271
272 // export json
export_json(const std::string & hashdb_dir,const std::string & json_file,const std::string & cmd)273 static void export_json(const std::string& hashdb_dir,
274 const std::string& json_file,
275 const std::string& cmd) {
276
277 // validate hashdb_dir path
278 require_hashdb_dir(hashdb_dir);
279
280 // resources
281 hashdb::scan_manager_t manager(hashdb_dir);
282 progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
283
284 // open the JSON file for writing
285 out_ptr_t out_ptr(json_file);
286
287 // print header to file
288 *out_ptr() << "# command: '" << cmd << "'\n"
289 << "# hashdb-Version: " << PACKAGE_VERSION << "\n";
290
291 // export the hashdb
292 ::export_json_hashes(manager, progress_tracker, *out_ptr());
293 ::export_json_sources(manager, *out_ptr());
294 }
295
296 // export json range
export_json_range(const std::string & hashdb_dir,const std::string & json_file,const std::string & begin_block_hash,const std::string & end_block_hash,const std::string & cmd)297 static void export_json_range(const std::string& hashdb_dir,
298 const std::string& json_file,
299 const std::string& begin_block_hash,
300 const std::string& end_block_hash,
301 const std::string& cmd) {
302
303 // validate hashdb_dir path
304 require_hashdb_dir(hashdb_dir);
305
306 // resources
307 hashdb::scan_manager_t manager(hashdb_dir);
308 progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
309
310 // open the JSON file for writing
311 out_ptr_t out_ptr(json_file);
312
313 // print header to file
314 *out_ptr() << "# command: '" << cmd << "'\n"
315 << "# hashdb-Version: " << PACKAGE_VERSION << "\n";
316
317 // export the range to the hashdb
318 ::export_json_range(manager, begin_block_hash, end_block_hash,
319 progress_tracker, *out_ptr());
320 }
321
322 // ************************************************************
323 // database manipulation
324 // ************************************************************
325 // add
add(const std::string & hashdb_dir,const std::string & dest_dir,const std::string & cmd)326 static void add(const std::string& hashdb_dir,
327 const std::string& dest_dir,
328 const std::string& cmd) {
329
330 // validate hashdb directories, maybe make dest_dir
331 require_hashdb_dir(hashdb_dir);
332 create_if_new(dest_dir, hashdb_dir, cmd);
333
334 // resources
335 hashdb::scan_manager_t manager_a(hashdb_dir);
336 hashdb::import_manager_t manager_b(dest_dir, cmd);
337 progress_tracker_t progress_tracker(
338 dest_dir, manager_a.size_hashes(), cmd);
339 adder_t adder(&manager_a, &manager_b, &progress_tracker);
340
341 // add data for binary_hash from A to B
342 std::string binary_hash = manager_a.first_hash();
343 while (binary_hash.size() != 0) {
344 // add the hash
345 adder.add(binary_hash);
346 binary_hash = manager_a.next_hash(binary_hash);
347 }
348 }
349
350 // add_multiple
351 // Flow:
352 // 1) Create an ordered multimap of key=hash, value=producer_t
353 // where key is the first key from a producer.
354 // 2) Consume elements from the ordered multimap and copy them
355 // until the producers are depleted. Do not enque when a producer
356 // is depleted. Done when the ordered multimap becomes empty.
add_multiple(const std::vector<std::string> & p_hashdb_dirs,const std::string & cmd)357 static void add_multiple(const std::vector<std::string>& p_hashdb_dirs,
358 const std::string& cmd) {
359
360 std::vector<std::string> hashdb_dirs = p_hashdb_dirs;
361
362 // read then strip off dest_dir from end of list
363 const std::string dest_dir = hashdb_dirs.back();
364 hashdb_dirs.pop_back();
365
366 // validate hashdb directories, maybe make dest_dir
367 for (std::vector<std::string>::const_iterator it = hashdb_dirs.begin();
368 it != hashdb_dirs.end(); ++it) {
369 require_hashdb_dir(*it);
370 }
371 create_if_new(dest_dir, hashdb_dirs[0], cmd);
372
373 // open the consumer at dest_dir
374 hashdb::import_manager_t consumer(dest_dir, cmd);
375
376 // calculate the total hash records for the tracker
377 size_t total_hash_records = 0;
378 for (std::vector<std::string>::const_iterator it = hashdb_dirs.begin();
379 it != hashdb_dirs.end(); ++it) {
380 hashdb::scan_manager_t scan_manager(*it);
381 total_hash_records += scan_manager.size_hashes();
382 }
383
384 // start progress tracker
385 progress_tracker_t progress_tracker(dest_dir, total_hash_records, cmd);
386
387 // define the ordered multimap of key=hash, value=producer_t
388 typedef std::pair<hashdb::scan_manager_t*, adder_t*> producer_t;
389 typedef std::pair<std::string, producer_t> ordered_producers_value_t;
390 typedef std::multimap<std::string, producer_t> ordered_producers_t;
391
392 // create the multimap of ordered producers
393 ordered_producers_t ordered_producers;
394
395 // open the producers
396 for (std::vector<std::string>::const_iterator it = hashdb_dirs.begin();
397 it != hashdb_dirs.end(); ++it) {
398 std::string hashdb_dir = *it;
399 hashdb::scan_manager_t* producer = new hashdb::scan_manager_t(hashdb_dir);
400 std::string binary_hash = producer->first_hash();
401 if (binary_hash.size() != 0) {
402 // the producer is not empty, so enqueue it
403 // create the adder
404 adder_t* adder = new adder_t(producer, &consumer, &progress_tracker);
405 ordered_producers.insert(ordered_producers_value_t(binary_hash,
406 producer_t(producer, adder)));
407
408 // also track total hashes to be processed
409 total_hash_records += producer->size_hashes();
410
411 } else {
412 // no hashes for this producer so close it
413 delete producer;
414 }
415 }
416
417 // add ordered hashes from producers until all hashes are consumed
418 while (ordered_producers.size() != 0) {
419 // get the hash, producer, and adder for the first hash
420 ordered_producers_t::iterator it = ordered_producers.begin();
421 hashdb::scan_manager_t* producer = it->second.first;
422 adder_t* adder = it->second.second;
423
424 // add the hash to the consumer
425 adder->add(it->first);
426
427 // get the next hash from this producer
428 std::string binary_hash = producer->next_hash(it->first);
429
430 // remove this hash, producer_t entry
431 ordered_producers.erase(it);
432
433 if (binary_hash.size() != 0) {
434 // hash exists so add the hash, producer, and adder
435 ordered_producers.insert(ordered_producers_value_t(binary_hash,
436 producer_t(producer, adder)));
437 } else {
438 // no hashes for this producer so close it
439 delete producer;
440 delete adder;
441 }
442 }
443 }
444
445 // add_repository
add_repository(const std::string & hashdb_dir,const std::string & dest_dir,const std::string & repository_name,const std::string & cmd)446 static void add_repository(const std::string& hashdb_dir,
447 const std::string& dest_dir,
448 const std::string& repository_name,
449 const std::string& cmd) {
450
451 // validate hashdb directories, maybe make dest_dir
452 require_hashdb_dir(hashdb_dir);
453 create_if_new(dest_dir, hashdb_dir, cmd);
454
455 // resources
456 hashdb::scan_manager_t manager_a(hashdb_dir);
457 hashdb::import_manager_t manager_b(dest_dir, cmd);
458 progress_tracker_t progress_tracker(dest_dir,
459 manager_a.size_hashes(), cmd);
460 adder_t adder(&manager_a, &manager_b, repository_name, &progress_tracker);
461
462 // add data for binary_hash from A to B
463 std::string binary_hash = manager_a.first_hash();
464 while (binary_hash.size() != 0) {
465 // add the hash
466 adder.add_repository(binary_hash);
467 binary_hash = manager_a.next_hash(binary_hash);
468 }
469 }
470
471 // add_range
add_range(const std::string & hashdb_dir,const std::string & dest_dir,const size_t m,const size_t n,const std::string & cmd)472 static void add_range(const std::string& hashdb_dir,
473 const std::string& dest_dir,
474 const size_t m,
475 const size_t n,
476 const std::string& cmd) {
477
478 // validate hashdb directories, maybe make dest_dir
479 require_hashdb_dir(hashdb_dir);
480 create_if_new(dest_dir, hashdb_dir, cmd);
481
482 // resources
483 hashdb::scan_manager_t manager_a(hashdb_dir);
484 hashdb::import_manager_t manager_b(dest_dir, cmd);
485 progress_tracker_t progress_tracker(dest_dir,
486 manager_a.size_hashes(), cmd);
487 adder_t adder(&manager_a, &manager_b, &progress_tracker);
488
489 // add data for binary_hash from A to B
490 std::string binary_hash = manager_a.first_hash();
491 while (binary_hash.size() != 0) {
492 // add the hash
493 adder.add_range(binary_hash, m, n);
494 binary_hash = manager_a.next_hash(binary_hash);
495 }
496 }
497
498 // intersect A and B into C
intersect(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)499 static void intersect(const std::string& hashdb_dir1,
500 const std::string& hashdb_dir2,
501 const std::string& dest_dir,
502 const std::string& cmd) {
503
504 // validate hashdb directories, maybe make dest_dir
505 require_hashdb_dir(hashdb_dir1);
506 require_hashdb_dir(hashdb_dir2);
507 create_if_new(dest_dir, hashdb_dir1, cmd);
508
509 // resources
510 hashdb::scan_manager_t manager_a(hashdb_dir1);
511 hashdb::scan_manager_t manager_b(hashdb_dir2);
512 hashdb::import_manager_t manager_c(dest_dir, cmd);
513 progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
514 adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
515 &progress_tracker);
516
517 // iterate A to intersect A and B into C
518 std::string binary_hash = manager_a.first_hash();
519 while (binary_hash.size() != 0) {
520 adder_set.intersect(binary_hash);
521 binary_hash = manager_a.next_hash(binary_hash);
522 }
523 }
524
525 // intersect_hash
intersect_hash(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)526 static void intersect_hash(const std::string& hashdb_dir1,
527 const std::string& hashdb_dir2,
528 const std::string& dest_dir,
529 const std::string& cmd) {
530
531 // validate hashdb directories, maybe make dest_dir
532 require_hashdb_dir(hashdb_dir1);
533 require_hashdb_dir(hashdb_dir2);
534 create_if_new(dest_dir, hashdb_dir1, cmd);
535
536 // resources
537 hashdb::scan_manager_t manager_a(hashdb_dir1);
538 hashdb::scan_manager_t manager_b(hashdb_dir2);
539 hashdb::import_manager_t manager_c(dest_dir, cmd);
540 progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
541 adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
542 & progress_tracker);
543
544 // iterate A to intersect_hash A and B into C
545 std::string binary_hash = manager_a.first_hash();
546 while (binary_hash.size() != 0) {
547 adder_set.intersect_hash(binary_hash);
548 binary_hash = manager_a.next_hash(binary_hash);
549 }
550 }
551
552 // subtract
subtract(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)553 static void subtract(const std::string& hashdb_dir1,
554 const std::string& hashdb_dir2,
555 const std::string& dest_dir,
556 const std::string& cmd) {
557
558 // validate hashdb directories, maybe make dest_dir
559 require_hashdb_dir(hashdb_dir1);
560 require_hashdb_dir(hashdb_dir2);
561 create_if_new(dest_dir, hashdb_dir1, cmd);
562
563 // resources
564 hashdb::scan_manager_t manager_a(hashdb_dir1);
565 hashdb::scan_manager_t manager_b(hashdb_dir2);
566 hashdb::import_manager_t manager_c(dest_dir, cmd);
567 progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
568 adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
569 &progress_tracker);
570
571 // iterate A to add A to C if A hash and source not in B
572 std::string binary_hash = manager_a.first_hash();
573 while (binary_hash.size() != 0) {
574
575 // add A to C if A hash and source not in B
576 adder_set.subtract(binary_hash);
577 binary_hash = manager_a.next_hash(binary_hash);
578 }
579 }
580
581 // subtract_hash
subtract_hash(const std::string & hashdb_dir1,const std::string & hashdb_dir2,const std::string & dest_dir,const std::string & cmd)582 static void subtract_hash(const std::string& hashdb_dir1,
583 const std::string& hashdb_dir2,
584 const std::string& dest_dir,
585 const std::string& cmd) {
586
587 // validate hashdb directories, maybe make dest_dir
588 require_hashdb_dir(hashdb_dir1);
589 require_hashdb_dir(hashdb_dir2);
590 create_if_new(dest_dir, hashdb_dir1, cmd);
591
592 // resources
593 hashdb::scan_manager_t manager_a(hashdb_dir1);
594 hashdb::scan_manager_t manager_b(hashdb_dir2);
595 hashdb::import_manager_t manager_c(dest_dir, cmd);
596 progress_tracker_t progress_tracker(dest_dir, manager_a.size_hashes(), cmd);
597 adder_set_t adder_set(&manager_a, &manager_b, &manager_c,
598 &progress_tracker);
599
600 // iterate A to add A to C if A hash not in B
601 std::string binary_hash = manager_a.first_hash();
602 while (binary_hash.size() != 0) {
603
604 // add A to C if A hash not in B
605 adder_set.subtract_hash(binary_hash);
606 binary_hash = manager_a.next_hash(binary_hash);
607 }
608 }
609
610 // subtract_repository
subtract_repository(const std::string & hashdb_dir,const std::string & dest_dir,const std::string & repository_name,const std::string & cmd)611 static void subtract_repository(const std::string& hashdb_dir,
612 const std::string& dest_dir,
613 const std::string& repository_name,
614 const std::string& cmd) {
615
616 // validate hashdb directories, maybe make dest_dir
617 require_hashdb_dir(hashdb_dir);
618 create_if_new(dest_dir, hashdb_dir, cmd);
619
620 // resources
621 hashdb::scan_manager_t manager_a(hashdb_dir);
622 hashdb::import_manager_t manager_b(dest_dir, cmd);
623 progress_tracker_t progress_tracker(dest_dir,
624 manager_a.size_hashes(), cmd);
625 adder_t adder(&manager_a, &manager_b, repository_name, &progress_tracker);
626
627 // add data for binary_hash from A to B
628 std::string binary_hash = manager_a.first_hash();
629 while (binary_hash.size() != 0) {
630 // add the hash
631 adder.add_non_repository(binary_hash);
632 binary_hash = manager_a.next_hash(binary_hash);
633 }
634 }
635
636 // ************************************************************
637 // scan
638 // ************************************************************
639 // scan
scan_list(const std::string & hashdb_dir,const std::string & hashes_file,const hashdb::scan_mode_t scan_mode,const std::string & cmd)640 static void scan_list(const std::string& hashdb_dir,
641 const std::string& hashes_file,
642 const hashdb::scan_mode_t scan_mode,
643 const std::string& cmd) {
644
645 // validate hashdb_dir path
646 require_hashdb_dir(hashdb_dir);
647
648 // resources
649 hashdb::scan_manager_t manager(hashdb_dir);
650
651 // open the hashes list file for reading
652 in_ptr_t in_ptr(hashes_file);
653
654 // print header information
655 print_header(cmd);
656
657 // scan the list
658 ::scan_list(manager, *in_ptr(), scan_mode);
659
660 // done
661 std::cout << "# scan_list completed.\n";
662 }
663
664 // scan_hash
scan_hash(const std::string & hashdb_dir,const std::string & hex_block_hash,const hashdb::scan_mode_t scan_mode,const std::string & cmd)665 static void scan_hash(const std::string& hashdb_dir,
666 const std::string& hex_block_hash,
667 const hashdb::scan_mode_t scan_mode,
668 const std::string& cmd) {
669
670 // validate hashdb_dir path
671 require_hashdb_dir(hashdb_dir);
672
673 // get the binary hash
674 std::string binary_hash = hashdb::hex_to_bin(hex_block_hash);
675
676 // reject invalid input
677 if (binary_hash == "") {
678 std::cerr << "Error: Invalid hash: '" << hex_block_hash << "'\n";
679 exit(1);
680 }
681
682 // open DB
683 hashdb::scan_manager_t scan_manager(hashdb_dir);
684
685 // scan
686 std::string expanded_text = scan_manager.find_hash_json(
687 scan_mode, binary_hash);
688
689 if (expanded_text.size() != 0) {
690 std::cout << expanded_text << std::endl;
691 } else {
692 std::cout << "Hash not found for '" << hex_block_hash << "'\n";
693 }
694 }
695
696 // scan_media
scan_media(const std::string & hashdb_dir,const std::string & media_image_filename,const size_t step_size,const bool disable_recursive_processing,const hashdb::scan_mode_t scan_mode,const std::string & cmd)697 static void scan_media(const std::string& hashdb_dir,
698 const std::string& media_image_filename,
699 const size_t step_size,
700 const bool disable_recursive_processing,
701 const hashdb::scan_mode_t scan_mode,
702 const std::string& cmd) {
703
704 // print header information
705 print_header(cmd);
706
707 // scan
708 std::string error_message = hashdb::scan_media(hashdb_dir,
709 media_image_filename, step_size,
710 disable_recursive_processing, scan_mode);
711 if (error_message.size() == 0) {
712 std::cout << "# scan_media completed.\n";
713 } else {
714 std::cerr << "Error: " << error_message << "\n";
715 exit(1);
716 }
717 }
718
719 // ************************************************************
720 // statistics
721 // ************************************************************
722 // size
size(const std::string & hashdb_dir,const std::string & cmd)723 static void size(const std::string& hashdb_dir,
724 const std::string& cmd) {
725
726 // validate hashdb_dir path
727 require_hashdb_dir(hashdb_dir);
728
729 // open DB
730 hashdb::scan_manager_t manager(hashdb_dir);
731
732 std::cout << manager.size() << std::endl;
733 }
734
735 // sources
sources(const std::string & hashdb_dir,const std::string & cmd)736 static void sources(const std::string& hashdb_dir,
737 const std::string& cmd) {
738
739 // validate hashdb_dir path
740 require_hashdb_dir(hashdb_dir);
741
742 // open DB
743 hashdb::scan_manager_t manager(hashdb_dir);
744
745 // print the sources
746 ::export_json_sources(manager, std::cout);
747 }
748
749 // histogram
histogram(const std::string & hashdb_dir,const std::string & cmd)750 static void histogram(const std::string& hashdb_dir,
751 const std::string& cmd) {
752
753 // validate hashdb_dir path
754 require_hashdb_dir(hashdb_dir);
755
756 // open DB
757 hashdb::scan_manager_t manager(hashdb_dir);
758
759 // print header information
760 print_header(cmd);
761
762 // start progress tracker
763 progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
764
765 // total number of hashes in the database
766 uint64_t total_hashes = 0;
767
768 // total number of distinct hashes
769 uint64_t total_distinct_hashes = 0;
770
771 // hash histogram as <count, number of hashes with count>
772 std::map<uint32_t, uint64_t> hash_histogram;
773
774 // space for variables
775 uint64_t k_entropy;
776 std::string block_label;
777 uint64_t count;
778 hashdb::source_sub_counts_t source_sub_counts;
779
780 // iterate over hashdb and set variables for calculating the histogram
781 std::string binary_hash = manager.first_hash();
782
783 // note if the DB is empty
784 if (binary_hash.size() == 0) {
785 std::cout << "The map is empty.\n";
786 }
787
788 while (binary_hash.size() != 0) {
789 manager.find_hash(binary_hash, k_entropy, block_label, count,
790 source_sub_counts);
791 // update total hashes observed
792 total_hashes += count;
793 // update total distinct hashes
794 if (count == 1) {
795 ++total_distinct_hashes;
796 }
797
798 // update hash_histogram information
799 // look for existing entry
800 std::map<uint32_t, uint64_t>::iterator hash_histogram_it =
801 hash_histogram.find(count);
802 if (hash_histogram_it == hash_histogram.end()) {
803
804 // this is the first hash found with this count value
805 // so start a new element for it
806 hash_histogram.insert(std::pair<uint32_t, uint64_t>(count, 1));
807
808 } else {
809
810 // increment existing value for number of hashes with this count
811 uint64_t old_number = hash_histogram_it->second;
812 hash_histogram.erase(count);
813 hash_histogram.insert(std::pair<uint32_t, uint64_t>(
814 count, old_number + 1));
815 }
816
817 // move forward
818 progress_tracker.track_hash_data(source_sub_counts.size());
819 binary_hash = manager.next_hash(binary_hash);
820 }
821
822 // show totals
823 std::cout << "{\"total_hashes\": " << total_hashes << ", "
824 << "\"total_distinct_hashes\": " << total_distinct_hashes << "}\n";
825
826 // show hash histogram as <count, number of hashes with count>
827 std::map<uint32_t, uint64_t>::iterator hash_histogram_it2;
828 for (hash_histogram_it2 = hash_histogram.begin();
829 hash_histogram_it2 != hash_histogram.end(); ++hash_histogram_it2) {
830 std::cout << "{\"duplicates\":" << hash_histogram_it2->first
831 << ", \"distinct_hashes\":" << hash_histogram_it2->second
832 << ", \"total\":" << hash_histogram_it2->first *
833 hash_histogram_it2->second << "}\n";
834 }
835 }
836
837 // duplicates
duplicates(const std::string & hashdb_dir,const std::string & number_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)838 static void duplicates(const std::string& hashdb_dir,
839 const std::string& number_string,
840 const hashdb::scan_mode_t scan_mode,
841 const std::string& cmd) {
842
843 // validate hashdb_dir path
844 require_hashdb_dir(hashdb_dir);
845
846 // convert duplicates string to number
847 uint32_t number = atoi(number_string.c_str());
848
849 // open DB
850 hashdb::scan_manager_t manager(hashdb_dir);
851
852 // there is nothing to report if the map is empty
853 if (manager.size_hashes() == 0) {
854 std::cout << "The map is empty.\n";
855 return;
856 }
857
858 // print header information
859 print_header(cmd);
860
861 // start progress tracker
862 progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
863
864 bool any_found = false;
865
866 // space for variables
867 uint64_t k_entropy;
868 std::string block_label;
869 uint64_t count;
870 hashdb::source_sub_counts_t source_sub_counts;
871
872 // iterate over hashdb and set variables for finding duplicates
873 std::string binary_hash = manager.first_hash();
874
875 while (binary_hash.size() != 0) {
876 manager.find_hash(binary_hash, k_entropy, block_label, count,
877 source_sub_counts);
878 if (count == number) {
879 // show hash with requested duplicates number
880 std::string expanded_text = manager.find_hash_json(
881 scan_mode, binary_hash);
882 std::cout << hashdb::bin_to_hex(binary_hash) << "\t"
883 << expanded_text << "\n";
884 any_found = true;
885 }
886
887 // move forward
888 progress_tracker.track_hash_data(source_sub_counts.size());
889 binary_hash = manager.next_hash(binary_hash);
890 }
891
892 // say so if nothing was found
893 if (!any_found) {
894 std::cout << "No hashes were found with this count.\n";
895 return;
896 }
897 }
898
899 // hash_table
hash_table(const std::string & hashdb_dir,const std::string & hex_file_hash,const hashdb::scan_mode_t scan_mode,const std::string & cmd)900 static void hash_table(const std::string& hashdb_dir,
901 const std::string& hex_file_hash,
902 const hashdb::scan_mode_t scan_mode,
903 const std::string& cmd) {
904
905 // validate hashdb_dir path
906 require_hashdb_dir(hashdb_dir);
907
908 // open DB
909 hashdb::scan_manager_t manager(hashdb_dir);
910
911 // source data
912 std::string file_binary_hash = hashdb::hex_to_bin(hex_file_hash);
913 uint64_t filesize = 0;
914 std::string file_type = "";
915 uint64_t zero_count = 0;
916 uint64_t nonprobative_count = 0;
917
918 // see if this source is even present
919 bool has_source_data = manager.find_source_data(file_binary_hash,
920 filesize, file_type, zero_count, nonprobative_count);
921 if (has_source_data == false) {
922 // the source is not present
923 std::cout << "There is no source with this file hash\n";
924 return;
925 }
926
927 // print header information
928 print_header(cmd);
929
930 // start progress tracker
931 progress_tracker_t progress_tracker(hashdb_dir, manager.size_hashes(), cmd);
932
933 // space for variables
934 uint64_t k_entropy;
935 std::string block_label;
936 uint64_t count;
937 hashdb::source_sub_counts_t source_sub_counts;
938
939 // look for hashes that belong to this source
940 // get the first hash
941 std::string binary_hash = manager.first_hash();
942 while (binary_hash.size() != 0) {
943
944 // read hash data for the hash
945 manager.find_hash(binary_hash, k_entropy, block_label, count,
946 source_sub_counts);
947
948 // find sources that match the source we are looking for
949 for (hashdb::source_sub_counts_t::const_iterator it =
950 source_sub_counts.begin();
951 it!= source_sub_counts.end(); ++it) {
952 if (it->file_hash == file_binary_hash) {
953
954 // the source matches so print the hash and move on
955 std::string expanded_text = manager.find_hash_json(
956 scan_mode, binary_hash);
957 std::cout << hashdb::bin_to_hex(binary_hash) << "\t" << expanded_text
958 << "\n";
959 break;
960 }
961 }
962
963 // move forward
964 progress_tracker.track_hash_data(source_sub_counts.size());
965 binary_hash = manager.next_hash(binary_hash);
966 }
967 }
968
969 // read_media
read_media(const std::string & media_image_filename,const std::string & media_offset,const std::string & count_string)970 static void read_media(const std::string& media_image_filename,
971 const std::string& media_offset,
972 const std::string& count_string) {
973
974 // convert count string to number
975 const uint64_t count = s_to_uint64(count_string);
976
977 // read the bytes
978 std::string bytes;
979 std::string error_message = hashdb::read_media(
980 media_image_filename, media_offset, count, bytes);
981
982 if (error_message.size() == 0) {
983 // print the bytes to stdout
984 std::cout << bytes << std::flush;
985 } else {
986 // print the error to stderr
987 std::cerr << "Error: " << error_message << "\n";
988 exit(1);
989 }
990 }
991
992 // read_media_size
read_media_size(const std::string & media_image_filename)993 static void read_media_size(const std::string& media_image_filename) {
994
995 // read the media size
996 uint64_t media_size;
997 std::string error_message = hashdb::read_media_size(
998 media_image_filename, media_size);
999
1000 if (error_message.size() == 0) {
1001 // print the bytes to stdout
1002 std::cout << media_size << "\n";
1003 } else {
1004 // print the error to stderr
1005 std::cerr << "Error: " << error_message << "\n";
1006 exit(1);
1007 }
1008 }
1009
1010 // ************************************************************
1011 // performance analysis
1012 // ************************************************************
1013 // add_random
add_random(const std::string & hashdb_dir,const std::string & count_string,const std::string & cmd)1014 static void add_random(const std::string& hashdb_dir,
1015 const std::string& count_string,
1016 const std::string& cmd) {
1017
1018 // validate hashdb_dir path
1019 require_hashdb_dir(hashdb_dir);
1020
1021 // convert count string to number
1022 const uint64_t count = s_to_uint64(count_string);
1023
1024 // read settings for byte alignment
1025 hashdb::settings_t settings;
1026 std::string error_message = hashdb::read_settings(hashdb_dir, settings);
1027 if (error_message.size() != 0) {
1028 std::cerr << "Error: " << error_message << "\n";
1029 exit(1);
1030 }
1031
1032 // initialize random seed
1033 srand (time(NULL));
1034
1035 // open manager
1036 hashdb::import_manager_t manager(hashdb_dir, cmd);
1037
1038 // start progress tracker
1039 progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1040
1041 // set up the source
1042 std::string file_binary_hash = hashdb::hex_to_bin("00");
1043 manager.insert_source_name(file_binary_hash, "add_random_repository_name",
1044 "add_random_filename");
1045 manager.insert_source_data(file_binary_hash, 0, "", 0, 0);
1046
1047 // get start index for this run
1048 uint64_t start_index = manager.size_hashes();
1049 if (start_index > 1) {
1050 --start_index;
1051 }
1052
1053 // insert count random hshes into the database
1054 for (uint64_t i=0; i<count; i++) {
1055
1056 // add hash
1057 manager.insert_hash(random_binary_hash(), 0.0, "", file_binary_hash);
1058
1059 // update progress tracker
1060 progress_tracker.track();
1061 }
1062 }
1063
1064 // scan_random
scan_random(const std::string & hashdb_dir,const std::string & count_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)1065 static void scan_random(const std::string& hashdb_dir,
1066 const std::string& count_string,
1067 const hashdb::scan_mode_t scan_mode,
1068 const std::string& cmd) {
1069
1070 // validate hashdb_dir path
1071 require_hashdb_dir(hashdb_dir);
1072
1073 // convert count string to number
1074 const uint64_t count = s_to_uint64(count_string);
1075
1076 // initialize random seed
1077 srand (time(NULL)+1); // ensure seed is different by advancing 1 second
1078
1079 // open manager
1080 hashdb::scan_manager_t manager(hashdb_dir);
1081
1082 // start progress tracker
1083 progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1084
1085 // scan random hashes where hash values are unlikely to match
1086 for (uint64_t i=1; i<=count; ++i) {
1087 std::string binary_hash = random_binary_hash();
1088
1089 std::string expanded_text = manager.find_hash_json(
1090 scan_mode, binary_hash);
1091
1092 if (expanded_text.size() != 0) {
1093 std::cout << "Match found, hash "
1094 << hashdb::bin_to_hex(binary_hash)
1095 << ": " << expanded_text << "\n";
1096 }
1097
1098 // update progress tracker
1099 progress_tracker.track();
1100 }
1101 }
1102
1103 // add_same
1104 // add same hash but different source offset
add_same(const std::string & hashdb_dir,const std::string & count_string,const std::string & cmd)1105 static void add_same(const std::string& hashdb_dir,
1106 const std::string& count_string,
1107 const std::string& cmd) {
1108
1109 // validate hashdb_dir path
1110 require_hashdb_dir(hashdb_dir);
1111
1112 // convert count string to number
1113 const uint64_t count = s_to_uint64(count_string);
1114
1115 // read settings for byte alignment
1116 hashdb::settings_t settings;
1117 std::string error_message = hashdb::read_settings(hashdb_dir, settings);
1118 if (error_message.size() != 0) {
1119 std::cerr << "Error: " << error_message << "\n";
1120 exit(1);
1121 }
1122
1123 // open manager
1124 hashdb::import_manager_t manager(hashdb_dir, cmd);
1125
1126 // start progress tracker
1127 progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1128
1129 // set up the source
1130 std::string file_binary_hash = hashdb::hex_to_bin("00");
1131 manager.insert_source_name(file_binary_hash, "add_same_repository_name",
1132 "add_same_filename");
1133 manager.insert_source_data(file_binary_hash, 0, "", 0, 0);
1134
1135 // hash to use
1136 std::string binary_hash =
1137 hashdb::hex_to_bin("80000000000000000000000000000000");
1138
1139 // get start index for this run
1140 uint64_t start_index = manager.size_hashes();
1141 if (start_index > 1) {
1142 --start_index;
1143 }
1144
1145 // insert count same hshes into the database
1146 for (uint64_t i=0; i<count; i++) {
1147
1148 // add hash
1149 manager.insert_hash(binary_hash, 0.0, "", file_binary_hash);
1150
1151 // update progress tracker
1152 progress_tracker.track();
1153 }
1154 }
1155
1156 // scan_same
scan_same(const std::string & hashdb_dir,const std::string & count_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)1157 static void scan_same(const std::string& hashdb_dir,
1158 const std::string& count_string,
1159 const hashdb::scan_mode_t scan_mode,
1160 const std::string& cmd) {
1161
1162 // validate hashdb_dir path
1163 require_hashdb_dir(hashdb_dir);
1164
1165 // convert count string to number
1166 const uint64_t count = s_to_uint64(count_string);
1167
1168 // open manager
1169 hashdb::scan_manager_t manager(hashdb_dir);
1170
1171 // start progress tracker
1172 progress_tracker_t progress_tracker(hashdb_dir, count, cmd);
1173
1174 // hash to use
1175 std::string binary_hash =
1176 hashdb::hex_to_bin("80000000000000000000000000000000");
1177
1178 // scan same hash repeatedly
1179 for (uint64_t i=1; i<=count; ++i) {
1180 std::string expanded_text = manager.find_hash_json(
1181 scan_mode, binary_hash);
1182
1183 if (expanded_text.size() == 0) {
1184 std::cout << "Match not found, hash "
1185 << hashdb::bin_to_hex(binary_hash)
1186 << ": " << expanded_text << "\n";
1187 }
1188
1189 // update progress tracker
1190 progress_tracker.track();
1191 }
1192 }
1193
1194 // test_scan_stream
test_scan_stream(const std::string & hashdb_dir,const std::string & count_string,const hashdb::scan_mode_t scan_mode,const std::string & cmd)1195 static void test_scan_stream(const std::string& hashdb_dir,
1196 const std::string& count_string,
1197 const hashdb::scan_mode_t scan_mode,
1198 const std::string& cmd) {
1199
1200 const size_t list_size = 10000;
1201
1202 // validate hashdb_dir path
1203 require_hashdb_dir(hashdb_dir);
1204
1205 // convert count string to number
1206 const uint64_t count = s_to_uint64(count_string);
1207
1208 // open manager
1209 hashdb::scan_manager_t manager(hashdb_dir);
1210
1211 // open scan_stream
1212 hashdb::scan_stream_t scan_stream(&manager, 16, scan_mode);
1213
1214 // start progress tracker
1215 progress_tracker_t progress_tracker(hashdb_dir, list_size * count, cmd);
1216
1217 // hash to use
1218 std::string binary_hash =
1219 hashdb::hex_to_bin("80000000000000000000000000000000");
1220
1221 // prepare the unscanned record of 10,000
1222 std::stringstream ss;
1223 for (size_t i=0; i<list_size; ++i) {
1224 const uint16_t index_length = std::to_string(i).size();
1225 ss << binary_hash;
1226 ss.write(reinterpret_cast<const char*>(&index_length), sizeof(uint16_t));
1227 ss << i;
1228 }
1229 const std::string unscanned(ss.str());
1230
1231 // put/get data
1232 for (uint64_t i=1; i<=count; ++i) {
1233 scan_stream.put(unscanned);
1234 const std::string scanned = scan_stream.get();
1235 if (scanned.size() > 0) {
1236 progress_tracker.track_count(list_size);
1237 }
1238 }
1239
1240 // get data until processing is done
1241 while (!scan_stream.empty()) {
1242 const std::string scanned = scan_stream.get();
1243 if (scanned.size() > 0) {
1244 progress_tracker.track_count(list_size);
1245 }
1246 }
1247 }
1248 }
1249
1250 #endif
1251
1252