1 // Author:  Bruce Allen
2 // Created: 2/25/2013
3 //
4 // The software provided here is released by the Naval Postgraduate
5 // School, an agency of the U.S. Department of Navy.  The software
6 // bears no warranty, either expressed or implied. NPS does not assume
7 // legal liability nor responsibility for a User's use of the software
8 // or the results of such use.
9 //
10 // Please note that within the United States, copyright protection,
11 // under Section 105 of the United States Code, Title 17, is not
12 // available for any work of the United States Government and/or for
13 // any works created by United States Government employees. User
14 // acknowledges that this software contains work which was created by
15 // NPS government employees and is therefore in the public domain and
16 // not subject to copyright.
17 //
18 // Released into the public domain on February 25, 2013 by Bruce Allen.
19 
20 /**
21  * \file
22  * Implementation code for the hashdb library.
23  */
24 
25 #include <config.h>
26 // this process of getting WIN32 defined was inspired
27 // from i686-w64-mingw32/sys-root/mingw/include/windows.h.
28 // All this to include winsock2.h before windows.h to avoid a warning.
29 #if defined(__MINGW64__) && defined(__cplusplus)
30 #  ifndef WIN32
31 #    define WIN32
32 #  endif
33 #endif
34 #ifdef WIN32
35   // including winsock2.h now keeps an included header somewhere from
36   // including windows.h first, resulting in a warning.
37   #include <winsock2.h>
38   #include "fsync.h"      // for simulation of linux fsync
39 #endif
40 #include "hashdb.hpp"
41 #include <string>
42 #include <sstream>
43 #include <vector>
44 #include <stdint.h>
45 #include <climits>
46 #ifndef HAVE_CXX11
47 #include <cassert>
48 #endif
49 #include <sys/stat.h>   // for mkdir
50 #include <fcntl.h>      // scan_stream_f
51 #include <time.h>       // for timestamp
52 #include <sys/types.h>  // for timestamp
53 #include <sys/time.h>   // for timestamp
54 #include <unistd.h>     // for pipe
55 #include "file_modes.h"
56 #include "settings_manager.hpp"
57 #include "lmdb_hash_data_manager.hpp"
58 #include "lmdb_hash_manager.hpp"
59 #include "lmdb_source_data_manager.hpp"
60 #include "lmdb_source_id_manager.hpp"
61 #include "lmdb_source_name_manager.hpp"
62 #include "logger.hpp"
63 #include "locked_member.hpp"
64 #include "lmdb_changes.hpp"
65 #include "rapidjson.h"
66 #include "writer.h"
67 #include "document.h"
68 #include "crc32.h"      // for find_expanded_hash_json
69 
70 // ************************************************************
71 // version of the hashdb library
72 // ************************************************************
73 /**
74  * Version of the hashdb library, same as hashdb::version.
75  */
76 extern "C"
hashdb_version()77 const char* hashdb_version() {
78   return PACKAGE_VERSION;
79 }
80 
81 namespace hashdb {
82 
83   // ************************************************************
84   // private helper functions
85   // ************************************************************
86   // obtain rapidjson::Value type from a std::string
v(const std::string & s,rapidjson::Document::AllocatorType & allocator)87   static rapidjson::Value v(const std::string& s,
88                             rapidjson::Document::AllocatorType& allocator) {
89     rapidjson::Value value;
90     value.SetString(s.c_str(), s.size(), allocator);
91     return value;
92   }
93 
94   // helper for producing expanded source for a source ID
provide_source_information(const hashdb::scan_manager_t & manager,const std::string file_hash,rapidjson::Document::AllocatorType & allocator,rapidjson::Value & json_source)95   static void provide_source_information(
96                         const hashdb::scan_manager_t& manager,
97                         const std::string file_hash,
98                         rapidjson::Document::AllocatorType& allocator,
99                         rapidjson::Value& json_source) {
100 
101     // fields to hold source information
102     uint64_t filesize;
103     std::string file_type;
104     uint64_t zero_count;
105     uint64_t nonprobative_count;
106     hashdb::source_names_t* source_names(new hashdb::source_names_t);
107 
108     // read source data
109     manager.find_source_data(file_hash, filesize, file_type,
110                              zero_count, nonprobative_count);
111 
112     // provide source data
113     const std::string hex_file_hash = hashdb::bin_to_hex(file_hash);
114 
115     // value for strings
116     json_source.AddMember("file_hash", v(hex_file_hash, allocator), allocator);
117     json_source.AddMember("filesize", filesize, allocator);
118     json_source.AddMember("file_type", v(file_type, allocator), allocator);
119     json_source.AddMember("zero_count", zero_count, allocator);
120     json_source.AddMember("nonprobative_count", nonprobative_count, allocator);
121 
122     // read source names
123     manager.find_source_names(file_hash, *source_names);
124 
125     // name_pairs object
126     rapidjson::Value json_name_pairs(rapidjson::kArrayType);
127 
128     // provide names
129     hashdb::source_names_t::const_iterator it;
130     for (it = source_names->begin(); it != source_names->end(); ++it) {
131       // repository name
132       json_name_pairs.PushBack(v(it->first, allocator), allocator);
133       // filename
134       json_name_pairs.PushBack(v(it->second, allocator), allocator);
135     }
136     json_source.AddMember("name_pairs", json_name_pairs, allocator);
137 
138     delete source_names;
139   }
140 
calculate_crc(const hashdb::source_sub_counts_t & source_sub_counts)141   static uint32_t calculate_crc(
142                        const hashdb::source_sub_counts_t& source_sub_counts) {
143 
144     // calculate the CRC for the sources
145     uint32_t crc = 0;
146     for (hashdb::source_sub_counts_t::const_iterator it =
147          source_sub_counts.begin(); it != source_sub_counts.end(); ++it) {
148       crc = hashdb::crc32(crc, static_cast<uint8_t*>(static_cast<void*>(
149                           const_cast<char*>((*it).file_hash.c_str()))),
150                           it->file_hash.size());
151     }
152     return crc;
153   }
154 
155   // ************************************************************
156   // version of the hashdb library
157   // ************************************************************
158   /**
159    * Version of the hashdb library.
160    */
161   extern "C"
version()162   const char* version() {
163     return PACKAGE_VERSION;
164   }
165 
166   // ************************************************************
167   // misc support interfaces
168   // ************************************************************
169   /**
170    * Return "" if hashdb is created else reason if not.
171    * The current implementation may abort if something worse than a simple
172    * path problem happens.
173    */
create_hashdb(const std::string & hashdb_dir,const hashdb::settings_t & settings,const std::string & command_string)174   std::string create_hashdb(const std::string& hashdb_dir,
175                             const hashdb::settings_t& settings,
176                             const std::string& command_string) {
177 
178     // path must be empty
179     if (access(hashdb_dir.c_str(), F_OK) == 0) {
180       return "Path '" + hashdb_dir + "' already exists.";
181     }
182 
183     // create the new hashdb directory
184     int status;
185 #ifdef WIN32
186     status = mkdir(hashdb_dir.c_str());
187 #else
188     status = mkdir(hashdb_dir.c_str(),0777);
189 #endif
190     if (status != 0) {
191       return "Unable to create new hashdb database at path '"
192                      + hashdb_dir + "'.";
193     }
194 
195     // create the settings file
196     std::string error_message = hashdb::write_settings(hashdb_dir, settings);
197     if (error_message.size() != 0) {
198       return error_message;
199     }
200 
201     // create new LMDB stores
202     lmdb_hash_data_manager_t(hashdb_dir, RW_NEW);
203     lmdb_hash_manager_t(hashdb_dir, RW_NEW);
204     lmdb_source_data_manager_t(hashdb_dir, RW_NEW);
205     lmdb_source_id_manager_t(hashdb_dir, RW_NEW);
206     lmdb_source_name_manager_t(hashdb_dir, RW_NEW);
207 
208     // create the log
209     logger_t(hashdb_dir, command_string);
210 
211     return "";
212   }
213 
214   // ************************************************************
215   // source sub_counts
216   // ************************************************************
source_sub_count_t(const std::string & p_file_hash,const uint64_t p_sub_count)217   source_sub_count_t::source_sub_count_t(const std::string& p_file_hash,
218                     const uint64_t p_sub_count) :
219           file_hash(p_file_hash),
220           sub_count(p_sub_count) {
221     }
operator <(const source_sub_count_t & that) const222   bool source_sub_count_t::operator<(const source_sub_count_t& that) const {
223     return (file_hash < that.file_hash);
224   }
225 
226   // ************************************************************
227   // settings
228   // ************************************************************
settings_t()229   settings_t::settings_t() :
230          settings_version(settings_t::CURRENT_SETTINGS_VERSION),
231          block_size(512) {
232   }
233 
settings_string() const234   std::string settings_t::settings_string() const {
235     std::stringstream ss;
236     ss << "{\"settings_version\":" << settings_version
237        << ", \"block_size\":" << block_size
238        << "}";
239     return ss.str();
240   }
241 
242   // ************************************************************
243   // import
244   // ************************************************************
import_manager_t(const std::string & hashdb_dir,const std::string & command_string)245   import_manager_t::import_manager_t(const std::string& hashdb_dir,
246                                      const std::string& command_string) :
247           // LMDB managers
248           lmdb_hash_data_manager(0),
249           lmdb_hash_manager(0),
250           lmdb_source_data_manager(0),
251           lmdb_source_id_manager(0),
252           lmdb_source_name_manager(0),
253 
254           // log
255           logger(new logger_t(hashdb_dir, command_string)),
256           changes(new hashdb::lmdb_changes_t) {
257 
258     // open managers
259     lmdb_hash_data_manager = new lmdb_hash_data_manager_t(hashdb_dir,
260                                                           RW_MODIFY);
261     lmdb_hash_manager = new lmdb_hash_manager_t(hashdb_dir, RW_MODIFY);
262     lmdb_source_data_manager = new lmdb_source_data_manager_t(hashdb_dir,
263                                                               RW_MODIFY);
264     lmdb_source_id_manager = new lmdb_source_id_manager_t(hashdb_dir,
265                                                               RW_MODIFY);
266     lmdb_source_name_manager = new lmdb_source_name_manager_t(hashdb_dir,
267                                                               RW_MODIFY);
268   }
269 
~import_manager_t()270   import_manager_t::~import_manager_t() {
271 
272     // show changes
273     logger->add_lmdb_changes(*changes);
274     std::cout << *changes;
275 
276     // close resources
277     delete lmdb_hash_data_manager;
278     delete lmdb_hash_manager;
279     delete lmdb_source_data_manager;
280     delete lmdb_source_id_manager;
281     delete lmdb_source_name_manager;
282     delete logger;
283     delete changes;
284   }
285 
insert_source_name(const std::string & file_hash,const std::string & repository_name,const std::string & filename)286   void import_manager_t::insert_source_name(
287                           const std::string& file_hash,
288                           const std::string& repository_name,
289                           const std::string& filename) {
290     if (file_hash.size() == 0) {
291       std::cerr << "Error: insert_source_name called with empty file_hash\n";
292       return;
293     }
294     uint64_t source_id;
295     bool is_new_id = lmdb_source_id_manager->insert(file_hash, *changes,
296                                                     source_id);
297     lmdb_source_name_manager->insert(source_id, repository_name, filename,
298                                      *changes);
299 
300     // If the source ID is new then add a blank source data record just to keep
301     // from breaking the reverse look-up done in scan_manager_t.
302     if (is_new_id == true) {
303       lmdb_source_data_manager->insert(source_id, file_hash, 0, "", 0, 0,
304                                        *changes);
305     }
306   }
307 
insert_source_data(const std::string & file_hash,const uint64_t filesize,const std::string & file_type,const uint64_t zero_count,const uint64_t nonprobative_count)308   void import_manager_t::insert_source_data(
309                           const std::string& file_hash,
310                           const uint64_t filesize,
311                           const std::string& file_type,
312                           const uint64_t zero_count,
313                           const uint64_t nonprobative_count) {
314     if (file_hash.size() == 0) {
315       std::cerr << "Error: insert_source_data called with empty file_hash\n";
316       return;
317     }
318     uint64_t source_id;
319     lmdb_source_id_manager->insert(file_hash, *changes, source_id);
320     lmdb_source_data_manager->insert(source_id, file_hash,
321                filesize, file_type, zero_count, nonprobative_count, *changes);
322   }
323 
324   // add whether file hash is present or not, used during ingest
insert_hash(const std::string & block_hash,const uint64_t k_entropy,const std::string & block_label,const std::string & file_hash)325   void import_manager_t::insert_hash(const std::string& block_hash,
326                           const uint64_t k_entropy,
327                           const std::string& block_label,
328                           const std::string& file_hash) {
329 
330     if (block_hash.size() == 0) {
331       std::cerr << "Error: insert_hash called with empty block_hash\n";
332       return;
333     }
334     if (file_hash.size() == 0) {
335       std::cerr << "Error: insert_hash called with empty file_hash\n";
336       return;
337     }
338 
339     uint64_t source_id;
340     bool is_new_id = lmdb_source_id_manager->insert(file_hash, *changes,
341                                                     source_id);
342 
343     // insert hash into hash data manager and hash manager
344     const size_t count = lmdb_hash_data_manager->insert(
345                  block_hash, k_entropy, block_label,
346                  source_id, *changes);
347     lmdb_hash_manager->insert(block_hash, count, *changes);
348 
349     // If the source ID is new then add a blank source data record just to keep
350     // from breaking the reverse look-up done in scan_manager_t.
351     if (is_new_id == true) {
352       lmdb_source_data_manager->insert(source_id, file_hash, 0, "", 0, 0,
353                                        *changes);
354     }
355   }
356 
357   // add only if file hash is not present, use during merge
merge_hash(const std::string & block_hash,const uint64_t k_entropy,const std::string & block_label,const std::string & file_hash,const uint64_t sub_count)358   void import_manager_t::merge_hash(const std::string& block_hash,
359                                     const uint64_t k_entropy,
360                                     const std::string& block_label,
361                                     const std::string& file_hash,
362                                     const uint64_t sub_count) {
363 
364     if (block_hash.size() == 0) {
365       std::cerr << "Error: insert_hash called with empty block_hash\n";
366       return;
367     }
368     if (file_hash.size() == 0) {
369       std::cerr << "Error: insert_hash called with empty file_hash\n";
370       return;
371     }
372 
373     uint64_t source_id;
374     bool is_new_id = lmdb_source_id_manager->insert(file_hash, *changes,
375                                                     source_id);
376 
377     // merge hash into hash data manager
378     const size_t count = lmdb_hash_data_manager->merge(
379                  block_hash, k_entropy, block_label,
380                  source_id, sub_count, *changes);
381 
382     // insert hash into hash manager
383     lmdb_hash_manager->insert(block_hash, count, *changes);
384 
385     // If the source ID is new then add a blank source data record just to keep
386     // from breaking the reverse look-up done in scan_manager_t.
387     if (is_new_id == true) {
388       lmdb_source_data_manager->insert(source_id, file_hash, 0, "", 0, 0,
389                                        *changes);
390     }
391   }
392 
393   // import JSON hash or source, return "" or error
import_json(const std::string & json_string)394   std::string import_manager_t::import_json(
395                           const std::string& json_string) {
396 
397     // open input as a JSON DOM document
398     rapidjson::Document document;
399     if (document.Parse(json_string.c_str()).HasParseError() ||
400         !document.IsObject()) {
401       return "Invalid JSON syntax";
402     }
403 
404     // block_hash or file_hash
405     if (document.HasMember("block_hash")) {
406 
407       // block_hash
408       if (!document["block_hash"].IsString()) {
409         return "Invalid block_hash field";
410       }
411       const std::string block_hash = hashdb::hex_to_bin(
412                                          document["block_hash"].GetString());
413 
414       // entropy (optional)
415       uint64_t k_entropy = 0;
416       if (document.HasMember("k_entropy")) {
417         if (document["k_entropy"].IsUint64()) {
418           k_entropy = document["k_entropy"].GetUint64();
419         } else {
420           return "Invalid k_entropy field";
421         }
422       }
423 
424       // block_label (optional)
425       std::string block_label = "";
426       if (document.HasMember("block_label")) {
427         if (document["block_label"].IsString()) {
428           block_label = document["block_label"].GetString();
429         } else {
430           return "Invalid block_label field";
431         }
432       }
433 
434       // source_sub_counts:[]
435       if (!document.HasMember("source_sub_counts") ||
436                     !document["source_sub_counts"].IsArray()) {
437         return "Invalid source_sub_counts field";
438       }
439       const rapidjson::Value& json_source_sub_counts =
440                                         document["source_sub_counts"];
441       hashdb::source_sub_counts_t* source_sub_counts =
442                                         new hashdb::source_sub_counts_t;
443       for (rapidjson::SizeType i = 0;
444            i+1 < json_source_sub_counts.Size(); i+=2) {
445 
446         // source hash
447         if (!json_source_sub_counts[i+0].IsString()) {
448           delete source_sub_counts;
449           return "Invalid source hash in source_sub_counts";
450         }
451         const std::string file_hash = hashdb::hex_to_bin(
452                                         json_source_sub_counts[i].GetString());
453 
454         // sub_count
455         if (!json_source_sub_counts[i+1].IsUint64()) {
456           delete source_sub_counts;
457           return "Invalid sub_count in source_sub_counts";
458         }
459         const uint64_t sub_count = json_source_sub_counts[i+1].GetUint64();
460 
461         // add hash data for this source and source sub_count
462         merge_hash(block_hash, k_entropy, block_label, file_hash, sub_count);
463       }
464 
465       delete source_sub_counts;
466       return "";
467 
468     } else if (document.HasMember("file_hash")) {
469 
470       // parse file_hash
471       if (!document.HasMember("file_hash") ||
472                     !document["file_hash"].IsString()) {
473         return "Invalid file_hash field";
474       }
475       const std::string file_hash = hashdb::hex_to_bin(
476                                        document["file_hash"].GetString());
477 
478       // parse filesize
479       if (!document.HasMember("filesize") ||
480                     !document["filesize"].IsUint64()) {
481         return "Invalid filesize field";
482       }
483       const uint64_t filesize = document["filesize"].GetUint64();
484 
485       // parse file_type (optional)
486       std::string file_type = "";
487       if (document.HasMember("file_type")) {
488         if (document["file_type"].IsString()) {
489           file_type = document["file_type"].GetString();
490         } else {
491           return "Invalid file_type field";
492         }
493       }
494 
495       // zero_count (optional)
496       uint64_t zero_count = 0;
497       if (document.HasMember("zero_count")) {
498         if (document["zero_count"].IsUint64()) {
499           zero_count = document["zero_count"].GetUint64();
500         } else {
501           return "Invalid zero_count field";
502         }
503       }
504 
505       // nonprobative_count (optional)
506       uint64_t nonprobative_count = 0;
507       if (document.HasMember("nonprobative_count")) {
508         if (document["nonprobative_count"].IsUint64()) {
509           nonprobative_count = document["nonprobative_count"].GetUint64();
510         } else {
511           return "Invalid nonprobative_count field";
512         }
513       }
514 
515       // parse name_pairs:[]
516       if (!document.HasMember("name_pairs") ||
517                     !document["name_pairs"].IsArray()) {
518         return "Invalid name_pairs field";
519       }
520       const rapidjson::Value& json_names = document["name_pairs"];
521       hashdb::source_names_t* names = new hashdb::source_names_t;
522       for (rapidjson::SizeType i = 0; i< json_names.Size(); i+=2) {
523 
524         // parse repository name
525         if (!json_names[i].IsString()) {
526           delete names;
527           return "Invalid repository name in name_pairs field";
528         }
529         const std::string repository_name = json_names[i].GetString();
530 
531         // parse filename
532         if (!json_names[i+1].IsString()) {
533           delete names;
534           return "Invalid filename in name_pairs field";
535         }
536         const std::string filename = json_names[i+1].GetString();
537 
538         // add repository name, filename pair
539         names->insert(hashdb::source_name_t(repository_name, filename));
540       }
541 
542       // everything worked so insert the source data and source names
543       insert_source_data(file_hash,
544                          filesize, file_type, zero_count, nonprobative_count);
545       for (hashdb::source_names_t::const_iterator it = names->begin();
546            it != names->end(); ++it) {
547         insert_source_name(file_hash, it->first, it->second);
548       }
549 
550       delete names;
551       return "";
552 
553     } else {
554       return "A block_hash or file_hash field is required";
555     }
556   }
557 
has_source(const std::string & file_hash) const558   bool import_manager_t::has_source(const std::string& file_hash) const {
559     uint64_t source_id;
560     return lmdb_source_id_manager->find(file_hash, source_id);
561   }
562 
first_source() const563   std::string import_manager_t::first_source() const {
564     return lmdb_source_id_manager->first_source();
565   }
566 
next_source(const std::string & file_hash) const567   std::string import_manager_t::next_source(const std::string& file_hash) const {
568     return lmdb_source_id_manager->next_source(file_hash);
569   }
570 
size() const571   std::string import_manager_t::size() const {
572     std::stringstream ss;
573     ss << "{\"hash_data_store\":" << lmdb_hash_data_manager->size()
574        << ", \"hash_store\":" << lmdb_hash_manager->size()
575        << ", \"source_data_store\":" << lmdb_source_data_manager->size()
576        << ", \"source_id_store\":" << lmdb_source_id_manager->size()
577        << ", \"source_name_store\":" << lmdb_source_name_manager->size()
578        << "}";
579     return ss.str();
580   }
581 
size_hashes() const582   size_t import_manager_t::size_hashes() const {
583     return lmdb_hash_data_manager->size();
584   }
585 
size_sources() const586   size_t import_manager_t::size_sources() const {
587     return lmdb_source_id_manager->size();
588   }
589 
590   // ************************************************************
591   // scan
592   // ************************************************************
scan_manager_t(const std::string & hashdb_dir)593   scan_manager_t::scan_manager_t(const std::string& hashdb_dir) :
594           // LMDB managers
595           lmdb_hash_data_manager(0),
596           lmdb_hash_manager(0),
597           lmdb_source_data_manager(0),
598           lmdb_source_id_manager(0),
599           lmdb_source_name_manager(0),
600 
601           // for find_expanded_hash_json
602           hashes(new locked_member_t),
603           sources(new locked_member_t) {
604 
605     // open managers
606     lmdb_hash_data_manager = new lmdb_hash_data_manager_t(hashdb_dir,
607                                                           READ_ONLY);
608     lmdb_hash_manager = new lmdb_hash_manager_t(hashdb_dir, READ_ONLY);
609     lmdb_source_data_manager = new lmdb_source_data_manager_t(hashdb_dir,
610                                                               READ_ONLY);
611     lmdb_source_id_manager = new lmdb_source_id_manager_t(hashdb_dir,
612                                                               READ_ONLY);
613     lmdb_source_name_manager = new lmdb_source_name_manager_t(hashdb_dir,
614                                                               READ_ONLY);
615   }
616 
~scan_manager_t()617   scan_manager_t::~scan_manager_t() {
618     delete lmdb_hash_data_manager;
619     delete lmdb_hash_manager;
620     delete lmdb_source_data_manager;
621     delete lmdb_source_id_manager;
622     delete lmdb_source_name_manager;
623 
624     // for find_expanded_hash_json
625     delete hashes;
626     delete sources;
627   }
628 
find_hash_json(const hashdb::scan_mode_t scan_mode,const std::string & block_hash)629   std::string scan_manager_t::find_hash_json(
630                    const hashdb::scan_mode_t scan_mode,
631                    const std::string& block_hash) {
632 
633     // delegate to low-level handler
634     switch(scan_mode) {
635 
636       // EXPANDED
637       case hashdb::scan_mode_t::EXPANDED:
638         return find_expanded_hash_json(false, block_hash);
639 
640       // EXPANDED_OPTIMIZED
641       case hashdb::scan_mode_t::EXPANDED_OPTIMIZED:
642         return find_expanded_hash_json(true, block_hash);
643 
644       // COUNT
645       case hashdb::scan_mode_t::COUNT:
646         return find_hash_count_json(block_hash);
647 
648       // APPROXIMATE_COUNT
649       case hashdb::scan_mode_t::APPROXIMATE_COUNT:
650         return find_approximate_hash_count_json(block_hash);
651 
652       default: assert(0); std::exit(1);
653     }
654   }
655 
656   // Find expanded hash, optimized with caching, return JSON.
657   // If optimizing, cache hashes and sources.
find_expanded_hash_json(const bool optimizing,const std::string & block_hash)658   std::string scan_manager_t::find_expanded_hash_json(
659                     const bool optimizing, const std::string& block_hash) {
660 
661     // fields to hold the scan
662     uint64_t k_entropy;
663     std::string block_label;
664     uint64_t count;
665     hashdb::source_sub_counts_t* source_sub_counts =
666                                            new hashdb::source_sub_counts_t;
667 
668     // scan
669     bool matched = scan_manager_t::find_hash(block_hash,
670                            k_entropy, block_label, count, *source_sub_counts);
671 
672     // done if no match
673     if (matched == false) {
674       delete source_sub_counts;
675       return "";
676     }
677 
678     // prepare JSON
679     rapidjson::Document json_doc;
680     rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
681     json_doc.SetObject();
682 
683     // block_hash
684     std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
685     json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
686 
687     // report hash if not caching or this is the first time for the hash
688     if (!optimizing || hashes->locked_insert(block_hash)) {
689 
690       // add entropy
691       json_doc.AddMember("k_entropy", k_entropy, allocator);
692 
693       // add block_label
694       json_doc.AddMember("block_label", v(block_label, allocator), allocator);
695 
696       // add count
697       json_doc.AddMember("count", count, allocator);
698 
699       // add source_list_id
700       uint32_t crc = calculate_crc(*source_sub_counts);
701       json_doc.AddMember("source_list_id", crc, allocator);
702 
703       // the sources array
704       rapidjson::Value json_sources(rapidjson::kArrayType);
705 
706       // add each source object
707       for (hashdb::source_sub_counts_t::const_iterator it =
708            source_sub_counts->begin(); it != source_sub_counts->end(); ++it) {
709         if (!optimizing || sources->locked_insert(it->file_hash)) {
710 
711           // create a json_source object for the json_sources array
712           rapidjson::Value json_source(rapidjson::kObjectType);
713 
714           // provide the complete source information for this source
715           provide_source_information(*this, it->file_hash, allocator,
716                                      json_source);
717           json_sources.PushBack(json_source, allocator);
718         }
719       }
720       json_doc.AddMember("sources", json_sources, allocator);
721 
722       // add source_sub_counts as pairs of file hash, sub_count
723       rapidjson::Value json_source_sub_counts(rapidjson::kArrayType);
724 
725       for (hashdb::source_sub_counts_t::const_iterator it =
726            source_sub_counts->begin(); it != source_sub_counts->end(); ++it) {
727 
728         // file hash
729         json_source_sub_counts.PushBack(
730                    v(hashdb::bin_to_hex(it->file_hash), allocator), allocator);
731 
732         // sub_count
733         json_source_sub_counts.PushBack(it->sub_count, allocator);
734 
735       }
736       json_doc.AddMember("source_sub_counts", json_source_sub_counts,
737                          allocator);
738     }
739 
740     delete source_sub_counts;
741 
742     // return JSON text
743     rapidjson::StringBuffer strbuf;
744     rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
745     json_doc.Accept(writer);
746     return strbuf.GetString();
747   }
748 
749   // find hash, return associated hash and source data
find_hash(const std::string & block_hash,uint64_t & k_entropy,std::string & block_label,uint64_t & count,source_sub_counts_t & source_sub_counts) const750   bool scan_manager_t::find_hash(
751                const std::string& block_hash,
752                uint64_t& k_entropy,
753                std::string& block_label,
754                uint64_t& count,
755                source_sub_counts_t& source_sub_counts) const {
756 
757     // clear fields
758     k_entropy = 0;
759     block_label = "";
760     count = 0;
761     source_sub_counts.clear();
762 
763     if (block_hash.size() == 0) {
764       std::cerr << "Error: find_hash called with empty block_hash\n";
765       return false;
766     }
767 
768     // first check hash store
769     if (lmdb_hash_manager->find(block_hash) == 0) {
770       // hash is not present so return false
771       return false;
772     }
773 
774     // hash may be present so read hash using hash data manager
775     hashdb::source_id_sub_counts_t* source_id_sub_counts =
776                 new hashdb::source_id_sub_counts_t;
777     bool has_hash = lmdb_hash_data_manager->find(block_hash, k_entropy,
778                                   block_label, count, *source_id_sub_counts);
779     if (has_hash) {
780       // build source_sub_count from source_id_sub_count
781       for (hashdb::source_id_sub_counts_t::const_iterator it =
782            source_id_sub_counts->begin(); it != source_id_sub_counts->end();
783            ++it) {
784 
785         // space for unused returned source variables
786         std::string file_hash;
787         uint64_t filesize;
788         std::string file_type;
789         uint64_t zero_count;
790         uint64_t nonprobative_count;
791 
792         // get file_hash from source_id
793         bool source_data_found = lmdb_source_data_manager->find(
794                                 it->source_id, file_hash,
795                                 filesize, file_type,
796                                 zero_count, nonprobative_count);
797 
798         // source_data must have a source_id to match the source_id in hash_data
799         if (source_data_found == false) {
800           assert(0);
801         }
802 
803         // add the source sub_counts
804         source_sub_counts.insert(hashdb::source_sub_count_t(file_hash,
805                                                             it->sub_count));
806       }
807       delete source_id_sub_counts;
808       return true;
809 
810     } else {
811       // no action, lmdb_hash_data_manager.find clears out fields
812       delete source_id_sub_counts;
813       return false;
814     }
815   }
816 
817   // export hash, return result as JSON string
export_hash_json(const std::string & block_hash) const818   std::string scan_manager_t::export_hash_json(
819                const std::string& block_hash) const {
820 
821     // hash fields
822     uint64_t k_entropy;
823     std::string block_label;
824     uint64_t unused_count;
825     hashdb::source_sub_counts_t* source_sub_counts =
826                                 new hashdb::source_sub_counts_t;
827 
828     // scan
829     bool found_hash = find_hash(block_hash, k_entropy, block_label,
830                                 unused_count, *source_sub_counts);
831 
832     std::string json_hash_string;
833     if (found_hash) {
834 
835       // prepare JSON
836       rapidjson::Document json_doc;
837       rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
838       json_doc.SetObject();
839 
840       // put in hash data
841       std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
842       json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
843       json_doc.AddMember("k_entropy", k_entropy, allocator);
844       json_doc.AddMember("block_label", v(block_label, allocator), allocator);
845 
846       // put in source_sub_counts as pairs of file hash, sub_count
847       rapidjson::Value json_source_sub_counts(rapidjson::kArrayType);
848 
849       for (hashdb::source_sub_counts_t::const_iterator it =
850            source_sub_counts->begin(); it != source_sub_counts->end(); ++it) {
851 
852         // file hash
853         json_source_sub_counts.PushBack(
854                    v(hashdb::bin_to_hex(it->file_hash), allocator), allocator);
855 
856         // sub_count
857         json_source_sub_counts.PushBack(it->sub_count, allocator);
858 
859       }
860       json_doc.AddMember("source_sub_counts", json_source_sub_counts,
861                          allocator);
862 
863       // write JSON text
864       rapidjson::StringBuffer strbuf;
865       rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
866       json_doc.Accept(writer);
867       json_hash_string = strbuf.GetString();
868 
869     } else {
870       // not found
871       json_hash_string = "";
872     }
873 
874     delete source_sub_counts;
875     return json_hash_string;
876   }
877 
878   // find hash count
find_hash_count(const std::string & block_hash) const879   size_t scan_manager_t::find_hash_count(
880                                     const std::string& block_hash) const {
881 
882     if (block_hash.size() == 0) {
883       std::cerr << "Error: find_hash_count called with empty block_hash\n";
884       return 0;
885     }
886 
887     return lmdb_hash_data_manager->find_count(block_hash);
888   }
889 
890   // find hash count JSON
find_hash_count_json(const std::string & block_hash) const891   std::string scan_manager_t::find_hash_count_json(
892                                     const std::string& block_hash) const {
893 
894     // get count
895     size_t count = find_hash_count(block_hash);
896 
897     // no match
898     if (count == 0) {
899       return "";
900     }
901 
902     // return JSON with count
903     // prepare JSON
904     rapidjson::Document json_doc;
905     rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
906     json_doc.SetObject();
907 
908     // block hash
909     std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
910     json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
911 
912     // count
913     json_doc.AddMember("count", (uint64_t)count, allocator);
914 
915     // write JSON text
916     rapidjson::StringBuffer strbuf;
917     rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
918     json_doc.Accept(writer);
919     return strbuf.GetString();
920   }
921 
find_approximate_hash_count(const std::string & block_hash) const922   size_t scan_manager_t::find_approximate_hash_count(
923                                     const std::string& block_hash) const {
924     if (block_hash.size() == 0) {
925       std::cerr << "Error: find_approximate_hash_count called with empty block_hash\n";
926       return 0;
927     }
928 
929     return lmdb_hash_manager->find(block_hash);
930   }
931 
932   // find approximate hash count JSON
find_approximate_hash_count_json(const std::string & block_hash) const933   std::string scan_manager_t::find_approximate_hash_count_json(
934                                     const std::string& block_hash) const {
935 
936     // get approximate count
937     size_t approximate_count =
938            find_approximate_hash_count(block_hash);
939 
940     // no match
941     if (approximate_count == 0) {
942       return "";
943     }
944 
945     // return JSON with approximate count
946     // prepare JSON
947     rapidjson::Document json_doc;
948     rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
949     json_doc.SetObject();
950 
951     // block hash
952     std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
953     json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
954 
955     // approximate count
956     json_doc.AddMember("approximate_count",
957                                   (uint64_t)approximate_count, allocator);
958 
959     // write JSON text
960     rapidjson::StringBuffer strbuf;
961     rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
962     json_doc.Accept(writer);
963     return strbuf.GetString();
964   }
965 
find_source_data(const std::string & file_hash,uint64_t & filesize,std::string & file_type,uint64_t & zero_count,uint64_t & nonprobative_count) const966   bool scan_manager_t::find_source_data(
967                         const std::string& file_hash,
968                         uint64_t& filesize,
969                         std::string& file_type,
970                         uint64_t& zero_count,
971                         uint64_t& nonprobative_count) const {
972 
973     if (file_hash.size() == 0) {
974       std::cerr << "Error: find_source_data called with empty file_hash\n";
975       return false;
976     }
977 
978     // read source_id
979     uint64_t source_id;
980     bool has_id = lmdb_source_id_manager->find(file_hash, source_id);
981     if (has_id == false) {
982       // no source ID for this file_hash
983       filesize = 0;
984       file_type = "";
985       zero_count = 0;
986       nonprobative_count = 0;
987       return false;
988     } else {
989 
990       // read source data associated with this source ID
991       std::string returned_file_hash;
992       bool source_data_found = lmdb_source_data_manager->find(source_id,
993                              returned_file_hash, filesize, file_type,
994                              zero_count, nonprobative_count);
995 
996       // if source data is found, make sure the file binary hash is right
997       if (source_data_found == true &&
998                          file_hash != returned_file_hash) {
999         assert(0);
1000       }
1001     }
1002     return true;
1003   }
1004 
find_source_names(const std::string & file_hash,source_names_t & source_names) const1005   bool scan_manager_t::find_source_names(const std::string& file_hash,
1006                          source_names_t& source_names) const {
1007 
1008     if (file_hash.size() == 0) {
1009       std::cerr << "Error: find_source_names called with empty file_hash\n";
1010       return false;
1011     }
1012 
1013     // read source_id
1014     uint64_t source_id;
1015     bool has_id = lmdb_source_id_manager->find(file_hash, source_id);
1016     if (has_id == false) {
1017       // no source ID for this file_hash
1018       source_names.clear();
1019       return false;
1020     } else {
1021       // source
1022       return lmdb_source_name_manager->find(source_id, source_names);
1023     }
1024   }
1025 
1026   // export source, return result as JSON string
export_source_json(const std::string & file_hash) const1027   std::string scan_manager_t::export_source_json(
1028                                const std::string& file_hash) const {
1029 
1030     // source fields
1031     uint64_t filesize;
1032     std::string file_type;
1033     uint64_t zero_count;
1034     uint64_t nonprobative_count;
1035 
1036     // prepare JSON
1037     rapidjson::Document json_doc;
1038     rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
1039     json_doc.SetObject();
1040 
1041     // get source data
1042     bool has_source_data = find_source_data(file_hash, filesize,
1043                                  file_type, zero_count, nonprobative_count);
1044     if (!has_source_data) {
1045       return "";
1046     }
1047 
1048     // source found
1049 
1050     // set source data
1051     std::string hex_file_hash = hashdb::bin_to_hex(file_hash);
1052     json_doc.AddMember("file_hash", v(hex_file_hash, allocator), allocator);
1053     json_doc.AddMember("filesize", filesize, allocator);
1054     json_doc.AddMember("file_type", v(file_type, allocator), allocator);
1055     json_doc.AddMember("zero_count", zero_count, allocator);
1056     json_doc.AddMember("nonprobative_count", nonprobative_count, allocator);
1057 
1058     // get source names
1059     hashdb::source_names_t* source_names = new hashdb::source_names_t;
1060     find_source_names(file_hash, *source_names);
1061 
1062     // name_pairs object
1063     rapidjson::Value json_name_pairs(rapidjson::kArrayType);
1064 
1065     // provide names
1066     for (hashdb::source_names_t::const_iterator it = source_names->begin();
1067          it != source_names->end(); ++it) {
1068       // repository name
1069       json_name_pairs.PushBack(v(it->first, allocator), allocator);
1070       // filename
1071       json_name_pairs.PushBack(v(it->second, allocator), allocator);
1072     }
1073     json_doc.AddMember("name_pairs", json_name_pairs, allocator);
1074 
1075     // done with source names
1076     delete source_names;
1077 
1078     // write JSON text
1079     rapidjson::StringBuffer strbuf;
1080     rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
1081     json_doc.Accept(writer);
1082     return strbuf.GetString();
1083   }
1084 
first_hash() const1085   std::string scan_manager_t::first_hash() const {
1086     return lmdb_hash_data_manager->first_hash();
1087   }
1088 
next_hash(const std::string & block_hash) const1089   std::string scan_manager_t::next_hash(const std::string& block_hash) const {
1090     if (block_hash.size() == 0) {
1091       std::cerr << "Error: next_hash called with empty block_hash\n";
1092       return "";
1093     }
1094     return lmdb_hash_data_manager->next_hash(block_hash);
1095   }
1096 
first_source() const1097   std::string scan_manager_t::first_source() const {
1098     return lmdb_source_id_manager->first_source();
1099   }
1100 
next_source(const std::string & file_hash) const1101   std::string scan_manager_t::next_source(const std::string& file_hash) const {
1102     if (file_hash.size() == 0) {
1103       std::cerr << "Error: next_source called with empty file_hash\n";
1104       return "";
1105     }
1106     return lmdb_source_id_manager->next_source(file_hash);
1107   }
1108 
size() const1109   std::string scan_manager_t::size() const {
1110     std::stringstream ss;
1111     ss << "{\"hash_data_store\":" << lmdb_hash_data_manager->size()
1112        << ", \"hash_store\":" << lmdb_hash_manager->size()
1113        << ", \"source_data_store\":" << lmdb_source_data_manager->size()
1114        << ", \"source_id_store\":" << lmdb_source_id_manager->size()
1115        << ", \"source_name_store\":" << lmdb_source_name_manager->size()
1116        << "}";
1117     return ss.str();
1118   }
1119 
size_hashes() const1120   size_t scan_manager_t::size_hashes() const {
1121     return lmdb_hash_data_manager->size();
1122   }
1123 
size_sources() const1124   size_t scan_manager_t::size_sources() const {
1125     return lmdb_source_id_manager->size();
1126   }
1127 
1128   // ************************************************************
1129   // timestamp
1130   // ************************************************************
timestamp_t()1131   timestamp_t::timestamp_t() :
1132               t0(new timeval()), t_last_timestamp(new timeval()) {
1133     gettimeofday(t0, 0);
1134     gettimeofday(t_last_timestamp, 0);
1135   }
1136 
~timestamp_t()1137   timestamp_t::~timestamp_t() {
1138     delete t0;
1139     delete t_last_timestamp;
1140   }
1141 
1142   /**
1143    * Take a timestamp and return a JSON string in format {"name":"name",
1144    * "delta":delta, "total":total}.
1145    */
stamp(const std::string & name)1146   std::string timestamp_t::stamp(const std::string &name) {
1147     // adapted from dfxml_writer.cpp
1148     struct timeval t1;
1149     gettimeofday(&t1,0);
1150     struct timeval t;
1151 
1152     // timestamp delta against t_last_timestamp
1153     t.tv_sec = t1.tv_sec - t_last_timestamp->tv_sec;
1154     if(t1.tv_usec > t_last_timestamp->tv_usec){
1155         t.tv_usec = t1.tv_usec - t_last_timestamp->tv_usec;
1156     } else {
1157         t.tv_sec--;
1158         t.tv_usec = (t1.tv_usec+1000000) - t_last_timestamp->tv_usec;
1159     }
1160     char delta[16];
1161     snprintf(delta, 16, "%d.%06d", (int)t.tv_sec, (int)t.tv_usec);
1162 
1163     // reset t_last_timestamp for the next invocation
1164     gettimeofday(t_last_timestamp,0);
1165 
1166     // timestamp total
1167     t.tv_sec = t1.tv_sec - t0->tv_sec;
1168     if(t1.tv_usec > t0->tv_usec){
1169         t.tv_usec = t1.tv_usec - t0->tv_usec;
1170     } else {
1171         t.tv_sec--;
1172         t.tv_usec = (t1.tv_usec+1000000) - t0->tv_usec;
1173     }
1174     char total_time[16];
1175     snprintf(total_time, 16, "%d.%06d", (int)t.tv_sec, (int)t.tv_usec);
1176 
1177     // return the named timestamp
1178     // prepare JSON
1179     rapidjson::Document json_doc;
1180     rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
1181     json_doc.SetObject();
1182     json_doc.AddMember("name", v(name, allocator), allocator);
1183     json_doc.AddMember("delta", v(std::string(delta), allocator), allocator);
1184     json_doc.AddMember("total", v(std::string(total_time), allocator),
1185                                                                   allocator);
1186 
1187     // copy JSON text
1188     rapidjson::StringBuffer strbuf;
1189     rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
1190     json_doc.Accept(writer);
1191     return strbuf.GetString();
1192   }
1193 }
1194 
1195