1 // Author: Bruce Allen
2 // Created: 2/25/2013
3 //
4 // The software provided here is released by the Naval Postgraduate
5 // School, an agency of the U.S. Department of Navy. The software
6 // bears no warranty, either expressed or implied. NPS does not assume
7 // legal liability nor responsibility for a User's use of the software
8 // or the results of such use.
9 //
10 // Please note that within the United States, copyright protection,
11 // under Section 105 of the United States Code, Title 17, is not
12 // available for any work of the United States Government and/or for
13 // any works created by United States Government employees. User
14 // acknowledges that this software contains work which was created by
15 // NPS government employees and is therefore in the public domain and
16 // not subject to copyright.
17 //
18 // Released into the public domain on February 25, 2013 by Bruce Allen.
19
20 /**
21 * \file
22 * Implementation code for the hashdb library.
23 */
24
25 #include <config.h>
26 // this process of getting WIN32 defined was inspired
27 // from i686-w64-mingw32/sys-root/mingw/include/windows.h.
28 // All this to include winsock2.h before windows.h to avoid a warning.
29 #if defined(__MINGW64__) && defined(__cplusplus)
30 # ifndef WIN32
31 # define WIN32
32 # endif
33 #endif
34 #ifdef WIN32
35 // including winsock2.h now keeps an included header somewhere from
36 // including windows.h first, resulting in a warning.
37 #include <winsock2.h>
38 #include "fsync.h" // for simulation of linux fsync
39 #endif
40 #include "hashdb.hpp"
41 #include <string>
42 #include <sstream>
43 #include <vector>
44 #include <stdint.h>
45 #include <climits>
46 #ifndef HAVE_CXX11
47 #include <cassert>
48 #endif
49 #include <sys/stat.h> // for mkdir
50 #include <fcntl.h> // scan_stream_f
51 #include <time.h> // for timestamp
52 #include <sys/types.h> // for timestamp
53 #include <sys/time.h> // for timestamp
54 #include <unistd.h> // for pipe
55 #include "file_modes.h"
56 #include "settings_manager.hpp"
57 #include "lmdb_hash_data_manager.hpp"
58 #include "lmdb_hash_manager.hpp"
59 #include "lmdb_source_data_manager.hpp"
60 #include "lmdb_source_id_manager.hpp"
61 #include "lmdb_source_name_manager.hpp"
62 #include "logger.hpp"
63 #include "locked_member.hpp"
64 #include "lmdb_changes.hpp"
65 #include "rapidjson.h"
66 #include "writer.h"
67 #include "document.h"
68 #include "crc32.h" // for find_expanded_hash_json
69
70 // ************************************************************
71 // version of the hashdb library
72 // ************************************************************
73 /**
74 * Version of the hashdb library, same as hashdb::version.
75 */
76 extern "C"
hashdb_version()77 const char* hashdb_version() {
78 return PACKAGE_VERSION;
79 }
80
81 namespace hashdb {
82
83 // ************************************************************
84 // private helper functions
85 // ************************************************************
86 // obtain rapidjson::Value type from a std::string
v(const std::string & s,rapidjson::Document::AllocatorType & allocator)87 static rapidjson::Value v(const std::string& s,
88 rapidjson::Document::AllocatorType& allocator) {
89 rapidjson::Value value;
90 value.SetString(s.c_str(), s.size(), allocator);
91 return value;
92 }
93
94 // helper for producing expanded source for a source ID
provide_source_information(const hashdb::scan_manager_t & manager,const std::string file_hash,rapidjson::Document::AllocatorType & allocator,rapidjson::Value & json_source)95 static void provide_source_information(
96 const hashdb::scan_manager_t& manager,
97 const std::string file_hash,
98 rapidjson::Document::AllocatorType& allocator,
99 rapidjson::Value& json_source) {
100
101 // fields to hold source information
102 uint64_t filesize;
103 std::string file_type;
104 uint64_t zero_count;
105 uint64_t nonprobative_count;
106 hashdb::source_names_t* source_names(new hashdb::source_names_t);
107
108 // read source data
109 manager.find_source_data(file_hash, filesize, file_type,
110 zero_count, nonprobative_count);
111
112 // provide source data
113 const std::string hex_file_hash = hashdb::bin_to_hex(file_hash);
114
115 // value for strings
116 json_source.AddMember("file_hash", v(hex_file_hash, allocator), allocator);
117 json_source.AddMember("filesize", filesize, allocator);
118 json_source.AddMember("file_type", v(file_type, allocator), allocator);
119 json_source.AddMember("zero_count", zero_count, allocator);
120 json_source.AddMember("nonprobative_count", nonprobative_count, allocator);
121
122 // read source names
123 manager.find_source_names(file_hash, *source_names);
124
125 // name_pairs object
126 rapidjson::Value json_name_pairs(rapidjson::kArrayType);
127
128 // provide names
129 hashdb::source_names_t::const_iterator it;
130 for (it = source_names->begin(); it != source_names->end(); ++it) {
131 // repository name
132 json_name_pairs.PushBack(v(it->first, allocator), allocator);
133 // filename
134 json_name_pairs.PushBack(v(it->second, allocator), allocator);
135 }
136 json_source.AddMember("name_pairs", json_name_pairs, allocator);
137
138 delete source_names;
139 }
140
calculate_crc(const hashdb::source_sub_counts_t & source_sub_counts)141 static uint32_t calculate_crc(
142 const hashdb::source_sub_counts_t& source_sub_counts) {
143
144 // calculate the CRC for the sources
145 uint32_t crc = 0;
146 for (hashdb::source_sub_counts_t::const_iterator it =
147 source_sub_counts.begin(); it != source_sub_counts.end(); ++it) {
148 crc = hashdb::crc32(crc, static_cast<uint8_t*>(static_cast<void*>(
149 const_cast<char*>((*it).file_hash.c_str()))),
150 it->file_hash.size());
151 }
152 return crc;
153 }
154
155 // ************************************************************
156 // version of the hashdb library
157 // ************************************************************
158 /**
159 * Version of the hashdb library.
160 */
161 extern "C"
version()162 const char* version() {
163 return PACKAGE_VERSION;
164 }
165
166 // ************************************************************
167 // misc support interfaces
168 // ************************************************************
169 /**
170 * Return "" if hashdb is created else reason if not.
171 * The current implementation may abort if something worse than a simple
172 * path problem happens.
173 */
create_hashdb(const std::string & hashdb_dir,const hashdb::settings_t & settings,const std::string & command_string)174 std::string create_hashdb(const std::string& hashdb_dir,
175 const hashdb::settings_t& settings,
176 const std::string& command_string) {
177
178 // path must be empty
179 if (access(hashdb_dir.c_str(), F_OK) == 0) {
180 return "Path '" + hashdb_dir + "' already exists.";
181 }
182
183 // create the new hashdb directory
184 int status;
185 #ifdef WIN32
186 status = mkdir(hashdb_dir.c_str());
187 #else
188 status = mkdir(hashdb_dir.c_str(),0777);
189 #endif
190 if (status != 0) {
191 return "Unable to create new hashdb database at path '"
192 + hashdb_dir + "'.";
193 }
194
195 // create the settings file
196 std::string error_message = hashdb::write_settings(hashdb_dir, settings);
197 if (error_message.size() != 0) {
198 return error_message;
199 }
200
201 // create new LMDB stores
202 lmdb_hash_data_manager_t(hashdb_dir, RW_NEW);
203 lmdb_hash_manager_t(hashdb_dir, RW_NEW);
204 lmdb_source_data_manager_t(hashdb_dir, RW_NEW);
205 lmdb_source_id_manager_t(hashdb_dir, RW_NEW);
206 lmdb_source_name_manager_t(hashdb_dir, RW_NEW);
207
208 // create the log
209 logger_t(hashdb_dir, command_string);
210
211 return "";
212 }
213
214 // ************************************************************
215 // source sub_counts
216 // ************************************************************
source_sub_count_t(const std::string & p_file_hash,const uint64_t p_sub_count)217 source_sub_count_t::source_sub_count_t(const std::string& p_file_hash,
218 const uint64_t p_sub_count) :
219 file_hash(p_file_hash),
220 sub_count(p_sub_count) {
221 }
operator <(const source_sub_count_t & that) const222 bool source_sub_count_t::operator<(const source_sub_count_t& that) const {
223 return (file_hash < that.file_hash);
224 }
225
226 // ************************************************************
227 // settings
228 // ************************************************************
settings_t()229 settings_t::settings_t() :
230 settings_version(settings_t::CURRENT_SETTINGS_VERSION),
231 block_size(512) {
232 }
233
settings_string() const234 std::string settings_t::settings_string() const {
235 std::stringstream ss;
236 ss << "{\"settings_version\":" << settings_version
237 << ", \"block_size\":" << block_size
238 << "}";
239 return ss.str();
240 }
241
242 // ************************************************************
243 // import
244 // ************************************************************
import_manager_t(const std::string & hashdb_dir,const std::string & command_string)245 import_manager_t::import_manager_t(const std::string& hashdb_dir,
246 const std::string& command_string) :
247 // LMDB managers
248 lmdb_hash_data_manager(0),
249 lmdb_hash_manager(0),
250 lmdb_source_data_manager(0),
251 lmdb_source_id_manager(0),
252 lmdb_source_name_manager(0),
253
254 // log
255 logger(new logger_t(hashdb_dir, command_string)),
256 changes(new hashdb::lmdb_changes_t) {
257
258 // open managers
259 lmdb_hash_data_manager = new lmdb_hash_data_manager_t(hashdb_dir,
260 RW_MODIFY);
261 lmdb_hash_manager = new lmdb_hash_manager_t(hashdb_dir, RW_MODIFY);
262 lmdb_source_data_manager = new lmdb_source_data_manager_t(hashdb_dir,
263 RW_MODIFY);
264 lmdb_source_id_manager = new lmdb_source_id_manager_t(hashdb_dir,
265 RW_MODIFY);
266 lmdb_source_name_manager = new lmdb_source_name_manager_t(hashdb_dir,
267 RW_MODIFY);
268 }
269
~import_manager_t()270 import_manager_t::~import_manager_t() {
271
272 // show changes
273 logger->add_lmdb_changes(*changes);
274 std::cout << *changes;
275
276 // close resources
277 delete lmdb_hash_data_manager;
278 delete lmdb_hash_manager;
279 delete lmdb_source_data_manager;
280 delete lmdb_source_id_manager;
281 delete lmdb_source_name_manager;
282 delete logger;
283 delete changes;
284 }
285
insert_source_name(const std::string & file_hash,const std::string & repository_name,const std::string & filename)286 void import_manager_t::insert_source_name(
287 const std::string& file_hash,
288 const std::string& repository_name,
289 const std::string& filename) {
290 if (file_hash.size() == 0) {
291 std::cerr << "Error: insert_source_name called with empty file_hash\n";
292 return;
293 }
294 uint64_t source_id;
295 bool is_new_id = lmdb_source_id_manager->insert(file_hash, *changes,
296 source_id);
297 lmdb_source_name_manager->insert(source_id, repository_name, filename,
298 *changes);
299
300 // If the source ID is new then add a blank source data record just to keep
301 // from breaking the reverse look-up done in scan_manager_t.
302 if (is_new_id == true) {
303 lmdb_source_data_manager->insert(source_id, file_hash, 0, "", 0, 0,
304 *changes);
305 }
306 }
307
insert_source_data(const std::string & file_hash,const uint64_t filesize,const std::string & file_type,const uint64_t zero_count,const uint64_t nonprobative_count)308 void import_manager_t::insert_source_data(
309 const std::string& file_hash,
310 const uint64_t filesize,
311 const std::string& file_type,
312 const uint64_t zero_count,
313 const uint64_t nonprobative_count) {
314 if (file_hash.size() == 0) {
315 std::cerr << "Error: insert_source_data called with empty file_hash\n";
316 return;
317 }
318 uint64_t source_id;
319 lmdb_source_id_manager->insert(file_hash, *changes, source_id);
320 lmdb_source_data_manager->insert(source_id, file_hash,
321 filesize, file_type, zero_count, nonprobative_count, *changes);
322 }
323
324 // add whether file hash is present or not, used during ingest
insert_hash(const std::string & block_hash,const uint64_t k_entropy,const std::string & block_label,const std::string & file_hash)325 void import_manager_t::insert_hash(const std::string& block_hash,
326 const uint64_t k_entropy,
327 const std::string& block_label,
328 const std::string& file_hash) {
329
330 if (block_hash.size() == 0) {
331 std::cerr << "Error: insert_hash called with empty block_hash\n";
332 return;
333 }
334 if (file_hash.size() == 0) {
335 std::cerr << "Error: insert_hash called with empty file_hash\n";
336 return;
337 }
338
339 uint64_t source_id;
340 bool is_new_id = lmdb_source_id_manager->insert(file_hash, *changes,
341 source_id);
342
343 // insert hash into hash data manager and hash manager
344 const size_t count = lmdb_hash_data_manager->insert(
345 block_hash, k_entropy, block_label,
346 source_id, *changes);
347 lmdb_hash_manager->insert(block_hash, count, *changes);
348
349 // If the source ID is new then add a blank source data record just to keep
350 // from breaking the reverse look-up done in scan_manager_t.
351 if (is_new_id == true) {
352 lmdb_source_data_manager->insert(source_id, file_hash, 0, "", 0, 0,
353 *changes);
354 }
355 }
356
357 // add only if file hash is not present, use during merge
merge_hash(const std::string & block_hash,const uint64_t k_entropy,const std::string & block_label,const std::string & file_hash,const uint64_t sub_count)358 void import_manager_t::merge_hash(const std::string& block_hash,
359 const uint64_t k_entropy,
360 const std::string& block_label,
361 const std::string& file_hash,
362 const uint64_t sub_count) {
363
364 if (block_hash.size() == 0) {
365 std::cerr << "Error: insert_hash called with empty block_hash\n";
366 return;
367 }
368 if (file_hash.size() == 0) {
369 std::cerr << "Error: insert_hash called with empty file_hash\n";
370 return;
371 }
372
373 uint64_t source_id;
374 bool is_new_id = lmdb_source_id_manager->insert(file_hash, *changes,
375 source_id);
376
377 // merge hash into hash data manager
378 const size_t count = lmdb_hash_data_manager->merge(
379 block_hash, k_entropy, block_label,
380 source_id, sub_count, *changes);
381
382 // insert hash into hash manager
383 lmdb_hash_manager->insert(block_hash, count, *changes);
384
385 // If the source ID is new then add a blank source data record just to keep
386 // from breaking the reverse look-up done in scan_manager_t.
387 if (is_new_id == true) {
388 lmdb_source_data_manager->insert(source_id, file_hash, 0, "", 0, 0,
389 *changes);
390 }
391 }
392
393 // import JSON hash or source, return "" or error
import_json(const std::string & json_string)394 std::string import_manager_t::import_json(
395 const std::string& json_string) {
396
397 // open input as a JSON DOM document
398 rapidjson::Document document;
399 if (document.Parse(json_string.c_str()).HasParseError() ||
400 !document.IsObject()) {
401 return "Invalid JSON syntax";
402 }
403
404 // block_hash or file_hash
405 if (document.HasMember("block_hash")) {
406
407 // block_hash
408 if (!document["block_hash"].IsString()) {
409 return "Invalid block_hash field";
410 }
411 const std::string block_hash = hashdb::hex_to_bin(
412 document["block_hash"].GetString());
413
414 // entropy (optional)
415 uint64_t k_entropy = 0;
416 if (document.HasMember("k_entropy")) {
417 if (document["k_entropy"].IsUint64()) {
418 k_entropy = document["k_entropy"].GetUint64();
419 } else {
420 return "Invalid k_entropy field";
421 }
422 }
423
424 // block_label (optional)
425 std::string block_label = "";
426 if (document.HasMember("block_label")) {
427 if (document["block_label"].IsString()) {
428 block_label = document["block_label"].GetString();
429 } else {
430 return "Invalid block_label field";
431 }
432 }
433
434 // source_sub_counts:[]
435 if (!document.HasMember("source_sub_counts") ||
436 !document["source_sub_counts"].IsArray()) {
437 return "Invalid source_sub_counts field";
438 }
439 const rapidjson::Value& json_source_sub_counts =
440 document["source_sub_counts"];
441 hashdb::source_sub_counts_t* source_sub_counts =
442 new hashdb::source_sub_counts_t;
443 for (rapidjson::SizeType i = 0;
444 i+1 < json_source_sub_counts.Size(); i+=2) {
445
446 // source hash
447 if (!json_source_sub_counts[i+0].IsString()) {
448 delete source_sub_counts;
449 return "Invalid source hash in source_sub_counts";
450 }
451 const std::string file_hash = hashdb::hex_to_bin(
452 json_source_sub_counts[i].GetString());
453
454 // sub_count
455 if (!json_source_sub_counts[i+1].IsUint64()) {
456 delete source_sub_counts;
457 return "Invalid sub_count in source_sub_counts";
458 }
459 const uint64_t sub_count = json_source_sub_counts[i+1].GetUint64();
460
461 // add hash data for this source and source sub_count
462 merge_hash(block_hash, k_entropy, block_label, file_hash, sub_count);
463 }
464
465 delete source_sub_counts;
466 return "";
467
468 } else if (document.HasMember("file_hash")) {
469
470 // parse file_hash
471 if (!document.HasMember("file_hash") ||
472 !document["file_hash"].IsString()) {
473 return "Invalid file_hash field";
474 }
475 const std::string file_hash = hashdb::hex_to_bin(
476 document["file_hash"].GetString());
477
478 // parse filesize
479 if (!document.HasMember("filesize") ||
480 !document["filesize"].IsUint64()) {
481 return "Invalid filesize field";
482 }
483 const uint64_t filesize = document["filesize"].GetUint64();
484
485 // parse file_type (optional)
486 std::string file_type = "";
487 if (document.HasMember("file_type")) {
488 if (document["file_type"].IsString()) {
489 file_type = document["file_type"].GetString();
490 } else {
491 return "Invalid file_type field";
492 }
493 }
494
495 // zero_count (optional)
496 uint64_t zero_count = 0;
497 if (document.HasMember("zero_count")) {
498 if (document["zero_count"].IsUint64()) {
499 zero_count = document["zero_count"].GetUint64();
500 } else {
501 return "Invalid zero_count field";
502 }
503 }
504
505 // nonprobative_count (optional)
506 uint64_t nonprobative_count = 0;
507 if (document.HasMember("nonprobative_count")) {
508 if (document["nonprobative_count"].IsUint64()) {
509 nonprobative_count = document["nonprobative_count"].GetUint64();
510 } else {
511 return "Invalid nonprobative_count field";
512 }
513 }
514
515 // parse name_pairs:[]
516 if (!document.HasMember("name_pairs") ||
517 !document["name_pairs"].IsArray()) {
518 return "Invalid name_pairs field";
519 }
520 const rapidjson::Value& json_names = document["name_pairs"];
521 hashdb::source_names_t* names = new hashdb::source_names_t;
522 for (rapidjson::SizeType i = 0; i< json_names.Size(); i+=2) {
523
524 // parse repository name
525 if (!json_names[i].IsString()) {
526 delete names;
527 return "Invalid repository name in name_pairs field";
528 }
529 const std::string repository_name = json_names[i].GetString();
530
531 // parse filename
532 if (!json_names[i+1].IsString()) {
533 delete names;
534 return "Invalid filename in name_pairs field";
535 }
536 const std::string filename = json_names[i+1].GetString();
537
538 // add repository name, filename pair
539 names->insert(hashdb::source_name_t(repository_name, filename));
540 }
541
542 // everything worked so insert the source data and source names
543 insert_source_data(file_hash,
544 filesize, file_type, zero_count, nonprobative_count);
545 for (hashdb::source_names_t::const_iterator it = names->begin();
546 it != names->end(); ++it) {
547 insert_source_name(file_hash, it->first, it->second);
548 }
549
550 delete names;
551 return "";
552
553 } else {
554 return "A block_hash or file_hash field is required";
555 }
556 }
557
has_source(const std::string & file_hash) const558 bool import_manager_t::has_source(const std::string& file_hash) const {
559 uint64_t source_id;
560 return lmdb_source_id_manager->find(file_hash, source_id);
561 }
562
first_source() const563 std::string import_manager_t::first_source() const {
564 return lmdb_source_id_manager->first_source();
565 }
566
next_source(const std::string & file_hash) const567 std::string import_manager_t::next_source(const std::string& file_hash) const {
568 return lmdb_source_id_manager->next_source(file_hash);
569 }
570
size() const571 std::string import_manager_t::size() const {
572 std::stringstream ss;
573 ss << "{\"hash_data_store\":" << lmdb_hash_data_manager->size()
574 << ", \"hash_store\":" << lmdb_hash_manager->size()
575 << ", \"source_data_store\":" << lmdb_source_data_manager->size()
576 << ", \"source_id_store\":" << lmdb_source_id_manager->size()
577 << ", \"source_name_store\":" << lmdb_source_name_manager->size()
578 << "}";
579 return ss.str();
580 }
581
size_hashes() const582 size_t import_manager_t::size_hashes() const {
583 return lmdb_hash_data_manager->size();
584 }
585
size_sources() const586 size_t import_manager_t::size_sources() const {
587 return lmdb_source_id_manager->size();
588 }
589
590 // ************************************************************
591 // scan
592 // ************************************************************
scan_manager_t(const std::string & hashdb_dir)593 scan_manager_t::scan_manager_t(const std::string& hashdb_dir) :
594 // LMDB managers
595 lmdb_hash_data_manager(0),
596 lmdb_hash_manager(0),
597 lmdb_source_data_manager(0),
598 lmdb_source_id_manager(0),
599 lmdb_source_name_manager(0),
600
601 // for find_expanded_hash_json
602 hashes(new locked_member_t),
603 sources(new locked_member_t) {
604
605 // open managers
606 lmdb_hash_data_manager = new lmdb_hash_data_manager_t(hashdb_dir,
607 READ_ONLY);
608 lmdb_hash_manager = new lmdb_hash_manager_t(hashdb_dir, READ_ONLY);
609 lmdb_source_data_manager = new lmdb_source_data_manager_t(hashdb_dir,
610 READ_ONLY);
611 lmdb_source_id_manager = new lmdb_source_id_manager_t(hashdb_dir,
612 READ_ONLY);
613 lmdb_source_name_manager = new lmdb_source_name_manager_t(hashdb_dir,
614 READ_ONLY);
615 }
616
~scan_manager_t()617 scan_manager_t::~scan_manager_t() {
618 delete lmdb_hash_data_manager;
619 delete lmdb_hash_manager;
620 delete lmdb_source_data_manager;
621 delete lmdb_source_id_manager;
622 delete lmdb_source_name_manager;
623
624 // for find_expanded_hash_json
625 delete hashes;
626 delete sources;
627 }
628
find_hash_json(const hashdb::scan_mode_t scan_mode,const std::string & block_hash)629 std::string scan_manager_t::find_hash_json(
630 const hashdb::scan_mode_t scan_mode,
631 const std::string& block_hash) {
632
633 // delegate to low-level handler
634 switch(scan_mode) {
635
636 // EXPANDED
637 case hashdb::scan_mode_t::EXPANDED:
638 return find_expanded_hash_json(false, block_hash);
639
640 // EXPANDED_OPTIMIZED
641 case hashdb::scan_mode_t::EXPANDED_OPTIMIZED:
642 return find_expanded_hash_json(true, block_hash);
643
644 // COUNT
645 case hashdb::scan_mode_t::COUNT:
646 return find_hash_count_json(block_hash);
647
648 // APPROXIMATE_COUNT
649 case hashdb::scan_mode_t::APPROXIMATE_COUNT:
650 return find_approximate_hash_count_json(block_hash);
651
652 default: assert(0); std::exit(1);
653 }
654 }
655
656 // Find expanded hash, optimized with caching, return JSON.
657 // If optimizing, cache hashes and sources.
find_expanded_hash_json(const bool optimizing,const std::string & block_hash)658 std::string scan_manager_t::find_expanded_hash_json(
659 const bool optimizing, const std::string& block_hash) {
660
661 // fields to hold the scan
662 uint64_t k_entropy;
663 std::string block_label;
664 uint64_t count;
665 hashdb::source_sub_counts_t* source_sub_counts =
666 new hashdb::source_sub_counts_t;
667
668 // scan
669 bool matched = scan_manager_t::find_hash(block_hash,
670 k_entropy, block_label, count, *source_sub_counts);
671
672 // done if no match
673 if (matched == false) {
674 delete source_sub_counts;
675 return "";
676 }
677
678 // prepare JSON
679 rapidjson::Document json_doc;
680 rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
681 json_doc.SetObject();
682
683 // block_hash
684 std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
685 json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
686
687 // report hash if not caching or this is the first time for the hash
688 if (!optimizing || hashes->locked_insert(block_hash)) {
689
690 // add entropy
691 json_doc.AddMember("k_entropy", k_entropy, allocator);
692
693 // add block_label
694 json_doc.AddMember("block_label", v(block_label, allocator), allocator);
695
696 // add count
697 json_doc.AddMember("count", count, allocator);
698
699 // add source_list_id
700 uint32_t crc = calculate_crc(*source_sub_counts);
701 json_doc.AddMember("source_list_id", crc, allocator);
702
703 // the sources array
704 rapidjson::Value json_sources(rapidjson::kArrayType);
705
706 // add each source object
707 for (hashdb::source_sub_counts_t::const_iterator it =
708 source_sub_counts->begin(); it != source_sub_counts->end(); ++it) {
709 if (!optimizing || sources->locked_insert(it->file_hash)) {
710
711 // create a json_source object for the json_sources array
712 rapidjson::Value json_source(rapidjson::kObjectType);
713
714 // provide the complete source information for this source
715 provide_source_information(*this, it->file_hash, allocator,
716 json_source);
717 json_sources.PushBack(json_source, allocator);
718 }
719 }
720 json_doc.AddMember("sources", json_sources, allocator);
721
722 // add source_sub_counts as pairs of file hash, sub_count
723 rapidjson::Value json_source_sub_counts(rapidjson::kArrayType);
724
725 for (hashdb::source_sub_counts_t::const_iterator it =
726 source_sub_counts->begin(); it != source_sub_counts->end(); ++it) {
727
728 // file hash
729 json_source_sub_counts.PushBack(
730 v(hashdb::bin_to_hex(it->file_hash), allocator), allocator);
731
732 // sub_count
733 json_source_sub_counts.PushBack(it->sub_count, allocator);
734
735 }
736 json_doc.AddMember("source_sub_counts", json_source_sub_counts,
737 allocator);
738 }
739
740 delete source_sub_counts;
741
742 // return JSON text
743 rapidjson::StringBuffer strbuf;
744 rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
745 json_doc.Accept(writer);
746 return strbuf.GetString();
747 }
748
749 // find hash, return associated hash and source data
find_hash(const std::string & block_hash,uint64_t & k_entropy,std::string & block_label,uint64_t & count,source_sub_counts_t & source_sub_counts) const750 bool scan_manager_t::find_hash(
751 const std::string& block_hash,
752 uint64_t& k_entropy,
753 std::string& block_label,
754 uint64_t& count,
755 source_sub_counts_t& source_sub_counts) const {
756
757 // clear fields
758 k_entropy = 0;
759 block_label = "";
760 count = 0;
761 source_sub_counts.clear();
762
763 if (block_hash.size() == 0) {
764 std::cerr << "Error: find_hash called with empty block_hash\n";
765 return false;
766 }
767
768 // first check hash store
769 if (lmdb_hash_manager->find(block_hash) == 0) {
770 // hash is not present so return false
771 return false;
772 }
773
774 // hash may be present so read hash using hash data manager
775 hashdb::source_id_sub_counts_t* source_id_sub_counts =
776 new hashdb::source_id_sub_counts_t;
777 bool has_hash = lmdb_hash_data_manager->find(block_hash, k_entropy,
778 block_label, count, *source_id_sub_counts);
779 if (has_hash) {
780 // build source_sub_count from source_id_sub_count
781 for (hashdb::source_id_sub_counts_t::const_iterator it =
782 source_id_sub_counts->begin(); it != source_id_sub_counts->end();
783 ++it) {
784
785 // space for unused returned source variables
786 std::string file_hash;
787 uint64_t filesize;
788 std::string file_type;
789 uint64_t zero_count;
790 uint64_t nonprobative_count;
791
792 // get file_hash from source_id
793 bool source_data_found = lmdb_source_data_manager->find(
794 it->source_id, file_hash,
795 filesize, file_type,
796 zero_count, nonprobative_count);
797
798 // source_data must have a source_id to match the source_id in hash_data
799 if (source_data_found == false) {
800 assert(0);
801 }
802
803 // add the source sub_counts
804 source_sub_counts.insert(hashdb::source_sub_count_t(file_hash,
805 it->sub_count));
806 }
807 delete source_id_sub_counts;
808 return true;
809
810 } else {
811 // no action, lmdb_hash_data_manager.find clears out fields
812 delete source_id_sub_counts;
813 return false;
814 }
815 }
816
817 // export hash, return result as JSON string
export_hash_json(const std::string & block_hash) const818 std::string scan_manager_t::export_hash_json(
819 const std::string& block_hash) const {
820
821 // hash fields
822 uint64_t k_entropy;
823 std::string block_label;
824 uint64_t unused_count;
825 hashdb::source_sub_counts_t* source_sub_counts =
826 new hashdb::source_sub_counts_t;
827
828 // scan
829 bool found_hash = find_hash(block_hash, k_entropy, block_label,
830 unused_count, *source_sub_counts);
831
832 std::string json_hash_string;
833 if (found_hash) {
834
835 // prepare JSON
836 rapidjson::Document json_doc;
837 rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
838 json_doc.SetObject();
839
840 // put in hash data
841 std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
842 json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
843 json_doc.AddMember("k_entropy", k_entropy, allocator);
844 json_doc.AddMember("block_label", v(block_label, allocator), allocator);
845
846 // put in source_sub_counts as pairs of file hash, sub_count
847 rapidjson::Value json_source_sub_counts(rapidjson::kArrayType);
848
849 for (hashdb::source_sub_counts_t::const_iterator it =
850 source_sub_counts->begin(); it != source_sub_counts->end(); ++it) {
851
852 // file hash
853 json_source_sub_counts.PushBack(
854 v(hashdb::bin_to_hex(it->file_hash), allocator), allocator);
855
856 // sub_count
857 json_source_sub_counts.PushBack(it->sub_count, allocator);
858
859 }
860 json_doc.AddMember("source_sub_counts", json_source_sub_counts,
861 allocator);
862
863 // write JSON text
864 rapidjson::StringBuffer strbuf;
865 rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
866 json_doc.Accept(writer);
867 json_hash_string = strbuf.GetString();
868
869 } else {
870 // not found
871 json_hash_string = "";
872 }
873
874 delete source_sub_counts;
875 return json_hash_string;
876 }
877
878 // find hash count
find_hash_count(const std::string & block_hash) const879 size_t scan_manager_t::find_hash_count(
880 const std::string& block_hash) const {
881
882 if (block_hash.size() == 0) {
883 std::cerr << "Error: find_hash_count called with empty block_hash\n";
884 return 0;
885 }
886
887 return lmdb_hash_data_manager->find_count(block_hash);
888 }
889
890 // find hash count JSON
find_hash_count_json(const std::string & block_hash) const891 std::string scan_manager_t::find_hash_count_json(
892 const std::string& block_hash) const {
893
894 // get count
895 size_t count = find_hash_count(block_hash);
896
897 // no match
898 if (count == 0) {
899 return "";
900 }
901
902 // return JSON with count
903 // prepare JSON
904 rapidjson::Document json_doc;
905 rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
906 json_doc.SetObject();
907
908 // block hash
909 std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
910 json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
911
912 // count
913 json_doc.AddMember("count", (uint64_t)count, allocator);
914
915 // write JSON text
916 rapidjson::StringBuffer strbuf;
917 rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
918 json_doc.Accept(writer);
919 return strbuf.GetString();
920 }
921
find_approximate_hash_count(const std::string & block_hash) const922 size_t scan_manager_t::find_approximate_hash_count(
923 const std::string& block_hash) const {
924 if (block_hash.size() == 0) {
925 std::cerr << "Error: find_approximate_hash_count called with empty block_hash\n";
926 return 0;
927 }
928
929 return lmdb_hash_manager->find(block_hash);
930 }
931
932 // find approximate hash count JSON
find_approximate_hash_count_json(const std::string & block_hash) const933 std::string scan_manager_t::find_approximate_hash_count_json(
934 const std::string& block_hash) const {
935
936 // get approximate count
937 size_t approximate_count =
938 find_approximate_hash_count(block_hash);
939
940 // no match
941 if (approximate_count == 0) {
942 return "";
943 }
944
945 // return JSON with approximate count
946 // prepare JSON
947 rapidjson::Document json_doc;
948 rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
949 json_doc.SetObject();
950
951 // block hash
952 std::string hex_block_hash = hashdb::bin_to_hex(block_hash);
953 json_doc.AddMember("block_hash", v(hex_block_hash, allocator), allocator);
954
955 // approximate count
956 json_doc.AddMember("approximate_count",
957 (uint64_t)approximate_count, allocator);
958
959 // write JSON text
960 rapidjson::StringBuffer strbuf;
961 rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
962 json_doc.Accept(writer);
963 return strbuf.GetString();
964 }
965
find_source_data(const std::string & file_hash,uint64_t & filesize,std::string & file_type,uint64_t & zero_count,uint64_t & nonprobative_count) const966 bool scan_manager_t::find_source_data(
967 const std::string& file_hash,
968 uint64_t& filesize,
969 std::string& file_type,
970 uint64_t& zero_count,
971 uint64_t& nonprobative_count) const {
972
973 if (file_hash.size() == 0) {
974 std::cerr << "Error: find_source_data called with empty file_hash\n";
975 return false;
976 }
977
978 // read source_id
979 uint64_t source_id;
980 bool has_id = lmdb_source_id_manager->find(file_hash, source_id);
981 if (has_id == false) {
982 // no source ID for this file_hash
983 filesize = 0;
984 file_type = "";
985 zero_count = 0;
986 nonprobative_count = 0;
987 return false;
988 } else {
989
990 // read source data associated with this source ID
991 std::string returned_file_hash;
992 bool source_data_found = lmdb_source_data_manager->find(source_id,
993 returned_file_hash, filesize, file_type,
994 zero_count, nonprobative_count);
995
996 // if source data is found, make sure the file binary hash is right
997 if (source_data_found == true &&
998 file_hash != returned_file_hash) {
999 assert(0);
1000 }
1001 }
1002 return true;
1003 }
1004
find_source_names(const std::string & file_hash,source_names_t & source_names) const1005 bool scan_manager_t::find_source_names(const std::string& file_hash,
1006 source_names_t& source_names) const {
1007
1008 if (file_hash.size() == 0) {
1009 std::cerr << "Error: find_source_names called with empty file_hash\n";
1010 return false;
1011 }
1012
1013 // read source_id
1014 uint64_t source_id;
1015 bool has_id = lmdb_source_id_manager->find(file_hash, source_id);
1016 if (has_id == false) {
1017 // no source ID for this file_hash
1018 source_names.clear();
1019 return false;
1020 } else {
1021 // source
1022 return lmdb_source_name_manager->find(source_id, source_names);
1023 }
1024 }
1025
1026 // export source, return result as JSON string
export_source_json(const std::string & file_hash) const1027 std::string scan_manager_t::export_source_json(
1028 const std::string& file_hash) const {
1029
1030 // source fields
1031 uint64_t filesize;
1032 std::string file_type;
1033 uint64_t zero_count;
1034 uint64_t nonprobative_count;
1035
1036 // prepare JSON
1037 rapidjson::Document json_doc;
1038 rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
1039 json_doc.SetObject();
1040
1041 // get source data
1042 bool has_source_data = find_source_data(file_hash, filesize,
1043 file_type, zero_count, nonprobative_count);
1044 if (!has_source_data) {
1045 return "";
1046 }
1047
1048 // source found
1049
1050 // set source data
1051 std::string hex_file_hash = hashdb::bin_to_hex(file_hash);
1052 json_doc.AddMember("file_hash", v(hex_file_hash, allocator), allocator);
1053 json_doc.AddMember("filesize", filesize, allocator);
1054 json_doc.AddMember("file_type", v(file_type, allocator), allocator);
1055 json_doc.AddMember("zero_count", zero_count, allocator);
1056 json_doc.AddMember("nonprobative_count", nonprobative_count, allocator);
1057
1058 // get source names
1059 hashdb::source_names_t* source_names = new hashdb::source_names_t;
1060 find_source_names(file_hash, *source_names);
1061
1062 // name_pairs object
1063 rapidjson::Value json_name_pairs(rapidjson::kArrayType);
1064
1065 // provide names
1066 for (hashdb::source_names_t::const_iterator it = source_names->begin();
1067 it != source_names->end(); ++it) {
1068 // repository name
1069 json_name_pairs.PushBack(v(it->first, allocator), allocator);
1070 // filename
1071 json_name_pairs.PushBack(v(it->second, allocator), allocator);
1072 }
1073 json_doc.AddMember("name_pairs", json_name_pairs, allocator);
1074
1075 // done with source names
1076 delete source_names;
1077
1078 // write JSON text
1079 rapidjson::StringBuffer strbuf;
1080 rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
1081 json_doc.Accept(writer);
1082 return strbuf.GetString();
1083 }
1084
first_hash() const1085 std::string scan_manager_t::first_hash() const {
1086 return lmdb_hash_data_manager->first_hash();
1087 }
1088
next_hash(const std::string & block_hash) const1089 std::string scan_manager_t::next_hash(const std::string& block_hash) const {
1090 if (block_hash.size() == 0) {
1091 std::cerr << "Error: next_hash called with empty block_hash\n";
1092 return "";
1093 }
1094 return lmdb_hash_data_manager->next_hash(block_hash);
1095 }
1096
first_source() const1097 std::string scan_manager_t::first_source() const {
1098 return lmdb_source_id_manager->first_source();
1099 }
1100
next_source(const std::string & file_hash) const1101 std::string scan_manager_t::next_source(const std::string& file_hash) const {
1102 if (file_hash.size() == 0) {
1103 std::cerr << "Error: next_source called with empty file_hash\n";
1104 return "";
1105 }
1106 return lmdb_source_id_manager->next_source(file_hash);
1107 }
1108
size() const1109 std::string scan_manager_t::size() const {
1110 std::stringstream ss;
1111 ss << "{\"hash_data_store\":" << lmdb_hash_data_manager->size()
1112 << ", \"hash_store\":" << lmdb_hash_manager->size()
1113 << ", \"source_data_store\":" << lmdb_source_data_manager->size()
1114 << ", \"source_id_store\":" << lmdb_source_id_manager->size()
1115 << ", \"source_name_store\":" << lmdb_source_name_manager->size()
1116 << "}";
1117 return ss.str();
1118 }
1119
size_hashes() const1120 size_t scan_manager_t::size_hashes() const {
1121 return lmdb_hash_data_manager->size();
1122 }
1123
size_sources() const1124 size_t scan_manager_t::size_sources() const {
1125 return lmdb_source_id_manager->size();
1126 }
1127
1128 // ************************************************************
1129 // timestamp
1130 // ************************************************************
timestamp_t()1131 timestamp_t::timestamp_t() :
1132 t0(new timeval()), t_last_timestamp(new timeval()) {
1133 gettimeofday(t0, 0);
1134 gettimeofday(t_last_timestamp, 0);
1135 }
1136
~timestamp_t()1137 timestamp_t::~timestamp_t() {
1138 delete t0;
1139 delete t_last_timestamp;
1140 }
1141
1142 /**
1143 * Take a timestamp and return a JSON string in format {"name":"name",
1144 * "delta":delta, "total":total}.
1145 */
stamp(const std::string & name)1146 std::string timestamp_t::stamp(const std::string &name) {
1147 // adapted from dfxml_writer.cpp
1148 struct timeval t1;
1149 gettimeofday(&t1,0);
1150 struct timeval t;
1151
1152 // timestamp delta against t_last_timestamp
1153 t.tv_sec = t1.tv_sec - t_last_timestamp->tv_sec;
1154 if(t1.tv_usec > t_last_timestamp->tv_usec){
1155 t.tv_usec = t1.tv_usec - t_last_timestamp->tv_usec;
1156 } else {
1157 t.tv_sec--;
1158 t.tv_usec = (t1.tv_usec+1000000) - t_last_timestamp->tv_usec;
1159 }
1160 char delta[16];
1161 snprintf(delta, 16, "%d.%06d", (int)t.tv_sec, (int)t.tv_usec);
1162
1163 // reset t_last_timestamp for the next invocation
1164 gettimeofday(t_last_timestamp,0);
1165
1166 // timestamp total
1167 t.tv_sec = t1.tv_sec - t0->tv_sec;
1168 if(t1.tv_usec > t0->tv_usec){
1169 t.tv_usec = t1.tv_usec - t0->tv_usec;
1170 } else {
1171 t.tv_sec--;
1172 t.tv_usec = (t1.tv_usec+1000000) - t0->tv_usec;
1173 }
1174 char total_time[16];
1175 snprintf(total_time, 16, "%d.%06d", (int)t.tv_sec, (int)t.tv_usec);
1176
1177 // return the named timestamp
1178 // prepare JSON
1179 rapidjson::Document json_doc;
1180 rapidjson::Document::AllocatorType& allocator = json_doc.GetAllocator();
1181 json_doc.SetObject();
1182 json_doc.AddMember("name", v(name, allocator), allocator);
1183 json_doc.AddMember("delta", v(std::string(delta), allocator), allocator);
1184 json_doc.AddMember("total", v(std::string(total_time), allocator),
1185 allocator);
1186
1187 // copy JSON text
1188 rapidjson::StringBuffer strbuf;
1189 rapidjson::Writer<rapidjson::StringBuffer> writer(strbuf);
1190 json_doc.Accept(writer);
1191 return strbuf.GetString();
1192 }
1193 }
1194
1195