1 #ifndef BLOOM_IO_H
2 #define BLOOM_IO_H 1
3 
4 #include "BloomDBG/RollingHash.h"
5 #include "BloomDBG/RollingHashIterator.h"
6 #include "DataLayer/FastaReader.h"
7 #include "vendor/btl_bloomfilter/BloomFilter.hpp"
8 
9 namespace BloomDBG {
10 
11 /**
12  * Round up `num` to the nearest multiple of `base`.
13  */
14 template<typename T>
15 inline static T
roundUpToMultiple(T num,T base)16 roundUpToMultiple(T num, T base)
17 {
18 	if (base == 0)
19 		return num;
20 	T remainder = num % base;
21 	if (remainder == 0)
22 		return num;
23 	return num + base - remainder;
24 }
25 
26 /**
27  * Load DNA sequence into Bloom filter using rolling hash.
28  *
29  * @param bloom target Bloom filter
30  * @param seq DNA sequence
31  */
32 template<typename BF>
33 inline static void
loadSeq(BF & bloom,const std::string & seq)34 loadSeq(BF& bloom, const std::string& seq)
35 {
36 	const unsigned k = bloom.getKmerSize();
37 	const unsigned numHashes = bloom.getHashNum();
38 	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); ++it) {
39 		bloom.insert(*it);
40 	}
41 }
42 
43 /**
44  * Load sequences contents of FASTA file into Bloom filter using
45  * rolling hash.
46  * @param bloom target Bloom filter
47  * @param path path to FASTA file
48  * @param verbose if true, print progress messages to STDERR
49  */
50 template<typename BF>
51 inline static void
52 loadFile(BF& bloom, const std::string& path, bool verbose = false)
53 {
54 	const size_t BUFFER_SIZE = 1000000;
55 	const size_t LOAD_PROGRESS_STEP = 10000;
56 
57 	assert(!path.empty());
58 	if (verbose)
59 		std::cerr << "Reading `" << path << "'..." << std::endl;
60 
61 	FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
62 	uint64_t readCount = 0;
63 #pragma omp parallel
64 	for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
65 		buffer.clear();
66 		size_t bufferSize = 0;
67 		bool good = true;
68 #pragma omp critical(in)
69 		for (; good && bufferSize < BUFFER_SIZE;) {
70 			std::string seq;
71 			good = in >> seq;
72 			if (good) {
73 				buffer.push_back(seq);
74 				bufferSize += seq.length();
75 			}
76 		}
77 		if (buffer.size() == 0)
78 			break;
79 		for (size_t j = 0; j < buffer.size(); j++) {
80 			loadSeq(bloom, buffer.at(j));
81 			if (verbose)
82 #pragma omp critical(cerr)
83 			{
84 				readCount++;
85 				if (readCount % LOAD_PROGRESS_STEP == 0)
86 					std::cerr << "Loaded " << readCount << " reads into Bloom filter\n";
87 			}
88 		}
89 	}
90 	assert(in.eof());
91 	if (verbose) {
92 		std::cerr << "Loaded " << readCount << " reads from `" << path << "` into Bloom filter\n";
93 	}
94 }
95 
96 /** Load FASTQ/FASTA/SAM/BAM files from command line into a Bloom filter */
97 template<typename BloomFilterT>
98 static inline void
99 loadBloomFilter(int argc, char** argv, BloomFilterT& bloom, bool verbose = false)
100 {
101 	/* load reads into Bloom filter */
102 	for (int i = optind; i < argc; ++i) {
103 		/*
104 		 * Debugging feature: If there is a ':'
105 		 * separating the list of input read files into
106 		 * two parts, use the first set of files
107 		 * to load the Bloom filter and the second
108 		 * set of files for the assembly (read extension).
109 		 */
110 		if (strcmp(argv[i], ":") == 0) {
111 			optind = i + 1;
112 			break;
113 		}
114 		BloomDBG::loadFile(bloom, argv[i], verbose);
115 	}
116 	if (verbose)
117 		cerr << "Bloom filter FPR: " << setprecision(3) << bloom.FPR() * 100 << "%" << endl;
118 }
119 
120 } // end namespace 'BloomDBG'
121 
122 #endif
123