1 #ifndef BLOOM_IO_H
2 #define BLOOM_IO_H 1
3
4 #include "BloomDBG/RollingHash.h"
5 #include "BloomDBG/RollingHashIterator.h"
6 #include "DataLayer/FastaReader.h"
7 #include "vendor/btl_bloomfilter/BloomFilter.hpp"
8
9 namespace BloomDBG {
10
11 /**
12 * Round up `num` to the nearest multiple of `base`.
13 */
14 template<typename T>
15 inline static T
roundUpToMultiple(T num,T base)16 roundUpToMultiple(T num, T base)
17 {
18 if (base == 0)
19 return num;
20 T remainder = num % base;
21 if (remainder == 0)
22 return num;
23 return num + base - remainder;
24 }
25
26 /**
27 * Load DNA sequence into Bloom filter using rolling hash.
28 *
29 * @param bloom target Bloom filter
30 * @param seq DNA sequence
31 */
32 template<typename BF>
33 inline static void
loadSeq(BF & bloom,const std::string & seq)34 loadSeq(BF& bloom, const std::string& seq)
35 {
36 const unsigned k = bloom.getKmerSize();
37 const unsigned numHashes = bloom.getHashNum();
38 for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); ++it) {
39 bloom.insert(*it);
40 }
41 }
42
43 /**
44 * Load sequences contents of FASTA file into Bloom filter using
45 * rolling hash.
46 * @param bloom target Bloom filter
47 * @param path path to FASTA file
48 * @param verbose if true, print progress messages to STDERR
49 */
50 template<typename BF>
51 inline static void
52 loadFile(BF& bloom, const std::string& path, bool verbose = false)
53 {
54 const size_t BUFFER_SIZE = 1000000;
55 const size_t LOAD_PROGRESS_STEP = 10000;
56
57 assert(!path.empty());
58 if (verbose)
59 std::cerr << "Reading `" << path << "'..." << std::endl;
60
61 FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
62 uint64_t readCount = 0;
63 #pragma omp parallel
64 for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
65 buffer.clear();
66 size_t bufferSize = 0;
67 bool good = true;
68 #pragma omp critical(in)
69 for (; good && bufferSize < BUFFER_SIZE;) {
70 std::string seq;
71 good = in >> seq;
72 if (good) {
73 buffer.push_back(seq);
74 bufferSize += seq.length();
75 }
76 }
77 if (buffer.size() == 0)
78 break;
79 for (size_t j = 0; j < buffer.size(); j++) {
80 loadSeq(bloom, buffer.at(j));
81 if (verbose)
82 #pragma omp critical(cerr)
83 {
84 readCount++;
85 if (readCount % LOAD_PROGRESS_STEP == 0)
86 std::cerr << "Loaded " << readCount << " reads into Bloom filter\n";
87 }
88 }
89 }
90 assert(in.eof());
91 if (verbose) {
92 std::cerr << "Loaded " << readCount << " reads from `" << path << "` into Bloom filter\n";
93 }
94 }
95
96 /** Load FASTQ/FASTA/SAM/BAM files from command line into a Bloom filter */
97 template<typename BloomFilterT>
98 static inline void
99 loadBloomFilter(int argc, char** argv, BloomFilterT& bloom, bool verbose = false)
100 {
101 /* load reads into Bloom filter */
102 for (int i = optind; i < argc; ++i) {
103 /*
104 * Debugging feature: If there is a ':'
105 * separating the list of input read files into
106 * two parts, use the first set of files
107 * to load the Bloom filter and the second
108 * set of files for the assembly (read extension).
109 */
110 if (strcmp(argv[i], ":") == 0) {
111 optind = i + 1;
112 break;
113 }
114 BloomDBG::loadFile(bloom, argv[i], verbose);
115 }
116 if (verbose)
117 cerr << "Bloom filter FPR: " << setprecision(3) << bloom.FPR() * 100 << "%" << endl;
118 }
119
120 } // end namespace 'BloomDBG'
121
122 #endif
123