1 /*
2 *
3 * BloomFilter.hpp
4 * Author: Hamid Mohamadi
5 * Genome Sciences Centre,
6 * British Columbia Cancer Agency
7 */
8
9
10 #ifndef BLOOMFILTER_H_
11 #define BLOOMFILTER_H_
12 #include <string>
13 #include <stdint.h>
14 #include <math.h>
15 #include <fstream>
16 #include <iostream>
17 #include <sys/stat.h>
18 #include <cstring>
19 #include <cassert>
20 #include <cstdlib>
21 #include <stdio.h>
22 #include <cstring>
23 #include "nthash.hpp"
24 #include "city.h"
25 #include "xxhash.h"
26 #include "murmur.hpp"
27
28 using namespace std;
29
popCnt(unsigned char x)30 inline unsigned popCnt(unsigned char x) {
31 return ((0x876543210 >>
32 (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2)) >>
33 ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2))
34 & 0xf;
35 }
36
37 class BloomFilter {
38 public:
39
BloomFilter(size_t filterSize,unsigned hashNum,unsigned kmerSize,const char * fPath)40 BloomFilter(size_t filterSize, unsigned hashNum, unsigned kmerSize, const char * fPath):
41 m_size(filterSize), m_hashNum(hashNum), m_kmerSize(kmerSize) {
42 m_filter = new unsigned char [(m_size + 7)/8];
43 ifstream myFile(fPath, ios::in | ios::binary);
44 myFile.seekg (0, ios::beg);
45 myFile.read ((char *)m_filter, (m_size + 7)/8);
46 myFile.close();
47 }
48
BloomFilter(size_t filterSize,unsigned hashNum,unsigned kmerSize)49 BloomFilter(size_t filterSize, unsigned hashNum, unsigned kmerSize):
50 m_size(filterSize), m_hashNum(hashNum), m_kmerSize(kmerSize) {
51 m_filter = new unsigned char [(m_size + 7)/8];
52 for(size_t i = 0; i < (m_size + 7)/8; i++)
53 m_filter[i]=0;
54 }
55
insertF(const char * kmer)56 void insertF(const char* kmer) {
57 uint64_t hVal = NTF64(kmer, m_kmerSize);
58 size_t hLoc = hVal % m_size;
59 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
60 for (unsigned i = 1; i < m_hashNum; i++) {
61 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
62 mhVal ^= mhVal >> multiShift;
63 size_t hLoc = mhVal % m_size;
64 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
65 }
66 }
67
insertF(const char * kmer,uint64_t & hVal)68 void insertF(const char * kmer, uint64_t& hVal) {
69 hVal = NTF64(kmer, m_kmerSize);
70 size_t hLoc = hVal % m_size;
71 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
72 for (unsigned i = 1; i < m_hashNum; i++) {
73 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
74 mhVal ^= mhVal >> multiShift;
75 size_t hLoc = mhVal % m_size;
76 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
77 }
78 }
79
insertF(uint64_t & hVal,const char charOut,const char charIn)80 void insertF(uint64_t& hVal, const char charOut, const char charIn) {
81 hVal = NTF64(hVal, m_kmerSize, charOut, charIn);
82 size_t hLoc = hVal % m_size;
83 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
84 for (unsigned i = 1; i < m_hashNum; i++) {
85 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
86 mhVal ^= mhVal >> multiShift;
87 size_t hLoc = mhVal % m_size;
88 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
89 }
90 }
91
insert(const char * kmer)92 void insert(const char* kmer) {
93 uint64_t hVal = NTC64(kmer, m_kmerSize);
94 size_t hLoc = hVal % m_size;
95 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
96 for (unsigned i = 1; i < m_hashNum; i++) {
97 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
98 mhVal ^= mhVal >> multiShift;
99 size_t hLoc = mhVal % m_size;
100 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
101 }
102 }
103
insert(const char * kmer,uint64_t & fhVal,uint64_t & rhVal)104 void insert(const char * kmer, uint64_t& fhVal, uint64_t& rhVal) {
105 uint64_t hVal = NTC64(kmer, m_kmerSize, fhVal, rhVal);
106 size_t hLoc = hVal % m_size;
107 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
108 for (unsigned i = 1; i < m_hashNum; i++) {
109 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
110 mhVal ^= mhVal >> multiShift;
111 size_t hLoc = mhVal % m_size;
112 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
113 }
114 }
115
insert(uint64_t & fhVal,uint64_t & rhVal,const char charOut,const char charIn)116 void insert(uint64_t& fhVal, uint64_t& rhVal, const char charOut, const char charIn) {
117 uint64_t hVal = NTC64(charOut, charIn, m_kmerSize, fhVal, rhVal);
118 size_t hLoc = hVal % m_size;
119 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
120 for (unsigned i = 1; i < m_hashNum; i++) {
121 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
122 mhVal ^= mhVal >> multiShift;
123 size_t hLoc = mhVal % m_size;
124 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
125 }
126 }
127
insertMur(const char * kmer)128 void insertMur(const char* kmer) {
129 for (unsigned i = 0; i < m_hashNum; i++) {
130 size_t hLoc = MurmurHash64A(kmer, m_kmerSize, i) % m_size;
131 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
132 }
133 }
134
insertCit(const char * kmer)135 void insertCit(const char* kmer) {
136 for (unsigned i = 0; i < m_hashNum; i++) {
137 size_t hLoc = CityHash64WithSeed(kmer, m_kmerSize, i) % m_size;
138 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
139 }
140 }
141
insertXxh(const char * kmer)142 void insertXxh(const char* kmer) {
143 for (unsigned i = 0; i < m_hashNum; i++) {
144 size_t hLoc = XXH64(kmer, m_kmerSize, i) % m_size;
145 __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
146 }
147 }
148
containsF(const char * kmer) const149 bool containsF(const char* kmer) const {
150 uint64_t hVal = NTF64(kmer, m_kmerSize);
151 size_t hLoc = hVal % m_size;
152 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
153 for (unsigned i = 1; i < m_hashNum; i++) {
154 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
155 mhVal ^= mhVal >> multiShift;
156 size_t hLoc = mhVal % m_size;
157 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
158 return false;
159 }
160 return true;
161 }
162
containsF(const char * kmer,uint64_t & hVal)163 bool containsF(const char * kmer, uint64_t& hVal) {
164 hVal = NTF64(kmer, m_kmerSize);
165 size_t hLoc = hVal % m_size;
166 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
167 for (unsigned i = 1; i < m_hashNum; i++) {
168 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
169 mhVal ^= mhVal >> multiShift;
170 size_t hLoc = mhVal % m_size;
171 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
172 return false;
173 }
174 return true;
175 }
176
containsF(uint64_t & hVal,const char charOut,const char charIn)177 bool containsF(uint64_t& hVal, const char charOut, const char charIn) {
178 hVal = NTF64(hVal, m_kmerSize, charOut, charIn);
179 size_t hLoc = hVal % m_size;
180 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
181 for (unsigned i = 1; i < m_hashNum; i++) {
182 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
183 mhVal ^= mhVal >> multiShift;
184 size_t hLoc = mhVal % m_size;
185 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
186 return false;
187 }
188 return true;
189 }
190
contains(const char * kmer) const191 bool contains(const char* kmer) const {
192 uint64_t hVal = NTC64(kmer, m_kmerSize);
193 size_t hLoc = hVal % m_size;
194 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
195 for (unsigned i = 1; i < m_hashNum; i++) {
196 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
197 mhVal ^= mhVal >> multiShift;
198 size_t hLoc = mhVal % m_size;
199 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
200 return false;
201 }
202 return true;
203 }
204
contains(const char * kmer,uint64_t & fhVal,uint64_t & rhVal)205 bool contains(const char * kmer, uint64_t& fhVal, uint64_t& rhVal) {
206 uint64_t hVal = NTC64(kmer, m_kmerSize, fhVal, rhVal);
207 size_t hLoc = hVal % m_size;
208 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
209 for (unsigned i = 1; i < m_hashNum; i++) {
210 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
211 mhVal ^= mhVal >> multiShift;
212 size_t hLoc = mhVal % m_size;
213 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
214 return false;
215 }
216 return true;
217 }
218
contains(uint64_t & fhVal,uint64_t & rhVal,const char charOut,const char charIn)219 bool contains(uint64_t& fhVal, uint64_t& rhVal, const char charOut, const char charIn) {
220 uint64_t hVal = NTC64(charOut, charIn, m_kmerSize, fhVal, rhVal);
221 size_t hLoc = hVal % m_size;
222 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
223 for (unsigned i = 1; i < m_hashNum; i++) {
224 uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
225 mhVal ^= mhVal >> multiShift;
226 size_t hLoc = mhVal % m_size;
227 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
228 return false;
229 }
230 return true;
231 }
232
containsMur(const char * kmer) const233 bool containsMur(const char* kmer) const {
234 for (unsigned i = 0; i < m_hashNum; i++) {
235 size_t hLoc = MurmurHash64A(kmer, m_kmerSize, i) % m_size;
236 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
237 return false;
238 }
239 return true;
240 }
241
containsCit(const char * kmer) const242 bool containsCit(const char* kmer) const {
243 for (unsigned i = 0; i < m_hashNum; i++) {
244 size_t hLoc = CityHash64WithSeed(kmer, m_kmerSize, i) % m_size;
245 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
246 return false;
247 }
248 return true;
249 }
250
containsXxh(const char * kmer) const251 bool containsXxh(const char* kmer) const {
252 for (unsigned i = 0; i < m_hashNum; i++) {
253 size_t hLoc = XXH64(kmer, m_kmerSize, i) % m_size;
254 if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
255 return false;
256 }
257 return true;
258 }
259
storeFilter(const char * fPath) const260 void storeFilter(const char * fPath) const {
261 ofstream myFile(fPath, ios::out | ios::binary);
262 myFile.write(reinterpret_cast<char*>(m_filter), (m_size + 7)/8);
263 myFile.close();
264 }
265
getPop() const266 size_t getPop() const {
267 size_t i, popBF=0;
268 #ifdef _OPENMP
269 #pragma omp parallel for reduction(+:popBF)
270 #endif
271 for(i=0; i<(m_size + 7)/8; i++)
272 popBF = popBF + popCnt(m_filter[i]);
273 return popBF;
274 }
275
getHashNum() const276 unsigned getHashNum() const {
277 return m_hashNum;
278 }
279
getKmerSize() const280 unsigned getKmerSize() const {
281 return m_kmerSize;
282 }
283
~BloomFilter()284 ~BloomFilter() {
285 delete[] m_filter;
286 }
287
288 private:
289 BloomFilter(const BloomFilter& that); //to prevent copy construction
290 unsigned char * m_filter;
291 size_t m_size;
292 unsigned m_hashNum;
293 unsigned m_kmerSize;
294 };
295
296 #endif /* BLOOMFILTER_H_ */
297