1 /*
2  *
3  * BloomFilter.hpp
4  * Author: Hamid Mohamadi
5  * Genome Sciences Centre,
6  * British Columbia Cancer Agency
7  */
8 
9 
10 #ifndef BLOOMFILTER_H_
11 #define BLOOMFILTER_H_
12 #include <string>
13 #include <stdint.h>
14 #include <math.h>
15 #include <fstream>
16 #include <iostream>
17 #include <sys/stat.h>
18 #include <cstring>
19 #include <cassert>
20 #include <cstdlib>
21 #include <stdio.h>
22 #include <cstring>
23 #include "nthash.hpp"
24 #include "city.h"
25 #include "xxhash.h"
26 #include "murmur.hpp"
27 
28 using namespace std;
29 
popCnt(unsigned char x)30 inline unsigned popCnt(unsigned char x) {
31     return ((0x876543210 >>
32              (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2)) >>
33             ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2))
34            & 0xf;
35 }
36 
37 class BloomFilter {
38 public:
39 
BloomFilter(size_t filterSize,unsigned hashNum,unsigned kmerSize,const char * fPath)40     BloomFilter(size_t filterSize, unsigned hashNum, unsigned kmerSize, const char * fPath):
41         m_size(filterSize), m_hashNum(hashNum), m_kmerSize(kmerSize) {
42         m_filter = new unsigned char [(m_size + 7)/8];
43         ifstream myFile(fPath, ios::in | ios::binary);
44         myFile.seekg (0, ios::beg);
45         myFile.read ((char *)m_filter, (m_size + 7)/8);
46         myFile.close();
47     }
48 
BloomFilter(size_t filterSize,unsigned hashNum,unsigned kmerSize)49     BloomFilter(size_t filterSize, unsigned hashNum, unsigned kmerSize):
50         m_size(filterSize), m_hashNum(hashNum), m_kmerSize(kmerSize) {
51         m_filter = new unsigned char [(m_size + 7)/8];
52         for(size_t i = 0; i < (m_size + 7)/8; i++)
53             m_filter[i]=0;
54     }
55 
insertF(const char * kmer)56     void insertF(const char* kmer) {
57         uint64_t hVal = NTF64(kmer, m_kmerSize);
58         size_t hLoc = hVal % m_size;
59         __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
60         for (unsigned i = 1; i < m_hashNum; i++) {
61             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
62             mhVal ^= mhVal >> multiShift;
63             size_t hLoc = mhVal % m_size;
64             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
65         }
66     }
67 
insertF(const char * kmer,uint64_t & hVal)68     void insertF(const char * kmer, uint64_t& hVal) {
69         hVal = NTF64(kmer, m_kmerSize);
70         size_t hLoc = hVal % m_size;
71         __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
72         for (unsigned i = 1; i < m_hashNum; i++) {
73             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
74             mhVal ^= mhVal >> multiShift;
75             size_t hLoc = mhVal % m_size;
76             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
77         }
78     }
79 
insertF(uint64_t & hVal,const char charOut,const char charIn)80     void insertF(uint64_t& hVal, const char charOut, const char charIn) {
81         hVal = NTF64(hVal, m_kmerSize, charOut, charIn);
82         size_t hLoc = hVal % m_size;
83         __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
84         for (unsigned i = 1; i < m_hashNum; i++) {
85             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
86             mhVal ^= mhVal >> multiShift;
87             size_t hLoc = mhVal % m_size;
88             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
89         }
90     }
91 
insert(const char * kmer)92     void insert(const char* kmer) {
93         uint64_t hVal = NTC64(kmer, m_kmerSize);
94         size_t hLoc = hVal % m_size;
95         __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
96         for (unsigned i = 1; i < m_hashNum; i++) {
97             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
98             mhVal ^= mhVal >> multiShift;
99             size_t hLoc = mhVal % m_size;
100             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
101         }
102     }
103 
insert(const char * kmer,uint64_t & fhVal,uint64_t & rhVal)104     void insert(const char * kmer, uint64_t& fhVal, uint64_t& rhVal) {
105         uint64_t hVal = NTC64(kmer, m_kmerSize, fhVal, rhVal);
106         size_t hLoc = hVal % m_size;
107         __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
108         for (unsigned i = 1; i < m_hashNum; i++) {
109             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
110             mhVal ^= mhVal >> multiShift;
111             size_t hLoc = mhVal % m_size;
112             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
113         }
114     }
115 
insert(uint64_t & fhVal,uint64_t & rhVal,const char charOut,const char charIn)116     void insert(uint64_t& fhVal, uint64_t& rhVal, const char charOut, const char charIn) {
117         uint64_t hVal = NTC64(charOut, charIn, m_kmerSize, fhVal, rhVal);
118         size_t hLoc = hVal % m_size;
119         __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
120         for (unsigned i = 1; i < m_hashNum; i++) {
121             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
122             mhVal ^= mhVal >> multiShift;
123             size_t hLoc = mhVal % m_size;
124             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
125         }
126     }
127 
insertMur(const char * kmer)128     void insertMur(const char* kmer) {
129         for (unsigned i = 0; i < m_hashNum; i++) {
130             size_t hLoc = MurmurHash64A(kmer, m_kmerSize, i) % m_size;
131             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
132         }
133     }
134 
insertCit(const char * kmer)135     void insertCit(const char* kmer) {
136         for (unsigned i = 0; i < m_hashNum; i++) {
137             size_t hLoc = CityHash64WithSeed(kmer, m_kmerSize, i) % m_size;
138             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
139         }
140     }
141 
insertXxh(const char * kmer)142     void insertXxh(const char* kmer) {
143         for (unsigned i = 0; i < m_hashNum; i++) {
144             size_t hLoc = XXH64(kmer, m_kmerSize, i) % m_size;
145             __sync_or_and_fetch(&m_filter[hLoc / 8], (1 << (7 - hLoc % 8)));
146         }
147     }
148 
containsF(const char * kmer) const149     bool containsF(const char* kmer) const {
150         uint64_t hVal = NTF64(kmer, m_kmerSize);
151         size_t hLoc = hVal % m_size;
152         if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
153         for (unsigned i = 1; i < m_hashNum; i++) {
154             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
155             mhVal ^= mhVal >> multiShift;
156             size_t hLoc = mhVal % m_size;
157             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
158                 return false;
159         }
160         return true;
161     }
162 
containsF(const char * kmer,uint64_t & hVal)163     bool containsF(const char * kmer, uint64_t& hVal) {
164         hVal = NTF64(kmer, m_kmerSize);
165         size_t hLoc = hVal % m_size;
166         if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
167         for (unsigned i = 1; i < m_hashNum; i++) {
168             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
169             mhVal ^= mhVal >> multiShift;
170             size_t hLoc = mhVal % m_size;
171             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
172                 return false;
173         }
174         return true;
175     }
176 
containsF(uint64_t & hVal,const char charOut,const char charIn)177     bool containsF(uint64_t& hVal, const char charOut, const char charIn) {
178         hVal = NTF64(hVal, m_kmerSize, charOut, charIn);
179         size_t hLoc = hVal % m_size;
180         if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
181         for (unsigned i = 1; i < m_hashNum; i++) {
182             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
183             mhVal ^= mhVal >> multiShift;
184             size_t hLoc = mhVal % m_size;
185             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
186                 return false;
187         }
188         return true;
189     }
190 
contains(const char * kmer) const191     bool contains(const char* kmer) const {
192         uint64_t hVal = NTC64(kmer, m_kmerSize);
193         size_t hLoc = hVal % m_size;
194         if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
195         for (unsigned i = 1; i < m_hashNum; i++) {
196             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
197             mhVal ^= mhVal >> multiShift;
198             size_t hLoc = mhVal % m_size;
199             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
200                 return false;
201         }
202         return true;
203     }
204 
contains(const char * kmer,uint64_t & fhVal,uint64_t & rhVal)205     bool contains(const char * kmer, uint64_t& fhVal, uint64_t& rhVal) {
206         uint64_t hVal = NTC64(kmer, m_kmerSize, fhVal, rhVal);
207         size_t hLoc = hVal % m_size;
208         if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
209         for (unsigned i = 1; i < m_hashNum; i++) {
210             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
211             mhVal ^= mhVal >> multiShift;
212             size_t hLoc = mhVal % m_size;
213             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
214                 return false;
215         }
216         return true;
217     }
218 
contains(uint64_t & fhVal,uint64_t & rhVal,const char charOut,const char charIn)219     bool contains(uint64_t& fhVal, uint64_t& rhVal, const char charOut, const char charIn) {
220         uint64_t hVal = NTC64(charOut, charIn, m_kmerSize, fhVal, rhVal);
221         size_t hLoc = hVal % m_size;
222         if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0) return false;
223         for (unsigned i = 1; i < m_hashNum; i++) {
224             uint64_t mhVal = hVal * (i ^ m_kmerSize * multiSeed);
225             mhVal ^= mhVal >> multiShift;
226             size_t hLoc = mhVal % m_size;
227             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
228                 return false;
229         }
230         return true;
231     }
232 
containsMur(const char * kmer) const233     bool containsMur(const char* kmer) const {
234         for (unsigned i = 0; i < m_hashNum; i++) {
235             size_t hLoc = MurmurHash64A(kmer, m_kmerSize, i) % m_size;
236             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
237                 return false;
238         }
239         return true;
240     }
241 
containsCit(const char * kmer) const242     bool containsCit(const char* kmer) const {
243         for (unsigned i = 0; i < m_hashNum; i++) {
244             size_t hLoc = CityHash64WithSeed(kmer, m_kmerSize, i) % m_size;
245             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
246                 return false;
247         }
248         return true;
249     }
250 
containsXxh(const char * kmer) const251     bool containsXxh(const char* kmer) const {
252         for (unsigned i = 0; i < m_hashNum; i++) {
253             size_t hLoc = XXH64(kmer, m_kmerSize, i) % m_size;
254             if ((m_filter[hLoc / 8] & (1 << (7 - hLoc % 8))) == 0)
255                 return false;
256         }
257         return true;
258     }
259 
storeFilter(const char * fPath) const260     void storeFilter(const char * fPath) const {
261         ofstream myFile(fPath, ios::out | ios::binary);
262         myFile.write(reinterpret_cast<char*>(m_filter), (m_size + 7)/8);
263         myFile.close();
264     }
265 
getPop() const266     size_t getPop() const {
267         size_t i, popBF=0;
268 	#ifdef _OPENMP
269         #pragma omp parallel for reduction(+:popBF)
270 	#endif
271         for(i=0; i<(m_size + 7)/8; i++)
272             popBF = popBF + popCnt(m_filter[i]);
273         return popBF;
274     }
275 
getHashNum() const276     unsigned getHashNum() const {
277         return m_hashNum;
278     }
279 
getKmerSize() const280     unsigned getKmerSize() const {
281         return m_kmerSize;
282     }
283 
~BloomFilter()284     ~BloomFilter() {
285         delete[] m_filter;
286     }
287 
288 private:
289     BloomFilter(const BloomFilter& that); //to prevent copy construction
290     unsigned char * m_filter;
291     size_t m_size;
292     unsigned m_hashNum;
293     unsigned m_kmerSize;
294 };
295 
296 #endif /* BLOOMFILTER_H_ */
297