1 /**
2  * A counting Bloom filter
3  * Copyright 2014 bcgsc
4  */
5 #ifndef COUNTINGBLOOMFILTER_H
6 #define COUNTINGBLOOMFILTER_H 1
7 
8 #include "Bloom/Bloom.h"
9 #include <cassert>
10 #include <cmath>
11 #include <vector>
12 
13 /** A counting Bloom filter. */
14 template<typename NumericType>
15 class CountingBloomFilter {
16 public:
17 
18 	/** Constructor */
19 	CountingBloomFilter(unsigned hashnum = 1) :
20 			m_data(0), hashNum(hashnum), uniqueEntries(0), replicateEntries(0)
21 	{
22 	}
23 
24 	/** Constructor */
25 	CountingBloomFilter(size_t n, unsigned hashnum = 1) :
m_data(n)26 			m_data(n), hashNum(hashnum), uniqueEntries(0), replicateEntries(0)
27 	{
28 	}
29 
30 	/** Destructor */
~CountingBloomFilter()31 	virtual ~CountingBloomFilter() {}
32 
33 	/** Return the size (in discrete elements) of the bit array. */
size()34 	size_t size() const
35 	{
36 		return m_data.size();
37 	}
38 
39 	/** Return the number of elements with count >= MAX_COUNT. */
popcount()40 	size_t popcount() const
41 	{
42 		return uniqueEntries;
43 	}
44 
45 	/** Return the estimated false positive rate */
FPR()46 	double FPR() const
47 	{
48 		return pow(1.0 - pow(1.0 - 1.0 / double(m_data.size()),
49 				double(uniqueEntries) * hashNum), double(hashNum));
50 	}
51 
52 	/** Return the count of the single element (debugging purposes)
53 	 */
54 	NumericType operator[](size_t i) const
55 	{
56 		return m_data[i];
57 	}
58 
59 	/** Return the count of this element. */
60 	NumericType operator[](const Bloom::key_type& key) const
61 	{
62 		NumericType currentMin = m_data[Bloom::hash(key, 0) % m_data.size()];
63 		for (unsigned int i = 1; i < hashNum; ++i) {
64 			NumericType min = m_data[Bloom::hash(key, i) % m_data.size()];
65 			if (min < currentMin) {
66 				currentMin = min;
67 			}
68 			if (0 == currentMin) {
69 				break;
70 			}
71 		}
72 		return currentMin;
73 	}
74 
75 	/** Add the object with the specified index (debugging purposes). */
insert(size_t index)76 	void insert(size_t index)
77 	{
78 		++m_data[index];
79 	}
80 
81 	/** Add the object to this counting multiset.
82 	 *  If all values are the same update all
83 	 *  If some values are larger only update smallest counts*/
insert(const Bloom::key_type & key)84 	void insert(const Bloom::key_type& key)
85 	{
86 		//check for which elements to update
87 		NumericType minEle = (*this)[key];
88 
89 		//update only those elements
90 		for (unsigned int i = 1; i < hashNum; ++i) {
91 			size_t hashVal = Bloom::hash(key, i) % m_data.size();
92 			NumericType val = m_data[hashVal];
93 			if (minEle == val) {
94 				insert(hashVal);
95 			}
96 		}
97 		if (minEle)
98 			++uniqueEntries;
99 		else
100 			++replicateEntries;
101 	}
102 
write(std::ostream & out)103 	void write(std::ostream& out) const
104 	{
105 		assert(!m_data.empty());
106 		out.write(reinterpret_cast<const char *>(&m_data), sizeof(NumericType));
107 	}
108 
109 	//TODO: need to implement tracking of directionality
loadSeq(unsigned k,const std::string & seq)110 	void loadSeq(unsigned k, const std::string& seq)
111 	{
112 		if (seq.size() < k)
113 			return;
114 		for (size_t i = 0; i < seq.size() - k + 1; ++i) {
115 			std::string kmer = seq.substr(i, k);
116 			size_t pos = kmer.find_last_not_of("ACGTacgt");
117 			if (pos == std::string::npos) {
118 				insert(Kmer(kmer));
119 			} else
120 				i += pos;
121 		}
122 	}
123 
124 protected:
125 	std::vector<NumericType> m_data;
126 	unsigned hashNum;
127 	size_t uniqueEntries;
128 	size_t replicateEntries;
129 
130 };
131 
132 #endif
133