1 /** 2 * A counting Bloom filter 3 * Copyright 2014 bcgsc 4 */ 5 #ifndef COUNTINGBLOOMFILTER_H 6 #define COUNTINGBLOOMFILTER_H 1 7 8 #include "Bloom/Bloom.h" 9 #include <cassert> 10 #include <cmath> 11 #include <vector> 12 13 /** A counting Bloom filter. */ 14 template<typename NumericType> 15 class CountingBloomFilter { 16 public: 17 18 /** Constructor */ 19 CountingBloomFilter(unsigned hashnum = 1) : 20 m_data(0), hashNum(hashnum), uniqueEntries(0), replicateEntries(0) 21 { 22 } 23 24 /** Constructor */ 25 CountingBloomFilter(size_t n, unsigned hashnum = 1) : m_data(n)26 m_data(n), hashNum(hashnum), uniqueEntries(0), replicateEntries(0) 27 { 28 } 29 30 /** Destructor */ ~CountingBloomFilter()31 virtual ~CountingBloomFilter() {} 32 33 /** Return the size (in discrete elements) of the bit array. */ size()34 size_t size() const 35 { 36 return m_data.size(); 37 } 38 39 /** Return the number of elements with count >= MAX_COUNT. */ popcount()40 size_t popcount() const 41 { 42 return uniqueEntries; 43 } 44 45 /** Return the estimated false positive rate */ FPR()46 double FPR() const 47 { 48 return pow(1.0 - pow(1.0 - 1.0 / double(m_data.size()), 49 double(uniqueEntries) * hashNum), double(hashNum)); 50 } 51 52 /** Return the count of the single element (debugging purposes) 53 */ 54 NumericType operator[](size_t i) const 55 { 56 return m_data[i]; 57 } 58 59 /** Return the count of this element. */ 60 NumericType operator[](const Bloom::key_type& key) const 61 { 62 NumericType currentMin = m_data[Bloom::hash(key, 0) % m_data.size()]; 63 for (unsigned int i = 1; i < hashNum; ++i) { 64 NumericType min = m_data[Bloom::hash(key, i) % m_data.size()]; 65 if (min < currentMin) { 66 currentMin = min; 67 } 68 if (0 == currentMin) { 69 break; 70 } 71 } 72 return currentMin; 73 } 74 75 /** Add the object with the specified index (debugging purposes). */ insert(size_t index)76 void insert(size_t index) 77 { 78 ++m_data[index]; 79 } 80 81 /** Add the object to this counting multiset. 82 * If all values are the same update all 83 * If some values are larger only update smallest counts*/ insert(const Bloom::key_type & key)84 void insert(const Bloom::key_type& key) 85 { 86 //check for which elements to update 87 NumericType minEle = (*this)[key]; 88 89 //update only those elements 90 for (unsigned int i = 1; i < hashNum; ++i) { 91 size_t hashVal = Bloom::hash(key, i) % m_data.size(); 92 NumericType val = m_data[hashVal]; 93 if (minEle == val) { 94 insert(hashVal); 95 } 96 } 97 if (minEle) 98 ++uniqueEntries; 99 else 100 ++replicateEntries; 101 } 102 write(std::ostream & out)103 void write(std::ostream& out) const 104 { 105 assert(!m_data.empty()); 106 out.write(reinterpret_cast<const char *>(&m_data), sizeof(NumericType)); 107 } 108 109 //TODO: need to implement tracking of directionality loadSeq(unsigned k,const std::string & seq)110 void loadSeq(unsigned k, const std::string& seq) 111 { 112 if (seq.size() < k) 113 return; 114 for (size_t i = 0; i < seq.size() - k + 1; ++i) { 115 std::string kmer = seq.substr(i, k); 116 size_t pos = kmer.find_last_not_of("ACGTacgt"); 117 if (pos == std::string::npos) { 118 insert(Kmer(kmer)); 119 } else 120 i += pos; 121 } 122 } 123 124 protected: 125 std::vector<NumericType> m_data; 126 unsigned hashNum; 127 size_t uniqueEntries; 128 size_t replicateEntries; 129 130 }; 131 132 #endif 133