1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /*
8  * A counting Bloom filter implementation.  This allows consumers to
9  * do fast probabilistic "is item X in set Y?" testing which will
10  * never answer "no" when the correct answer is "yes" (but might
11  * incorrectly answer "yes" when the correct answer is "no").
12  */
13 
14 #ifndef mozilla_BloomFilter_h
15 #define mozilla_BloomFilter_h
16 
17 #include "mozilla/Assertions.h"
18 #include "mozilla/Likely.h"
19 
20 #include <stdint.h>
21 #include <string.h>
22 
23 namespace mozilla {
24 
25 /*
26  * This class implements a counting Bloom filter as described at
27  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
28  * 8-bit counters.  This allows quick probabilistic answers to the
29  * question "is object X in set Y?" where the contents of Y might not
30  * be time-invariant.  The probabilistic nature of the test means that
31  * sometimes the answer will be "yes" when it should be "no".  If the
32  * answer is "no", then X is guaranteed not to be in Y.
33  *
34  * The filter is parametrized on KeySize, which is the size of the key
35  * generated by each of hash functions used by the filter, in bits,
36  * and the type of object T being added and removed.  T must implement
37  * a |uint32_t hash() const| method which returns a uint32_t hash key
38  * that will be used to generate the two separate hash functions for
39  * the Bloom filter.  This hash key MUST be well-distributed for good
40  * results!  KeySize is not allowed to be larger than 16.
41  *
42  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
43  * will refer to the memory used by the filter as M.
44  *
45  * The expected rate of incorrect "yes" answers depends on M and on
46  * the number N of objects in set Y.  As long as N is small compared
47  * to M, the rate of such answers is expected to be approximately
48  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
49  * elements then using a KeySize of 12 gives a reasonably low
50  * incorrect answer rate.  A KeySize of 12 has the additional benefit
51  * of using exactly one page for the filter in typical hardware
52  * configurations.
53  */
54 
55 template<unsigned KeySize, class T>
56 class BloomFilter
57 {
58   /*
59    * A counting Bloom filter with 8-bit counters.  For now we assume
60    * that having two hash functions is enough, but we may revisit that
61    * decision later.
62    *
63    * The filter uses an array with 2**KeySize entries.
64    *
65    * Assuming a well-distributed hash function, a Bloom filter with
66    * array size M containing N elements and
67    * using k hash function has expected false positive rate exactly
68    *
69    * $  (1 - (1 - 1/M)^{kN})^k  $
70    *
71    * because each array slot has a
72    *
73    * $  (1 - 1/M)^{kN}  $
74    *
75    * chance of being 0, and the expected false positive rate is the
76    * probability that all of the k hash functions will hit a nonzero
77    * slot.
78    *
79    * For reasonable assumptions (M large, kN large, which should both
80    * hold if we're worried about false positives) about M and kN this
81    * becomes approximately
82    *
83    * $$  (1 - \exp(-kN/M))^k   $$
84    *
85    * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
86    * or in other words
87    *
88    * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
89    *
90    * where r is the false positive rate.  This can be used to compute
91    * the desired KeySize for a given load N and false positive rate r.
92    *
93    * If N/M is assumed small, then the false positive rate can
94    * further be approximated as 4*N^2/M^2.  So increasing KeySize by
95    * 1, which doubles M, reduces the false positive rate by about a
96    * factor of 4, and a false positive rate of 1% corresponds to
97    * about M/N == 20.
98    *
99    * What this means in practice is that for a few hundred keys using a
100    * KeySize of 12 gives false positive rates on the order of 0.25-4%.
101    *
102    * Similarly, using a KeySize of 10 would lead to a 4% false
103    * positive rate for N == 100 and to quite bad false positive
104    * rates for larger N.
105    */
106 public:
BloomFilter()107   BloomFilter()
108   {
109     static_assert(KeySize <= kKeyShift, "KeySize too big");
110 
111     // Should we have a custom operator new using calloc instead and
112     // require that we're allocated via the operator?
113     clear();
114   }
115 
116   /*
117    * Clear the filter.  This should be done before reusing it, because
118    * just removing all items doesn't clear counters that hit the upper
119    * bound.
120    */
121   void clear();
122 
123   /*
124    * Add an item to the filter.
125    */
126   void add(const T* aValue);
127 
128   /*
129    * Remove an item from the filter.
130    */
131   void remove(const T* aValue);
132 
133   /*
134    * Check whether the filter might contain an item.  This can
135    * sometimes return true even if the item is not in the filter,
136    * but will never return false for items that are actually in the
137    * filter.
138    */
139   bool mightContain(const T* aValue) const;
140 
141   /*
142    * Methods for add/remove/contain when we already have a hash computed
143    */
144   void add(uint32_t aHash);
145   void remove(uint32_t aHash);
146   bool mightContain(uint32_t aHash) const;
147 
148 private:
149   static const size_t kArraySize = (1 << KeySize);
150   static const uint32_t kKeyMask = (1 << KeySize) - 1;
151   static const uint32_t kKeyShift = 16;
152 
hash1(uint32_t aHash)153   static uint32_t hash1(uint32_t aHash)
154   {
155     return aHash & kKeyMask;
156   }
hash2(uint32_t aHash)157   static uint32_t hash2(uint32_t aHash)
158   {
159     return (aHash >> kKeyShift) & kKeyMask;
160   }
161 
firstSlot(uint32_t aHash)162   uint8_t& firstSlot(uint32_t aHash)
163   {
164     return mCounters[hash1(aHash)];
165   }
secondSlot(uint32_t aHash)166   uint8_t& secondSlot(uint32_t aHash)
167   {
168     return mCounters[hash2(aHash)];
169   }
170 
firstSlot(uint32_t aHash)171   const uint8_t& firstSlot(uint32_t aHash) const
172   {
173     return mCounters[hash1(aHash)];
174   }
secondSlot(uint32_t aHash)175   const uint8_t& secondSlot(uint32_t aHash) const
176   {
177     return mCounters[hash2(aHash)];
178   }
179 
full(const uint8_t & aSlot)180   static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
181 
182   uint8_t mCounters[kArraySize];
183 };
184 
185 template<unsigned KeySize, class T>
186 inline void
clear()187 BloomFilter<KeySize, T>::clear()
188 {
189   memset(mCounters, 0, kArraySize);
190 }
191 
192 template<unsigned KeySize, class T>
193 inline void
add(uint32_t aHash)194 BloomFilter<KeySize, T>::add(uint32_t aHash)
195 {
196   uint8_t& slot1 = firstSlot(aHash);
197   if (MOZ_LIKELY(!full(slot1))) {
198     ++slot1;
199   }
200   uint8_t& slot2 = secondSlot(aHash);
201   if (MOZ_LIKELY(!full(slot2))) {
202     ++slot2;
203   }
204 }
205 
206 template<unsigned KeySize, class T>
207 MOZ_ALWAYS_INLINE void
add(const T * aValue)208 BloomFilter<KeySize, T>::add(const T* aValue)
209 {
210   uint32_t hash = aValue->hash();
211   return add(hash);
212 }
213 
214 template<unsigned KeySize, class T>
215 inline void
remove(uint32_t aHash)216 BloomFilter<KeySize, T>::remove(uint32_t aHash)
217 {
218   // If the slots are full, we don't know whether we bumped them to be
219   // there when we added or not, so just leave them full.
220   uint8_t& slot1 = firstSlot(aHash);
221   if (MOZ_LIKELY(!full(slot1))) {
222     --slot1;
223   }
224   uint8_t& slot2 = secondSlot(aHash);
225   if (MOZ_LIKELY(!full(slot2))) {
226     --slot2;
227   }
228 }
229 
230 template<unsigned KeySize, class T>
231 MOZ_ALWAYS_INLINE void
remove(const T * aValue)232 BloomFilter<KeySize, T>::remove(const T* aValue)
233 {
234   uint32_t hash = aValue->hash();
235   remove(hash);
236 }
237 
238 template<unsigned KeySize, class T>
239 MOZ_ALWAYS_INLINE bool
mightContain(uint32_t aHash)240 BloomFilter<KeySize, T>::mightContain(uint32_t aHash) const
241 {
242   // Check that all the slots for this hash contain something
243   return firstSlot(aHash) && secondSlot(aHash);
244 }
245 
246 template<unsigned KeySize, class T>
247 MOZ_ALWAYS_INLINE bool
mightContain(const T * aValue)248 BloomFilter<KeySize, T>::mightContain(const T* aValue) const
249 {
250   uint32_t hash = aValue->hash();
251   return mightContain(hash);
252 }
253 
254 } // namespace mozilla
255 
256 #endif /* mozilla_BloomFilter_h */
257