1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 //! Counting and non-counting Bloom filters tuned for use as ancestor filters
6 //! for selector matching.
7 
8 use fnv::FnvHasher;
9 use std::fmt::{self, Debug};
10 use std::hash::{Hash, Hasher};
11 
12 // The top 8 bits of the 32-bit hash value are not used by the bloom filter.
13 // Consumers may rely on this to pack hashes more efficiently.
14 pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
15 const KEY_SIZE: usize = 12;
16 
17 const ARRAY_SIZE: usize = 1 << KEY_SIZE;
18 const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
19 
20 /// A counting Bloom filter with 8-bit counters.
21 pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
22 
23 /// A non-counting Bloom filter.
24 ///
25 /// Effectively a counting Bloom filter with 1-bit counters.
26 pub type NonCountingBloomFilter = CountingBloomFilter<BloomStorageBool>;
27 
28 /// A counting Bloom filter with parameterized storage to handle
29 /// counters of different sizes.  For now we assume that having two hash
30 /// functions is enough, but we may revisit that decision later.
31 ///
32 /// The filter uses an array with 2**KeySize entries.
33 ///
34 /// Assuming a well-distributed hash function, a Bloom filter with
35 /// array size M containing N elements and
36 /// using k hash function has expected false positive rate exactly
37 ///
38 /// $  (1 - (1 - 1/M)^{kN})^k  $
39 ///
40 /// because each array slot has a
41 ///
42 /// $  (1 - 1/M)^{kN}  $
43 ///
44 /// chance of being 0, and the expected false positive rate is the
45 /// probability that all of the k hash functions will hit a nonzero
46 /// slot.
47 ///
48 /// For reasonable assumptions (M large, kN large, which should both
49 /// hold if we're worried about false positives) about M and kN this
50 /// becomes approximately
51 ///
52 /// $$  (1 - \exp(-kN/M))^k   $$
53 ///
54 /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
55 /// or in other words
56 ///
57 /// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
58 ///
59 /// where r is the false positive rate.  This can be used to compute
60 /// the desired KeySize for a given load N and false positive rate r.
61 ///
62 /// If N/M is assumed small, then the false positive rate can
63 /// further be approximated as 4*N^2/M^2.  So increasing KeySize by
64 /// 1, which doubles M, reduces the false positive rate by about a
65 /// factor of 4, and a false positive rate of 1% corresponds to
66 /// about M/N == 20.
67 ///
68 /// What this means in practice is that for a few hundred keys using a
69 /// KeySize of 12 gives false positive rates on the order of 0.25-4%.
70 ///
71 /// Similarly, using a KeySize of 10 would lead to a 4% false
72 /// positive rate for N == 100 and to quite bad false positive
73 /// rates for larger N.
74 #[derive(Clone)]
75 pub struct CountingBloomFilter<S> where S: BloomStorage {
76     storage: S,
77 }
78 
79 impl<S> CountingBloomFilter<S> where S: BloomStorage {
80     /// Creates a new bloom filter.
81     #[inline]
new() -> Self82     pub fn new() -> Self {
83         CountingBloomFilter {
84             storage: Default::default(),
85         }
86     }
87 
88     #[inline]
clear(&mut self)89     pub fn clear(&mut self) {
90         self.storage = Default::default();
91     }
92 
93     // Slow linear accessor to make sure the bloom filter is zeroed. This should
94     // never be used in release builds.
95     #[cfg(debug_assertions)]
is_zeroed(&self) -> bool96     pub fn is_zeroed(&self) -> bool {
97         self.storage.is_zeroed()
98     }
99 
100     #[cfg(not(debug_assertions))]
is_zeroed(&self) -> bool101     pub fn is_zeroed(&self) -> bool {
102         unreachable!()
103     }
104 
105     #[inline]
insert_hash(&mut self, hash: u32)106     pub fn insert_hash(&mut self, hash: u32) {
107         self.storage.adjust_first_slot(hash, true);
108         self.storage.adjust_second_slot(hash, true);
109     }
110 
111     /// Inserts an item into the bloom filter.
112     #[inline]
insert<T: Hash>(&mut self, elem: &T)113     pub fn insert<T: Hash>(&mut self, elem: &T) {
114         self.insert_hash(hash(elem))
115     }
116 
117     #[inline]
remove_hash(&mut self, hash: u32)118     pub fn remove_hash(&mut self, hash: u32) {
119         self.storage.adjust_first_slot(hash, false);
120         self.storage.adjust_second_slot(hash, false);
121     }
122 
123     /// Removes an item from the bloom filter.
124     #[inline]
remove<T: Hash>(&mut self, elem: &T)125     pub fn remove<T: Hash>(&mut self, elem: &T) {
126         self.remove_hash(hash(elem))
127     }
128 
129     #[inline]
might_contain_hash(&self, hash: u32) -> bool130     pub fn might_contain_hash(&self, hash: u32) -> bool {
131         !self.storage.first_slot_is_empty(hash) &&
132             !self.storage.second_slot_is_empty(hash)
133     }
134 
135     /// Check whether the filter might contain an item.  This can
136     /// sometimes return true even if the item is not in the filter,
137     /// but will never return false for items that are actually in the
138     /// filter.
139     #[inline]
might_contain<T: Hash>(&self, elem: &T) -> bool140     pub fn might_contain<T: Hash>(&self, elem: &T) -> bool {
141         self.might_contain_hash(hash(elem))
142     }
143 }
144 
145 impl<S> Debug for CountingBloomFilter<S> where S: BloomStorage {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result146     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
147         let mut slots_used = 0;
148         for i in 0..ARRAY_SIZE {
149             if !self.storage.slot_is_empty(i) {
150                 slots_used += 1;
151             }
152         }
153         write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
154     }
155 }
156 
157 pub trait BloomStorage : Clone + Default {
slot_is_empty(&self, index: usize) -> bool158     fn slot_is_empty(&self, index: usize) -> bool;
adjust_slot(&mut self, index: usize, increment: bool)159     fn adjust_slot(&mut self, index: usize, increment: bool);
is_zeroed(&self) -> bool160     fn is_zeroed(&self) -> bool;
161 
162     #[inline]
first_slot_is_empty(&self, hash: u32) -> bool163     fn first_slot_is_empty(&self, hash: u32) -> bool {
164         self.slot_is_empty(Self::first_slot_index(hash))
165     }
166 
167     #[inline]
second_slot_is_empty(&self, hash: u32) -> bool168     fn second_slot_is_empty(&self, hash: u32) -> bool {
169         self.slot_is_empty(Self::second_slot_index(hash))
170     }
171 
172     #[inline]
adjust_first_slot(&mut self, hash: u32, increment: bool)173     fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
174         self.adjust_slot(Self::first_slot_index(hash), increment)
175     }
176 
177     #[inline]
adjust_second_slot(&mut self, hash: u32, increment: bool)178     fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
179         self.adjust_slot(Self::second_slot_index(hash), increment)
180     }
181 
182     #[inline]
first_slot_index(hash: u32) -> usize183     fn first_slot_index(hash: u32) -> usize {
184         hash1(hash) as usize
185     }
186 
187     #[inline]
second_slot_index(hash: u32) -> usize188     fn second_slot_index(hash: u32) -> usize {
189         hash2(hash) as usize
190     }
191 }
192 
193 /// Storage class for a CountingBloomFilter that has 8-bit counters.
194 pub struct BloomStorageU8 {
195     counters: [u8; ARRAY_SIZE],
196 }
197 
198 impl BloomStorage for BloomStorageU8 {
199     #[inline]
adjust_slot(&mut self, index: usize, increment: bool)200     fn adjust_slot(&mut self, index: usize, increment: bool) {
201         let slot = &mut self.counters[index];
202         if *slot != 0xff {  // full
203             if increment {
204                 *slot += 1;
205             } else {
206                 *slot -= 1;
207             }
208         }
209     }
210 
211     #[inline]
slot_is_empty(&self, index: usize) -> bool212     fn slot_is_empty(&self, index: usize) -> bool {
213         self.counters[index] == 0
214     }
215 
216     #[inline]
is_zeroed(&self) -> bool217     fn is_zeroed(&self) -> bool {
218         self.counters.iter().all(|x| *x == 0)
219     }
220 }
221 
222 impl Default for BloomStorageU8 {
default() -> Self223     fn default() -> Self {
224         BloomStorageU8 {
225             counters: [0; ARRAY_SIZE],
226         }
227     }
228 }
229 
230 impl Clone for BloomStorageU8 {
clone(&self) -> Self231     fn clone(&self) -> Self {
232         BloomStorageU8 {
233             counters: self.counters,
234         }
235     }
236 }
237 
238 /// Storage class for a CountingBloomFilter that has 1-bit counters.
239 pub struct BloomStorageBool {
240     counters: [u8; ARRAY_SIZE / 8],
241 }
242 
243 impl BloomStorage for BloomStorageBool {
244     #[inline]
adjust_slot(&mut self, index: usize, increment: bool)245     fn adjust_slot(&mut self, index: usize, increment: bool) {
246         let bit = 1 << (index % 8);
247         let byte = &mut self.counters[index / 8];
248 
249         // Since we have only one bit for storage, decrementing it
250         // should never do anything.  Assert against an accidental
251         // decrementing of a bit that was never set.
252         assert!(increment || (*byte & bit) != 0,
253                 "should not decrement if slot is already false");
254 
255         if increment {
256             *byte |= bit;
257         }
258     }
259 
260     #[inline]
slot_is_empty(&self, index: usize) -> bool261     fn slot_is_empty(&self, index: usize) -> bool {
262         let bit = 1 << (index % 8);
263         (self.counters[index / 8] & bit) == 0
264     }
265 
266     #[inline]
is_zeroed(&self) -> bool267     fn is_zeroed(&self) -> bool {
268         self.counters.iter().all(|x| *x == 0)
269     }
270 }
271 
272 impl Default for BloomStorageBool {
default() -> Self273     fn default() -> Self {
274         BloomStorageBool {
275             counters: [0; ARRAY_SIZE / 8],
276         }
277     }
278 }
279 
280 impl Clone for BloomStorageBool {
clone(&self) -> Self281     fn clone(&self) -> Self {
282         BloomStorageBool {
283             counters: self.counters,
284         }
285     }
286 }
287 
hash<T: Hash>(elem: &T) -> u32288 fn hash<T: Hash>(elem: &T) -> u32 {
289     let mut hasher = FnvHasher::default();
290     elem.hash(&mut hasher);
291     let hash: u64 = hasher.finish();
292     (hash >> 32) as u32 ^ (hash as u32)
293 }
294 
295 #[inline]
hash1(hash: u32) -> u32296 fn hash1(hash: u32) -> u32 {
297     hash & KEY_MASK
298 }
299 
300 #[inline]
hash2(hash: u32) -> u32301 fn hash2(hash: u32) -> u32 {
302     (hash >> KEY_SIZE) & KEY_MASK
303 }
304 
305 #[test]
create_and_insert_some_stuff()306 fn create_and_insert_some_stuff() {
307     use std::mem::transmute;
308 
309     let mut bf = BloomFilter::new();
310 
311     // Statically assert that ARRAY_SIZE is a multiple of 8, which
312     // BloomStorageBool relies on.
313     unsafe {
314         transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
315     }
316 
317     for i in 0_usize .. 1000 {
318         bf.insert(&i);
319     }
320 
321     for i in 0_usize .. 1000 {
322         assert!(bf.might_contain(&i));
323     }
324 
325     let false_positives =
326         (1001_usize .. 2000).filter(|i| bf.might_contain(i)).count();
327 
328     assert!(false_positives < 150, "{} is not < 150", false_positives); // 15%.
329 
330     for i in 0_usize .. 100 {
331         bf.remove(&i);
332     }
333 
334     for i in 100_usize .. 1000 {
335         assert!(bf.might_contain(&i));
336     }
337 
338     let false_positives = (0_usize .. 100).filter(|i| bf.might_contain(i)).count();
339 
340     assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
341 
342     bf.clear();
343 
344     for i in 0_usize .. 2000 {
345         assert!(!bf.might_contain(&i));
346     }
347 }
348 
349 #[cfg(feature = "bench")]
350 #[cfg(test)]
351 mod bench {
352     extern crate test;
353     use super::BloomFilter;
354 
355     #[derive(Default)]
356     struct HashGenerator(u32);
357 
358     impl HashGenerator {
next(&mut self) -> u32359         fn next(&mut self) -> u32 {
360             // Each hash is split into two twelve-bit segments, which are used
361             // as an index into an array. We increment each by 64 so that we
362             // hit the next cache line, and then another 1 so that our wrapping
363             // behavior leads us to different entries each time.
364             //
365             // Trying to simulate cold caches is rather difficult with the cargo
366             // benchmarking setup, so it may all be moot depending on the number
367             // of iterations that end up being run. But we might as well.
368             self.0 += (65) + (65 << super::KEY_SIZE);
369             self.0
370         }
371     }
372 
373     #[bench]
create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher)374     fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
375         b.iter(|| {
376             let mut gen1 = HashGenerator::default();
377             let mut gen2 = HashGenerator::default();
378             let mut bf = BloomFilter::new();
379             for _ in 0_usize .. 1000 {
380                 bf.insert_hash(gen1.next());
381             }
382             for _ in 0_usize .. 100 {
383                 bf.remove_hash(gen2.next());
384             }
385             for _ in 100_usize .. 200 {
386                 test::black_box(bf.might_contain_hash(gen2.next()));
387             }
388         });
389     }
390 
391     #[bench]
might_contain_10(b: &mut test::Bencher)392     fn might_contain_10(b: &mut test::Bencher) {
393         let bf = BloomFilter::new();
394         let mut gen = HashGenerator::default();
395         b.iter(|| for _ in 0..10 {
396             test::black_box(bf.might_contain_hash(gen.next()));
397         });
398     }
399 
400     #[bench]
clear(b: &mut test::Bencher)401     fn clear(b: &mut test::Bencher) {
402         let mut bf = Box::new(BloomFilter::new());
403         b.iter(|| test::black_box(&mut bf).clear());
404     }
405 
406     #[bench]
insert_10(b: &mut test::Bencher)407     fn insert_10(b: &mut test::Bencher) {
408         let mut bf = BloomFilter::new();
409         let mut gen = HashGenerator::default();
410         b.iter(|| for _ in 0..10 {
411             test::black_box(bf.insert_hash(gen.next()));
412         });
413     }
414 
415     #[bench]
remove_10(b: &mut test::Bencher)416     fn remove_10(b: &mut test::Bencher) {
417         let mut bf = BloomFilter::new();
418         let mut gen = HashGenerator::default();
419         // Note: this will underflow, and that's ok.
420         b.iter(|| for _ in 0..10 {
421             bf.remove_hash(gen.next())
422         });
423     }
424 }
425