1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4 
5 //! Counting and non-counting Bloom filters tuned for use as ancestor filters
6 //! for selector matching.
7 
8 use std::fmt::{self, Debug};
9 
10 // The top 8 bits of the 32-bit hash value are not used by the bloom filter.
11 // Consumers may rely on this to pack hashes more efficiently.
12 pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
13 const KEY_SIZE: usize = 12;
14 
15 const ARRAY_SIZE: usize = 1 << KEY_SIZE;
16 const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
17 
18 /// A counting Bloom filter with 8-bit counters.
19 pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
20 
21 /// A counting Bloom filter with parameterized storage to handle
22 /// counters of different sizes.  For now we assume that having two hash
23 /// functions is enough, but we may revisit that decision later.
24 ///
25 /// The filter uses an array with 2**KeySize entries.
26 ///
27 /// Assuming a well-distributed hash function, a Bloom filter with
28 /// array size M containing N elements and
29 /// using k hash function has expected false positive rate exactly
30 ///
31 /// $  (1 - (1 - 1/M)^{kN})^k  $
32 ///
33 /// because each array slot has a
34 ///
35 /// $  (1 - 1/M)^{kN}  $
36 ///
37 /// chance of being 0, and the expected false positive rate is the
38 /// probability that all of the k hash functions will hit a nonzero
39 /// slot.
40 ///
41 /// For reasonable assumptions (M large, kN large, which should both
42 /// hold if we're worried about false positives) about M and kN this
43 /// becomes approximately
44 ///
45 /// $$  (1 - \exp(-kN/M))^k   $$
46 ///
47 /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
48 /// or in other words
49 ///
50 /// $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
51 ///
52 /// where r is the false positive rate.  This can be used to compute
53 /// the desired KeySize for a given load N and false positive rate r.
54 ///
55 /// If N/M is assumed small, then the false positive rate can
56 /// further be approximated as 4*N^2/M^2.  So increasing KeySize by
57 /// 1, which doubles M, reduces the false positive rate by about a
58 /// factor of 4, and a false positive rate of 1% corresponds to
59 /// about M/N == 20.
60 ///
61 /// What this means in practice is that for a few hundred keys using a
62 /// KeySize of 12 gives false positive rates on the order of 0.25-4%.
63 ///
64 /// Similarly, using a KeySize of 10 would lead to a 4% false
65 /// positive rate for N == 100 and to quite bad false positive
66 /// rates for larger N.
67 #[derive(Clone, Default)]
68 pub struct CountingBloomFilter<S>
69 where
70     S: BloomStorage,
71 {
72     storage: S,
73 }
74 
75 impl<S> CountingBloomFilter<S>
76 where
77     S: BloomStorage,
78 {
79     /// Creates a new bloom filter.
80     #[inline]
new() -> Self81     pub fn new() -> Self {
82         Default::default()
83     }
84 
85     #[inline]
clear(&mut self)86     pub fn clear(&mut self) {
87         self.storage = Default::default();
88     }
89 
90     // Slow linear accessor to make sure the bloom filter is zeroed. This should
91     // never be used in release builds.
92     #[cfg(debug_assertions)]
is_zeroed(&self) -> bool93     pub fn is_zeroed(&self) -> bool {
94         self.storage.is_zeroed()
95     }
96 
97     #[cfg(not(debug_assertions))]
is_zeroed(&self) -> bool98     pub fn is_zeroed(&self) -> bool {
99         unreachable!()
100     }
101 
102     /// Inserts an item with a particular hash into the bloom filter.
103     #[inline]
insert_hash(&mut self, hash: u32)104     pub fn insert_hash(&mut self, hash: u32) {
105         self.storage.adjust_first_slot(hash, true);
106         self.storage.adjust_second_slot(hash, true);
107     }
108 
109     /// Removes an item with a particular hash from the bloom filter.
110     #[inline]
remove_hash(&mut self, hash: u32)111     pub fn remove_hash(&mut self, hash: u32) {
112         self.storage.adjust_first_slot(hash, false);
113         self.storage.adjust_second_slot(hash, false);
114     }
115 
116     /// Check whether the filter might contain an item with the given hash.
117     /// This can sometimes return true even if the item is not in the filter,
118     /// but will never return false for items that are actually in the filter.
119     #[inline]
might_contain_hash(&self, hash: u32) -> bool120     pub fn might_contain_hash(&self, hash: u32) -> bool {
121         !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
122     }
123 }
124 
125 impl<S> Debug for CountingBloomFilter<S>
126 where
127     S: BloomStorage,
128 {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result129     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
130         let mut slots_used = 0;
131         for i in 0..ARRAY_SIZE {
132             if !self.storage.slot_is_empty(i) {
133                 slots_used += 1;
134             }
135         }
136         write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
137     }
138 }
139 
140 pub trait BloomStorage: Clone + Default {
slot_is_empty(&self, index: usize) -> bool141     fn slot_is_empty(&self, index: usize) -> bool;
adjust_slot(&mut self, index: usize, increment: bool)142     fn adjust_slot(&mut self, index: usize, increment: bool);
is_zeroed(&self) -> bool143     fn is_zeroed(&self) -> bool;
144 
145     #[inline]
first_slot_is_empty(&self, hash: u32) -> bool146     fn first_slot_is_empty(&self, hash: u32) -> bool {
147         self.slot_is_empty(Self::first_slot_index(hash))
148     }
149 
150     #[inline]
second_slot_is_empty(&self, hash: u32) -> bool151     fn second_slot_is_empty(&self, hash: u32) -> bool {
152         self.slot_is_empty(Self::second_slot_index(hash))
153     }
154 
155     #[inline]
adjust_first_slot(&mut self, hash: u32, increment: bool)156     fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
157         self.adjust_slot(Self::first_slot_index(hash), increment)
158     }
159 
160     #[inline]
adjust_second_slot(&mut self, hash: u32, increment: bool)161     fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
162         self.adjust_slot(Self::second_slot_index(hash), increment)
163     }
164 
165     #[inline]
first_slot_index(hash: u32) -> usize166     fn first_slot_index(hash: u32) -> usize {
167         hash1(hash) as usize
168     }
169 
170     #[inline]
second_slot_index(hash: u32) -> usize171     fn second_slot_index(hash: u32) -> usize {
172         hash2(hash) as usize
173     }
174 }
175 
176 /// Storage class for a CountingBloomFilter that has 8-bit counters.
177 pub struct BloomStorageU8 {
178     counters: [u8; ARRAY_SIZE],
179 }
180 
181 impl BloomStorage for BloomStorageU8 {
182     #[inline]
adjust_slot(&mut self, index: usize, increment: bool)183     fn adjust_slot(&mut self, index: usize, increment: bool) {
184         let slot = &mut self.counters[index];
185         if *slot != 0xff {
186             // full
187             if increment {
188                 *slot += 1;
189             } else {
190                 *slot -= 1;
191             }
192         }
193     }
194 
195     #[inline]
slot_is_empty(&self, index: usize) -> bool196     fn slot_is_empty(&self, index: usize) -> bool {
197         self.counters[index] == 0
198     }
199 
200     #[inline]
is_zeroed(&self) -> bool201     fn is_zeroed(&self) -> bool {
202         self.counters.iter().all(|x| *x == 0)
203     }
204 }
205 
206 impl Default for BloomStorageU8 {
default() -> Self207     fn default() -> Self {
208         BloomStorageU8 {
209             counters: [0; ARRAY_SIZE],
210         }
211     }
212 }
213 
214 impl Clone for BloomStorageU8 {
clone(&self) -> Self215     fn clone(&self) -> Self {
216         BloomStorageU8 {
217             counters: self.counters,
218         }
219     }
220 }
221 
222 /// Storage class for a CountingBloomFilter that has 1-bit counters.
223 pub struct BloomStorageBool {
224     counters: [u8; ARRAY_SIZE / 8],
225 }
226 
227 impl BloomStorage for BloomStorageBool {
228     #[inline]
adjust_slot(&mut self, index: usize, increment: bool)229     fn adjust_slot(&mut self, index: usize, increment: bool) {
230         let bit = 1 << (index % 8);
231         let byte = &mut self.counters[index / 8];
232 
233         // Since we have only one bit for storage, decrementing it
234         // should never do anything.  Assert against an accidental
235         // decrementing of a bit that was never set.
236         assert!(
237             increment || (*byte & bit) != 0,
238             "should not decrement if slot is already false"
239         );
240 
241         if increment {
242             *byte |= bit;
243         }
244     }
245 
246     #[inline]
slot_is_empty(&self, index: usize) -> bool247     fn slot_is_empty(&self, index: usize) -> bool {
248         let bit = 1 << (index % 8);
249         (self.counters[index / 8] & bit) == 0
250     }
251 
252     #[inline]
is_zeroed(&self) -> bool253     fn is_zeroed(&self) -> bool {
254         self.counters.iter().all(|x| *x == 0)
255     }
256 }
257 
258 impl Default for BloomStorageBool {
default() -> Self259     fn default() -> Self {
260         BloomStorageBool {
261             counters: [0; ARRAY_SIZE / 8],
262         }
263     }
264 }
265 
266 impl Clone for BloomStorageBool {
clone(&self) -> Self267     fn clone(&self) -> Self {
268         BloomStorageBool {
269             counters: self.counters,
270         }
271     }
272 }
273 
274 #[inline]
hash1(hash: u32) -> u32275 fn hash1(hash: u32) -> u32 {
276     hash & KEY_MASK
277 }
278 
279 #[inline]
hash2(hash: u32) -> u32280 fn hash2(hash: u32) -> u32 {
281     (hash >> KEY_SIZE) & KEY_MASK
282 }
283 
284 #[test]
create_and_insert_some_stuff()285 fn create_and_insert_some_stuff() {
286     use fxhash::FxHasher;
287     use std::hash::{Hash, Hasher};
288     use std::mem::transmute;
289 
290     fn hash_as_str(i: usize) -> u32 {
291         let mut hasher = FxHasher::default();
292         let s = i.to_string();
293         s.hash(&mut hasher);
294         let hash: u64 = hasher.finish();
295         (hash >> 32) as u32 ^ (hash as u32)
296     }
297 
298     let mut bf = BloomFilter::new();
299 
300     // Statically assert that ARRAY_SIZE is a multiple of 8, which
301     // BloomStorageBool relies on.
302     unsafe {
303         transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
304     }
305 
306     for i in 0_usize..1000 {
307         bf.insert_hash(hash_as_str(i));
308     }
309 
310     for i in 0_usize..1000 {
311         assert!(bf.might_contain_hash(hash_as_str(i)));
312     }
313 
314     let false_positives = (1001_usize..2000)
315         .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
316         .count();
317 
318     assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%.
319 
320     for i in 0_usize..100 {
321         bf.remove_hash(hash_as_str(i));
322     }
323 
324     for i in 100_usize..1000 {
325         assert!(bf.might_contain_hash(hash_as_str(i)));
326     }
327 
328     let false_positives = (0_usize..100)
329         .filter(|i| bf.might_contain_hash(hash_as_str(*i)))
330         .count();
331 
332     assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
333 
334     bf.clear();
335 
336     for i in 0_usize..2000 {
337         assert!(!bf.might_contain_hash(hash_as_str(i)));
338     }
339 }
340 
341 #[cfg(feature = "bench")]
342 #[cfg(test)]
343 mod bench {
344     extern crate test;
345     use super::BloomFilter;
346 
347     #[derive(Default)]
348     struct HashGenerator(u32);
349 
350     impl HashGenerator {
next(&mut self) -> u32351         fn next(&mut self) -> u32 {
352             // Each hash is split into two twelve-bit segments, which are used
353             // as an index into an array. We increment each by 64 so that we
354             // hit the next cache line, and then another 1 so that our wrapping
355             // behavior leads us to different entries each time.
356             //
357             // Trying to simulate cold caches is rather difficult with the cargo
358             // benchmarking setup, so it may all be moot depending on the number
359             // of iterations that end up being run. But we might as well.
360             self.0 += (65) + (65 << super::KEY_SIZE);
361             self.0
362         }
363     }
364 
365     #[bench]
create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher)366     fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
367         b.iter(|| {
368             let mut gen1 = HashGenerator::default();
369             let mut gen2 = HashGenerator::default();
370             let mut bf = BloomFilter::new();
371             for _ in 0_usize..1000 {
372                 bf.insert_hash(gen1.next());
373             }
374             for _ in 0_usize..100 {
375                 bf.remove_hash(gen2.next());
376             }
377             for _ in 100_usize..200 {
378                 test::black_box(bf.might_contain_hash(gen2.next()));
379             }
380         });
381     }
382 
383     #[bench]
might_contain_10(b: &mut test::Bencher)384     fn might_contain_10(b: &mut test::Bencher) {
385         let bf = BloomFilter::new();
386         let mut gen = HashGenerator::default();
387         b.iter(|| {
388             for _ in 0..10 {
389                 test::black_box(bf.might_contain_hash(gen.next()));
390             }
391         });
392     }
393 
394     #[bench]
clear(b: &mut test::Bencher)395     fn clear(b: &mut test::Bencher) {
396         let mut bf = Box::new(BloomFilter::new());
397         b.iter(|| test::black_box(&mut bf).clear());
398     }
399 
400     #[bench]
insert_10(b: &mut test::Bencher)401     fn insert_10(b: &mut test::Bencher) {
402         let mut bf = BloomFilter::new();
403         let mut gen = HashGenerator::default();
404         b.iter(|| {
405             for _ in 0..10 {
406                 test::black_box(bf.insert_hash(gen.next()));
407             }
408         });
409     }
410 
411     #[bench]
remove_10(b: &mut test::Bencher)412     fn remove_10(b: &mut test::Bencher) {
413         let mut bf = BloomFilter::new();
414         let mut gen = HashGenerator::default();
415         // Note: this will underflow, and that's ok.
416         b.iter(|| {
417             for _ in 0..10 {
418                 bf.remove_hash(gen.next())
419             }
420         });
421     }
422 }
423