1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5 //! Counting and non-counting Bloom filters tuned for use as ancestor filters
6 //! for selector matching.
7
8 use fnv::FnvHasher;
9 use std::fmt::{self, Debug};
10 use std::hash::{Hash, Hasher};
11
12 // The top 8 bits of the 32-bit hash value are not used by the bloom filter.
13 // Consumers may rely on this to pack hashes more efficiently.
14 pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
15 const KEY_SIZE: usize = 12;
16
17 const ARRAY_SIZE: usize = 1 << KEY_SIZE;
18 const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
19
20 /// A counting Bloom filter with 8-bit counters.
21 pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
22
23 /// A non-counting Bloom filter.
24 ///
25 /// Effectively a counting Bloom filter with 1-bit counters.
26 pub type NonCountingBloomFilter = CountingBloomFilter<BloomStorageBool>;
27
28 /// A counting Bloom filter with parameterized storage to handle
29 /// counters of different sizes. For now we assume that having two hash
30 /// functions is enough, but we may revisit that decision later.
31 ///
32 /// The filter uses an array with 2**KeySize entries.
33 ///
34 /// Assuming a well-distributed hash function, a Bloom filter with
35 /// array size M containing N elements and
36 /// using k hash function has expected false positive rate exactly
37 ///
38 /// $ (1 - (1 - 1/M)^{kN})^k $
39 ///
40 /// because each array slot has a
41 ///
42 /// $ (1 - 1/M)^{kN} $
43 ///
44 /// chance of being 0, and the expected false positive rate is the
45 /// probability that all of the k hash functions will hit a nonzero
46 /// slot.
47 ///
48 /// For reasonable assumptions (M large, kN large, which should both
49 /// hold if we're worried about false positives) about M and kN this
50 /// becomes approximately
51 ///
52 /// $$ (1 - \exp(-kN/M))^k $$
53 ///
54 /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
55 /// or in other words
56 ///
57 /// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
58 ///
59 /// where r is the false positive rate. This can be used to compute
60 /// the desired KeySize for a given load N and false positive rate r.
61 ///
62 /// If N/M is assumed small, then the false positive rate can
63 /// further be approximated as 4*N^2/M^2. So increasing KeySize by
64 /// 1, which doubles M, reduces the false positive rate by about a
65 /// factor of 4, and a false positive rate of 1% corresponds to
66 /// about M/N == 20.
67 ///
68 /// What this means in practice is that for a few hundred keys using a
69 /// KeySize of 12 gives false positive rates on the order of 0.25-4%.
70 ///
71 /// Similarly, using a KeySize of 10 would lead to a 4% false
72 /// positive rate for N == 100 and to quite bad false positive
73 /// rates for larger N.
74 #[derive(Clone)]
75 pub struct CountingBloomFilter<S> where S: BloomStorage {
76 storage: S,
77 }
78
79 impl<S> CountingBloomFilter<S> where S: BloomStorage {
80 /// Creates a new bloom filter.
81 #[inline]
new() -> Self82 pub fn new() -> Self {
83 CountingBloomFilter {
84 storage: Default::default(),
85 }
86 }
87
88 #[inline]
clear(&mut self)89 pub fn clear(&mut self) {
90 self.storage = Default::default();
91 }
92
93 // Slow linear accessor to make sure the bloom filter is zeroed. This should
94 // never be used in release builds.
95 #[cfg(debug_assertions)]
is_zeroed(&self) -> bool96 pub fn is_zeroed(&self) -> bool {
97 self.storage.is_zeroed()
98 }
99
100 #[cfg(not(debug_assertions))]
is_zeroed(&self) -> bool101 pub fn is_zeroed(&self) -> bool {
102 unreachable!()
103 }
104
105 #[inline]
insert_hash(&mut self, hash: u32)106 pub fn insert_hash(&mut self, hash: u32) {
107 self.storage.adjust_first_slot(hash, true);
108 self.storage.adjust_second_slot(hash, true);
109 }
110
111 /// Inserts an item into the bloom filter.
112 #[inline]
insert<T: Hash>(&mut self, elem: &T)113 pub fn insert<T: Hash>(&mut self, elem: &T) {
114 self.insert_hash(hash(elem))
115 }
116
117 #[inline]
remove_hash(&mut self, hash: u32)118 pub fn remove_hash(&mut self, hash: u32) {
119 self.storage.adjust_first_slot(hash, false);
120 self.storage.adjust_second_slot(hash, false);
121 }
122
123 /// Removes an item from the bloom filter.
124 #[inline]
remove<T: Hash>(&mut self, elem: &T)125 pub fn remove<T: Hash>(&mut self, elem: &T) {
126 self.remove_hash(hash(elem))
127 }
128
129 #[inline]
might_contain_hash(&self, hash: u32) -> bool130 pub fn might_contain_hash(&self, hash: u32) -> bool {
131 !self.storage.first_slot_is_empty(hash) &&
132 !self.storage.second_slot_is_empty(hash)
133 }
134
135 /// Check whether the filter might contain an item. This can
136 /// sometimes return true even if the item is not in the filter,
137 /// but will never return false for items that are actually in the
138 /// filter.
139 #[inline]
might_contain<T: Hash>(&self, elem: &T) -> bool140 pub fn might_contain<T: Hash>(&self, elem: &T) -> bool {
141 self.might_contain_hash(hash(elem))
142 }
143 }
144
145 impl<S> Debug for CountingBloomFilter<S> where S: BloomStorage {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result146 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
147 let mut slots_used = 0;
148 for i in 0..ARRAY_SIZE {
149 if !self.storage.slot_is_empty(i) {
150 slots_used += 1;
151 }
152 }
153 write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
154 }
155 }
156
157 pub trait BloomStorage : Clone + Default {
slot_is_empty(&self, index: usize) -> bool158 fn slot_is_empty(&self, index: usize) -> bool;
adjust_slot(&mut self, index: usize, increment: bool)159 fn adjust_slot(&mut self, index: usize, increment: bool);
is_zeroed(&self) -> bool160 fn is_zeroed(&self) -> bool;
161
162 #[inline]
first_slot_is_empty(&self, hash: u32) -> bool163 fn first_slot_is_empty(&self, hash: u32) -> bool {
164 self.slot_is_empty(Self::first_slot_index(hash))
165 }
166
167 #[inline]
second_slot_is_empty(&self, hash: u32) -> bool168 fn second_slot_is_empty(&self, hash: u32) -> bool {
169 self.slot_is_empty(Self::second_slot_index(hash))
170 }
171
172 #[inline]
adjust_first_slot(&mut self, hash: u32, increment: bool)173 fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
174 self.adjust_slot(Self::first_slot_index(hash), increment)
175 }
176
177 #[inline]
adjust_second_slot(&mut self, hash: u32, increment: bool)178 fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
179 self.adjust_slot(Self::second_slot_index(hash), increment)
180 }
181
182 #[inline]
first_slot_index(hash: u32) -> usize183 fn first_slot_index(hash: u32) -> usize {
184 hash1(hash) as usize
185 }
186
187 #[inline]
second_slot_index(hash: u32) -> usize188 fn second_slot_index(hash: u32) -> usize {
189 hash2(hash) as usize
190 }
191 }
192
193 /// Storage class for a CountingBloomFilter that has 8-bit counters.
194 pub struct BloomStorageU8 {
195 counters: [u8; ARRAY_SIZE],
196 }
197
198 impl BloomStorage for BloomStorageU8 {
199 #[inline]
adjust_slot(&mut self, index: usize, increment: bool)200 fn adjust_slot(&mut self, index: usize, increment: bool) {
201 let slot = &mut self.counters[index];
202 if *slot != 0xff { // full
203 if increment {
204 *slot += 1;
205 } else {
206 *slot -= 1;
207 }
208 }
209 }
210
211 #[inline]
slot_is_empty(&self, index: usize) -> bool212 fn slot_is_empty(&self, index: usize) -> bool {
213 self.counters[index] == 0
214 }
215
216 #[inline]
is_zeroed(&self) -> bool217 fn is_zeroed(&self) -> bool {
218 self.counters.iter().all(|x| *x == 0)
219 }
220 }
221
222 impl Default for BloomStorageU8 {
default() -> Self223 fn default() -> Self {
224 BloomStorageU8 {
225 counters: [0; ARRAY_SIZE],
226 }
227 }
228 }
229
230 impl Clone for BloomStorageU8 {
clone(&self) -> Self231 fn clone(&self) -> Self {
232 BloomStorageU8 {
233 counters: self.counters,
234 }
235 }
236 }
237
238 /// Storage class for a CountingBloomFilter that has 1-bit counters.
239 pub struct BloomStorageBool {
240 counters: [u8; ARRAY_SIZE / 8],
241 }
242
243 impl BloomStorage for BloomStorageBool {
244 #[inline]
adjust_slot(&mut self, index: usize, increment: bool)245 fn adjust_slot(&mut self, index: usize, increment: bool) {
246 let bit = 1 << (index % 8);
247 let byte = &mut self.counters[index / 8];
248
249 // Since we have only one bit for storage, decrementing it
250 // should never do anything. Assert against an accidental
251 // decrementing of a bit that was never set.
252 assert!(increment || (*byte & bit) != 0,
253 "should not decrement if slot is already false");
254
255 if increment {
256 *byte |= bit;
257 }
258 }
259
260 #[inline]
slot_is_empty(&self, index: usize) -> bool261 fn slot_is_empty(&self, index: usize) -> bool {
262 let bit = 1 << (index % 8);
263 (self.counters[index / 8] & bit) == 0
264 }
265
266 #[inline]
is_zeroed(&self) -> bool267 fn is_zeroed(&self) -> bool {
268 self.counters.iter().all(|x| *x == 0)
269 }
270 }
271
272 impl Default for BloomStorageBool {
default() -> Self273 fn default() -> Self {
274 BloomStorageBool {
275 counters: [0; ARRAY_SIZE / 8],
276 }
277 }
278 }
279
280 impl Clone for BloomStorageBool {
clone(&self) -> Self281 fn clone(&self) -> Self {
282 BloomStorageBool {
283 counters: self.counters,
284 }
285 }
286 }
287
hash<T: Hash>(elem: &T) -> u32288 fn hash<T: Hash>(elem: &T) -> u32 {
289 let mut hasher = FnvHasher::default();
290 elem.hash(&mut hasher);
291 let hash: u64 = hasher.finish();
292 (hash >> 32) as u32 ^ (hash as u32)
293 }
294
295 #[inline]
hash1(hash: u32) -> u32296 fn hash1(hash: u32) -> u32 {
297 hash & KEY_MASK
298 }
299
300 #[inline]
hash2(hash: u32) -> u32301 fn hash2(hash: u32) -> u32 {
302 (hash >> KEY_SIZE) & KEY_MASK
303 }
304
305 #[test]
create_and_insert_some_stuff()306 fn create_and_insert_some_stuff() {
307 use std::mem::transmute;
308
309 let mut bf = BloomFilter::new();
310
311 // Statically assert that ARRAY_SIZE is a multiple of 8, which
312 // BloomStorageBool relies on.
313 unsafe {
314 transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
315 }
316
317 for i in 0_usize .. 1000 {
318 bf.insert(&i);
319 }
320
321 for i in 0_usize .. 1000 {
322 assert!(bf.might_contain(&i));
323 }
324
325 let false_positives =
326 (1001_usize .. 2000).filter(|i| bf.might_contain(i)).count();
327
328 assert!(false_positives < 150, "{} is not < 150", false_positives); // 15%.
329
330 for i in 0_usize .. 100 {
331 bf.remove(&i);
332 }
333
334 for i in 100_usize .. 1000 {
335 assert!(bf.might_contain(&i));
336 }
337
338 let false_positives = (0_usize .. 100).filter(|i| bf.might_contain(i)).count();
339
340 assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%.
341
342 bf.clear();
343
344 for i in 0_usize .. 2000 {
345 assert!(!bf.might_contain(&i));
346 }
347 }
348
349 #[cfg(feature = "bench")]
350 #[cfg(test)]
351 mod bench {
352 extern crate test;
353 use super::BloomFilter;
354
355 #[derive(Default)]
356 struct HashGenerator(u32);
357
358 impl HashGenerator {
next(&mut self) -> u32359 fn next(&mut self) -> u32 {
360 // Each hash is split into two twelve-bit segments, which are used
361 // as an index into an array. We increment each by 64 so that we
362 // hit the next cache line, and then another 1 so that our wrapping
363 // behavior leads us to different entries each time.
364 //
365 // Trying to simulate cold caches is rather difficult with the cargo
366 // benchmarking setup, so it may all be moot depending on the number
367 // of iterations that end up being run. But we might as well.
368 self.0 += (65) + (65 << super::KEY_SIZE);
369 self.0
370 }
371 }
372
373 #[bench]
create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher)374 fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
375 b.iter(|| {
376 let mut gen1 = HashGenerator::default();
377 let mut gen2 = HashGenerator::default();
378 let mut bf = BloomFilter::new();
379 for _ in 0_usize .. 1000 {
380 bf.insert_hash(gen1.next());
381 }
382 for _ in 0_usize .. 100 {
383 bf.remove_hash(gen2.next());
384 }
385 for _ in 100_usize .. 200 {
386 test::black_box(bf.might_contain_hash(gen2.next()));
387 }
388 });
389 }
390
391 #[bench]
might_contain_10(b: &mut test::Bencher)392 fn might_contain_10(b: &mut test::Bencher) {
393 let bf = BloomFilter::new();
394 let mut gen = HashGenerator::default();
395 b.iter(|| for _ in 0..10 {
396 test::black_box(bf.might_contain_hash(gen.next()));
397 });
398 }
399
400 #[bench]
clear(b: &mut test::Bencher)401 fn clear(b: &mut test::Bencher) {
402 let mut bf = Box::new(BloomFilter::new());
403 b.iter(|| test::black_box(&mut bf).clear());
404 }
405
406 #[bench]
insert_10(b: &mut test::Bencher)407 fn insert_10(b: &mut test::Bencher) {
408 let mut bf = BloomFilter::new();
409 let mut gen = HashGenerator::default();
410 b.iter(|| for _ in 0..10 {
411 test::black_box(bf.insert_hash(gen.next()));
412 });
413 }
414
415 #[bench]
remove_10(b: &mut test::Bencher)416 fn remove_10(b: &mut test::Bencher) {
417 let mut bf = BloomFilter::new();
418 let mut gen = HashGenerator::default();
419 // Note: this will underflow, and that's ok.
420 b.iter(|| for _ in 0..10 {
421 bf.remove_hash(gen.next())
422 });
423 }
424 }
425