1 extern crate byteorder;
2 extern crate digest;
3 extern crate murmurhash3;
4 extern crate sha2;
5 
6 use byteorder::ReadBytesExt;
7 use murmurhash3::murmurhash3_x86_32;
8 use sha2::{Digest, Sha256};
9 use std::convert::{TryFrom, TryInto};
10 use std::fmt;
11 use std::io::{Error, ErrorKind};
12 
13 /// Helper struct to provide read-only bit access to a slice of bytes.
14 struct BitSlice<'a> {
15     /// The slice of bytes we're interested in.
16     bytes: &'a [u8],
17     /// The number of bits that are valid to access in the slice.
18     /// Not necessarily equal to `bytes.len() * 8`, but it will not be greater than that.
19     bit_len: usize,
20 }
21 
22 impl<'a> BitSlice<'a> {
23     /// Creates a new `BitSlice` of the given bit length over the given slice of data.
24     /// Panics if the indicated bit length is larger than fits in the slice.
25     ///
26     /// # Arguments
27     /// * `bytes` - The slice of bytes we need bit-access to
28     /// * `bit_len` - The number of bits that are valid to access in the slice
new(bytes: &'a [u8], bit_len: usize) -> BitSlice<'a>29     fn new(bytes: &'a [u8], bit_len: usize) -> BitSlice<'a> {
30         if bit_len > bytes.len() * 8 {
31             panic!(
32                 "bit_len too large for given data: {} > {} * 8",
33                 bit_len,
34                 bytes.len()
35             );
36         }
37         BitSlice { bytes, bit_len }
38     }
39 
40     /// Get the value of the specified bit.
41     /// Panics if the specified bit is out of range for the number of bits in this instance.
42     ///
43     /// # Arguments
44     /// * `bit_index` - The bit index to access
get(&self, bit_index: usize) -> bool45     fn get(&self, bit_index: usize) -> bool {
46         if bit_index >= self.bit_len {
47             panic!(
48                 "bit index out of range for bit slice: {} >= {}",
49                 bit_index, self.bit_len
50             );
51         }
52         let byte_index = bit_index / 8;
53         let final_bit_index = bit_index % 8;
54         let byte = self.bytes[byte_index];
55         let test_value = match final_bit_index {
56             0 => byte & 0b0000_0001u8,
57             1 => byte & 0b0000_0010u8,
58             2 => byte & 0b0000_0100u8,
59             3 => byte & 0b0000_1000u8,
60             4 => byte & 0b0001_0000u8,
61             5 => byte & 0b0010_0000u8,
62             6 => byte & 0b0100_0000u8,
63             7 => byte & 0b1000_0000u8,
64             _ => panic!("impossible final_bit_index value: {}", final_bit_index),
65         };
66         test_value > 0
67     }
68 }
69 
70 /// A Bloom filter representing a specific level in a multi-level cascading Bloom filter.
71 struct Bloom<'a> {
72     /// What level this filter is in
73     level: u8,
74     /// How many hash functions this filter uses
75     n_hash_funcs: u32,
76     /// The bit length of the filter
77     size: u32,
78     /// The data of the filter
79     bit_slice: BitSlice<'a>,
80     /// The hash algorithm enumeration in use
81     hash_algorithm: HashAlgorithm,
82 }
83 
84 #[repr(u8)]
85 #[derive(Copy, Clone)]
86 /// These enumerations need to match the python filter-cascade project:
87 /// https://github.com/mozilla/filter-cascade/blob/v0.3.0/filtercascade/fileformats.py
88 enum HashAlgorithm {
89     MurmurHash3 = 1,
90     Sha256 = 2,
91 }
92 
93 impl fmt::Display for HashAlgorithm {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result94     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95         write!(f, "{}", *self as u8)
96     }
97 }
98 
99 impl TryFrom<u8> for HashAlgorithm {
100     type Error = ();
try_from(value: u8) -> Result<HashAlgorithm, ()>101     fn try_from(value: u8) -> Result<HashAlgorithm, ()> {
102         match value {
103             // Naturally, these need to match the enum declaration
104             1 => Ok(Self::MurmurHash3),
105             2 => Ok(Self::Sha256),
106             _ => Err(()),
107         }
108     }
109 }
110 
111 impl<'a> Bloom<'a> {
112     /// Attempts to decode and return a pair that consists of the Bloom filter represented by the
113     /// given bytes and any remaining unprocessed bytes in the given bytes.
114     ///
115     /// # Arguments
116     /// * `bytes` - The encoded representation of this Bloom filter. May include additional data
117     /// describing further Bloom filters. Any additional data is returned unconsumed.
118     /// The format of an encoded Bloom filter is:
119     /// [1 byte] - the hash algorithm to use in the filter
120     /// [4 little endian bytes] - the length in bits of the filter
121     /// [4 little endian bytes] - the number of hash functions to use in the filter
122     /// [1 byte] - which level in the cascade this filter is
123     /// [variable length bytes] - the filter itself (the length is determined by Ceiling(bit length
124     /// / 8)
from_bytes(bytes: &'a [u8]) -> Result<(Bloom<'a>, &'a [u8]), Error>125     pub fn from_bytes(bytes: &'a [u8]) -> Result<(Bloom<'a>, &'a [u8]), Error> {
126         let mut cursor = bytes;
127         // Load the layer metadata. bloomer.py writes size, nHashFuncs and level as little-endian
128         // unsigned ints.
129         let hash_algorithm_val = cursor.read_u8()?;
130         let hash_algorithm = match HashAlgorithm::try_from(hash_algorithm_val) {
131             Ok(algo) => algo,
132             Err(()) => {
133                 return Err(Error::new(
134                     ErrorKind::InvalidData,
135                     "Unexpected hash algorithm",
136                 ))
137             }
138         };
139 
140         let size = cursor.read_u32::<byteorder::LittleEndian>()?;
141         let n_hash_funcs = cursor.read_u32::<byteorder::LittleEndian>()?;
142         let level = cursor.read_u8()?;
143 
144         let shifted_size = size.wrapping_shr(3) as usize;
145         let byte_count = if size % 8 != 0 {
146             shifted_size + 1
147         } else {
148             shifted_size
149         };
150         if byte_count > cursor.len() {
151             return Err(Error::new(
152                 ErrorKind::InvalidData,
153                 "Invalid Bloom filter: too short",
154             ));
155         }
156         let (bits_bytes, rest_of_bytes) = cursor.split_at(byte_count);
157         let bloom = Bloom {
158             level,
159             n_hash_funcs,
160             size,
161             bit_slice: BitSlice::new(bits_bytes, size as usize),
162             hash_algorithm,
163         };
164         Ok((bloom, rest_of_bytes))
165     }
166 
hash(&self, n_fn: u32, key: &[u8], salt: Option<&[u8]>) -> u32167     fn hash(&self, n_fn: u32, key: &[u8], salt: Option<&[u8]>) -> u32 {
168         match self.hash_algorithm {
169             HashAlgorithm::MurmurHash3 => {
170                 if salt.is_some() {
171                     panic!("murmur does not support salts")
172                 }
173                 let hash_seed = (n_fn << 16) + self.level as u32;
174                 murmurhash3_x86_32(key, hash_seed) % self.size
175             }
176             HashAlgorithm::Sha256 => {
177                 let mut hasher = Sha256::new();
178                 if let Some(salt_bytes) = salt {
179                     hasher.input(salt_bytes)
180                 }
181                 hasher.input(n_fn.to_le_bytes());
182                 hasher.input(self.level.to_le_bytes());
183                 hasher.input(key);
184 
185                 u32::from_le_bytes(
186                     hasher.result()[0..4]
187                         .try_into()
188                         .expect("sha256 should have given enough bytes"),
189                 ) % self.size
190             }
191         }
192     }
193 
194     /// Test for the presence of a given sequence of bytes in this Bloom filter.
195     ///
196     /// # Arguments
197     /// `item` - The slice of bytes to test for
has(&self, item: &[u8], salt: Option<&[u8]>) -> bool198     pub fn has(&self, item: &[u8], salt: Option<&[u8]>) -> bool {
199         for i in 0..self.n_hash_funcs {
200             if !self.bit_slice.get(self.hash(i, item, salt) as usize) {
201                 return false;
202             }
203         }
204         true
205     }
206 }
207 
208 impl<'a> fmt::Display for Bloom<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result209     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
210         write!(
211             f,
212             "level={} n_hash_funcs={} hash_algorithm={} size={}",
213             self.level, self.n_hash_funcs, self.hash_algorithm, self.size
214         )
215     }
216 }
217 
218 /// A multi-level cascading Bloom filter.
219 pub struct Cascade<'a> {
220     /// The Bloom filter for this level in the cascade
221     filter: Bloom<'a>,
222     /// The next (lower) level in the cascade
223     child_layer: Option<Box<Cascade<'a>>>,
224     /// The salt in use, if any
225     salt: Option<&'a [u8]>,
226     /// Whether the logic should be inverted
227     inverted: bool,
228 }
229 
230 impl<'a> Cascade<'a> {
231     /// Attempts to decode and return a multi-level cascading Bloom filter. NB: `Cascade` does not
232     /// take ownership of the given data. This is to facilitate decoding cascading filters
233     /// backed by memory-mapped files.
234     ///
235     /// # Arguments
236     /// `bytes` - The encoded representation of the Bloom filters in this cascade. Starts with 2
237     /// little endian bytes indicating the version. The current version is 2. The Python
238     /// filter-cascade project defines the formats, see
239     /// https://github.com/mozilla/filter-cascade/blob/v0.3.0/filtercascade/fileformats.py
240     ///
241     /// May be of length 0, in which case `None` is returned.
from_bytes(bytes: &'a [u8]) -> Result<Option<Box<Cascade<'a>>>, Error>242     pub fn from_bytes(bytes: &'a [u8]) -> Result<Option<Box<Cascade<'a>>>, Error> {
243         if bytes.is_empty() {
244             return Ok(None);
245         }
246         let mut cursor = bytes;
247         let version = cursor.read_u16::<byteorder::LittleEndian>()?;
248         let mut salt = None;
249         let mut inverted = false;
250 
251         if version >= 2 {
252             inverted = cursor.read_u8()? != 0;
253             let salt_len = cursor.read_u8()? as usize;
254 
255             if salt_len > cursor.len() {
256                 return Err(Error::new(
257                     ErrorKind::InvalidData,
258                     "Invalid Bloom filter: too short",
259                 ));
260             }
261 
262             let (salt_bytes, remaining_bytes) = cursor.split_at(salt_len);
263             if salt_len > 0 {
264                 salt = Some(salt_bytes)
265             }
266             cursor = remaining_bytes;
267         }
268 
269         if version > 2 {
270             return Err(Error::new(
271                 ErrorKind::InvalidData,
272                 format!("Invalid version: {}", version),
273             ));
274         }
275 
276         Cascade::child_layer_from_bytes(cursor, salt, inverted)
277     }
278 
child_layer_from_bytes( bytes: &'a [u8], salt: Option<&'a [u8]>, inverted: bool, ) -> Result<Option<Box<Cascade<'a>>>, Error>279     fn child_layer_from_bytes(
280         bytes: &'a [u8],
281         salt: Option<&'a [u8]>,
282         inverted: bool,
283     ) -> Result<Option<Box<Cascade<'a>>>, Error> {
284         if bytes.is_empty() {
285             return Ok(None);
286         }
287         let (filter, rest_of_bytes) = Bloom::from_bytes(bytes)?;
288         Ok(Some(Box::new(Cascade {
289             filter,
290             child_layer: Cascade::child_layer_from_bytes(rest_of_bytes, salt, inverted)?,
291             salt,
292             inverted,
293         })))
294     }
295 
296     /// Determine if the given sequence of bytes is in the cascade.
297     ///
298     /// # Arguments
299     /// `entry` - The slice of bytes to test for
has(&self, entry: &[u8]) -> bool300     pub fn has(&self, entry: &[u8]) -> bool {
301         let result = self.has_internal(entry);
302         if self.inverted {
303             return !result;
304         }
305         result
306     }
307 
has_internal(&self, entry: &[u8]) -> bool308     pub fn has_internal(&self, entry: &[u8]) -> bool {
309         if self.filter.has(&entry, self.salt) {
310             match self.child_layer {
311                 Some(ref child) => {
312                     let child_value = !child.has_internal(entry);
313                     return child_value;
314                 }
315                 None => {
316                     return true;
317                 }
318             }
319         }
320         false
321     }
322 }
323 
324 impl<'a> fmt::Display for Cascade<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result325     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
326         write!(
327             f,
328             "salt={:?} inverted={} filter=[{}] ",
329             self.salt, self.inverted, self.filter
330         )?;
331         match &self.child_layer {
332             Some(layer) => write!(f, "[child={}]", layer),
333             None => Ok(()),
334         }
335     }
336 }
337 
338 #[cfg(test)]
339 mod tests {
340     use Bloom;
341     use Cascade;
342 
343     #[test]
bloom_v1_test_from_bytes()344     fn bloom_v1_test_from_bytes() {
345         let src: Vec<u8> = vec![
346             0x01, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x41, 0x00,
347         ];
348 
349         match Bloom::from_bytes(&src) {
350             Ok((bloom, rest_of_bytes)) => {
351                 assert!(rest_of_bytes.len() == 0);
352                 assert!(bloom.has(b"this", None) == true);
353                 assert!(bloom.has(b"that", None) == true);
354                 assert!(bloom.has(b"other", None) == false);
355             }
356             Err(_) => {
357                 panic!("Parsing failed");
358             }
359         };
360 
361         let short: Vec<u8> = vec![
362             0x01, 0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x41,
363         ];
364         assert!(Bloom::from_bytes(&short).is_err());
365     }
366 
367     #[test]
bloom_v3_unsupported()368     fn bloom_v3_unsupported() {
369         let src: Vec<u8> = vec![0x03, 0x01, 0x00];
370         assert!(Bloom::from_bytes(&src).is_err());
371     }
372 
373     #[test]
cascade_v1_murmur_from_file_bytes_test()374     fn cascade_v1_murmur_from_file_bytes_test() {
375         let v = include_bytes!("../test_data/test_v1_murmur_mlbf");
376         let cascade = Cascade::from_bytes(v)
377             .expect("parsing Cascade should succeed")
378             .expect("Cascade should be Some");
379         // Key format is SHA256(issuer SPKI) + serial number
380         #[rustfmt::skip]
381         let key_for_revoked_cert_1 =
382             [ 0x2e, 0xb2, 0xd5, 0xa8, 0x60, 0xfe, 0x50, 0xe9, 0xc2, 0x42, 0x36, 0x85, 0x52, 0x98,
383               0x01, 0x50, 0xe4, 0x5d, 0xb5, 0x32, 0x1a, 0x5b, 0x00, 0x5e, 0x26, 0xd6, 0x76, 0x25,
384               0x3a, 0x40, 0x9b, 0xf5,
385               0x06, 0x2d, 0xf5, 0x68, 0xa0, 0x51, 0x31, 0x08, 0x20, 0xd7, 0xec, 0x43, 0x27, 0xe1,
386               0xba, 0xfd ];
387         assert!(cascade.has(&key_for_revoked_cert_1));
388         #[rustfmt::skip]
389         let key_for_revoked_cert_2 =
390             [ 0xf1, 0x1c, 0x3d, 0xd0, 0x48, 0xf7, 0x4e, 0xdb, 0x7c, 0x45, 0x19, 0x2b, 0x83, 0xe5,
391               0x98, 0x0d, 0x2f, 0x67, 0xec, 0x84, 0xb4, 0xdd, 0xb9, 0x39, 0x6e, 0x33, 0xff, 0x51,
392               0x73, 0xed, 0x69, 0x8f,
393               0x00, 0xd2, 0xe8, 0xf6, 0xaa, 0x80, 0x48, 0x1c, 0xd4 ];
394         assert!(cascade.has(&key_for_revoked_cert_2));
395         #[rustfmt::skip]
396         let key_for_valid_cert =
397             [ 0x99, 0xfc, 0x9d, 0x40, 0xf1, 0xad, 0xb1, 0x63, 0x65, 0x61, 0xa6, 0x1d, 0x68, 0x3d,
398               0x9e, 0xa6, 0xb4, 0x60, 0xc5, 0x7d, 0x0c, 0x75, 0xea, 0x00, 0xc3, 0x41, 0xb9, 0xdf,
399               0xb9, 0x0b, 0x5f, 0x39,
400               0x0b, 0x77, 0x75, 0xf7, 0xaf, 0x9a, 0xe5, 0x42, 0x65, 0xc9, 0xcd, 0x32, 0x57, 0x10,
401               0x77, 0x8e ];
402         assert!(!cascade.has(&key_for_valid_cert));
403 
404         let v = include_bytes!("../test_data/test_v1_murmur_short_mlbf");
405         assert!(Cascade::from_bytes(v).is_err());
406     }
407 
408     #[test]
cascade_v2_sha256_from_file_bytes_test()409     fn cascade_v2_sha256_from_file_bytes_test() {
410         let v = include_bytes!("../test_data/test_v2_sha256_mlbf");
411         let cascade = Cascade::from_bytes(v)
412             .expect("parsing Cascade should succeed")
413             .expect("Cascade should be Some");
414 
415         assert!(cascade.salt == None);
416         assert!(cascade.inverted == false);
417         assert!(cascade.has(b"this") == true);
418         assert!(cascade.has(b"that") == true);
419         assert!(cascade.has(b"other") == false);
420     }
421 
422     #[test]
cascade_v2_sha256_with_salt_from_file_bytes_test()423     fn cascade_v2_sha256_with_salt_from_file_bytes_test() {
424         let v = include_bytes!("../test_data/test_v2_sha256_salt_mlbf");
425         let cascade = Cascade::from_bytes(v)
426             .expect("parsing Cascade should succeed")
427             .expect("Cascade should be Some");
428 
429         assert!(cascade.salt == Some(b"nacl"));
430         assert!(cascade.inverted == false);
431         assert!(cascade.has(b"this") == true);
432         assert!(cascade.has(b"that") == true);
433         assert!(cascade.has(b"other") == false);
434     }
435 
436     #[test]
cascade_v2_murmur_from_file_bytes_test()437     fn cascade_v2_murmur_from_file_bytes_test() {
438         let v = include_bytes!("../test_data/test_v2_murmur_mlbf");
439         let cascade = Cascade::from_bytes(v)
440             .expect("parsing Cascade should succeed")
441             .expect("Cascade should be Some");
442 
443         assert!(cascade.salt == None);
444         assert!(cascade.inverted == false);
445         assert!(cascade.has(b"this") == true);
446         assert!(cascade.has(b"that") == true);
447         assert!(cascade.has(b"other") == false);
448     }
449 
450     #[test]
cascade_v2_murmur_inverted_from_file_bytes_test()451     fn cascade_v2_murmur_inverted_from_file_bytes_test() {
452         let v = include_bytes!("../test_data/test_v2_murmur_inverted_mlbf");
453         let cascade = Cascade::from_bytes(v)
454             .expect("parsing Cascade should succeed")
455             .expect("Cascade should be Some");
456 
457         assert!(cascade.salt == None);
458         assert!(cascade.inverted == true);
459         assert!(cascade.has(b"this") == true);
460         assert!(cascade.has(b"that") == true);
461         assert!(cascade.has(b"other") == false);
462     }
463 
464     #[test]
cascade_v2_sha256_inverted_from_file_bytes_test()465     fn cascade_v2_sha256_inverted_from_file_bytes_test() {
466         let v = include_bytes!("../test_data/test_v2_sha256_inverted_mlbf");
467         let cascade = Cascade::from_bytes(v)
468             .expect("parsing Cascade should succeed")
469             .expect("Cascade should be Some");
470 
471         assert!(cascade.salt == None);
472         assert!(cascade.inverted == true);
473         assert!(cascade.has(b"this") == true);
474         assert!(cascade.has(b"that") == true);
475         assert!(cascade.has(b"other") == false);
476     }
477 }
478