1 use std::mem;
2 
3 use packed::pattern::{PatternID, Patterns};
4 use Match;
5 
6 /// The type of the rolling hash used in the Rabin-Karp algorithm.
7 type Hash = usize;
8 
9 /// The number of buckets to store our patterns in. We don't want this to be
10 /// too big in order to avoid wasting memory, but we don't want it to be too
11 /// small either to avoid spending too much time confirming literals.
12 ///
13 /// The number of buckets MUST be a power of two. Otherwise, determining the
14 /// bucket from a hash will slow down the code considerably. Using a power
15 /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
16 /// instruction.
17 const NUM_BUCKETS: usize = 64;
18 
19 /// An implementation of the Rabin-Karp algorithm. The main idea of this
20 /// algorithm is to maintain a rolling hash as it moves through the input, and
21 /// then check whether that hash corresponds to the same hash for any of the
22 /// patterns we're looking for.
23 ///
24 /// A draw back of naively scaling Rabin-Karp to multiple patterns is that
25 /// it requires all of the patterns to be the same length, which in turn
26 /// corresponds to the number of bytes to hash. We adapt this to work for
27 /// multiple patterns of varying size by fixing the number of bytes to hash
28 /// to be the length of the smallest pattern. We also split the patterns into
29 /// several buckets to hopefully make the confirmation step faster.
30 ///
31 /// Wikipedia has a decent explanation, if a bit heavy on the theory:
32 /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
33 ///
34 /// But ESMAJ provides something a bit more concrete:
35 /// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html
36 #[derive(Clone, Debug)]
37 pub struct RabinKarp {
38     /// The order of patterns in each bucket is significant. Namely, they are
39     /// arranged such that the first one to match is the correct match. This
40     /// may not necessarily correspond to the order provided by the caller.
41     /// For example, if leftmost-longest semantics are used, then the patterns
42     /// are sorted by their length in descending order. If leftmost-first
43     /// semantics are used, then the patterns are sorted by their pattern ID
44     /// in ascending order (which corresponds to the caller's order).
45     buckets: Vec<Vec<(Hash, PatternID)>>,
46     /// The length of the hashing window. Generally, this corresponds to the
47     /// length of the smallest pattern.
48     hash_len: usize,
49     /// The factor to subtract out of a hash before updating it with a new
50     /// byte.
51     hash_2pow: usize,
52     /// The maximum identifier of a pattern. This is used as a sanity check
53     /// to ensure that the patterns provided by the caller are the same as
54     /// the patterns that were used to compile the matcher. This sanity check
55     /// possibly permits safely eliminating bounds checks regardless of what
56     /// patterns are provided by the caller.
57     ///
58     /// (Currently, we don't use this to elide bounds checks since it doesn't
59     /// result in a measurable performance improvement, but we do use it for
60     /// better failure modes.)
61     max_pattern_id: PatternID,
62 }
63 
64 impl RabinKarp {
65     /// Compile a new Rabin-Karp matcher from the patterns given.
66     ///
67     /// This panics if any of the patterns in the collection are empty, or if
68     /// the collection is itself empty.
new(patterns: &Patterns) -> RabinKarp69     pub fn new(patterns: &Patterns) -> RabinKarp {
70         assert!(patterns.len() >= 1);
71         let hash_len = patterns.minimum_len();
72         assert!(hash_len >= 1);
73 
74         let mut hash_2pow = 1usize;
75         for _ in 1..hash_len {
76             hash_2pow = hash_2pow.wrapping_shl(1);
77         }
78 
79         let mut rk = RabinKarp {
80             buckets: vec![vec![]; NUM_BUCKETS],
81             hash_len,
82             hash_2pow,
83             max_pattern_id: patterns.max_pattern_id(),
84         };
85         for (id, pat) in patterns.iter() {
86             let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
87             let bucket = hash % NUM_BUCKETS;
88             rk.buckets[bucket].push((hash, id));
89         }
90         rk
91     }
92 
93     /// Return the first matching pattern in the given haystack, begining the
94     /// search at `at`.
find_at( &self, patterns: &Patterns, haystack: &[u8], mut at: usize, ) -> Option<Match>95     pub fn find_at(
96         &self,
97         patterns: &Patterns,
98         haystack: &[u8],
99         mut at: usize,
100     ) -> Option<Match> {
101         assert_eq!(NUM_BUCKETS, self.buckets.len());
102         assert_eq!(
103             self.max_pattern_id,
104             patterns.max_pattern_id(),
105             "Rabin-Karp must be called with same patterns it was built with",
106         );
107 
108         if at + self.hash_len > haystack.len() {
109             return None;
110         }
111         let mut hash = self.hash(&haystack[at..at + self.hash_len]);
112         loop {
113             let bucket = &self.buckets[hash % NUM_BUCKETS];
114             for &(phash, pid) in bucket {
115                 if phash == hash {
116                     if let Some(c) = self.verify(patterns, pid, haystack, at) {
117                         return Some(c);
118                     }
119                 }
120             }
121             if at + self.hash_len >= haystack.len() {
122                 return None;
123             }
124             hash = self.update_hash(
125                 hash,
126                 haystack[at],
127                 haystack[at + self.hash_len],
128             );
129             at += 1;
130         }
131     }
132 
133     /// Returns the approximate total amount of heap used by this searcher, in
134     /// units of bytes.
heap_bytes(&self) -> usize135     pub fn heap_bytes(&self) -> usize {
136         let num_patterns = self.max_pattern_id as usize + 1;
137         self.buckets.len() * mem::size_of::<Vec<(Hash, PatternID)>>()
138             + num_patterns * mem::size_of::<(Hash, PatternID)>()
139     }
140 
141     /// Verify whether the pattern with the given id matches at
142     /// `haystack[at..]`.
143     ///
144     /// We tag this function as `cold` because it helps improve codegen.
145     /// Intuitively, it would seem like inlining it would be better. However,
146     /// the only time this is called and a match is not found is when there
147     /// there is a hash collision, or when a prefix of a pattern matches but
148     /// the entire pattern doesn't match. This is hopefully fairly rare, and
149     /// if it does occur a lot, it's going to be slow no matter what we do.
150     #[cold]
verify( &self, patterns: &Patterns, id: PatternID, haystack: &[u8], at: usize, ) -> Option<Match>151     fn verify(
152         &self,
153         patterns: &Patterns,
154         id: PatternID,
155         haystack: &[u8],
156         at: usize,
157     ) -> Option<Match> {
158         let pat = patterns.get(id);
159         if pat.is_prefix(&haystack[at..]) {
160             Some(Match::from_span(id as usize, at, at + pat.len()))
161         } else {
162             None
163         }
164     }
165 
166     /// Hash the given bytes.
hash(&self, bytes: &[u8]) -> Hash167     fn hash(&self, bytes: &[u8]) -> Hash {
168         assert_eq!(self.hash_len, bytes.len());
169 
170         let mut hash = 0usize;
171         for &b in bytes {
172             hash = hash.wrapping_shl(1).wrapping_add(b as usize);
173         }
174         hash
175     }
176 
177     /// Update the hash given based on removing `old_byte` at the beginning
178     /// of some byte string, and appending `new_byte` to the end of that same
179     /// byte string.
update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash180     fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
181         prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
182             .wrapping_shl(1)
183             .wrapping_add(new_byte as usize)
184     }
185 }
186