1 // This module contains a couple simple and purpose built hash maps. The key
2 // trade off they make is that they serve as caches rather than true maps. That
3 // is, inserting a new entry may cause eviction of another entry. This gives
4 // us two things. First, there's less overhead associated with inserts and
5 // lookups. Secondly, it lets us control our memory usage.
6 //
7 // These maps are used in some fairly hot code when generating NFA states for
8 // large Unicode character classes.
9 //
10 // Instead of exposing a rich hashmap entry API, we just permit the caller
11 // to produce a hash of the key directly. The hash can then be reused for both
12 // lookups and insertions at the cost of leaking things a bit. But these are
13 // for internal use only, so it's fine.
14 //
15 // The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
16 // (almost) minimal DFA for large Unicode character classes in linear time.
17 // (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
18 // NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
19 // since there's a bit more expense in the reverse direction.)
20 //
21 // The Utf8SuffixMap is used when compiling large Unicode character classes for
22 // reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
23 // construction of UTF-8 automata by caching common suffixes. This doesn't
24 // get the same space savings as Daciuk's algorithm, but it's basically as
25 // fast as the naive approach and typically winds up using less memory (since
26 // it generates smaller NFAs) despite the presence of the cache.
27 //
28 // These maps effectively represent caching mechanisms for CState::Sparse and
29 // CState::Range, respectively. The former represents a single NFA state with
30 // many transitions of equivalent priority while the latter represents a single
31 // NFA state with a single transition. (Neither state ever has or is an
32 // epsilon transition.) Thus, they have different key types. It's likely we
33 // could make one generic map, but the machinery didn't seem worth it. They
34 // are simple enough.
35 
36 use nfa::{StateID, Transition};
37 
38 // Basic FNV-1a hash constants as described in:
39 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
40 const PRIME: u64 = 1099511628211;
41 const INIT: u64 = 14695981039346656037;
42 
43 /// A bounded hash map where the key is a sequence of NFA transitions and the
44 /// value is a pre-existing NFA state ID.
45 ///
46 /// std's hashmap can be used for this, however, this map has two important
47 /// advantages. Firstly, it has lower overhead. Secondly, it permits us to
48 /// control our memory usage by limited the number of slots. In general, the
49 /// cost here is that this map acts as a cache. That is, inserting a new entry
50 /// may remove an old entry. We are okay with this, since it does not impact
51 /// correctness in the cases where it is used. The only effect that dropping
52 /// states from the cache has is that the resulting NFA generated may be bigger
53 /// than it otherwise would be.
54 ///
55 /// This improves benchmarks that compile large Unicode character classes,
56 /// since it makes the generation of (almost) minimal UTF-8 automaton faster.
57 /// Specifically, one could observe the difference with std's hashmap via
58 /// something like the following benchmark:
59 ///
60 ///   hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
61 ///
62 /// But to observe that difference, you'd have to modify the code to use
63 /// std's hashmap.
64 ///
65 /// It is quite possible that there is a better way to approach this problem.
66 /// For example, if there happens to be a very common state that collides with
67 /// a lot of less frequent states, then we could wind up with very poor caching
68 /// behavior. Alas, the effectiveness of this cache has not been measured.
69 /// Instead, ad hoc experiments suggest that it is "good enough." Additional
70 /// smarts (such as an LRU eviction policy) have to be weighed against the
71 /// amount of extra time they cost.
72 #[derive(Clone, Debug)]
73 pub struct Utf8BoundedMap {
74     /// The current version of this map. Only entries with matching versions
75     /// are considered during lookups. If an entry is found with a mismatched
76     /// version, then the map behaves as if the entry does not exist.
77     version: u16,
78     /// The total number of entries this map can store.
79     capacity: usize,
80     /// The actual entries, keyed by hash. Collisions between different states
81     /// result in the old state being dropped.
82     map: Vec<Utf8BoundedEntry>,
83 }
84 
85 /// An entry in this map.
86 #[derive(Clone, Debug, Default)]
87 struct Utf8BoundedEntry {
88     /// The version of the map used to produce this entry. If this entry's
89     /// version does not match the current version of the map, then the map
90     /// should behave as if this entry does not exist.
91     version: u16,
92     /// The key, which is a sorted sequence of non-overlapping NFA transitions.
93     key: Vec<Transition>,
94     /// The state ID corresponding to the state containing the transitions in
95     /// this entry.
96     val: StateID,
97 }
98 
99 impl Utf8BoundedMap {
100     /// Create a new bounded map with the given capacity. The map will never
101     /// grow beyond the given size.
102     ///
103     /// Note that this does not allocate. Instead, callers must call `clear`
104     /// before using this map. `clear` will allocate space if necessary.
105     ///
106     /// This avoids the need to pay for the allocation of this map when
107     /// compiling regexes that lack large Unicode character classes.
new(capacity: usize) -> Utf8BoundedMap108     pub fn new(capacity: usize) -> Utf8BoundedMap {
109         assert!(capacity > 0);
110         Utf8BoundedMap { version: 0, capacity, map: vec![] }
111     }
112 
113     /// Clear this map of all entries, but permit the reuse of allocation
114     /// if possible.
115     ///
116     /// This must be called before the map can be used.
clear(&mut self)117     pub fn clear(&mut self) {
118         if self.map.is_empty() {
119             self.map = vec![Utf8BoundedEntry::default(); self.capacity];
120         } else {
121             self.version = self.version.wrapping_add(1);
122             if self.version == 0 {
123                 self.map = vec![Utf8BoundedEntry::default(); self.capacity];
124             }
125         }
126     }
127 
128     /// Return a hash of the given transitions.
hash(&self, key: &[Transition]) -> usize129     pub fn hash(&self, key: &[Transition]) -> usize {
130         let mut h = INIT;
131         for t in key {
132             h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
133             h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
134             h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
135         }
136         (h as usize) % self.map.len()
137     }
138 
139     /// Retrieve the cached state ID corresponding to the given key. The hash
140     /// given must have been computed with `hash` using the same key value.
141     ///
142     /// If there is no cached state with the given transitions, then None is
143     /// returned.
get(&mut self, key: &[Transition], hash: usize) -> Option<StateID>144     pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
145         let entry = &self.map[hash];
146         if entry.version != self.version {
147             return None;
148         }
149         // There may be a hash collision, so we need to confirm real equality.
150         if entry.key != key {
151             return None;
152         }
153         Some(entry.val)
154     }
155 
156     /// Add a cached state to this map with the given key. Callers should
157     /// ensure that `state_id` points to a state that contains precisely the
158     /// NFA transitions given.
159     ///
160     /// `hash` must have been computed using the `hash` method with the same
161     /// key.
set( &mut self, key: Vec<Transition>, hash: usize, state_id: StateID, )162     pub fn set(
163         &mut self,
164         key: Vec<Transition>,
165         hash: usize,
166         state_id: StateID,
167     ) {
168         self.map[hash] =
169             Utf8BoundedEntry { version: self.version, key, val: state_id };
170     }
171 }
172 
173 /// A cache of suffixes used to modestly compress UTF-8 automata for large
174 /// Unicode character classes.
175 #[derive(Clone, Debug)]
176 pub struct Utf8SuffixMap {
177     /// The current version of this map. Only entries with matching versions
178     /// are considered during lookups. If an entry is found with a mismatched
179     /// version, then the map behaves as if the entry does not exist.
180     version: u16,
181     /// The total number of entries this map can store.
182     capacity: usize,
183     /// The actual entries, keyed by hash. Collisions between different states
184     /// result in the old state being dropped.
185     map: Vec<Utf8SuffixEntry>,
186 }
187 
188 /// A key that uniquely identifies an NFA state. It is a triple that represents
189 /// a transition from one state for a particular byte range.
190 #[derive(Clone, Debug, Default, Eq, PartialEq)]
191 pub struct Utf8SuffixKey {
192     pub from: StateID,
193     pub start: u8,
194     pub end: u8,
195 }
196 
197 /// An entry in this map.
198 #[derive(Clone, Debug, Default)]
199 struct Utf8SuffixEntry {
200     /// The version of the map used to produce this entry. If this entry's
201     /// version does not match the current version of the map, then the map
202     /// should behave as if this entry does not exist.
203     version: u16,
204     /// The key, which consists of a transition in a particular state.
205     key: Utf8SuffixKey,
206     /// The identifier that the transition in the key maps to.
207     val: StateID,
208 }
209 
210 impl Utf8SuffixMap {
211     /// Create a new bounded map with the given capacity. The map will never
212     /// grow beyond the given size.
213     ///
214     /// Note that this does not allocate. Instead, callers must call `clear`
215     /// before using this map. `clear` will allocate space if necessary.
216     ///
217     /// This avoids the need to pay for the allocation of this map when
218     /// compiling regexes that lack large Unicode character classes.
new(capacity: usize) -> Utf8SuffixMap219     pub fn new(capacity: usize) -> Utf8SuffixMap {
220         assert!(capacity > 0);
221         Utf8SuffixMap { version: 0, capacity, map: vec![] }
222     }
223 
224     /// Clear this map of all entries, but permit the reuse of allocation
225     /// if possible.
226     ///
227     /// This must be called before the map can be used.
clear(&mut self)228     pub fn clear(&mut self) {
229         if self.map.is_empty() {
230             self.map = vec![Utf8SuffixEntry::default(); self.capacity];
231         } else {
232             self.version = self.version.wrapping_add(1);
233             if self.version == 0 {
234                 self.map = vec![Utf8SuffixEntry::default(); self.capacity];
235             }
236         }
237     }
238 
239     /// Return a hash of the given transition.
hash(&self, key: &Utf8SuffixKey) -> usize240     pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
241         // Basic FNV-1a hash as described:
242         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
243         const PRIME: u64 = 1099511628211;
244         const INIT: u64 = 14695981039346656037;
245 
246         let mut h = INIT;
247         h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
248         h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
249         h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
250         (h as usize) % self.map.len()
251     }
252 
253     /// Retrieve the cached state ID corresponding to the given key. The hash
254     /// given must have been computed with `hash` using the same key value.
255     ///
256     /// If there is no cached state with the given key, then None is returned.
get( &mut self, key: &Utf8SuffixKey, hash: usize, ) -> Option<StateID>257     pub fn get(
258         &mut self,
259         key: &Utf8SuffixKey,
260         hash: usize,
261     ) -> Option<StateID> {
262         let entry = &self.map[hash];
263         if entry.version != self.version {
264             return None;
265         }
266         if key != &entry.key {
267             return None;
268         }
269         Some(entry.val)
270     }
271 
272     /// Add a cached state to this map with the given key. Callers should
273     /// ensure that `state_id` points to a state that contains precisely the
274     /// NFA transition given.
275     ///
276     /// `hash` must have been computed using the `hash` method with the same
277     /// key.
set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID)278     pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
279         self.map[hash] =
280             Utf8SuffixEntry { version: self.version, key, val: state_id };
281     }
282 }
283