1 pub const WINDOW_SIZE: usize = 32768;
2 pub const WINDOW_MASK: usize = WINDOW_SIZE - 1;
3 #[cfg(test)]
4 pub const HASH_BYTES: usize = 3;
5 const HASH_SHIFT: u16 = 5;
6 const HASH_MASK: u16 = WINDOW_MASK as u16;
7 
8 /// Helper struct to let us allocate both head and prev in the same block.
9 struct Tables {
10     /// Starts of hash chains (in prev)
11     pub head: [u16; WINDOW_SIZE],
12     /// Link to previous occurence of this hash value
13     pub prev: [u16; WINDOW_SIZE],
14 }
15 
16 impl Default for Tables {
17     #[inline]
default() -> Tables18     fn default() -> Tables {
19         Tables {
20             head: [0;WINDOW_SIZE],
21             prev: [0;WINDOW_SIZE],
22         }
23     }
24 }
25 
26 impl Tables {
27     #[inline]
fill_prev(&mut self)28     fn fill_prev(&mut self) {
29         self.prev.copy_from_slice(&self.head);
30     }
31 }
32 
33 /// Create and box the hash chains.
create_tables() -> Box<Tables>34 fn create_tables() -> Box<Tables> {
35     // Using default here is a trick to get around the lack of box syntax on stable rust.
36     //
37     // Box::new([0u16,n]) ends up creating an temporary array on the stack which is not optimised
38     // but using default, which simply calls `box value` internally allows us to get around this.
39     //
40     // We could use vec instead, but using a boxed array helps the compiler optimise
41     // away bounds checks as `n & WINDOW_MASK < WINDOW_SIZE` will always be true.
42     let mut t: Box<Tables> = Box::default();
43 
44     for (n, b) in t.head.iter_mut().enumerate() {
45         *b = n as u16;
46     }
47 
48     t.fill_prev();
49 
50     t
51 }
52 
53 /// Returns a new hash value based on the previous value and the next byte
54 #[inline]
update_hash(current_hash: u16, to_insert: u8) -> u1655 pub fn update_hash(current_hash: u16, to_insert: u8) -> u16 {
56     update_hash_conf(current_hash, to_insert, HASH_SHIFT, HASH_MASK)
57 }
58 
59 #[inline]
update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u1660 fn update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u16 {
61     ((current_hash << shift) ^ (to_insert as u16)) & mask
62 }
63 
64 #[inline]
reset_array(arr: &mut [u16; WINDOW_SIZE])65 fn reset_array(arr: &mut [u16; WINDOW_SIZE]) {
66     for (n, b) in arr.iter_mut().enumerate() {
67         *b = n as u16;
68     }
69 }
70 
71 pub struct ChainedHashTable {
72     // Current running hash value of the last 3 bytes
73     current_hash: u16,
74     // Hash chains.
75     c: Box<Tables>,
76     // Used for testing
77     // count: DebugCounter,
78 }
79 
80 impl ChainedHashTable {
new() -> ChainedHashTable81     pub fn new() -> ChainedHashTable {
82         ChainedHashTable {
83             current_hash: 0,
84             c: create_tables(),
85             //count: DebugCounter::default(),
86         }
87     }
88 
89     #[cfg(test)]
from_starting_values(v1: u8, v2: u8) -> ChainedHashTable90     pub fn from_starting_values(v1: u8, v2: u8) -> ChainedHashTable {
91         let mut t = ChainedHashTable::new();
92         t.current_hash = update_hash(t.current_hash, v1);
93         t.current_hash = update_hash(t.current_hash, v2);
94         t
95     }
96 
97     /// Resets the hash value and hash chains
reset(&mut self)98     pub fn reset(&mut self) {
99         self.current_hash = 0;
100         reset_array(&mut self.c.head);
101         {
102             let h = self.c.head;
103             let mut c = self.c.prev;
104             c[..].copy_from_slice(&h[..]);
105         }
106         /*if cfg!(debug_assertions) {
107             self.count.reset();
108         }*/
109     }
110 
add_initial_hash_values(&mut self, v1: u8, v2: u8)111     pub fn add_initial_hash_values(&mut self, v1: u8, v2: u8) {
112         self.current_hash = update_hash(self.current_hash, v1);
113         self.current_hash = update_hash(self.current_hash, v2);
114     }
115 
116     /// Insert a byte into the hash table
117     #[inline]
add_hash_value(&mut self, position: usize, value: u8)118     pub fn add_hash_value(&mut self, position: usize, value: u8) {
119         // Check that all bytes are input in order and at the correct positions.
120         // Disabled for now as it breaks when sync flushing.
121         /*debug_assert_eq!(
122             position & WINDOW_MASK,
123             self.count.get() as usize & WINDOW_MASK
124         );*/
125         debug_assert!(
126             position < WINDOW_SIZE * 2,
127             "Position is larger than 2 * window size! {}",
128             position
129         );
130         // Storing the hash in a temporary variable here makes the compiler avoid the
131         // bounds checks in this function.
132         let new_hash = update_hash(self.current_hash, value);
133 
134         self.add_with_hash(position, new_hash);
135 
136         // Update the stored hash value with the new hash.
137         self.current_hash = new_hash;
138     }
139 
140     /// Directly set the current hash value
141     #[inline]
set_hash(&mut self, hash: u16)142     pub fn set_hash(&mut self, hash: u16) {
143         self.current_hash = hash;
144     }
145 
146     /// Update the tables directly, providing the hash.
147     #[inline]
add_with_hash(&mut self, position: usize, hash: u16)148     pub fn add_with_hash(&mut self, position: usize, hash: u16) {
149         /*if cfg!(debug_assertions) {
150             self.count.add(1);
151         }*/
152 
153         self.c.prev[position & WINDOW_MASK] = self.c.head[hash as usize];
154 
155         // Ignoring any bits over 16 here is deliberate, as we only concern ourselves about
156         // where in the buffer (which is 64k bytes) we are referring to.
157         self.c.head[hash as usize] = position as u16;
158     }
159 
160     // Get the head of the hash chain for the current hash value
161     #[cfg(test)]
162     #[inline]
current_head(&self) -> u16163     pub fn current_head(&self) -> u16 {
164         self.c.head[self.current_hash as usize]
165     }
166 
167     #[inline]
current_hash(&self) -> u16168     pub fn current_hash(&self) -> u16 {
169         self.current_hash
170     }
171 
172     #[inline]
get_prev(&self, bytes: usize) -> u16173     pub fn get_prev(&self, bytes: usize) -> u16 {
174         self.c.prev[bytes & WINDOW_MASK]
175     }
176 
177     #[cfg(test)]
178     #[inline]
farthest_next(&self, match_pos: usize, match_len: usize) -> usize179     pub fn farthest_next(&self, match_pos: usize, match_len: usize) -> usize {
180         let to_check = match_len.saturating_sub(2);
181 
182         let mut n = 0;
183         let mut smallest_prev =
184             self.get_prev(match_pos);
185         let mut smallest_pos = 0;
186         while n < to_check {
187             let prev =
188                 self.get_prev(match_pos + n);
189             if prev < smallest_prev {
190                 smallest_prev = prev;
191                 smallest_pos = n;
192             }
193             n += 1;
194         }
195         smallest_pos
196     }
197 
198     #[inline]
slide_value(b: u16, pos: u16, bytes: u16) -> u16199     fn slide_value(b: u16, pos: u16, bytes: u16) -> u16 {
200         if b >= bytes {
201             b - bytes
202         } else {
203             pos
204         }
205     }
206 
207     #[inline]
slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16)208     fn slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16) {
209         for (n, b) in table.iter_mut().enumerate() {
210             *b = ChainedHashTable::slide_value(*b, n as u16, bytes);
211         }
212     }
213 
slide(&mut self, bytes: usize)214     pub fn slide(&mut self, bytes: usize) {
215         /*if cfg!(debug_assertions) && bytes != WINDOW_SIZE {
216             // This should only happen in tests in this file.
217             self.count.reset();
218         }*/
219         ChainedHashTable::slide_table(&mut self.c.head, bytes as u16);
220         ChainedHashTable::slide_table(&mut self.c.prev, bytes as u16);
221     }
222 }
223 
224 #[cfg(test)]
filled_hash_table(data: &[u8]) -> ChainedHashTable225 pub fn filled_hash_table(data: &[u8]) -> ChainedHashTable {
226     assert!(data.len() <= (WINDOW_SIZE * 2) + 2);
227     let mut hash_table = ChainedHashTable::from_starting_values(data[0], data[1]);
228     for (n, b) in data[2..].iter().enumerate() {
229         hash_table.add_hash_value(n, *b);
230     }
231     hash_table
232 }
233 
234 #[cfg(test)]
235 mod test {
236     use super::{filled_hash_table, ChainedHashTable};
237 
238     #[test]
chained_hash()239     fn chained_hash() {
240         use std::str;
241 
242         let test_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
243                            eiusmod tempor. rum. incididunt ut labore et dolore magna aliqua. Ut \
244                            enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi \
245                            ut aliquip ex ea commodo consequat. rum. Duis aute irure dolor in \
246                            reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \
247                            pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \
248                            culpa qui officia deserunt mollit anim id est laborum.";
249 
250         let test_data = test_string.as_bytes();
251 
252         let current_bytes = &test_data[test_data.len() - super::HASH_BYTES..test_data.len()];
253 
254         let num_iters = test_string
255             .matches(str::from_utf8(current_bytes).unwrap())
256             .count();
257 
258         let hash_table = filled_hash_table(test_data);
259 
260         // Test that the positions in the chain are valid
261         let mut prev_value = hash_table.get_prev(hash_table.current_head() as usize) as usize;
262         let mut count = 0;
263         let mut current = hash_table.current_head() as usize;
264         while current != prev_value {
265             count += 1;
266             current = prev_value;
267             prev_value = hash_table.get_prev(prev_value) as usize;
268         }
269         // There should be at least as many occurences of the hash of the checked bytes as the
270         // numbers of occurences of the checked bytes themselves. As the hashes are not large enough
271         // to store 8 * 3 = 24 bits, there could be more with different input data.
272         assert!(count >= num_iters);
273     }
274 
275     #[test]
table_unique()276     fn table_unique() {
277         let mut test_data = Vec::new();
278         test_data.extend(0u8..255);
279         test_data.extend(255u8..0);
280         let hash_table = filled_hash_table(&test_data);
281         let prev_pos = hash_table.get_prev(hash_table.current_head() as usize);
282         // Since all sequences in the input are unique, there shouldn't be any previous values.
283         assert_eq!(prev_pos, hash_table.current_hash());
284     }
285 
286     #[test]
table_slide()287     fn table_slide() {
288         use std::fs::File;
289         use std::io::Read;
290 
291         let window_size = super::WINDOW_SIZE;
292         let window_size16 = super::WINDOW_SIZE as u16;
293 
294         let mut input = Vec::new();
295 
296         let mut f = File::open("tests/pg11.txt").unwrap();
297 
298         f.read_to_end(&mut input).unwrap();
299 
300         let mut hash_table = filled_hash_table(&input[..window_size + 2]);
301 
302         for (n, b) in input[2..window_size + 2].iter().enumerate() {
303             hash_table.add_hash_value(n + window_size, *b);
304         }
305 
306         hash_table.slide(window_size);
307 
308         {
309             let max_head = hash_table.c.head.iter().max().unwrap();
310             // After sliding there should be no hashes referring to values
311             // higher than the window size
312             assert!(*max_head < window_size16);
313             assert!(*max_head > 0);
314             let pos = hash_table.get_prev(hash_table.current_head() as usize);
315             // There should be a previous occurence since we inserted the data 3 times
316             assert!(pos < window_size16);
317             assert!(pos > 0);
318         }
319 
320         for (n, b) in input[2..(window_size / 2)].iter().enumerate() {
321             hash_table.add_hash_value(n + window_size, *b);
322         }
323 
324         // There should hashes referring to values in the upper part of the input window
325         // at this point
326         let max_prev = hash_table.c.prev.iter().max().unwrap();
327         assert!(*max_prev > window_size16);
328 
329         let mut pos = hash_table.current_head();
330         // There should be a previous occurence since we inserted the data 3 times
331         assert!(pos > window_size16);
332         let end_byte = input[(window_size / 2) - 1 - 2];
333         let mut iterations = 0;
334         while pos > window_size16 && iterations < 5000 {
335             assert_eq!(input[pos as usize & window_size - 1], end_byte);
336 
337             pos = hash_table.get_prev(pos as usize);
338             iterations += 1;
339         }
340     }
341 
342     #[test]
343     /// Ensure that the initial hash values are correct.
initial_chains()344     fn initial_chains() {
345         let t = ChainedHashTable::new();
346         for (n, &b) in t.c.head.iter().enumerate() {
347             assert_eq!(n, b as usize);
348         }
349         for (n, &b) in t.c.prev.iter().enumerate() {
350             assert_eq!(n, b as usize);
351         }
352     }
353 }
354