1 pub const WINDOW_SIZE: usize = 32768;
2 pub const WINDOW_MASK: usize = WINDOW_SIZE - 1;
3 #[cfg(test)]
4 pub const HASH_BYTES: usize = 3;
5 const HASH_SHIFT: u16 = 5;
6 const HASH_MASK: u16 = WINDOW_MASK as u16;
7
8 /// Helper struct to let us allocate both head and prev in the same block.
9 struct Tables {
10 /// Starts of hash chains (in prev)
11 pub head: [u16; WINDOW_SIZE],
12 /// Link to previous occurence of this hash value
13 pub prev: [u16; WINDOW_SIZE],
14 }
15
16 impl Default for Tables {
17 #[inline]
default() -> Tables18 fn default() -> Tables {
19 Tables {
20 head: [0; WINDOW_SIZE],
21 prev: [0; WINDOW_SIZE],
22 }
23 }
24 }
25
26 impl Tables {
27 #[inline]
fill_prev(&mut self)28 fn fill_prev(&mut self) {
29 self.prev.copy_from_slice(&self.head);
30 }
31 }
32
33 /// Create and box the hash chains.
create_tables() -> Box<Tables>34 fn create_tables() -> Box<Tables> {
35 // Using default here is a trick to get around the lack of box syntax on stable rust.
36 //
37 // Box::new([0u16,n]) ends up creating an temporary array on the stack which is not optimised
38 // but using default, which simply calls `box value` internally allows us to get around this.
39 //
40 // We could use vec instead, but using a boxed array helps the compiler optimise
41 // away bounds checks as `n & WINDOW_MASK < WINDOW_SIZE` will always be true.
42 let mut t: Box<Tables> = Box::default();
43
44 for (n, b) in t.head.iter_mut().enumerate() {
45 *b = n as u16;
46 }
47
48 t.fill_prev();
49
50 t
51 }
52
53 /// Returns a new hash value based on the previous value and the next byte
54 #[inline]
update_hash(current_hash: u16, to_insert: u8) -> u1655 pub fn update_hash(current_hash: u16, to_insert: u8) -> u16 {
56 update_hash_conf(current_hash, to_insert, HASH_SHIFT, HASH_MASK)
57 }
58
59 #[inline]
update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u1660 fn update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u16 {
61 ((current_hash << shift) ^ (u16::from(to_insert))) & mask
62 }
63
64 #[inline]
reset_array(arr: &mut [u16; WINDOW_SIZE])65 fn reset_array(arr: &mut [u16; WINDOW_SIZE]) {
66 for (n, b) in arr.iter_mut().enumerate() {
67 *b = n as u16;
68 }
69 }
70
71 pub struct ChainedHashTable {
72 // Current running hash value of the last 3 bytes
73 current_hash: u16,
74 // Hash chains.
75 c: Box<Tables>,
76 // Used for testing
77 // count: DebugCounter,
78 }
79
80 impl ChainedHashTable {
new() -> ChainedHashTable81 pub fn new() -> ChainedHashTable {
82 ChainedHashTable {
83 current_hash: 0,
84 c: create_tables(),
85 //count: DebugCounter::default(),
86 }
87 }
88
89 #[cfg(test)]
from_starting_values(v1: u8, v2: u8) -> ChainedHashTable90 pub fn from_starting_values(v1: u8, v2: u8) -> ChainedHashTable {
91 let mut t = ChainedHashTable::new();
92 t.current_hash = update_hash(t.current_hash, v1);
93 t.current_hash = update_hash(t.current_hash, v2);
94 t
95 }
96
97 /// Resets the hash value and hash chains
reset(&mut self)98 pub fn reset(&mut self) {
99 self.current_hash = 0;
100 reset_array(&mut self.c.head);
101 {
102 let h = self.c.head;
103 let mut c = self.c.prev;
104 c[..].copy_from_slice(&h[..]);
105 }
106 /*if cfg!(debug_assertions) {
107 self.count.reset();
108 }*/
109 }
110
add_initial_hash_values(&mut self, v1: u8, v2: u8)111 pub fn add_initial_hash_values(&mut self, v1: u8, v2: u8) {
112 self.current_hash = update_hash(self.current_hash, v1);
113 self.current_hash = update_hash(self.current_hash, v2);
114 }
115
116 /// Insert a byte into the hash table
117 #[inline]
add_hash_value(&mut self, position: usize, value: u8)118 pub fn add_hash_value(&mut self, position: usize, value: u8) {
119 // Check that all bytes are input in order and at the correct positions.
120 // Disabled for now as it breaks when sync flushing.
121 /*debug_assert_eq!(
122 position & WINDOW_MASK,
123 self.count.get() as usize & WINDOW_MASK
124 );*/
125 debug_assert!(
126 position < WINDOW_SIZE * 2,
127 "Position is larger than 2 * window size! {}",
128 position
129 );
130 // Storing the hash in a temporary variable here makes the compiler avoid the
131 // bounds checks in this function.
132 let new_hash = update_hash(self.current_hash, value);
133
134 self.add_with_hash(position, new_hash);
135
136 // Update the stored hash value with the new hash.
137 self.current_hash = new_hash;
138 }
139
140 /// Directly set the current hash value
141 #[inline]
set_hash(&mut self, hash: u16)142 pub fn set_hash(&mut self, hash: u16) {
143 self.current_hash = hash;
144 }
145
146 /// Update the tables directly, providing the hash.
147 #[inline]
add_with_hash(&mut self, position: usize, hash: u16)148 pub fn add_with_hash(&mut self, position: usize, hash: u16) {
149 /*if cfg!(debug_assertions) {
150 self.count.add(1);
151 }*/
152
153 self.c.prev[position & WINDOW_MASK] = self.c.head[hash as usize];
154
155 // Ignoring any bits over 16 here is deliberate, as we only concern ourselves about
156 // where in the buffer (which is 64k bytes) we are referring to.
157 self.c.head[hash as usize] = position as u16;
158 }
159
160 // Get the head of the hash chain for the current hash value
161 #[cfg(test)]
162 #[inline]
current_head(&self) -> u16163 pub fn current_head(&self) -> u16 {
164 self.c.head[self.current_hash as usize]
165 }
166
167 #[inline]
current_hash(&self) -> u16168 pub fn current_hash(&self) -> u16 {
169 self.current_hash
170 }
171
172 #[inline]
get_prev(&self, bytes: usize) -> u16173 pub fn get_prev(&self, bytes: usize) -> u16 {
174 self.c.prev[bytes & WINDOW_MASK]
175 }
176
177 #[cfg(test)]
178 #[inline]
farthest_next(&self, match_pos: usize, match_len: usize) -> usize179 pub fn farthest_next(&self, match_pos: usize, match_len: usize) -> usize {
180 let to_check = match_len.saturating_sub(2);
181
182 let mut n = 0;
183 let mut smallest_prev = self.get_prev(match_pos);
184 let mut smallest_pos = 0;
185 while n < to_check {
186 let prev = self.get_prev(match_pos + n);
187 if prev < smallest_prev {
188 smallest_prev = prev;
189 smallest_pos = n;
190 }
191 n += 1;
192 }
193 smallest_pos
194 }
195
196 #[inline]
slide_value(b: u16, pos: u16, bytes: u16) -> u16197 fn slide_value(b: u16, pos: u16, bytes: u16) -> u16 {
198 if b >= bytes {
199 b - bytes
200 } else {
201 pos
202 }
203 }
204
205 #[inline]
slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16)206 fn slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16) {
207 for (n, b) in table.iter_mut().enumerate() {
208 *b = ChainedHashTable::slide_value(*b, n as u16, bytes);
209 }
210 }
211
slide(&mut self, bytes: usize)212 pub fn slide(&mut self, bytes: usize) {
213 /*if cfg!(debug_assertions) && bytes != WINDOW_SIZE {
214 // This should only happen in tests in this file.
215 self.count.reset();
216 }*/
217 ChainedHashTable::slide_table(&mut self.c.head, bytes as u16);
218 ChainedHashTable::slide_table(&mut self.c.prev, bytes as u16);
219 }
220 }
221
222 #[cfg(test)]
filled_hash_table(data: &[u8]) -> ChainedHashTable223 pub fn filled_hash_table(data: &[u8]) -> ChainedHashTable {
224 assert!(data.len() <= (WINDOW_SIZE * 2) + 2);
225 let mut hash_table = ChainedHashTable::from_starting_values(data[0], data[1]);
226 for (n, b) in data[2..].iter().enumerate() {
227 hash_table.add_hash_value(n, *b);
228 }
229 hash_table
230 }
231
232 #[cfg(test)]
233 mod test {
234 use super::{filled_hash_table, ChainedHashTable};
235
236 #[test]
chained_hash()237 fn chained_hash() {
238 use std::str;
239
240 let test_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
241 eiusmod tempor. rum. incididunt ut labore et dolore magna aliqua. Ut \
242 enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi \
243 ut aliquip ex ea commodo consequat. rum. Duis aute irure dolor in \
244 reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \
245 pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \
246 culpa qui officia deserunt mollit anim id est laborum.";
247
248 let test_data = test_string.as_bytes();
249
250 let current_bytes = &test_data[test_data.len() - super::HASH_BYTES..test_data.len()];
251
252 let num_iters = test_string
253 .matches(str::from_utf8(current_bytes).unwrap())
254 .count();
255
256 let hash_table = filled_hash_table(test_data);
257
258 // Test that the positions in the chain are valid
259 let mut prev_value = hash_table.get_prev(hash_table.current_head() as usize) as usize;
260 let mut count = 0;
261 let mut current = hash_table.current_head() as usize;
262 while current != prev_value {
263 count += 1;
264 current = prev_value;
265 prev_value = hash_table.get_prev(prev_value) as usize;
266 }
267 // There should be at least as many occurences of the hash of the checked bytes as the
268 // numbers of occurences of the checked bytes themselves. As the hashes are not large enough
269 // to store 8 * 3 = 24 bits, there could be more with different input data.
270 assert!(count >= num_iters);
271 }
272
273 #[test]
table_unique()274 fn table_unique() {
275 let mut test_data = Vec::new();
276 test_data.extend(0u8..255);
277 test_data.extend(255u8..0);
278 let hash_table = filled_hash_table(&test_data);
279 let prev_pos = hash_table.get_prev(hash_table.current_head() as usize);
280 // Since all sequences in the input are unique, there shouldn't be any previous values.
281 assert_eq!(prev_pos, hash_table.current_hash());
282 }
283
284 #[test]
table_slide()285 fn table_slide() {
286 use std::fs::File;
287 use std::io::Read;
288
289 let window_size = super::WINDOW_SIZE;
290 let window_size16 = super::WINDOW_SIZE as u16;
291
292 let mut input = Vec::new();
293
294 let mut f = File::open("tests/pg11.txt").unwrap();
295
296 f.read_to_end(&mut input).unwrap();
297
298 let mut hash_table = filled_hash_table(&input[..window_size + 2]);
299
300 for (n, b) in input[2..window_size + 2].iter().enumerate() {
301 hash_table.add_hash_value(n + window_size, *b);
302 }
303
304 hash_table.slide(window_size);
305
306 {
307 let max_head = hash_table.c.head.iter().max().unwrap();
308 // After sliding there should be no hashes referring to values
309 // higher than the window size
310 assert!(*max_head < window_size16);
311 assert!(*max_head > 0);
312 let pos = hash_table.get_prev(hash_table.current_head() as usize);
313 // There should be a previous occurence since we inserted the data 3 times
314 assert!(pos < window_size16);
315 assert!(pos > 0);
316 }
317
318 for (n, b) in input[2..(window_size / 2)].iter().enumerate() {
319 hash_table.add_hash_value(n + window_size, *b);
320 }
321
322 // There should hashes referring to values in the upper part of the input window
323 // at this point
324 let max_prev = hash_table.c.prev.iter().max().unwrap();
325 assert!(*max_prev > window_size16);
326
327 let mut pos = hash_table.current_head();
328 // There should be a previous occurence since we inserted the data 3 times
329 assert!(pos > window_size16);
330 let end_byte = input[(window_size / 2) - 1 - 2];
331 let mut iterations = 0;
332 while pos > window_size16 && iterations < 5000 {
333 assert_eq!(input[pos as usize & window_size - 1], end_byte);
334
335 pos = hash_table.get_prev(pos as usize);
336 iterations += 1;
337 }
338 }
339
340 #[test]
341 /// Ensure that the initial hash values are correct.
initial_chains()342 fn initial_chains() {
343 let t = ChainedHashTable::new();
344 for (n, &b) in t.c.head.iter().enumerate() {
345 assert_eq!(n, b as usize);
346 }
347 for (n, &b) in t.c.prev.iter().enumerate() {
348 assert_eq!(n, b as usize);
349 }
350 }
351 }
352