1 //! This module contains functionality for doing lz77 compression of data.
2 #![macro_use]
3 use std::cmp;
4 use std::fmt;
5 use std::iter::{self, Iterator};
6 use std::ops::{Range, RangeFrom};
7 use std::slice::Iter;
8 
9 use crate::chained_hash_table::{update_hash, ChainedHashTable};
10 use crate::compress::Flush;
11 #[cfg(test)]
12 use crate::compression_options::{HIGH_LAZY_IF_LESS_THAN, HIGH_MAX_HASH_CHECKS};
13 use crate::input_buffer::InputBuffer;
14 #[cfg(test)]
15 use crate::lzvalue::{LZType, LZValue};
16 use crate::matching::longest_match;
17 use crate::output_writer::{BufferStatus, DynamicWriter};
18 use crate::rle::process_chunk_greedy_rle;
19 
20 const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize;
21 const MIN_MATCH: usize = crate::huffman_table::MIN_MATCH as usize;
22 
23 const NO_RLE: u16 = 43212;
24 
25 /// An enum describing whether we use lazy or greedy matching.
26 #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
27 pub enum MatchingType {
28     /// Use greedy matching: the matching algorithm simply uses a match right away
29     /// if found.
30     Greedy,
31     /// Use lazy matching: after finding a match, the next input byte is checked, to see
32     /// if there is a better match starting at that byte.
33     ///
34     /// As a special case, if max_hash_checks is set to 0, compression using only run-length
35     /// (i.e maximum match distance of 1) is performed instead.
36     Lazy,
37 }
38 
39 impl fmt::Display for MatchingType {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result40     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
41         match *self {
42             MatchingType::Greedy => write!(f, "Greedy matching"),
43             MatchingType::Lazy => write!(f, "Lazy matching"),
44         }
45     }
46 }
47 
48 /// A struct that contains the hash table, and keeps track of where we are in the input data
49 pub struct LZ77State {
50     /// Struct containing hash chains that will be used to find matches.
51     hash_table: ChainedHashTable,
52     /// True if this is the first window that is being processed.
53     is_first_window: bool,
54     /// Set to true when the last block has been processed.
55     is_last_block: bool,
56     /// How many bytes the last match in the previous window extended into the current one.
57     overlap: usize,
58     /// How many bytes of input the current block contains.
59     current_block_input_bytes: u64,
60     /// The maximum number of hash entries to search.
61     max_hash_checks: u16,
62     /// Only lazy match if we have a match length less than this.
63     lazy_if_less_than: u16,
64     /// Whether to use greedy or lazy parsing
65     matching_type: MatchingType,
66     /// Keep track of the previous match and byte in case the buffer is full when lazy matching.
67     match_state: ChunkState,
68     /// Keep track of how many bytes in the lookahead that was part of a match, but has not been
69     /// added to the hash chain yet.
70     bytes_to_hash: usize,
71     /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be
72     /// hashed.
73     was_synced: bool,
74 }
75 
76 impl LZ77State {
77     /// Creates a new LZ77 state
new( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> LZ77State78     pub fn new(
79         max_hash_checks: u16,
80         lazy_if_less_than: u16,
81         matching_type: MatchingType,
82     ) -> LZ77State {
83         LZ77State {
84             hash_table: ChainedHashTable::new(),
85             is_first_window: true,
86             is_last_block: false,
87             overlap: 0,
88             current_block_input_bytes: 0,
89             max_hash_checks,
90             lazy_if_less_than,
91             matching_type,
92             match_state: ChunkState::new(),
93             bytes_to_hash: 0,
94             was_synced: false,
95         }
96     }
97 
98     /// Resets the state excluding max_hash_checks and lazy_if_less_than
reset(&mut self)99     pub fn reset(&mut self) {
100         self.hash_table.reset();
101         self.is_first_window = true;
102         self.is_last_block = false;
103         self.overlap = 0;
104         self.current_block_input_bytes = 0;
105         self.match_state = ChunkState::new();
106         self.bytes_to_hash = 0
107     }
108 
set_last(&mut self)109     pub fn set_last(&mut self) {
110         self.is_last_block = true;
111     }
112 
113     /// Is this the last block we are outputting?
is_last_block(&self) -> bool114     pub fn is_last_block(&self) -> bool {
115         self.is_last_block
116     }
117 
118     /// How many bytes of input the current block contains.
current_block_input_bytes(&self) -> u64119     pub fn current_block_input_bytes(&self) -> u64 {
120         self.current_block_input_bytes
121     }
122 
123     /// Sets the number of input bytes for the current block to 0.
reset_input_bytes(&mut self)124     pub fn reset_input_bytes(&mut self) {
125         self.current_block_input_bytes = 0;
126     }
127 
128     /// Is there a buffered byte that has not been output yet?
pending_byte(&self) -> bool129     pub fn pending_byte(&self) -> bool {
130         self.match_state.add
131     }
132 
133     /// Returns 1 if pending_byte is true, 0 otherwise.
pending_byte_as_num(&self) -> usize134     pub fn pending_byte_as_num(&self) -> usize {
135         // This could be implemented by using `as usize` as the documentation states this would give
136         // the same result, but not sure if that should be relied upon.
137         if self.match_state.add {
138             1
139         } else {
140             0
141         }
142     }
143 }
144 
145 const DEFAULT_WINDOW_SIZE: usize = 32768;
146 
147 #[derive(Debug)]
148 /// Status after calling `process_chunk`.
149 pub enum ProcessStatus {
150     /// All the input data was processed.
151     Ok,
152     /// The output buffer was full.
153     ///
154     /// The argument is the position of the next byte to be checked by `process_chunk`.
155     BufferFull(usize),
156 }
157 
158 #[derive(Debug)]
159 /// A struct to keep track of status between calls of `process_chunk_lazy`
160 ///
161 /// This is needed as the output buffer might become full before having output all pending data.
162 pub struct ChunkState {
163     /// Length of the last match that was found, if any.
164     current_length: u16,
165     /// Distance of the last match that was found, if any.
166     current_distance: u16,
167     /// The previous byte checked in process_chunk.
168     prev_byte: u8,
169     /// The current byte being checked in process_chunk.
170     cur_byte: u8,
171     /// Whether prev_byte still needs to be output.
172     add: bool,
173 }
174 
175 impl ChunkState {
new() -> ChunkState176     pub fn new() -> ChunkState {
177         ChunkState {
178             current_length: 0,
179             current_distance: 0,
180             prev_byte: 0,
181             cur_byte: 0,
182             add: false,
183         }
184     }
185 }
186 
buffer_full(position: usize) -> ProcessStatus187 pub fn buffer_full(position: usize) -> ProcessStatus {
188     ProcessStatus::BufferFull(position)
189 }
190 
191 #[allow(clippy::too_many_arguments)]
process_chunk( data: &[u8], iterated_data: &Range<usize>, mut match_state: &mut ChunkState, hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, matching_type: MatchingType, ) -> (usize, ProcessStatus)192 fn process_chunk(
193     data: &[u8],
194     iterated_data: &Range<usize>,
195     mut match_state: &mut ChunkState,
196     hash_table: &mut ChainedHashTable,
197     writer: &mut DynamicWriter,
198     max_hash_checks: u16,
199     lazy_if_less_than: usize,
200     matching_type: MatchingType,
201 ) -> (usize, ProcessStatus) {
202     let avoid_rle = if cfg!(test) {
203         // Avoid RLE if lazy_if_less than is a specific value.
204         // This is used in some tests, ideally we should probably do this in a less clunky way,
205         // but we use a value here that is higher than the maximum sensible one anyhow, and will
206         // be truncated by deflate_state for calls from outside the library.
207         lazy_if_less_than == NO_RLE as usize
208     } else {
209         false
210     };
211     match matching_type {
212         MatchingType::Greedy => {
213             process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks)
214         }
215         MatchingType::Lazy => {
216             if max_hash_checks > 0 || avoid_rle {
217                 process_chunk_lazy(
218                     data,
219                     iterated_data,
220                     &mut match_state,
221                     hash_table,
222                     writer,
223                     max_hash_checks,
224                     lazy_if_less_than,
225                 )
226             } else {
227                 // Use the RLE method if max_hash_checks is set to 0.
228                 process_chunk_greedy_rle(data, iterated_data, writer)
229             }
230         }
231     }
232 }
233 
234 /// Add the specified number of bytes to the hash table from the iterators
235 /// adding `start` to the position supplied to the hash table.
add_to_hash_table( bytes_to_add: usize, insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>, hash_it: &mut Iter<u8>, hash_table: &mut ChainedHashTable, )236 fn add_to_hash_table(
237     bytes_to_add: usize,
238     insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>,
239     hash_it: &mut Iter<u8>,
240     hash_table: &mut ChainedHashTable,
241 ) {
242     let taker = insert_it.by_ref().take(bytes_to_add);
243     let mut hash_taker = hash_it.by_ref().take(bytes_to_add);
244     // Update the hash manually here to keep it in a register.
245     let mut hash = hash_table.current_hash();
246     // Advance the iterators and add the bytes we jump over to the hash table and
247     // checksum
248     for (ipos, _) in taker {
249         if let Some(&i_hash_byte) = hash_taker.next() {
250             hash = update_hash(hash, i_hash_byte);
251             hash_table.add_with_hash(ipos, hash);
252         }
253     }
254     // Write the hash back once we are done.
255     hash_table.set_hash(hash);
256 }
257 
258 /// Write the specified literal `byte` to the writer `w`, and return
259 /// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing.
260 ///
261 /// `pos` should indicate the byte to start at in the next call to `process_chunk`,
262 macro_rules! write_literal {
263     ($w:ident, $byte:expr, $pos:expr) => {
264         let b_status = $w.write_literal($byte);
265 
266         if let BufferStatus::Full = b_status {
267             return (0, buffer_full($pos));
268         }
269     };
270 }
271 
272 /// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take
273 /// up more space than it would save.
274 #[inline]
match_too_far(match_len: usize, match_dist: usize) -> bool275 fn match_too_far(match_len: usize, match_dist: usize) -> bool {
276     const TOO_FAR: usize = 8 * 1024;
277     match_len == MIN_MATCH && match_dist > TOO_FAR
278 }
279 
280 ///Create the iterators used when processing through a chunk of data.
create_iterators<'a>( data: &'a [u8], iterated_data: &Range<usize>, ) -> ( usize, iter::Zip<RangeFrom<usize>, Iter<'a, u8>>, Iter<'a, u8>, )281 fn create_iterators<'a>(
282     data: &'a [u8],
283     iterated_data: &Range<usize>,
284 ) -> (
285     usize,
286     iter::Zip<RangeFrom<usize>, Iter<'a, u8>>,
287     Iter<'a, u8>,
288 ) {
289     let end = cmp::min(data.len(), iterated_data.end);
290     let start = iterated_data.start;
291     let current_chunk = &data[start..end];
292 
293     let insert_it = (start..).zip(current_chunk.iter());
294     let hash_it = {
295         let hash_start = if data.len() - start > 2 {
296             start + 2
297         } else {
298             data.len()
299         };
300         (&data[hash_start..]).iter()
301     };
302     (end, insert_it, hash_it)
303 }
304 
process_chunk_lazy( data: &[u8], iterated_data: &Range<usize>, state: &mut ChunkState, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, ) -> (usize, ProcessStatus)305 fn process_chunk_lazy(
306     data: &[u8],
307     iterated_data: &Range<usize>,
308     state: &mut ChunkState,
309     mut hash_table: &mut ChainedHashTable,
310     writer: &mut DynamicWriter,
311     max_hash_checks: u16,
312     lazy_if_less_than: usize,
313 ) -> (usize, ProcessStatus) {
314     let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
315 
316     const NO_LENGTH: u16 = 0;
317 
318     // The previous match length, if any.
319     let mut prev_length = state.current_length;
320     // The distance of the previous match if any.
321     let mut prev_distance = state.current_distance;
322 
323     state.current_length = 0;
324     state.current_distance = 0;
325 
326     // The number of bytes past end that was added due to finding a match that extends into
327     // the lookahead window.
328     let mut overlap = 0;
329 
330     // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`,
331     // indicating that we won't lazy match (check for a better match at the next byte).
332     // If we had a good match, carry this over from the previous call.
333     let mut ignore_next = prev_length as usize >= lazy_if_less_than;
334 
335     // This is to output the correct byte in case there is one pending to be output
336     // from the previous call.
337     state.prev_byte = state.cur_byte;
338 
339     // Iterate through the slice, adding literals or length/distance pairs
340     while let Some((position, &b)) = insert_it.next() {
341         state.cur_byte = b;
342         if let Some(&hash_byte) = hash_it.next() {
343             hash_table.add_hash_value(position, hash_byte);
344 
345             // Only lazy match if we have a match shorter than a set value
346             // TODO: This should be cleaned up a bit
347             if !ignore_next {
348                 let (mut match_len, match_dist) = {
349                     // If there already was a decent match at the previous byte
350                     // and we are lazy matching, do less match checks in this step.
351                     let max_hash_checks = if prev_length >= 32 {
352                         max_hash_checks >> 2
353                     } else {
354                         max_hash_checks
355                     };
356 
357                     // Check if we can find a better match here than the one we had at
358                     // the previous byte.
359                     longest_match(
360                         data,
361                         hash_table,
362                         position,
363                         prev_length as usize,
364                         max_hash_checks,
365                     )
366                 };
367 
368                 // If the match is only 3 bytes long and very far back, it's probably not worth
369                 // outputting.
370                 if match_too_far(match_len, match_dist) {
371                     match_len = NO_LENGTH as usize;
372                 };
373 
374                 if match_len >= lazy_if_less_than {
375                     // We found a decent match, so we won't check for a better one at the next byte.
376                     ignore_next = true;
377                 }
378                 state.current_length = match_len as u16;
379                 state.current_distance = match_dist as u16;
380             } else {
381                 // We already had a decent match, so we don't bother checking for another one.
382                 state.current_length = NO_LENGTH;
383                 state.current_distance = 0;
384                 // Make sure we check again next time.
385                 ignore_next = false;
386             };
387 
388             if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 {
389                 // The previous match was better so we add it.
390                 // Casting note: length and distance is already bounded by the longest match
391                 // function. Usize is just used for convenience.
392                 let b_status =
393                     writer.write_length_distance(prev_length as u16, prev_distance as u16);
394 
395                 // We add the bytes to the hash table and checksum.
396                 // Since we've already added two of them, we need to add two less than
397                 // the length.
398                 let bytes_to_add = prev_length - 2;
399 
400                 add_to_hash_table(
401                     bytes_to_add as usize,
402                     &mut insert_it,
403                     &mut hash_it,
404                     &mut hash_table,
405                 );
406 
407                 // If the match is longer than the current window, we have note how many
408                 // bytes we overlap, since we don't need to do any matching on these bytes
409                 // in the next call of this function.
410                 // We don't have to worry about setting overlap to 0 if this is false, as the
411                 // function will stop after this condition is true, and overlap is not altered
412                 // elsewhere.
413                 if position + prev_length as usize > end {
414                     // We need to subtract 1 since the byte at pos is also included.
415                     overlap = position + prev_length as usize - end - 1;
416                 };
417 
418                 state.add = false;
419 
420                 // Note that there is no current match.
421                 state.current_length = 0;
422                 state.current_distance = 0;
423 
424                 if let BufferStatus::Full = b_status {
425                     // MATCH(lazy)
426                     return (overlap, buffer_full(position + prev_length as usize - 1));
427                 }
428 
429                 ignore_next = false;
430             } else if state.add {
431                 // We found a better match (or there was no previous match)
432                 // so output the previous byte.
433                 // BETTER OR NO MATCH
434                 write_literal!(writer, state.prev_byte, position + 1);
435             } else {
436                 state.add = true
437             }
438 
439             prev_length = state.current_length;
440             prev_distance = state.current_distance;
441             state.prev_byte = b;
442         } else {
443             // If there is a match at this point, it will not have been added, so we need to add it.
444             if prev_length >= MIN_MATCH as u16 {
445                 let b_status =
446                     writer.write_length_distance(prev_length as u16, prev_distance as u16);
447 
448                 state.current_length = 0;
449                 state.current_distance = 0;
450                 state.add = false;
451 
452                 debug_assert!((position + prev_length as usize) >= end - 1);
453                 // Needed to note overlap as we can end up with the last window containing the rest
454                 // of the match.
455                 overlap = (position + prev_length as usize)
456                     .saturating_sub(end)
457                     .saturating_sub(1);
458 
459                 // TODO: Not sure if we need to signal that the buffer is full here.
460                 // It's only needed in the case of syncing.
461                 if let BufferStatus::Full = b_status {
462                     // TODO: These bytes should be hashed when doing a sync flush.
463                     // This can't be done here as the new input data does not exist yet.
464                     return (overlap, buffer_full(end));
465                 } else {
466                     return (overlap, ProcessStatus::Ok);
467                 }
468             };
469 
470             if state.add {
471                 // We may still have a leftover byte at this point, so we add it here if needed.
472                 state.add = false;
473 
474                 // ADD
475                 write_literal!(writer, state.prev_byte, position);
476             };
477 
478             // We are at the last two bytes we want to add, so there is no point
479             // searching for matches here.
480 
481             // AFTER ADD
482             write_literal!(writer, b, position + 1);
483         }
484     }
485     (overlap, ProcessStatus::Ok)
486 }
487 
process_chunk_greedy( data: &[u8], iterated_data: &Range<usize>, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, ) -> (usize, ProcessStatus)488 fn process_chunk_greedy(
489     data: &[u8],
490     iterated_data: &Range<usize>,
491     mut hash_table: &mut ChainedHashTable,
492     writer: &mut DynamicWriter,
493     max_hash_checks: u16,
494 ) -> (usize, ProcessStatus) {
495     let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
496 
497     const NO_LENGTH: usize = 0;
498 
499     // The number of bytes past end that was added due to finding a match that extends into
500     // the lookahead window.
501     let mut overlap = 0;
502 
503     // Iterate through the slice, adding literals or length/distance pairs.
504     while let Some((position, &b)) = insert_it.next() {
505         if let Some(&hash_byte) = hash_it.next() {
506             hash_table.add_hash_value(position, hash_byte);
507 
508             // TODO: This should be cleaned up a bit.
509             let (match_len, match_dist) =
510                 { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) };
511 
512             if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) {
513                 // Casting note: length and distance is already bounded by the longest match
514                 // function. Usize is just used for convenience.
515                 let b_status = writer.write_length_distance(match_len as u16, match_dist as u16);
516 
517                 // We add the bytes to the hash table and checksum.
518                 // Since we've already added one of them, we need to add one less than
519                 // the length.
520                 let bytes_to_add = match_len - 1;
521                 add_to_hash_table(bytes_to_add, &mut insert_it, &mut hash_it, &mut hash_table);
522 
523                 // If the match is longer than the current window, we have note how many
524                 // bytes we overlap, since we don't need to do any matching on these bytes
525                 // in the next call of this function.
526                 if position + match_len > end {
527                     // We need to subtract 1 since the byte at pos is also included.
528                     overlap = position + match_len - end;
529                 };
530 
531                 if let BufferStatus::Full = b_status {
532                     // MATCH
533                     return (overlap, buffer_full(position + match_len));
534                 }
535             } else {
536                 // NO MATCH
537                 write_literal!(writer, b, position + 1);
538             }
539         } else {
540             // We are at the last two bytes we want to add, so there is no point
541             // searching for matches here.
542             // END
543             write_literal!(writer, b, position + 1);
544         }
545     }
546     (overlap, ProcessStatus::Ok)
547 }
548 
549 #[derive(Eq, PartialEq, Clone, Copy, Debug)]
550 pub enum LZ77Status {
551     /// Waiting for more input before doing any processing
552     NeedInput,
553     /// The output buffer is full, so the current block needs to be ended so the
554     /// buffer can be flushed.
555     EndBlock,
556     /// All pending data has been processed.
557     Finished,
558 }
559 
560 #[cfg(test)]
lz77_compress_block_finish( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, ) -> (usize, LZ77Status)561 pub fn lz77_compress_block_finish(
562     data: &[u8],
563     state: &mut LZ77State,
564     buffer: &mut InputBuffer,
565     mut writer: &mut DynamicWriter,
566 ) -> (usize, LZ77Status) {
567     let (consumed, status, _) =
568         lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish);
569     (consumed, status)
570 }
571 
572 /// Compress a slice with lz77 compression.
573 ///
574 /// This function processes one window at a time, and returns when there is no input left,
575 /// or it determines it's time to end a block.
576 ///
577 /// Returns the number of bytes of the input that were consumed, a status describing
578 /// whether there is no input, it's time to finish, or it's time to end the block, and the position
579 /// of the first byte in the input buffer that has not been output (but may have been checked for
580 /// matches).
lz77_compress_block( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, flush: Flush, ) -> (usize, LZ77Status, usize)581 pub fn lz77_compress_block(
582     data: &[u8],
583     state: &mut LZ77State,
584     buffer: &mut InputBuffer,
585     mut writer: &mut DynamicWriter,
586     flush: Flush,
587 ) -> (usize, LZ77Status, usize) {
588     // Currently we only support the maximum window size
589     let window_size = DEFAULT_WINDOW_SIZE;
590 
591     // Indicates whether we should try to process all the data including the lookahead, or if we
592     // should wait until we have at least one window size of data before doing anything.
593     let finish = flush == Flush::Finish || flush == Flush::Sync;
594     let sync = flush == Flush::Sync;
595 
596     let mut current_position = 0;
597 
598     // The current status of the encoding.
599     let mut status = LZ77Status::EndBlock;
600 
601     // Whether warm up the hash chain with the two first values.
602     let mut add_initial = true;
603 
604     // If we have synced, add the two first bytes to the hash as they couldn't be added before.
605     if state.was_synced {
606         if buffer.current_end() > 2 {
607             let pos_add = buffer.current_end() - 2;
608             for (n, &b) in data.iter().take(2).enumerate() {
609                 state.hash_table.add_hash_value(n + pos_add, b);
610             }
611             add_initial = false;
612         }
613         state.was_synced = false;
614     }
615 
616     // Add data to the input buffer and keep a reference to the slice of data not added yet.
617     let mut remaining_data = buffer.add_data(data);
618 
619     loop {
620         // Note if there is a pending byte from the previous call to process_chunk,
621         // so we get the block input size right.
622         let pending_previous = state.pending_byte_as_num();
623 
624         assert!(writer.buffer_length() <= (window_size * 2));
625         // Don't do anything until we are either flushing, or we have at least one window of
626         // data.
627         if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {
628             if state.is_first_window {
629                 if buffer.get_buffer().len() >= 2
630                     && add_initial
631                     && state.current_block_input_bytes == 0
632                 {
633                     let b = buffer.get_buffer();
634                     // Warm up the hash with the two first values, so we can find  matches at
635                     // index 0.
636                     state.hash_table.add_initial_hash_values(b[0], b[1]);
637                     add_initial = false;
638                 }
639             } else {
640                 if buffer.current_end() >= window_size + 2 {
641                     for (n, &h) in buffer.get_buffer()[window_size + 2..]
642                         .iter()
643                         .enumerate()
644                         .take(state.bytes_to_hash)
645                     {
646                         state.hash_table.add_hash_value(window_size + n, h);
647                     }
648                     state.bytes_to_hash = 0;
649                 }
650             }
651 
652             let window_start = if state.is_first_window {
653                 0
654             } else {
655                 window_size
656             };
657             let start = state.overlap + window_start;
658             let end = cmp::min(window_size + window_start, buffer.current_end());
659 
660             let (overlap, p_status) = process_chunk(
661                 buffer.get_buffer(),
662                 &(start..end),
663                 &mut state.match_state,
664                 &mut state.hash_table,
665                 &mut writer,
666                 state.max_hash_checks,
667                 state.lazy_if_less_than as usize,
668                 state.matching_type,
669             );
670 
671             state.bytes_to_hash = overlap;
672 
673             if let ProcessStatus::BufferFull(written) = p_status {
674                 state.current_block_input_bytes +=
675                     (written - start + pending_previous - state.pending_byte_as_num()) as u64;
676 
677                 // If the buffer is full, return and end the block.
678                 // If overlap is non-zero, the buffer was full after outputting the last byte,
679                 // otherwise we have to skip to the point in the buffer where we stopped in the
680                 // next call.
681                 state.overlap = if overlap > 0 {
682                     if !state.is_first_window {
683                         // If we are at the end of the window, make sure we slide the buffer and the
684                         // hash table.
685                         if state.max_hash_checks > 0 {
686                             state.hash_table.slide(window_size);
687                         }
688                         remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
689                     } else {
690                         state.is_first_window = false;
691                     }
692                     overlap
693                 } else {
694                     written - window_start
695                 };
696 
697                 current_position = written - state.pending_byte_as_num();
698 
699                 // Status is already EndBlock at this point.
700                 // status = LZ77Status::EndBlock;
701                 break;
702             }
703 
704             state.current_block_input_bytes +=
705                 (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64;
706 
707             // The buffer is not full, but we still need to note if there is any overlap into the
708             // next window.
709             state.overlap = overlap;
710 
711             if (state.is_first_window || remaining_data.is_none())
712                 && finish
713                 && end >= buffer.current_end()
714             {
715                 current_position = if state.is_first_window {
716                     end - state.pending_byte_as_num()
717                 } else {
718                     debug_assert!(
719                         !state.pending_byte(),
720                         "Bug! Ended compression with a pending byte!"
721                     );
722                     buffer.current_end()
723                 };
724 
725                 // We stopped before or at the window size, so we are at the end.
726                 if !sync {
727                     // If we are finishing and not syncing, we simply indicate that we are done.
728                     state.set_last();
729                     state.is_first_window = false;
730                 } else {
731                     // For sync flushing we need to slide the buffer and the hash chains so that the
732                     // next call to this function starts at the right place.
733 
734                     // There won't be any overlap, since when syncing, we process to the end of the
735                     // pending data.
736                     state.overlap = if state.is_first_window {
737                         end
738                     } else {
739                         buffer.current_end() - window_size
740                     };
741                     state.was_synced = true;
742                 }
743                 status = LZ77Status::Finished;
744                 break;
745             } else {
746                 if state.is_first_window {
747                     state.is_first_window = false;
748                 } else {
749                     // We are not at the end, so slide and continue.
750                     // We slide the hash table back to make space for new hash values
751                     // We only need to remember 2^15 bytes back (the maximum distance allowed by the
752                     // deflate spec).
753                     if state.max_hash_checks > 0 {
754                         state.hash_table.slide(window_size);
755                     }
756 
757                     // Also slide the buffer, discarding data we no longer need and adding new data.
758                     remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
759                 }
760             }
761         } else {
762             // The caller has not indicated that they want to finish or flush, and there is less
763             // than a window + lookahead of new data, so we wait for more.
764             status = LZ77Status::NeedInput;
765             break;
766         }
767     }
768 
769     (
770         data.len() - remaining_data.unwrap_or(&[]).len(),
771         status,
772         current_position,
773     )
774 }
775 
776 #[cfg(test)]
decompress_lz77(input: &[LZValue]) -> Vec<u8>777 pub fn decompress_lz77(input: &[LZValue]) -> Vec<u8> {
778     decompress_lz77_with_backbuffer(input, &[])
779 }
780 
781 #[cfg(test)]
decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8>782 pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8> {
783     let mut output = Vec::new();
784     for p in input {
785         match p.value() {
786             LZType::Literal(l) => output.push(l),
787             LZType::StoredLengthDistance(l, d) => {
788                 // We found a match, so we have to get the data that the match refers to.
789                 let d = d as usize;
790                 let prev_output_len = output.len();
791                 // Get match data from the back buffer if the match extends that far.
792                 let consumed = if d > output.len() {
793                     let into_back_buffer = d - output.len();
794 
795                     assert!(
796                         into_back_buffer <= back_buffer.len(),
797                         "ERROR: Attempted to refer to a match in non-existing data!\
798                          into_back_buffer: {}, back_buffer len {}, d {}, l {:?}",
799                         into_back_buffer,
800                         back_buffer.len(),
801                         d,
802                         l
803                     );
804                     let start = back_buffer.len() - into_back_buffer;
805                     let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize);
806                     output.extend_from_slice(&back_buffer[start..end]);
807 
808                     end - start
809                 } else {
810                     0
811                 };
812 
813                 // Get match data from the currently decompressed data.
814                 let start = prev_output_len.saturating_sub(d);
815                 let mut n = 0;
816                 while n < (l.actual_length() as usize).saturating_sub(consumed) {
817                     let b = output[start + n];
818                     output.push(b);
819                     n += 1;
820                 }
821             }
822         }
823     }
824     output
825 }
826 
827 #[cfg(test)]
828 pub struct TestStruct {
829     state: LZ77State,
830     buffer: InputBuffer,
831     writer: DynamicWriter,
832 }
833 
834 #[cfg(test)]
835 impl TestStruct {
new() -> TestStruct836     fn new() -> TestStruct {
837         TestStruct::with_config(
838             HIGH_MAX_HASH_CHECKS,
839             HIGH_LAZY_IF_LESS_THAN,
840             MatchingType::Lazy,
841         )
842     }
843 
with_config( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> TestStruct844     fn with_config(
845         max_hash_checks: u16,
846         lazy_if_less_than: u16,
847         matching_type: MatchingType,
848     ) -> TestStruct {
849         TestStruct {
850             state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type),
851             buffer: InputBuffer::empty(),
852             writer: DynamicWriter::new(),
853         }
854     }
855 
compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize)856     fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) {
857         lz77_compress_block(
858             data,
859             &mut self.state,
860             &mut self.buffer,
861             &mut self.writer,
862             if flush { Flush::Finish } else { Flush::None },
863         )
864     }
865 }
866 
867 #[cfg(test)]
lz77_compress(data: &[u8]) -> Option<Vec<LZValue>>868 pub fn lz77_compress(data: &[u8]) -> Option<Vec<LZValue>> {
869     lz77_compress_conf(
870         data,
871         HIGH_MAX_HASH_CHECKS,
872         HIGH_LAZY_IF_LESS_THAN,
873         MatchingType::Lazy,
874     )
875 }
876 
877 /// Compress a slice, not storing frequency information
878 ///
879 /// This is a convenience function for compression with fixed huffman values
880 /// Only used in tests for now
881 #[cfg(test)]
lz77_compress_conf( data: &[u8], max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> Option<Vec<LZValue>>882 pub fn lz77_compress_conf(
883     data: &[u8],
884     max_hash_checks: u16,
885     lazy_if_less_than: u16,
886     matching_type: MatchingType,
887 ) -> Option<Vec<LZValue>> {
888     let mut test_boxed = Box::new(TestStruct::with_config(
889         max_hash_checks,
890         lazy_if_less_than,
891         matching_type,
892     ));
893     let mut out = Vec::<LZValue>::with_capacity(data.len() / 3);
894     {
895         let test = test_boxed.as_mut();
896         let mut slice = data;
897 
898         while !test.state.is_last_block {
899             let bytes_written = lz77_compress_block_finish(
900                 slice,
901                 &mut test.state,
902                 &mut test.buffer,
903                 &mut test.writer,
904             )
905             .0;
906             slice = &slice[bytes_written..];
907             out.extend(test.writer.get_buffer());
908             test.writer.clear();
909         }
910     }
911 
912     Some(out)
913 }
914 
915 #[cfg(test)]
916 mod test {
917     use super::*;
918 
919     use crate::chained_hash_table::WINDOW_SIZE;
920     use crate::compression_options::DEFAULT_LAZY_IF_LESS_THAN;
921     use crate::lzvalue::{ld, lit, LZType, LZValue};
922     use crate::output_writer::MAX_BUFFER_LENGTH;
923     use crate::test_utils::get_test_data;
924 
925     /// Helper function to print the output from the lz77 compression function
print_output(input: &[LZValue])926     fn print_output(input: &[LZValue]) {
927         let mut output = vec![];
928         for l in input {
929             match l.value() {
930                 LZType::Literal(l) => output.push(l),
931                 LZType::StoredLengthDistance(l, d) => {
932                     output.extend(format!("<L {}>", l.actual_length()).into_bytes());
933                     output.extend(format!("<D {}>", d).into_bytes())
934                 }
935             }
936         }
937 
938         println!("\"{}\"", String::from_utf8(output).unwrap());
939     }
940 
941     /// Test that a short string from an example on SO compresses correctly
942     #[test]
compress_short()943     fn compress_short() {
944         let test_bytes = String::from("Deflate late").into_bytes();
945         let res = lz77_compress(&test_bytes).unwrap();
946 
947         let decompressed = decompress_lz77(&res);
948 
949         assert_eq!(test_bytes, decompressed);
950         assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5));
951     }
952 
953     /// Test that compression is working for a longer file
954     #[test]
compress_long()955     fn compress_long() {
956         let input = get_test_data();
957         let compressed = lz77_compress(&input).unwrap();
958         assert!(compressed.len() < input.len());
959 
960         let decompressed = decompress_lz77(&compressed);
961         println!("compress_long length: {}", input.len());
962 
963         // This is to check where the compression fails, if it were to
964         for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() {
965             if a != b {
966                 println!("First difference at {}, input: {}, output: {}", n, a, b);
967                 break;
968             }
969         }
970         assert_eq!(input.len(), decompressed.len());
971         assert!(&decompressed == &input);
972     }
973 
974     /// Check that lazy matching is working as intended
975     #[test]
lazy()976     fn lazy() {
977         // We want to match on `badger` rather than `nba` as it is longer
978         // let data = b" nba nbadg badger nbadger";
979         let data = b"nba badger nbadger";
980         let compressed = lz77_compress(data).unwrap();
981         let test = compressed[compressed.len() - 1];
982         if let LZType::StoredLengthDistance(l, _) = test.value() {
983             assert_eq!(l.actual_length(), 6);
984         } else {
985             print_output(&compressed);
986             panic!();
987         }
988     }
989 
roundtrip(data: &[u8])990     fn roundtrip(data: &[u8]) {
991         let compressed = super::lz77_compress(&data).unwrap();
992         let decompressed = decompress_lz77(&compressed);
993         assert!(decompressed == data);
994     }
995 
996     // Check that data with the exact window size is working properly
997     #[test]
998     #[allow(unused)]
exact_window_size()999     fn exact_window_size() {
1000         use std::io::Write;
1001         let mut data = vec![0; WINDOW_SIZE];
1002         roundtrip(&data);
1003         {
1004             data.write(&[22; WINDOW_SIZE]);
1005         }
1006         roundtrip(&data);
1007         {
1008             data.write(&[55; WINDOW_SIZE]);
1009         }
1010         roundtrip(&data);
1011     }
1012 
1013     /// Test that matches at the window border are working correctly
1014     #[test]
border()1015     fn border() {
1016         use crate::chained_hash_table::WINDOW_SIZE;
1017         let mut data = vec![35; WINDOW_SIZE];
1018         data.extend(b"Test");
1019         let compressed = super::lz77_compress(&data).unwrap();
1020         assert!(compressed.len() < data.len());
1021         let decompressed = decompress_lz77(&compressed);
1022 
1023         assert_eq!(decompressed.len(), data.len());
1024         assert!(decompressed == data);
1025     }
1026 
1027     #[test]
border_multiple_blocks()1028     fn border_multiple_blocks() {
1029         use crate::chained_hash_table::WINDOW_SIZE;
1030         let mut data = vec![0; (WINDOW_SIZE * 2) + 50];
1031         data.push(1);
1032         let compressed = super::lz77_compress(&data).unwrap();
1033         assert!(compressed.len() < data.len());
1034         let decompressed = decompress_lz77(&compressed);
1035         assert_eq!(decompressed.len(), data.len());
1036         assert!(decompressed == data);
1037     }
1038 
1039     #[test]
compress_block_status()1040     fn compress_block_status() {
1041         use crate::input_buffer::InputBuffer;
1042 
1043         let data = b"Test data data";
1044         let mut writer = DynamicWriter::new();
1045 
1046         let mut buffer = InputBuffer::empty();
1047         let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1048         let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer);
1049         assert_eq!(status.1, LZ77Status::Finished);
1050         assert!(&buffer.get_buffer()[..data.len()] == data);
1051         assert_eq!(buffer.current_end(), data.len());
1052     }
1053 
1054     #[test]
compress_block_multiple_windows()1055     fn compress_block_multiple_windows() {
1056         use crate::input_buffer::InputBuffer;
1057         use crate::output_writer::DynamicWriter;
1058 
1059         let data = get_test_data();
1060         assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH);
1061         let mut writer = DynamicWriter::new();
1062 
1063         let mut buffer = InputBuffer::empty();
1064         let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1065         let (bytes_consumed, status) =
1066             lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer);
1067         assert_eq!(
1068             buffer.get_buffer().len(),
1069             (WINDOW_SIZE * 2) + super::MAX_MATCH
1070         );
1071         assert_eq!(status, LZ77Status::EndBlock);
1072         let buf_len = buffer.get_buffer().len();
1073         assert!(buffer.get_buffer()[..] == data[..buf_len]);
1074 
1075         writer.clear();
1076         let (_, status) = lz77_compress_block_finish(
1077             &data[bytes_consumed..],
1078             &mut state,
1079             &mut buffer,
1080             &mut writer,
1081         );
1082         assert_eq!(status, LZ77Status::EndBlock);
1083     }
1084 
1085     #[test]
multiple_inputs()1086     fn multiple_inputs() {
1087         let data = b"Badger badger bababa test data 25 asfgestghresjkgh";
1088         let comp1 = lz77_compress(data).unwrap();
1089         let comp2 = {
1090             const SPLIT: usize = 25;
1091             let first_part = &data[..SPLIT];
1092             let second_part = &data[SPLIT..];
1093             let mut state = TestStruct::new();
1094             let (bytes_written, status, _) = state.compress_block(first_part, false);
1095             assert_eq!(bytes_written, first_part.len());
1096             assert_eq!(status, LZ77Status::NeedInput);
1097             let (bytes_written, status, _) = state.compress_block(second_part, true);
1098             assert_eq!(bytes_written, second_part.len());
1099             assert_eq!(status, LZ77Status::Finished);
1100             Vec::from(state.writer.get_buffer())
1101         };
1102         assert!(comp1 == comp2);
1103     }
1104 
1105     #[test]
1106     /// Test that the exit from process_chunk when buffer is full is working correctly.
buffer_fill()1107     fn buffer_fill() {
1108         let data = get_test_data();
1109         // The comments above these calls refers the positions with the
1110         // corersponding comments in process_chunk_{greedy/lazy}.
1111         // POS BETTER OR NO MATCH
1112         buffer_test_literals(&data);
1113         // POS END
1114         // POS NO MATCH
1115         buffer_test_last_bytes(MatchingType::Greedy, &data);
1116         // POS ADD
1117         // POS AFTER ADD
1118         buffer_test_last_bytes(MatchingType::Lazy, &data);
1119 
1120         // POS MATCH
1121         buffer_test_match(MatchingType::Greedy);
1122         // POS MATCH(lazy)
1123         buffer_test_match(MatchingType::Lazy);
1124 
1125         // POS END
1126         buffer_test_add_end(&data);
1127     }
1128 
1129     /// Test buffer fill when a byte is added due to no match being found.
buffer_test_literals(data: &[u8])1130     fn buffer_test_literals(data: &[u8]) {
1131         let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy);
1132         let (bytes_consumed, status, position) = state.compress_block(&data, false);
1133 
1134         // There should be enough data for the block to have ended.
1135         assert_eq!(status, LZ77Status::EndBlock);
1136         assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH);
1137 
1138         // The buffer should be full.
1139         assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH);
1140         assert_eq!(position, state.writer.get_buffer().len());
1141         // Since all literals have been input, the block should have the exact number of litlens
1142         // as there were input bytes.
1143         assert_eq!(
1144             state.state.current_block_input_bytes() as usize,
1145             MAX_BUFFER_LENGTH
1146         );
1147         state.state.reset_input_bytes();
1148 
1149         let mut out = decompress_lz77(state.writer.get_buffer());
1150 
1151         state.writer.clear();
1152         // The buffer should now be cleared.
1153         assert_eq!(state.writer.get_buffer().len(), 0);
1154 
1155         assert!(data[..out.len()] == out[..]);
1156 
1157         let _ = state.compress_block(&data[bytes_consumed..], false);
1158         // We should have some new data in the buffer at this point.
1159         assert!(state.writer.get_buffer().len() > 0);
1160         assert_eq!(
1161             state.state.current_block_input_bytes() as usize,
1162             MAX_BUFFER_LENGTH
1163         );
1164 
1165         out.extend_from_slice(&decompress_lz77(state.writer.get_buffer()));
1166         assert!(data[..out.len()] == out[..]);
1167     }
1168 
1169     /// Test buffer fill at the last two bytes that are not hashed.
buffer_test_last_bytes(matching_type: MatchingType, data: &[u8])1170     fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) {
1171         const BYTES_USED: usize = MAX_BUFFER_LENGTH;
1172         assert!(
1173             &data[..BYTES_USED]
1174                 == &decompress_lz77(
1175                     &lz77_compress_conf(&data[..BYTES_USED], 0, NO_RLE, matching_type,).unwrap()
1176                 )[..]
1177         );
1178         assert!(
1179             &data[..BYTES_USED + 1]
1180                 == &decompress_lz77(
1181                     &lz77_compress_conf(&data[..BYTES_USED + 1], 0, NO_RLE, matching_type,)
1182                         .unwrap()
1183                 )[..]
1184         );
1185     }
1186 
1187     /// Test buffer fill when buffer is full at a match.
buffer_test_match(matching_type: MatchingType)1188     fn buffer_test_match(matching_type: MatchingType) {
1189         // TODO: Also test this for the second block to make sure
1190         // buffer is slid.
1191         let mut state = TestStruct::with_config(1, 0, matching_type);
1192         for _ in 0..MAX_BUFFER_LENGTH - 4 {
1193             assert!(state.writer.write_literal(0) == BufferStatus::NotFull);
1194         }
1195         state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true);
1196         assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3));
1197     }
1198 
1199     /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end.
buffer_test_add_end(_data: &[u8])1200     fn buffer_test_add_end(_data: &[u8]) {
1201         // This is disabled while the buffer size has not been stabilized.
1202         /*
1203         let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS,
1204                                                 DEFAULT_LAZY_IF_LESS_THAN,
1205                                                 MatchingType::Lazy);
1206         // For the test file, this is how much data needs to be added to get the buffer
1207         // full at the right spot to test that this buffer full exit is workong correctly.
1208         for i in 0..31743 {
1209             assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i);
1210         }
1211 
1212         let dec = decompress_lz77(&state.writer.get_buffer()[pos..]);
1213         assert!(dec.len() > 0);
1214         assert!(dec[..] == data[..dec.len()]);
1215          */
1216     }
1217 
1218     /// Check that decompressing lz77-data that refers to the back-buffer works.
1219     #[test]
test_decompress_with_backbuffer()1220     fn test_decompress_with_backbuffer() {
1221         let bb = vec![5; WINDOW_SIZE];
1222         let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)];
1223         let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb);
1224 
1225         // ------------l2 l4  <-ld4,20-> l1 l1  <---ld5,10-->
1226         assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]);
1227     }
1228 }
1229 
1230 #[cfg(all(test, feature = "benchmarks"))]
1231 mod bench {
1232     use super::lz77_compress;
1233     use test_std::Bencher;
1234     use test_utils::get_test_data;
1235     #[bench]
test_file_zlib_lz77_only(b: &mut Bencher)1236     fn test_file_zlib_lz77_only(b: &mut Bencher) {
1237         let test_data = get_test_data();
1238         b.iter(|| lz77_compress(&test_data));
1239     }
1240 }
1241