1 //! This module contains functionality for doing lz77 compression of data.
2 #![macro_use]
3 use std::cmp;
4 use std::fmt;
5 use std::iter::{self, Iterator};
6 use std::ops::{Range, RangeFrom};
7 use std::slice::Iter;
8
9 use crate::chained_hash_table::{update_hash, ChainedHashTable};
10 use crate::compress::Flush;
11 #[cfg(test)]
12 use crate::compression_options::{HIGH_LAZY_IF_LESS_THAN, HIGH_MAX_HASH_CHECKS};
13 use crate::input_buffer::InputBuffer;
14 #[cfg(test)]
15 use crate::lzvalue::{LZType, LZValue};
16 use crate::matching::longest_match;
17 use crate::output_writer::{BufferStatus, DynamicWriter};
18 use crate::rle::process_chunk_greedy_rle;
19
20 const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize;
21 const MIN_MATCH: usize = crate::huffman_table::MIN_MATCH as usize;
22
23 const NO_RLE: u16 = 43212;
24
25 /// An enum describing whether we use lazy or greedy matching.
26 #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
27 pub enum MatchingType {
28 /// Use greedy matching: the matching algorithm simply uses a match right away
29 /// if found.
30 Greedy,
31 /// Use lazy matching: after finding a match, the next input byte is checked, to see
32 /// if there is a better match starting at that byte.
33 ///
34 /// As a special case, if max_hash_checks is set to 0, compression using only run-length
35 /// (i.e maximum match distance of 1) is performed instead.
36 Lazy,
37 }
38
39 impl fmt::Display for MatchingType {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result40 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
41 match *self {
42 MatchingType::Greedy => write!(f, "Greedy matching"),
43 MatchingType::Lazy => write!(f, "Lazy matching"),
44 }
45 }
46 }
47
48 /// A struct that contains the hash table, and keeps track of where we are in the input data
49 pub struct LZ77State {
50 /// Struct containing hash chains that will be used to find matches.
51 hash_table: ChainedHashTable,
52 /// True if this is the first window that is being processed.
53 is_first_window: bool,
54 /// Set to true when the last block has been processed.
55 is_last_block: bool,
56 /// How many bytes the last match in the previous window extended into the current one.
57 overlap: usize,
58 /// How many bytes of input the current block contains.
59 current_block_input_bytes: u64,
60 /// The maximum number of hash entries to search.
61 max_hash_checks: u16,
62 /// Only lazy match if we have a match length less than this.
63 lazy_if_less_than: u16,
64 /// Whether to use greedy or lazy parsing
65 matching_type: MatchingType,
66 /// Keep track of the previous match and byte in case the buffer is full when lazy matching.
67 match_state: ChunkState,
68 /// Keep track of how many bytes in the lookahead that was part of a match, but has not been
69 /// added to the hash chain yet.
70 bytes_to_hash: usize,
71 /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be
72 /// hashed.
73 was_synced: bool,
74 }
75
76 impl LZ77State {
77 /// Creates a new LZ77 state
new( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> LZ77State78 pub fn new(
79 max_hash_checks: u16,
80 lazy_if_less_than: u16,
81 matching_type: MatchingType,
82 ) -> LZ77State {
83 LZ77State {
84 hash_table: ChainedHashTable::new(),
85 is_first_window: true,
86 is_last_block: false,
87 overlap: 0,
88 current_block_input_bytes: 0,
89 max_hash_checks,
90 lazy_if_less_than,
91 matching_type,
92 match_state: ChunkState::new(),
93 bytes_to_hash: 0,
94 was_synced: false,
95 }
96 }
97
98 /// Resets the state excluding max_hash_checks and lazy_if_less_than
reset(&mut self)99 pub fn reset(&mut self) {
100 self.hash_table.reset();
101 self.is_first_window = true;
102 self.is_last_block = false;
103 self.overlap = 0;
104 self.current_block_input_bytes = 0;
105 self.match_state = ChunkState::new();
106 self.bytes_to_hash = 0
107 }
108
set_last(&mut self)109 pub fn set_last(&mut self) {
110 self.is_last_block = true;
111 }
112
113 /// Is this the last block we are outputting?
is_last_block(&self) -> bool114 pub fn is_last_block(&self) -> bool {
115 self.is_last_block
116 }
117
118 /// How many bytes of input the current block contains.
current_block_input_bytes(&self) -> u64119 pub fn current_block_input_bytes(&self) -> u64 {
120 self.current_block_input_bytes
121 }
122
123 /// Sets the number of input bytes for the current block to 0.
reset_input_bytes(&mut self)124 pub fn reset_input_bytes(&mut self) {
125 self.current_block_input_bytes = 0;
126 }
127
128 /// Is there a buffered byte that has not been output yet?
pending_byte(&self) -> bool129 pub fn pending_byte(&self) -> bool {
130 self.match_state.add
131 }
132
133 /// Returns 1 if pending_byte is true, 0 otherwise.
pending_byte_as_num(&self) -> usize134 pub fn pending_byte_as_num(&self) -> usize {
135 // This could be implemented by using `as usize` as the documentation states this would give
136 // the same result, but not sure if that should be relied upon.
137 if self.match_state.add {
138 1
139 } else {
140 0
141 }
142 }
143 }
144
145 const DEFAULT_WINDOW_SIZE: usize = 32768;
146
147 #[derive(Debug)]
148 /// Status after calling `process_chunk`.
149 pub enum ProcessStatus {
150 /// All the input data was processed.
151 Ok,
152 /// The output buffer was full.
153 ///
154 /// The argument is the position of the next byte to be checked by `process_chunk`.
155 BufferFull(usize),
156 }
157
158 #[derive(Debug)]
159 /// A struct to keep track of status between calls of `process_chunk_lazy`
160 ///
161 /// This is needed as the output buffer might become full before having output all pending data.
162 pub struct ChunkState {
163 /// Length of the last match that was found, if any.
164 current_length: u16,
165 /// Distance of the last match that was found, if any.
166 current_distance: u16,
167 /// The previous byte checked in process_chunk.
168 prev_byte: u8,
169 /// The current byte being checked in process_chunk.
170 cur_byte: u8,
171 /// Whether prev_byte still needs to be output.
172 add: bool,
173 }
174
175 impl ChunkState {
new() -> ChunkState176 pub fn new() -> ChunkState {
177 ChunkState {
178 current_length: 0,
179 current_distance: 0,
180 prev_byte: 0,
181 cur_byte: 0,
182 add: false,
183 }
184 }
185 }
186
buffer_full(position: usize) -> ProcessStatus187 pub fn buffer_full(position: usize) -> ProcessStatus {
188 ProcessStatus::BufferFull(position)
189 }
190
191 #[allow(clippy::too_many_arguments)]
process_chunk( data: &[u8], iterated_data: &Range<usize>, mut match_state: &mut ChunkState, hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, matching_type: MatchingType, ) -> (usize, ProcessStatus)192 fn process_chunk(
193 data: &[u8],
194 iterated_data: &Range<usize>,
195 mut match_state: &mut ChunkState,
196 hash_table: &mut ChainedHashTable,
197 writer: &mut DynamicWriter,
198 max_hash_checks: u16,
199 lazy_if_less_than: usize,
200 matching_type: MatchingType,
201 ) -> (usize, ProcessStatus) {
202 let avoid_rle = if cfg!(test) {
203 // Avoid RLE if lazy_if_less than is a specific value.
204 // This is used in some tests, ideally we should probably do this in a less clunky way,
205 // but we use a value here that is higher than the maximum sensible one anyhow, and will
206 // be truncated by deflate_state for calls from outside the library.
207 lazy_if_less_than == NO_RLE as usize
208 } else {
209 false
210 };
211 match matching_type {
212 MatchingType::Greedy => {
213 process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks)
214 }
215 MatchingType::Lazy => {
216 if max_hash_checks > 0 || avoid_rle {
217 process_chunk_lazy(
218 data,
219 iterated_data,
220 &mut match_state,
221 hash_table,
222 writer,
223 max_hash_checks,
224 lazy_if_less_than,
225 )
226 } else {
227 // Use the RLE method if max_hash_checks is set to 0.
228 process_chunk_greedy_rle(data, iterated_data, writer)
229 }
230 }
231 }
232 }
233
234 /// Add the specified number of bytes to the hash table from the iterators
235 /// adding `start` to the position supplied to the hash table.
add_to_hash_table( bytes_to_add: usize, insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>, hash_it: &mut Iter<u8>, hash_table: &mut ChainedHashTable, )236 fn add_to_hash_table(
237 bytes_to_add: usize,
238 insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>,
239 hash_it: &mut Iter<u8>,
240 hash_table: &mut ChainedHashTable,
241 ) {
242 let taker = insert_it.by_ref().take(bytes_to_add);
243 let mut hash_taker = hash_it.by_ref().take(bytes_to_add);
244 // Update the hash manually here to keep it in a register.
245 let mut hash = hash_table.current_hash();
246 // Advance the iterators and add the bytes we jump over to the hash table and
247 // checksum
248 for (ipos, _) in taker {
249 if let Some(&i_hash_byte) = hash_taker.next() {
250 hash = update_hash(hash, i_hash_byte);
251 hash_table.add_with_hash(ipos, hash);
252 }
253 }
254 // Write the hash back once we are done.
255 hash_table.set_hash(hash);
256 }
257
258 /// Write the specified literal `byte` to the writer `w`, and return
259 /// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing.
260 ///
261 /// `pos` should indicate the byte to start at in the next call to `process_chunk`,
262 macro_rules! write_literal {
263 ($w:ident, $byte:expr, $pos:expr) => {
264 let b_status = $w.write_literal($byte);
265
266 if let BufferStatus::Full = b_status {
267 return (0, buffer_full($pos));
268 }
269 };
270 }
271
272 /// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take
273 /// up more space than it would save.
274 #[inline]
match_too_far(match_len: usize, match_dist: usize) -> bool275 fn match_too_far(match_len: usize, match_dist: usize) -> bool {
276 const TOO_FAR: usize = 8 * 1024;
277 match_len == MIN_MATCH && match_dist > TOO_FAR
278 }
279
280 ///Create the iterators used when processing through a chunk of data.
create_iterators<'a>( data: &'a [u8], iterated_data: &Range<usize>, ) -> ( usize, iter::Zip<RangeFrom<usize>, Iter<'a, u8>>, Iter<'a, u8>, )281 fn create_iterators<'a>(
282 data: &'a [u8],
283 iterated_data: &Range<usize>,
284 ) -> (
285 usize,
286 iter::Zip<RangeFrom<usize>, Iter<'a, u8>>,
287 Iter<'a, u8>,
288 ) {
289 let end = cmp::min(data.len(), iterated_data.end);
290 let start = iterated_data.start;
291 let current_chunk = &data[start..end];
292
293 let insert_it = (start..).zip(current_chunk.iter());
294 let hash_it = {
295 let hash_start = if data.len() - start > 2 {
296 start + 2
297 } else {
298 data.len()
299 };
300 (&data[hash_start..]).iter()
301 };
302 (end, insert_it, hash_it)
303 }
304
process_chunk_lazy( data: &[u8], iterated_data: &Range<usize>, state: &mut ChunkState, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, lazy_if_less_than: usize, ) -> (usize, ProcessStatus)305 fn process_chunk_lazy(
306 data: &[u8],
307 iterated_data: &Range<usize>,
308 state: &mut ChunkState,
309 mut hash_table: &mut ChainedHashTable,
310 writer: &mut DynamicWriter,
311 max_hash_checks: u16,
312 lazy_if_less_than: usize,
313 ) -> (usize, ProcessStatus) {
314 let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
315
316 const NO_LENGTH: u16 = 0;
317
318 // The previous match length, if any.
319 let mut prev_length = state.current_length;
320 // The distance of the previous match if any.
321 let mut prev_distance = state.current_distance;
322
323 state.current_length = 0;
324 state.current_distance = 0;
325
326 // The number of bytes past end that was added due to finding a match that extends into
327 // the lookahead window.
328 let mut overlap = 0;
329
330 // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`,
331 // indicating that we won't lazy match (check for a better match at the next byte).
332 // If we had a good match, carry this over from the previous call.
333 let mut ignore_next = prev_length as usize >= lazy_if_less_than;
334
335 // This is to output the correct byte in case there is one pending to be output
336 // from the previous call.
337 state.prev_byte = state.cur_byte;
338
339 // Iterate through the slice, adding literals or length/distance pairs
340 while let Some((position, &b)) = insert_it.next() {
341 state.cur_byte = b;
342 if let Some(&hash_byte) = hash_it.next() {
343 hash_table.add_hash_value(position, hash_byte);
344
345 // Only lazy match if we have a match shorter than a set value
346 // TODO: This should be cleaned up a bit
347 if !ignore_next {
348 let (mut match_len, match_dist) = {
349 // If there already was a decent match at the previous byte
350 // and we are lazy matching, do less match checks in this step.
351 let max_hash_checks = if prev_length >= 32 {
352 max_hash_checks >> 2
353 } else {
354 max_hash_checks
355 };
356
357 // Check if we can find a better match here than the one we had at
358 // the previous byte.
359 longest_match(
360 data,
361 hash_table,
362 position,
363 prev_length as usize,
364 max_hash_checks,
365 )
366 };
367
368 // If the match is only 3 bytes long and very far back, it's probably not worth
369 // outputting.
370 if match_too_far(match_len, match_dist) {
371 match_len = NO_LENGTH as usize;
372 };
373
374 if match_len >= lazy_if_less_than {
375 // We found a decent match, so we won't check for a better one at the next byte.
376 ignore_next = true;
377 }
378 state.current_length = match_len as u16;
379 state.current_distance = match_dist as u16;
380 } else {
381 // We already had a decent match, so we don't bother checking for another one.
382 state.current_length = NO_LENGTH;
383 state.current_distance = 0;
384 // Make sure we check again next time.
385 ignore_next = false;
386 };
387
388 if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 {
389 // The previous match was better so we add it.
390 // Casting note: length and distance is already bounded by the longest match
391 // function. Usize is just used for convenience.
392 let b_status =
393 writer.write_length_distance(prev_length as u16, prev_distance as u16);
394
395 // We add the bytes to the hash table and checksum.
396 // Since we've already added two of them, we need to add two less than
397 // the length.
398 let bytes_to_add = prev_length - 2;
399
400 add_to_hash_table(
401 bytes_to_add as usize,
402 &mut insert_it,
403 &mut hash_it,
404 &mut hash_table,
405 );
406
407 // If the match is longer than the current window, we have note how many
408 // bytes we overlap, since we don't need to do any matching on these bytes
409 // in the next call of this function.
410 // We don't have to worry about setting overlap to 0 if this is false, as the
411 // function will stop after this condition is true, and overlap is not altered
412 // elsewhere.
413 if position + prev_length as usize > end {
414 // We need to subtract 1 since the byte at pos is also included.
415 overlap = position + prev_length as usize - end - 1;
416 };
417
418 state.add = false;
419
420 // Note that there is no current match.
421 state.current_length = 0;
422 state.current_distance = 0;
423
424 if let BufferStatus::Full = b_status {
425 // MATCH(lazy)
426 return (overlap, buffer_full(position + prev_length as usize - 1));
427 }
428
429 ignore_next = false;
430 } else if state.add {
431 // We found a better match (or there was no previous match)
432 // so output the previous byte.
433 // BETTER OR NO MATCH
434 write_literal!(writer, state.prev_byte, position + 1);
435 } else {
436 state.add = true
437 }
438
439 prev_length = state.current_length;
440 prev_distance = state.current_distance;
441 state.prev_byte = b;
442 } else {
443 // If there is a match at this point, it will not have been added, so we need to add it.
444 if prev_length >= MIN_MATCH as u16 {
445 let b_status =
446 writer.write_length_distance(prev_length as u16, prev_distance as u16);
447
448 state.current_length = 0;
449 state.current_distance = 0;
450 state.add = false;
451
452 debug_assert!((position + prev_length as usize) >= end - 1);
453 // Needed to note overlap as we can end up with the last window containing the rest
454 // of the match.
455 overlap = (position + prev_length as usize)
456 .saturating_sub(end)
457 .saturating_sub(1);
458
459 // TODO: Not sure if we need to signal that the buffer is full here.
460 // It's only needed in the case of syncing.
461 if let BufferStatus::Full = b_status {
462 // TODO: These bytes should be hashed when doing a sync flush.
463 // This can't be done here as the new input data does not exist yet.
464 return (overlap, buffer_full(end));
465 } else {
466 return (overlap, ProcessStatus::Ok);
467 }
468 };
469
470 if state.add {
471 // We may still have a leftover byte at this point, so we add it here if needed.
472 state.add = false;
473
474 // ADD
475 write_literal!(writer, state.prev_byte, position);
476 };
477
478 // We are at the last two bytes we want to add, so there is no point
479 // searching for matches here.
480
481 // AFTER ADD
482 write_literal!(writer, b, position + 1);
483 }
484 }
485 (overlap, ProcessStatus::Ok)
486 }
487
process_chunk_greedy( data: &[u8], iterated_data: &Range<usize>, mut hash_table: &mut ChainedHashTable, writer: &mut DynamicWriter, max_hash_checks: u16, ) -> (usize, ProcessStatus)488 fn process_chunk_greedy(
489 data: &[u8],
490 iterated_data: &Range<usize>,
491 mut hash_table: &mut ChainedHashTable,
492 writer: &mut DynamicWriter,
493 max_hash_checks: u16,
494 ) -> (usize, ProcessStatus) {
495 let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
496
497 const NO_LENGTH: usize = 0;
498
499 // The number of bytes past end that was added due to finding a match that extends into
500 // the lookahead window.
501 let mut overlap = 0;
502
503 // Iterate through the slice, adding literals or length/distance pairs.
504 while let Some((position, &b)) = insert_it.next() {
505 if let Some(&hash_byte) = hash_it.next() {
506 hash_table.add_hash_value(position, hash_byte);
507
508 // TODO: This should be cleaned up a bit.
509 let (match_len, match_dist) =
510 { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) };
511
512 if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) {
513 // Casting note: length and distance is already bounded by the longest match
514 // function. Usize is just used for convenience.
515 let b_status = writer.write_length_distance(match_len as u16, match_dist as u16);
516
517 // We add the bytes to the hash table and checksum.
518 // Since we've already added one of them, we need to add one less than
519 // the length.
520 let bytes_to_add = match_len - 1;
521 add_to_hash_table(bytes_to_add, &mut insert_it, &mut hash_it, &mut hash_table);
522
523 // If the match is longer than the current window, we have note how many
524 // bytes we overlap, since we don't need to do any matching on these bytes
525 // in the next call of this function.
526 if position + match_len > end {
527 // We need to subtract 1 since the byte at pos is also included.
528 overlap = position + match_len - end;
529 };
530
531 if let BufferStatus::Full = b_status {
532 // MATCH
533 return (overlap, buffer_full(position + match_len));
534 }
535 } else {
536 // NO MATCH
537 write_literal!(writer, b, position + 1);
538 }
539 } else {
540 // We are at the last two bytes we want to add, so there is no point
541 // searching for matches here.
542 // END
543 write_literal!(writer, b, position + 1);
544 }
545 }
546 (overlap, ProcessStatus::Ok)
547 }
548
549 #[derive(Eq, PartialEq, Clone, Copy, Debug)]
550 pub enum LZ77Status {
551 /// Waiting for more input before doing any processing
552 NeedInput,
553 /// The output buffer is full, so the current block needs to be ended so the
554 /// buffer can be flushed.
555 EndBlock,
556 /// All pending data has been processed.
557 Finished,
558 }
559
560 #[cfg(test)]
lz77_compress_block_finish( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, ) -> (usize, LZ77Status)561 pub fn lz77_compress_block_finish(
562 data: &[u8],
563 state: &mut LZ77State,
564 buffer: &mut InputBuffer,
565 mut writer: &mut DynamicWriter,
566 ) -> (usize, LZ77Status) {
567 let (consumed, status, _) =
568 lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish);
569 (consumed, status)
570 }
571
572 /// Compress a slice with lz77 compression.
573 ///
574 /// This function processes one window at a time, and returns when there is no input left,
575 /// or it determines it's time to end a block.
576 ///
577 /// Returns the number of bytes of the input that were consumed, a status describing
578 /// whether there is no input, it's time to finish, or it's time to end the block, and the position
579 /// of the first byte in the input buffer that has not been output (but may have been checked for
580 /// matches).
lz77_compress_block( data: &[u8], state: &mut LZ77State, buffer: &mut InputBuffer, mut writer: &mut DynamicWriter, flush: Flush, ) -> (usize, LZ77Status, usize)581 pub fn lz77_compress_block(
582 data: &[u8],
583 state: &mut LZ77State,
584 buffer: &mut InputBuffer,
585 mut writer: &mut DynamicWriter,
586 flush: Flush,
587 ) -> (usize, LZ77Status, usize) {
588 // Currently we only support the maximum window size
589 let window_size = DEFAULT_WINDOW_SIZE;
590
591 // Indicates whether we should try to process all the data including the lookahead, or if we
592 // should wait until we have at least one window size of data before doing anything.
593 let finish = flush == Flush::Finish || flush == Flush::Sync;
594 let sync = flush == Flush::Sync;
595
596 let mut current_position = 0;
597
598 // The current status of the encoding.
599 let mut status = LZ77Status::EndBlock;
600
601 // Whether warm up the hash chain with the two first values.
602 let mut add_initial = true;
603
604 // If we have synced, add the two first bytes to the hash as they couldn't be added before.
605 if state.was_synced {
606 if buffer.current_end() > 2 {
607 let pos_add = buffer.current_end() - 2;
608 for (n, &b) in data.iter().take(2).enumerate() {
609 state.hash_table.add_hash_value(n + pos_add, b);
610 }
611 add_initial = false;
612 }
613 state.was_synced = false;
614 }
615
616 // Add data to the input buffer and keep a reference to the slice of data not added yet.
617 let mut remaining_data = buffer.add_data(data);
618
619 loop {
620 // Note if there is a pending byte from the previous call to process_chunk,
621 // so we get the block input size right.
622 let pending_previous = state.pending_byte_as_num();
623
624 assert!(writer.buffer_length() <= (window_size * 2));
625 // Don't do anything until we are either flushing, or we have at least one window of
626 // data.
627 if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {
628 if state.is_first_window {
629 if buffer.get_buffer().len() >= 2
630 && add_initial
631 && state.current_block_input_bytes == 0
632 {
633 let b = buffer.get_buffer();
634 // Warm up the hash with the two first values, so we can find matches at
635 // index 0.
636 state.hash_table.add_initial_hash_values(b[0], b[1]);
637 add_initial = false;
638 }
639 } else {
640 if buffer.current_end() >= window_size + 2 {
641 for (n, &h) in buffer.get_buffer()[window_size + 2..]
642 .iter()
643 .enumerate()
644 .take(state.bytes_to_hash)
645 {
646 state.hash_table.add_hash_value(window_size + n, h);
647 }
648 state.bytes_to_hash = 0;
649 }
650 }
651
652 let window_start = if state.is_first_window {
653 0
654 } else {
655 window_size
656 };
657 let start = state.overlap + window_start;
658 let end = cmp::min(window_size + window_start, buffer.current_end());
659
660 let (overlap, p_status) = process_chunk(
661 buffer.get_buffer(),
662 &(start..end),
663 &mut state.match_state,
664 &mut state.hash_table,
665 &mut writer,
666 state.max_hash_checks,
667 state.lazy_if_less_than as usize,
668 state.matching_type,
669 );
670
671 state.bytes_to_hash = overlap;
672
673 if let ProcessStatus::BufferFull(written) = p_status {
674 state.current_block_input_bytes +=
675 (written - start + pending_previous - state.pending_byte_as_num()) as u64;
676
677 // If the buffer is full, return and end the block.
678 // If overlap is non-zero, the buffer was full after outputting the last byte,
679 // otherwise we have to skip to the point in the buffer where we stopped in the
680 // next call.
681 state.overlap = if overlap > 0 {
682 if !state.is_first_window {
683 // If we are at the end of the window, make sure we slide the buffer and the
684 // hash table.
685 if state.max_hash_checks > 0 {
686 state.hash_table.slide(window_size);
687 }
688 remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
689 } else {
690 state.is_first_window = false;
691 }
692 overlap
693 } else {
694 written - window_start
695 };
696
697 current_position = written - state.pending_byte_as_num();
698
699 // Status is already EndBlock at this point.
700 // status = LZ77Status::EndBlock;
701 break;
702 }
703
704 state.current_block_input_bytes +=
705 (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64;
706
707 // The buffer is not full, but we still need to note if there is any overlap into the
708 // next window.
709 state.overlap = overlap;
710
711 if (state.is_first_window || remaining_data.is_none())
712 && finish
713 && end >= buffer.current_end()
714 {
715 current_position = if state.is_first_window {
716 end - state.pending_byte_as_num()
717 } else {
718 debug_assert!(
719 !state.pending_byte(),
720 "Bug! Ended compression with a pending byte!"
721 );
722 buffer.current_end()
723 };
724
725 // We stopped before or at the window size, so we are at the end.
726 if !sync {
727 // If we are finishing and not syncing, we simply indicate that we are done.
728 state.set_last();
729 state.is_first_window = false;
730 } else {
731 // For sync flushing we need to slide the buffer and the hash chains so that the
732 // next call to this function starts at the right place.
733
734 // There won't be any overlap, since when syncing, we process to the end of the
735 // pending data.
736 state.overlap = if state.is_first_window {
737 end
738 } else {
739 buffer.current_end() - window_size
740 };
741 state.was_synced = true;
742 }
743 status = LZ77Status::Finished;
744 break;
745 } else {
746 if state.is_first_window {
747 state.is_first_window = false;
748 } else {
749 // We are not at the end, so slide and continue.
750 // We slide the hash table back to make space for new hash values
751 // We only need to remember 2^15 bytes back (the maximum distance allowed by the
752 // deflate spec).
753 if state.max_hash_checks > 0 {
754 state.hash_table.slide(window_size);
755 }
756
757 // Also slide the buffer, discarding data we no longer need and adding new data.
758 remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
759 }
760 }
761 } else {
762 // The caller has not indicated that they want to finish or flush, and there is less
763 // than a window + lookahead of new data, so we wait for more.
764 status = LZ77Status::NeedInput;
765 break;
766 }
767 }
768
769 (
770 data.len() - remaining_data.unwrap_or(&[]).len(),
771 status,
772 current_position,
773 )
774 }
775
776 #[cfg(test)]
decompress_lz77(input: &[LZValue]) -> Vec<u8>777 pub fn decompress_lz77(input: &[LZValue]) -> Vec<u8> {
778 decompress_lz77_with_backbuffer(input, &[])
779 }
780
781 #[cfg(test)]
decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8>782 pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8> {
783 let mut output = Vec::new();
784 for p in input {
785 match p.value() {
786 LZType::Literal(l) => output.push(l),
787 LZType::StoredLengthDistance(l, d) => {
788 // We found a match, so we have to get the data that the match refers to.
789 let d = d as usize;
790 let prev_output_len = output.len();
791 // Get match data from the back buffer if the match extends that far.
792 let consumed = if d > output.len() {
793 let into_back_buffer = d - output.len();
794
795 assert!(
796 into_back_buffer <= back_buffer.len(),
797 "ERROR: Attempted to refer to a match in non-existing data!\
798 into_back_buffer: {}, back_buffer len {}, d {}, l {:?}",
799 into_back_buffer,
800 back_buffer.len(),
801 d,
802 l
803 );
804 let start = back_buffer.len() - into_back_buffer;
805 let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize);
806 output.extend_from_slice(&back_buffer[start..end]);
807
808 end - start
809 } else {
810 0
811 };
812
813 // Get match data from the currently decompressed data.
814 let start = prev_output_len.saturating_sub(d);
815 let mut n = 0;
816 while n < (l.actual_length() as usize).saturating_sub(consumed) {
817 let b = output[start + n];
818 output.push(b);
819 n += 1;
820 }
821 }
822 }
823 }
824 output
825 }
826
827 #[cfg(test)]
828 pub struct TestStruct {
829 state: LZ77State,
830 buffer: InputBuffer,
831 writer: DynamicWriter,
832 }
833
834 #[cfg(test)]
835 impl TestStruct {
new() -> TestStruct836 fn new() -> TestStruct {
837 TestStruct::with_config(
838 HIGH_MAX_HASH_CHECKS,
839 HIGH_LAZY_IF_LESS_THAN,
840 MatchingType::Lazy,
841 )
842 }
843
with_config( max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> TestStruct844 fn with_config(
845 max_hash_checks: u16,
846 lazy_if_less_than: u16,
847 matching_type: MatchingType,
848 ) -> TestStruct {
849 TestStruct {
850 state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type),
851 buffer: InputBuffer::empty(),
852 writer: DynamicWriter::new(),
853 }
854 }
855
compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize)856 fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) {
857 lz77_compress_block(
858 data,
859 &mut self.state,
860 &mut self.buffer,
861 &mut self.writer,
862 if flush { Flush::Finish } else { Flush::None },
863 )
864 }
865 }
866
867 #[cfg(test)]
lz77_compress(data: &[u8]) -> Option<Vec<LZValue>>868 pub fn lz77_compress(data: &[u8]) -> Option<Vec<LZValue>> {
869 lz77_compress_conf(
870 data,
871 HIGH_MAX_HASH_CHECKS,
872 HIGH_LAZY_IF_LESS_THAN,
873 MatchingType::Lazy,
874 )
875 }
876
877 /// Compress a slice, not storing frequency information
878 ///
879 /// This is a convenience function for compression with fixed huffman values
880 /// Only used in tests for now
881 #[cfg(test)]
lz77_compress_conf( data: &[u8], max_hash_checks: u16, lazy_if_less_than: u16, matching_type: MatchingType, ) -> Option<Vec<LZValue>>882 pub fn lz77_compress_conf(
883 data: &[u8],
884 max_hash_checks: u16,
885 lazy_if_less_than: u16,
886 matching_type: MatchingType,
887 ) -> Option<Vec<LZValue>> {
888 let mut test_boxed = Box::new(TestStruct::with_config(
889 max_hash_checks,
890 lazy_if_less_than,
891 matching_type,
892 ));
893 let mut out = Vec::<LZValue>::with_capacity(data.len() / 3);
894 {
895 let test = test_boxed.as_mut();
896 let mut slice = data;
897
898 while !test.state.is_last_block {
899 let bytes_written = lz77_compress_block_finish(
900 slice,
901 &mut test.state,
902 &mut test.buffer,
903 &mut test.writer,
904 )
905 .0;
906 slice = &slice[bytes_written..];
907 out.extend(test.writer.get_buffer());
908 test.writer.clear();
909 }
910 }
911
912 Some(out)
913 }
914
915 #[cfg(test)]
916 mod test {
917 use super::*;
918
919 use crate::chained_hash_table::WINDOW_SIZE;
920 use crate::compression_options::DEFAULT_LAZY_IF_LESS_THAN;
921 use crate::lzvalue::{ld, lit, LZType, LZValue};
922 use crate::output_writer::MAX_BUFFER_LENGTH;
923 use crate::test_utils::get_test_data;
924
925 /// Helper function to print the output from the lz77 compression function
print_output(input: &[LZValue])926 fn print_output(input: &[LZValue]) {
927 let mut output = vec![];
928 for l in input {
929 match l.value() {
930 LZType::Literal(l) => output.push(l),
931 LZType::StoredLengthDistance(l, d) => {
932 output.extend(format!("<L {}>", l.actual_length()).into_bytes());
933 output.extend(format!("<D {}>", d).into_bytes())
934 }
935 }
936 }
937
938 println!("\"{}\"", String::from_utf8(output).unwrap());
939 }
940
941 /// Test that a short string from an example on SO compresses correctly
942 #[test]
compress_short()943 fn compress_short() {
944 let test_bytes = String::from("Deflate late").into_bytes();
945 let res = lz77_compress(&test_bytes).unwrap();
946
947 let decompressed = decompress_lz77(&res);
948
949 assert_eq!(test_bytes, decompressed);
950 assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5));
951 }
952
953 /// Test that compression is working for a longer file
954 #[test]
compress_long()955 fn compress_long() {
956 let input = get_test_data();
957 let compressed = lz77_compress(&input).unwrap();
958 assert!(compressed.len() < input.len());
959
960 let decompressed = decompress_lz77(&compressed);
961 println!("compress_long length: {}", input.len());
962
963 // This is to check where the compression fails, if it were to
964 for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() {
965 if a != b {
966 println!("First difference at {}, input: {}, output: {}", n, a, b);
967 break;
968 }
969 }
970 assert_eq!(input.len(), decompressed.len());
971 assert!(&decompressed == &input);
972 }
973
974 /// Check that lazy matching is working as intended
975 #[test]
lazy()976 fn lazy() {
977 // We want to match on `badger` rather than `nba` as it is longer
978 // let data = b" nba nbadg badger nbadger";
979 let data = b"nba badger nbadger";
980 let compressed = lz77_compress(data).unwrap();
981 let test = compressed[compressed.len() - 1];
982 if let LZType::StoredLengthDistance(l, _) = test.value() {
983 assert_eq!(l.actual_length(), 6);
984 } else {
985 print_output(&compressed);
986 panic!();
987 }
988 }
989
roundtrip(data: &[u8])990 fn roundtrip(data: &[u8]) {
991 let compressed = super::lz77_compress(&data).unwrap();
992 let decompressed = decompress_lz77(&compressed);
993 assert!(decompressed == data);
994 }
995
996 // Check that data with the exact window size is working properly
997 #[test]
998 #[allow(unused)]
exact_window_size()999 fn exact_window_size() {
1000 use std::io::Write;
1001 let mut data = vec![0; WINDOW_SIZE];
1002 roundtrip(&data);
1003 {
1004 data.write(&[22; WINDOW_SIZE]);
1005 }
1006 roundtrip(&data);
1007 {
1008 data.write(&[55; WINDOW_SIZE]);
1009 }
1010 roundtrip(&data);
1011 }
1012
1013 /// Test that matches at the window border are working correctly
1014 #[test]
border()1015 fn border() {
1016 use crate::chained_hash_table::WINDOW_SIZE;
1017 let mut data = vec![35; WINDOW_SIZE];
1018 data.extend(b"Test");
1019 let compressed = super::lz77_compress(&data).unwrap();
1020 assert!(compressed.len() < data.len());
1021 let decompressed = decompress_lz77(&compressed);
1022
1023 assert_eq!(decompressed.len(), data.len());
1024 assert!(decompressed == data);
1025 }
1026
1027 #[test]
border_multiple_blocks()1028 fn border_multiple_blocks() {
1029 use crate::chained_hash_table::WINDOW_SIZE;
1030 let mut data = vec![0; (WINDOW_SIZE * 2) + 50];
1031 data.push(1);
1032 let compressed = super::lz77_compress(&data).unwrap();
1033 assert!(compressed.len() < data.len());
1034 let decompressed = decompress_lz77(&compressed);
1035 assert_eq!(decompressed.len(), data.len());
1036 assert!(decompressed == data);
1037 }
1038
1039 #[test]
compress_block_status()1040 fn compress_block_status() {
1041 use crate::input_buffer::InputBuffer;
1042
1043 let data = b"Test data data";
1044 let mut writer = DynamicWriter::new();
1045
1046 let mut buffer = InputBuffer::empty();
1047 let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1048 let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer);
1049 assert_eq!(status.1, LZ77Status::Finished);
1050 assert!(&buffer.get_buffer()[..data.len()] == data);
1051 assert_eq!(buffer.current_end(), data.len());
1052 }
1053
1054 #[test]
compress_block_multiple_windows()1055 fn compress_block_multiple_windows() {
1056 use crate::input_buffer::InputBuffer;
1057 use crate::output_writer::DynamicWriter;
1058
1059 let data = get_test_data();
1060 assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH);
1061 let mut writer = DynamicWriter::new();
1062
1063 let mut buffer = InputBuffer::empty();
1064 let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1065 let (bytes_consumed, status) =
1066 lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer);
1067 assert_eq!(
1068 buffer.get_buffer().len(),
1069 (WINDOW_SIZE * 2) + super::MAX_MATCH
1070 );
1071 assert_eq!(status, LZ77Status::EndBlock);
1072 let buf_len = buffer.get_buffer().len();
1073 assert!(buffer.get_buffer()[..] == data[..buf_len]);
1074
1075 writer.clear();
1076 let (_, status) = lz77_compress_block_finish(
1077 &data[bytes_consumed..],
1078 &mut state,
1079 &mut buffer,
1080 &mut writer,
1081 );
1082 assert_eq!(status, LZ77Status::EndBlock);
1083 }
1084
1085 #[test]
multiple_inputs()1086 fn multiple_inputs() {
1087 let data = b"Badger badger bababa test data 25 asfgestghresjkgh";
1088 let comp1 = lz77_compress(data).unwrap();
1089 let comp2 = {
1090 const SPLIT: usize = 25;
1091 let first_part = &data[..SPLIT];
1092 let second_part = &data[SPLIT..];
1093 let mut state = TestStruct::new();
1094 let (bytes_written, status, _) = state.compress_block(first_part, false);
1095 assert_eq!(bytes_written, first_part.len());
1096 assert_eq!(status, LZ77Status::NeedInput);
1097 let (bytes_written, status, _) = state.compress_block(second_part, true);
1098 assert_eq!(bytes_written, second_part.len());
1099 assert_eq!(status, LZ77Status::Finished);
1100 Vec::from(state.writer.get_buffer())
1101 };
1102 assert!(comp1 == comp2);
1103 }
1104
1105 #[test]
1106 /// Test that the exit from process_chunk when buffer is full is working correctly.
buffer_fill()1107 fn buffer_fill() {
1108 let data = get_test_data();
1109 // The comments above these calls refers the positions with the
1110 // corersponding comments in process_chunk_{greedy/lazy}.
1111 // POS BETTER OR NO MATCH
1112 buffer_test_literals(&data);
1113 // POS END
1114 // POS NO MATCH
1115 buffer_test_last_bytes(MatchingType::Greedy, &data);
1116 // POS ADD
1117 // POS AFTER ADD
1118 buffer_test_last_bytes(MatchingType::Lazy, &data);
1119
1120 // POS MATCH
1121 buffer_test_match(MatchingType::Greedy);
1122 // POS MATCH(lazy)
1123 buffer_test_match(MatchingType::Lazy);
1124
1125 // POS END
1126 buffer_test_add_end(&data);
1127 }
1128
1129 /// Test buffer fill when a byte is added due to no match being found.
buffer_test_literals(data: &[u8])1130 fn buffer_test_literals(data: &[u8]) {
1131 let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy);
1132 let (bytes_consumed, status, position) = state.compress_block(&data, false);
1133
1134 // There should be enough data for the block to have ended.
1135 assert_eq!(status, LZ77Status::EndBlock);
1136 assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH);
1137
1138 // The buffer should be full.
1139 assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH);
1140 assert_eq!(position, state.writer.get_buffer().len());
1141 // Since all literals have been input, the block should have the exact number of litlens
1142 // as there were input bytes.
1143 assert_eq!(
1144 state.state.current_block_input_bytes() as usize,
1145 MAX_BUFFER_LENGTH
1146 );
1147 state.state.reset_input_bytes();
1148
1149 let mut out = decompress_lz77(state.writer.get_buffer());
1150
1151 state.writer.clear();
1152 // The buffer should now be cleared.
1153 assert_eq!(state.writer.get_buffer().len(), 0);
1154
1155 assert!(data[..out.len()] == out[..]);
1156
1157 let _ = state.compress_block(&data[bytes_consumed..], false);
1158 // We should have some new data in the buffer at this point.
1159 assert!(state.writer.get_buffer().len() > 0);
1160 assert_eq!(
1161 state.state.current_block_input_bytes() as usize,
1162 MAX_BUFFER_LENGTH
1163 );
1164
1165 out.extend_from_slice(&decompress_lz77(state.writer.get_buffer()));
1166 assert!(data[..out.len()] == out[..]);
1167 }
1168
1169 /// Test buffer fill at the last two bytes that are not hashed.
buffer_test_last_bytes(matching_type: MatchingType, data: &[u8])1170 fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) {
1171 const BYTES_USED: usize = MAX_BUFFER_LENGTH;
1172 assert!(
1173 &data[..BYTES_USED]
1174 == &decompress_lz77(
1175 &lz77_compress_conf(&data[..BYTES_USED], 0, NO_RLE, matching_type,).unwrap()
1176 )[..]
1177 );
1178 assert!(
1179 &data[..BYTES_USED + 1]
1180 == &decompress_lz77(
1181 &lz77_compress_conf(&data[..BYTES_USED + 1], 0, NO_RLE, matching_type,)
1182 .unwrap()
1183 )[..]
1184 );
1185 }
1186
1187 /// Test buffer fill when buffer is full at a match.
buffer_test_match(matching_type: MatchingType)1188 fn buffer_test_match(matching_type: MatchingType) {
1189 // TODO: Also test this for the second block to make sure
1190 // buffer is slid.
1191 let mut state = TestStruct::with_config(1, 0, matching_type);
1192 for _ in 0..MAX_BUFFER_LENGTH - 4 {
1193 assert!(state.writer.write_literal(0) == BufferStatus::NotFull);
1194 }
1195 state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true);
1196 assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3));
1197 }
1198
1199 /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end.
buffer_test_add_end(_data: &[u8])1200 fn buffer_test_add_end(_data: &[u8]) {
1201 // This is disabled while the buffer size has not been stabilized.
1202 /*
1203 let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS,
1204 DEFAULT_LAZY_IF_LESS_THAN,
1205 MatchingType::Lazy);
1206 // For the test file, this is how much data needs to be added to get the buffer
1207 // full at the right spot to test that this buffer full exit is workong correctly.
1208 for i in 0..31743 {
1209 assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i);
1210 }
1211
1212 let dec = decompress_lz77(&state.writer.get_buffer()[pos..]);
1213 assert!(dec.len() > 0);
1214 assert!(dec[..] == data[..dec.len()]);
1215 */
1216 }
1217
1218 /// Check that decompressing lz77-data that refers to the back-buffer works.
1219 #[test]
test_decompress_with_backbuffer()1220 fn test_decompress_with_backbuffer() {
1221 let bb = vec![5; WINDOW_SIZE];
1222 let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)];
1223 let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb);
1224
1225 // ------------l2 l4 <-ld4,20-> l1 l1 <---ld5,10-->
1226 assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]);
1227 }
1228 }
1229
1230 #[cfg(all(test, feature = "benchmarks"))]
1231 mod bench {
1232 use super::lz77_compress;
1233 use test_std::Bencher;
1234 use test_utils::get_test_data;
1235 #[bench]
test_file_zlib_lz77_only(b: &mut Bencher)1236 fn test_file_zlib_lz77_only(b: &mut Bencher) {
1237 let test_data = get_test_data();
1238 b.iter(|| lz77_compress(&test_data));
1239 }
1240 }
1241