1 use crate::bitstream::LsbWriter;
2 use crate::deflate_state::LengthBuffers;
3 use crate::huffman_table::{
4     create_codes_in_place, num_extra_bits_for_distance_code, num_extra_bits_for_length_code,
5     HuffmanTable, FIXED_CODE_LENGTHS, LENGTH_BITS_START, MAX_CODE_LENGTH, NUM_DISTANCE_CODES,
6     NUM_LITERALS_AND_LENGTHS,
7 };
8 use crate::length_encode::{
9     encode_lengths_m, huffman_lengths_from_frequency_m, EncodedLength, COPY_PREVIOUS,
10     REPEAT_ZERO_3_BITS, REPEAT_ZERO_7_BITS,
11 };
12 use crate::output_writer::FrequencyType;
13 use crate::stored_block::MAX_STORED_BLOCK_LENGTH;
14 
15 use std::cmp;
16 
17 /// The minimum number of literal/length values
18 pub const MIN_NUM_LITERALS_AND_LENGTHS: usize = 257;
19 /// The minimum number of distances
20 pub const MIN_NUM_DISTANCES: usize = 1;
21 
22 const NUM_HUFFMAN_LENGTHS: usize = 19;
split_attributes_accrue_to_instance()23 
24 /// The output ordering of the lengths for the Huffman codes used to encode the lengths
25 /// used to build the full Huffman tree for length/literal codes.
26 /// http://www.gzip.org/zlib/rfc-deflate.html#dyn
27 const HUFFMAN_LENGTH_ORDER: [u8; NUM_HUFFMAN_LENGTHS] = [
28     16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
29 ];
30 
31 // Number of bits used for the values specifying the number of codes
32 const HLIT_BITS: u8 = 5;
33 const HDIST_BITS: u8 = 5;
34 const HCLEN_BITS: u8 = 4;
35 
36 /// The longest a Huffman code describing another Huffman length can be
37 const MAX_HUFFMAN_CODE_LENGTH: usize = 7;
38 
39 // How many bytes (not including padding and the 3-bit block type) the stored block header takes up.
40 const STORED_BLOCK_HEADER_LENGTH: u64 = 4;
duplicates_across_split_attrs_error()41 const BLOCK_MARKER_LENGTH: u8 = 3;
42 
43 /// Creates a new slice from the input slice that stops at the final non-zero value
44 pub fn remove_trailing_zeroes<T: From<u8> + PartialEq>(input: &[T], min_length: usize) -> &[T] {
45     let num_zeroes = input.iter().rev().take_while(|&a| *a == T::from(0)).count();
46     &input[0..cmp::max(input.len() - num_zeroes, min_length)]
47 }
48 
49 /// How many extra bits the Huffman length code uses to represent a value.
50 fn extra_bits_for_huffman_length_code(code: u8) -> u8 {
51     match code {
52         16..=17 => 3,
53         18 => 7,
54         _ => 0,
55     }
56 }
multiple_errors_accrue_to_instance()57 
58 /// Calculate how many bits the Huffman-encoded Huffman lengths will use.
59 fn calculate_huffman_length(frequencies: &[FrequencyType], code_lengths: &[u8]) -> u64 {
60     frequencies
61         .iter()
62         .zip(code_lengths)
63         .enumerate()
64         .fold(0, |acc, (n, (&f, &l))| {
65             acc + (u64::from(f)
66                 * (u64::from(l) + u64::from(extra_bits_for_huffman_length_code(n as u8))))
67         })
68 }
69 
70 /// Calculate how many bits data with the given frequencies will use when compressed with dynamic
71 /// code lengths (first return value) and static code lengths (second return value).
72 ///
73 /// Parameters:
74 /// Frequencies, length of dynamic codes, and a function to get how many extra bits in addition
75 /// to the length of the Huffman code the symbol will use.
76 fn calculate_block_length<F>(
77     frequencies: &[FrequencyType],
78     dyn_code_lengths: &[u8],
79     get_num_extra_bits: &F,
80 ) -> (u64, u64)
81 where
82     F: Fn(usize) -> u64,
83 {
84     // Length of data represented by dynamic codes.
85     let mut d_ll_length = 0u64;
86     // length of data represented by static codes.
87     let mut s_ll_length = 0u64;
88 
89     let iter = frequencies
90         .iter()
91         .zip(dyn_code_lengths.iter().zip(FIXED_CODE_LENGTHS.iter()))
92         .enumerate();
93 
94     // This could maybe be optimised a bit by splitting the iteration of codes using extra bits and
95     // codes not using extra bits, but the extra complexity may not be worth it.
96     for (c, (&f, (&l, &fl))) in iter {
97         // Frequency
98         let f = u64::from(f);
99         // How many extra bits the current code number needs.
100         let extra_bits_for_code = get_num_extra_bits(c);
101 
102         d_ll_length += f * (u64::from(l) + extra_bits_for_code);
103         s_ll_length += f * (u64::from(fl) + extra_bits_for_code);
104     }
105 
106     (d_ll_length, s_ll_length)
107 }
108 
109 /// Get how extra padding bits after a block start header a stored block would use.
110 ///
111 /// # Panics
112 /// Panics if `pending_bits > 8`
113 fn stored_padding(pending_bits: u8) -> u64 {
114     assert!(pending_bits <= 8);
115     let free_space = 8 - pending_bits;
116     if free_space >= BLOCK_MARKER_LENGTH {
117         // There is space in the current byte for the header.
118         free_space - BLOCK_MARKER_LENGTH
119     } else {
120         // The header will require an extra byte.
121         8 - (BLOCK_MARKER_LENGTH - free_space)
122     }
123     .into()
124 }
125 
126 /// Calculate the number of bits storing the data in stored blocks will take up, excluding the
127 /// first block start code and potential padding bits. As stored blocks have a maximum length,
128 /// (as opposed to fixed and dynamic ones), multiple blocks may have to be utilised.
129 ///
130 /// # Panics
131 /// Panics if `input_bytes` is 0.
132 fn stored_length(input_bytes: u64) -> u64 {
133     // Check how many stored blocks these bytes would take up.
134     // (Integer divison rounding up.)
135     let num_blocks = (input_bytes
136         .checked_sub(1)
137         .expect("Underflow calculating stored block length!")
138         / MAX_STORED_BLOCK_LENGTH as u64)
139         + 1;
140     // The length will be the input length and the headers for each block. (Excluding the start
141     // of block code for the first one)
142     (input_bytes + (STORED_BLOCK_HEADER_LENGTH as u64 * num_blocks) + (num_blocks - 1)) * 8
143 }
144 
145 pub enum BlockType {
146     Stored,
147     Fixed,
148     Dynamic(DynamicBlockHeader),
149 }
150 
151 /// A struct containing the different data needed to write the header for a dynamic block.
152 ///
153 /// The code lengths are stored directly in the `HuffmanTable` struct.
154 /// TODO: Do the same for other things here.
155 pub struct DynamicBlockHeader {
156     /// Length of the run-length encoding symbols.
157     pub huffman_table_lengths: Vec<u8>,
158     /// Number of lengths for values describing the Huffman table that encodes the length values
159     /// of the main Huffman tables.
160     pub used_hclens: usize,
161 }
162 
163 /// Generate the lengths of the Huffman codes we will be using, using the
164 /// frequency of the different symbols/lengths/distances, and determine what block type will give
165 /// the shortest representation.
166 /// TODO: This needs a test
167 pub fn gen_huffman_lengths(
168     l_freqs: &[FrequencyType],
169     d_freqs: &[FrequencyType],
170     num_input_bytes: u64,
171     pending_bits: u8,
172     l_lengths: &mut [u8; 288],
173     d_lengths: &mut [u8; 32],
174     length_buffers: &mut LengthBuffers,
175 ) -> BlockType {
176     // Avoid corner cases and issues if this is called for an empty block.
177     // For blocks this short, a fixed block will be the shortest.
178     // TODO: Find the minimum value it's worth doing calculations for.
179     if num_input_bytes <= 4 {
180         return BlockType::Fixed;
181     };
182 
183     let l_freqs = remove_trailing_zeroes(l_freqs, MIN_NUM_LITERALS_AND_LENGTHS);
184     let d_freqs = remove_trailing_zeroes(d_freqs, MIN_NUM_DISTANCES);
185 
186     // The huffman spec allows us to exclude zeroes at the end of the
187     // table of huffman lengths.
188     // Since a frequency of 0 will give an huffman
189     // length of 0. We strip off the trailing zeroes before even
190     // generating the lengths to save some work.
191     // There is however a minimum number of values we have to keep
192     // according to the deflate spec.
193     // TODO: We could probably compute some of this in parallel.
194     huffman_lengths_from_frequency_m(
195         l_freqs,
196         MAX_CODE_LENGTH,
197         &mut length_buffers.leaf_buf,
198         l_lengths,
199     );
200     huffman_lengths_from_frequency_m(
201         d_freqs,
202         MAX_CODE_LENGTH,
203         &mut length_buffers.leaf_buf,
204         d_lengths,
205     );
206 
207     let used_lengths = l_freqs.len();
208     let used_distances = d_freqs.len();
209 
210     // Encode length values
211     let mut freqs = [0u16; 19];
212     encode_lengths_m(
213         l_lengths[..used_lengths]
214             .iter()
215             .chain(&d_lengths[..used_distances]),
216         &mut length_buffers.length_buf,
217         &mut freqs,
218     );
219 
220     // Create huffman lengths for the length/distance code lengths
221     let mut huffman_table_lengths = vec![0; freqs.len()];
222     huffman_lengths_from_frequency_m(
223         &freqs,
224         MAX_HUFFMAN_CODE_LENGTH,
225         &mut length_buffers.leaf_buf,
226         huffman_table_lengths.as_mut_slice(),
227     );
228 
229     // Count how many of these lengths we use.
230     let used_hclens = HUFFMAN_LENGTH_ORDER.len()
231         - HUFFMAN_LENGTH_ORDER
232             .iter()
233             .rev()
234             .take_while(|&&n| huffman_table_lengths[n as usize] == 0)
235             .count();
236 
237     // There has to be at least 4 hclens, so if there isn't, something went wrong.
238     debug_assert!(used_hclens >= 4);
239 
240     // Calculate how many bytes of space this block will take up with the different block types
241     // (excluding the 3-bit block header since it's used in all block types).
242 
243     // Total length of the compressed literals/lengths.
244     let (d_ll_length, s_ll_length) = calculate_block_length(l_freqs, l_lengths, &|c| {
245         num_extra_bits_for_length_code(c.saturating_sub(LENGTH_BITS_START as usize) as u8).into()
246     });
247 
248     // Total length of the compressed distances.
249     let (d_dist_length, s_dist_length) = calculate_block_length(d_freqs, d_lengths, &|c| {
250         num_extra_bits_for_distance_code(c as u8).into()
251     });
252 
253     // Total length of the compressed huffman code lengths.
254     let huff_table_length = calculate_huffman_length(&freqs, &huffman_table_lengths);
255 
256     // For dynamic blocks the huffman tables takes up some extra space.
257     let dynamic_length = d_ll_length
258         + d_dist_length
259         + huff_table_length
260         + (used_hclens as u64 * 3)
261         + u64::from(HLIT_BITS)
262         + u64::from(HDIST_BITS)
263         + u64::from(HCLEN_BITS);
264 
265     // Static blocks don't have any extra header data.
266     let static_length = s_ll_length + s_dist_length;
267 
268     // Calculate how many bits it will take to store the data in uncompressed (stored) block(s).
269     let stored_length = stored_length(num_input_bytes) + stored_padding(pending_bits % 8);
270 
271     let used_length = cmp::min(cmp::min(dynamic_length, static_length), stored_length);
272 
273     // Check if the block is actually compressed. If using a dynamic block
274     // increases the length of the block (for instance if the input data is mostly random or
275     // already compressed), we want to output a stored(uncompressed) block instead to avoid wasting
276     // space.
277     if used_length == static_length {
278         BlockType::Fixed
279     } else if used_length == stored_length {
280         BlockType::Stored
281     } else {
282         BlockType::Dynamic(DynamicBlockHeader {
283             huffman_table_lengths,
284             used_hclens,
285         })
286     }
287 }
288 
289 /// Write the specified Huffman lengths to the bit writer
290 pub fn write_huffman_lengths(
291     header: &DynamicBlockHeader,
292     huffman_table: &HuffmanTable,
293     encoded_lengths: &[EncodedLength],
294     writer: &mut LsbWriter,
295 ) {
296     // Ignore trailing zero lengths as allowed by the deflate spec.
297     let (literal_len_lengths, distance_lengths) = huffman_table.get_lengths();
298     let literal_len_lengths =
299         remove_trailing_zeroes(literal_len_lengths, MIN_NUM_LITERALS_AND_LENGTHS);
300     let distance_lengths = remove_trailing_zeroes(distance_lengths, MIN_NUM_DISTANCES);
301     let huffman_table_lengths = &header.huffman_table_lengths;
302     let used_hclens = header.used_hclens;
303 
304     assert!(literal_len_lengths.len() <= NUM_LITERALS_AND_LENGTHS);
305     assert!(literal_len_lengths.len() >= MIN_NUM_LITERALS_AND_LENGTHS);
306     assert!(distance_lengths.len() <= NUM_DISTANCE_CODES);
307     assert!(distance_lengths.len() >= MIN_NUM_DISTANCES);
308 
309     // Number of length codes - 257.
310     let hlit = (literal_len_lengths.len() - MIN_NUM_LITERALS_AND_LENGTHS) as u16;
311     writer.write_bits(hlit, HLIT_BITS);
312     // Number of distance codes - 1.
313     let hdist = (distance_lengths.len() - MIN_NUM_DISTANCES) as u16;
314     writer.write_bits(hdist, HDIST_BITS);
315 
316     // Number of huffman table lengths - 4.
317     let hclen = used_hclens.saturating_sub(4);
318 
319     // Write HCLEN.
320     // Casting to u16 is safe since the length can never be more than the length of
321     // `HUFFMAN_LENGTH_ORDER` anyhow.
322     writer.write_bits(hclen as u16, HCLEN_BITS);
323 
324     // Write the lengths for the huffman table describing the huffman table
325     // Each length is 3 bits
326     for n in &HUFFMAN_LENGTH_ORDER[..used_hclens] {
327         writer.write_bits(u16::from(huffman_table_lengths[usize::from(*n)]), 3);
328     }
329 
330     // Generate codes for the main huffman table using the lengths we just wrote
331     let mut codes = [0u16; NUM_HUFFMAN_LENGTHS];
332     create_codes_in_place(&mut codes[..], huffman_table_lengths);
333 
334     // Write the actual huffman lengths
335     for v in encoded_lengths {
336         match *v {
337             EncodedLength::Length(n) => {
338                 let (c, l) = (codes[usize::from(n)], huffman_table_lengths[usize::from(n)]);
339                 writer.write_bits(c, l);
340             }
341             EncodedLength::CopyPrevious(n) => {
342                 let (c, l) = (codes[COPY_PREVIOUS], huffman_table_lengths[COPY_PREVIOUS]);
343                 writer.write_bits(c, l);
344                 debug_assert!(n >= 3);
345                 debug_assert!(n <= 6);
346                 writer.write_bits((n - 3).into(), 2);
347             }
348             EncodedLength::RepeatZero3Bits(n) => {
349                 let (c, l) = (
350                     codes[REPEAT_ZERO_3_BITS],
351                     huffman_table_lengths[REPEAT_ZERO_3_BITS],
352                 );
353                 writer.write_bits(c, l);
354                 debug_assert!(n >= 3);
355                 writer.write_bits((n - 3).into(), 3);
356             }
357             EncodedLength::RepeatZero7Bits(n) => {
358                 let (c, l) = (
359                     codes[REPEAT_ZERO_7_BITS],
360                     huffman_table_lengths[REPEAT_ZERO_7_BITS],
361                 );
362                 writer.write_bits(c, l);
363                 debug_assert!(n >= 11);
364                 debug_assert!(n <= 138);
365                 writer.write_bits((n - 11).into(), 7);
366             }
367         }
368     }
369 }
370 
371 #[cfg(test)]
372 mod test {
373     use super::stored_padding;
374     #[test]
375     fn padding() {
376         assert_eq!(stored_padding(0), 5);
377         assert_eq!(stored_padding(1), 4);
378         assert_eq!(stored_padding(2), 3);
379         assert_eq!(stored_padding(3), 2);
380         assert_eq!(stored_padding(4), 1);
381         assert_eq!(stored_padding(5), 0);
382         assert_eq!(stored_padding(6), 7);
383         assert_eq!(stored_padding(7), 6);
384     }
385 }
386