1 use length_encode::{EncodedLength, encode_lengths_m, huffman_lengths_from_frequency_m,
2                     COPY_PREVIOUS, REPEAT_ZERO_3_BITS, REPEAT_ZERO_7_BITS};
3 use huffman_table::{HuffmanTable, create_codes_in_place, num_extra_bits_for_length_code,
4                     num_extra_bits_for_distance_code, NUM_LITERALS_AND_LENGTHS,
5                     NUM_DISTANCE_CODES, MAX_CODE_LENGTH, FIXED_CODE_LENGTHS, LENGTH_BITS_START};
6 use bitstream::LsbWriter;
7 use output_writer::FrequencyType;
8 use stored_block::MAX_STORED_BLOCK_LENGTH;
9 use deflate_state::LengthBuffers;
10 
11 use std::cmp;
12 
13 // The minimum number of literal/length values
14 pub const MIN_NUM_LITERALS_AND_LENGTHS: usize = 257;
15 // The minimum number of distances
16 pub const MIN_NUM_DISTANCES: usize = 1;
17 
18 const NUM_HUFFMAN_LENGTHS: usize = 19;
19 
20 // The output ordering of the lengths for the huffman codes used to encode the lengths
21 // used to build the full huffman tree for length/literal codes.
22 // http://www.gzip.org/zlib/rfc-deflate.html#dyn
23 const HUFFMAN_LENGTH_ORDER: [u8; NUM_HUFFMAN_LENGTHS] = [
24     16,
25     17,
26     18,
27     0,
28     8,
29     7,
30     9,
31     6,
32     10,
33     5,
34     11,
35     4,
36     12,
37     3,
38     13,
39     2,
40     14,
41     1,
42     15,
43 ];
44 
45 // Number of bits used for the values specifying the number of codes
46 const HLIT_BITS: u8 = 5;
47 const HDIST_BITS: u8 = 5;
48 const HCLEN_BITS: u8 = 4;
49 
50 // The longest a huffman code describing another huffman length can be
51 const MAX_HUFFMAN_CODE_LENGTH: usize = 7;
52 
53 // How many bytes (not including padding and the 3-bit block type) the stored block header takes up.
54 const STORED_BLOCK_HEADER_LENGTH: u64 = 4;
55 const BLOCK_MARKER_LENGTH: u8 = 3;
56 
57 /// Creates a new slice from the input slice that stops at the final non-zero value
remove_trailing_zeroes<T: From<u8> + PartialEq>(input: &[T], min_length: usize) -> &[T]58 pub fn remove_trailing_zeroes<T: From<u8> + PartialEq>(input: &[T], min_length: usize) -> &[T] {
59     let num_zeroes = input.iter().rev().take_while(|&a| *a == T::from(0)).count();
60     &input[0..cmp::max(input.len() - num_zeroes, min_length)]
61 }
62 
63 /// How many extra bits the huffman length code uses to represent a value.
extra_bits_for_huffman_length_code(code: u8) -> u864 fn extra_bits_for_huffman_length_code(code: u8) -> u8 {
65     match code {
66         16...17 => 3,
67         18 => 7,
68         _ => 0,
69     }
70 }
71 
72 /// Calculate how many bits the huffman-encoded huffman lengths will use.
calculate_huffman_length(frequencies: &[FrequencyType], code_lengths: &[u8]) -> u6473 fn calculate_huffman_length(frequencies: &[FrequencyType], code_lengths: &[u8]) -> u64 {
74     frequencies.iter().zip(code_lengths).enumerate().fold(
75         0,
76         |acc, (n, (&f, &l))| {
77             acc +
78                 (u64::from(f) *
79                      (u64::from(l) + u64::from(extra_bits_for_huffman_length_code(n as u8))))
80         },
81     )
82 }
83 
84 /// Calculate how many bits data with the given frequencies will use when compressed with dynamic
85 /// code lengths (first return value) and static code lengths (second return value).
86 ///
87 /// Parameters:
88 /// Frequencies, length of dynamic codes, and a function to get how many extra bits in addition
89 /// to the length of the huffman code the symbol will use.
calculate_block_length<F>( frequencies: &[FrequencyType], dyn_code_lengths: &[u8], get_num_extra_bits: &F, ) -> (u64, u64) where F: Fn(usize) -> u64,90 fn calculate_block_length<F>(
91     frequencies: &[FrequencyType],
92     dyn_code_lengths: &[u8],
93     get_num_extra_bits: &F,
94 ) -> (u64, u64)
95 where
96     F: Fn(usize) -> u64,
97 {
98     // Length of data represented by dynamic codes.
99     let mut d_ll_length = 0u64;
100     // length of data represented by static codes.
101     let mut s_ll_length = 0u64;
102 
103     let iter = frequencies
104         .iter()
105         .zip(dyn_code_lengths.iter().zip(FIXED_CODE_LENGTHS.iter()))
106         .enumerate();
107 
108     // This could maybe be optimised a bit by splitting the iteration of codes using extra bits and
109     // codes not using extra bits, but the extra complexity may not be worth it.
110     for (c, (&f, (&l, &fl))) in iter {
111         // Frequency
112         let f = u64::from(f);
113         // How many extra bits the current code number needs.
114         let extra_bits_for_code = get_num_extra_bits(c);
115 
116         d_ll_length += f * (u64::from(l) + extra_bits_for_code);
117         s_ll_length += f * (u64::from(fl) + extra_bits_for_code);
118 
119     }
120 
121     (d_ll_length, s_ll_length)
122 }
123 
124 /// Get how extra padding bits after a block start header a stored block would use.
125 ///
126 /// # Panics
127 /// Panics if `pending_bits > 8`
stored_padding(pending_bits: u8) -> u64128 fn stored_padding(pending_bits: u8) -> u64 {
129     assert!(pending_bits <= 8);
130     let free_space = 8 - pending_bits;
131     if free_space >= BLOCK_MARKER_LENGTH {
132         // There is space in the current byte for the header.
133         free_space - BLOCK_MARKER_LENGTH
134     } else {
135         // The header will require an extra byte.
136         8 - (BLOCK_MARKER_LENGTH - free_space)
137     }.into()
138 }
139 
140 /// Calculate the number of bits storing the data in stored blocks will take up, excluding the
141 /// first block start code and potential padding bits. As stored blocks have a maximum length,
142 /// (as opposed to fixed and dynamic ones), multiple blocks may have to be utilised.
143 ///
144 /// # Panics
145 /// Panics if `input_bytes` is 0.
stored_length(input_bytes: u64) -> u64146 fn stored_length(input_bytes: u64) -> u64 {
147     // Check how many stored blocks these bytes would take up.
148     // (Integer divison rounding up.)
149     let num_blocks = (input_bytes
150                           .checked_sub(1)
151                           .expect("Underflow calculating stored block length!") /
152                           MAX_STORED_BLOCK_LENGTH as u64) + 1;
153     // The length will be the input length and the headers for each block. (Excluding the start
154     // of block code for the first one)
155     (input_bytes + (STORED_BLOCK_HEADER_LENGTH as u64 * num_blocks) + (num_blocks - 1)) * 8
156 }
157 
158 pub enum BlockType {
159     Stored,
160     Fixed,
161     Dynamic(DynamicBlockHeader),
162 }
163 
164 /// A struct containing the different data needed to write the header for a dynamic block.
165 ///
166 /// The code lengths are stored directly in the `HuffmanTable` struct.
167 /// TODO: Do the same for other things here.
168 pub struct DynamicBlockHeader {
169     /// Length of the run-length encoding symbols.
170     pub huffman_table_lengths: Vec<u8>,
171     /// Number of lengths for values describing the huffman table that encodes the length values
172     /// of the main huffman tables.
173     pub used_hclens: usize,
174 }
175 
176 /// Generate the lengths of the huffman codes we will be using, using the
177 /// frequency of the different symbols/lengths/distances, and determine what block type will give
178 /// the shortest representation.
179 /// TODO: This needs a test
gen_huffman_lengths( l_freqs: &[FrequencyType], d_freqs: &[FrequencyType], num_input_bytes: u64, pending_bits: u8, l_lengths: &mut [u8; 288], d_lengths: &mut [u8; 32], length_buffers: &mut LengthBuffers, ) -> BlockType180 pub fn gen_huffman_lengths(
181     l_freqs: &[FrequencyType],
182     d_freqs: &[FrequencyType],
183     num_input_bytes: u64,
184     pending_bits: u8,
185     l_lengths: &mut [u8; 288],
186     d_lengths: &mut [u8; 32],
187     length_buffers: &mut LengthBuffers,
188 ) -> BlockType {
189     // Avoid corner cases and issues if this is called for an empty block.
190     // For blocks this short, a fixed block will be the shortest.
191     // TODO: Find the minimum value it's worth doing calculations for.
192     if num_input_bytes <= 4 {
193         return BlockType::Fixed;
194     };
195 
196     let l_freqs = remove_trailing_zeroes(l_freqs, MIN_NUM_LITERALS_AND_LENGTHS);
197     let d_freqs = remove_trailing_zeroes(d_freqs, MIN_NUM_DISTANCES);
198 
199     // The huffman spec allows us to exclude zeroes at the end of the
200     // table of huffman lengths.
201     // Since a frequency of 0 will give an huffman
202     // length of 0. We strip off the trailing zeroes before even
203     // generating the lengths to save some work.
204     // There is however a minimum number of values we have to keep
205     // according to the deflate spec.
206     // TODO: We could probably compute some of this in parallel.
207     huffman_lengths_from_frequency_m(
208         l_freqs,
209         MAX_CODE_LENGTH,
210         &mut length_buffers.leaf_buf,
211         l_lengths,
212     );
213     huffman_lengths_from_frequency_m(
214         d_freqs,
215         MAX_CODE_LENGTH,
216         &mut length_buffers.leaf_buf,
217         d_lengths,
218     );
219 
220 
221     let used_lengths = l_freqs.len();
222     let used_distances = d_freqs.len();
223 
224     // Encode length values
225     let mut freqs = [0u16; 19];
226     encode_lengths_m(
227         l_lengths[..used_lengths]
228             .iter()
229             .chain(&d_lengths[..used_distances]),
230         &mut length_buffers.length_buf,
231         &mut freqs,
232     );
233 
234     // Create huffman lengths for the length/distance code lengths
235     let mut huffman_table_lengths = vec![0; freqs.len()];
236     huffman_lengths_from_frequency_m(
237         &freqs,
238         MAX_HUFFMAN_CODE_LENGTH,
239         &mut length_buffers.leaf_buf,
240         huffman_table_lengths.as_mut_slice(),
241     );
242 
243     // Count how many of these lengths we use.
244     let used_hclens = HUFFMAN_LENGTH_ORDER.len() -
245         HUFFMAN_LENGTH_ORDER
246             .iter()
247             .rev()
248             .take_while(|&&n| huffman_table_lengths[n as usize] == 0)
249             .count();
250 
251     // There has to be at least 4 hclens, so if there isn't, something went wrong.
252     debug_assert!(used_hclens >= 4);
253 
254     // Calculate how many bytes of space this block will take up with the different block types
255     // (excluding the 3-bit block header since it's used in all block types).
256 
257     // Total length of the compressed literals/lengths.
258     let (d_ll_length, s_ll_length) = calculate_block_length(l_freqs, l_lengths, &|c| {
259         num_extra_bits_for_length_code(c.saturating_sub(LENGTH_BITS_START as usize) as u8).into()
260     });
261 
262     // Total length of the compressed distances.
263     let (d_dist_length, s_dist_length) = calculate_block_length(d_freqs, d_lengths, &|c| {
264         num_extra_bits_for_distance_code(c as u8).into()
265     });
266 
267     // Total length of the compressed huffman code lengths.
268     let huff_table_length = calculate_huffman_length(&freqs, &huffman_table_lengths);
269 
270     // For dynamic blocks the huffman tables takes up some extra space.
271     let dynamic_length = d_ll_length + d_dist_length + huff_table_length +
272         (used_hclens as u64 * 3) + u64::from(HLIT_BITS) +
273         u64::from(HDIST_BITS) + u64::from(HCLEN_BITS);
274 
275     // Static blocks don't have any extra header data.
276     let static_length = s_ll_length + s_dist_length;
277 
278     // Calculate how many bits it will take to store the data in uncompressed (stored) block(s).
279     let stored_length = stored_length(num_input_bytes) + stored_padding(pending_bits % 8);
280 
281     let used_length = cmp::min(cmp::min(dynamic_length, static_length), stored_length);
282 
283     // Check if the block is actually compressed. If using a dynamic block
284     // increases the length of the block (for instance if the input data is mostly random or
285     // already compressed), we want to output a stored(uncompressed) block instead to avoid wasting
286     // space.
287     if used_length == static_length {
288         BlockType::Fixed
289     } else if used_length == stored_length {
290         BlockType::Stored
291     } else {
292         BlockType::Dynamic(DynamicBlockHeader {
293             huffman_table_lengths: huffman_table_lengths,
294             used_hclens: used_hclens,
295         })
296     }
297 }
298 
299 /// Write the specified huffman lengths to the bit writer
write_huffman_lengths( header: &DynamicBlockHeader, huffman_table: &HuffmanTable, encoded_lengths: &[EncodedLength], writer: &mut LsbWriter, )300 pub fn write_huffman_lengths(
301     header: &DynamicBlockHeader,
302     huffman_table: &HuffmanTable,
303     encoded_lengths: &[EncodedLength],
304     writer: &mut LsbWriter,
305 ) {
306     // Ignore trailing zero lengths as allowed by the deflate spec.
307     let (literal_len_lengths, distance_lengths) = huffman_table.get_lengths();
308     let literal_len_lengths =
309         remove_trailing_zeroes(literal_len_lengths, MIN_NUM_LITERALS_AND_LENGTHS);
310     let distance_lengths = remove_trailing_zeroes(distance_lengths, MIN_NUM_DISTANCES);
311     let huffman_table_lengths = &header.huffman_table_lengths;
312     let used_hclens = header.used_hclens;
313 
314     assert!(literal_len_lengths.len() <= NUM_LITERALS_AND_LENGTHS);
315     assert!(literal_len_lengths.len() >= MIN_NUM_LITERALS_AND_LENGTHS);
316     assert!(distance_lengths.len() <= NUM_DISTANCE_CODES);
317     assert!(distance_lengths.len() >= MIN_NUM_DISTANCES);
318 
319     // Number of length codes - 257.
320     let hlit = (literal_len_lengths.len() - MIN_NUM_LITERALS_AND_LENGTHS) as u16;
321     writer.write_bits(hlit, HLIT_BITS);
322     // Number of distance codes - 1.
323     let hdist = (distance_lengths.len() - MIN_NUM_DISTANCES) as u16;
324     writer.write_bits(hdist, HDIST_BITS);
325 
326     // Number of huffman table lengths - 4.
327     let hclen = used_hclens.saturating_sub(4);
328 
329     // Write HCLEN.
330     // Casting to u16 is safe since the length can never be more than the length of
331     // `HUFFMAN_LENGTH_ORDER` anyhow.
332     writer.write_bits(hclen as u16, HCLEN_BITS);
333 
334     // Write the lengths for the huffman table describing the huffman table
335     // Each length is 3 bits
336     for n in &HUFFMAN_LENGTH_ORDER[..used_hclens] {
337         writer.write_bits(huffman_table_lengths[usize::from(*n)] as u16, 3);
338     }
339 
340     // Generate codes for the main huffman table using the lengths we just wrote
341     let mut codes = [0u16; NUM_HUFFMAN_LENGTHS];
342     create_codes_in_place(&mut codes[..], huffman_table_lengths);
343 
344     // Write the actual huffman lengths
345     for v in encoded_lengths {
346         match *v {
347             EncodedLength::Length(n) => {
348                 let (c, l) = (codes[usize::from(n)], huffman_table_lengths[usize::from(n)]);
349                 writer.write_bits(c, l);
350             }
351             EncodedLength::CopyPrevious(n) => {
352                 let (c, l) = (codes[COPY_PREVIOUS], huffman_table_lengths[COPY_PREVIOUS]);
353                 writer.write_bits(c, l);
354                 debug_assert!(n >= 3);
355                 debug_assert!(n <= 6);
356                 writer.write_bits((n - 3).into(), 2);
357             }
358             EncodedLength::RepeatZero3Bits(n) => {
359                 let (c, l) = (
360                     codes[REPEAT_ZERO_3_BITS],
361                     huffman_table_lengths[REPEAT_ZERO_3_BITS],
362                 );
363                 writer.write_bits(c, l);
364                 debug_assert!(n >= 3);
365                 writer.write_bits((n - 3).into(), 3);
366             }
367             EncodedLength::RepeatZero7Bits(n) => {
368                 let (c, l) = (
369                     codes[REPEAT_ZERO_7_BITS],
370                     huffman_table_lengths[REPEAT_ZERO_7_BITS],
371                 );
372                 writer.write_bits(c, l);
373                 debug_assert!(n >= 11);
374                 debug_assert!(n <= 138);
375                 writer.write_bits((n - 11).into(), 7);
376             }
377         }
378     }
379 }
380 
381 
382 #[cfg(test)]
383 mod test {
384     use super::stored_padding;
385     #[test]
padding()386     fn padding() {
387         assert_eq!(stored_padding(0), 5);
388         assert_eq!(stored_padding(1), 4);
389         assert_eq!(stored_padding(2), 3);
390         assert_eq!(stored_padding(3), 2);
391         assert_eq!(stored_padding(4), 1);
392         assert_eq!(stored_padding(5), 0);
393         assert_eq!(stored_padding(6), 7);
394         assert_eq!(stored_padding(7), 6);
395     }
396 }
397