use length_encode::{EncodedLength, encode_lengths_m, huffman_lengths_from_frequency_m, COPY_PREVIOUS, REPEAT_ZERO_3_BITS, REPEAT_ZERO_7_BITS}; use huffman_table::{HuffmanTable, create_codes_in_place, num_extra_bits_for_length_code, num_extra_bits_for_distance_code, NUM_LITERALS_AND_LENGTHS, NUM_DISTANCE_CODES, MAX_CODE_LENGTH, FIXED_CODE_LENGTHS, LENGTH_BITS_START}; use bitstream::LsbWriter; use output_writer::FrequencyType; use stored_block::MAX_STORED_BLOCK_LENGTH; use deflate_state::LengthBuffers; use std::cmp; // The minimum number of literal/length values pub const MIN_NUM_LITERALS_AND_LENGTHS: usize = 257; // The minimum number of distances pub const MIN_NUM_DISTANCES: usize = 1; const NUM_HUFFMAN_LENGTHS: usize = 19; // The output ordering of the lengths for the huffman codes used to encode the lengths // used to build the full huffman tree for length/literal codes. // http://www.gzip.org/zlib/rfc-deflate.html#dyn const HUFFMAN_LENGTH_ORDER: [u8; NUM_HUFFMAN_LENGTHS] = [ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, ]; // Number of bits used for the values specifying the number of codes const HLIT_BITS: u8 = 5; const HDIST_BITS: u8 = 5; const HCLEN_BITS: u8 = 4; // The longest a huffman code describing another huffman length can be const MAX_HUFFMAN_CODE_LENGTH: usize = 7; // How many bytes (not including padding and the 3-bit block type) the stored block header takes up. const STORED_BLOCK_HEADER_LENGTH: u64 = 4; const BLOCK_MARKER_LENGTH: u8 = 3; /// Creates a new slice from the input slice that stops at the final non-zero value pub fn remove_trailing_zeroes + PartialEq>(input: &[T], min_length: usize) -> &[T] { let num_zeroes = input.iter().rev().take_while(|&a| *a == T::from(0)).count(); &input[0..cmp::max(input.len() - num_zeroes, min_length)] } /// How many extra bits the huffman length code uses to represent a value. fn extra_bits_for_huffman_length_code(code: u8) -> u8 { match code { 16...17 => 3, 18 => 7, _ => 0, } } /// Calculate how many bits the huffman-encoded huffman lengths will use. fn calculate_huffman_length(frequencies: &[FrequencyType], code_lengths: &[u8]) -> u64 { frequencies.iter().zip(code_lengths).enumerate().fold( 0, |acc, (n, (&f, &l))| { acc + (u64::from(f) * (u64::from(l) + u64::from(extra_bits_for_huffman_length_code(n as u8)))) }, ) } /// Calculate how many bits data with the given frequencies will use when compressed with dynamic /// code lengths (first return value) and static code lengths (second return value). /// /// Parameters: /// Frequencies, length of dynamic codes, and a function to get how many extra bits in addition /// to the length of the huffman code the symbol will use. fn calculate_block_length( frequencies: &[FrequencyType], dyn_code_lengths: &[u8], get_num_extra_bits: &F, ) -> (u64, u64) where F: Fn(usize) -> u64, { // Length of data represented by dynamic codes. let mut d_ll_length = 0u64; // length of data represented by static codes. let mut s_ll_length = 0u64; let iter = frequencies .iter() .zip(dyn_code_lengths.iter().zip(FIXED_CODE_LENGTHS.iter())) .enumerate(); // This could maybe be optimised a bit by splitting the iteration of codes using extra bits and // codes not using extra bits, but the extra complexity may not be worth it. for (c, (&f, (&l, &fl))) in iter { // Frequency let f = u64::from(f); // How many extra bits the current code number needs. let extra_bits_for_code = get_num_extra_bits(c); d_ll_length += f * (u64::from(l) + extra_bits_for_code); s_ll_length += f * (u64::from(fl) + extra_bits_for_code); } (d_ll_length, s_ll_length) } /// Get how extra padding bits after a block start header a stored block would use. /// /// # Panics /// Panics if `pending_bits > 8` fn stored_padding(pending_bits: u8) -> u64 { assert!(pending_bits <= 8); let free_space = 8 - pending_bits; if free_space >= BLOCK_MARKER_LENGTH { // There is space in the current byte for the header. free_space - BLOCK_MARKER_LENGTH } else { // The header will require an extra byte. 8 - (BLOCK_MARKER_LENGTH - free_space) }.into() } /// Calculate the number of bits storing the data in stored blocks will take up, excluding the /// first block start code and potential padding bits. As stored blocks have a maximum length, /// (as opposed to fixed and dynamic ones), multiple blocks may have to be utilised. /// /// # Panics /// Panics if `input_bytes` is 0. fn stored_length(input_bytes: u64) -> u64 { // Check how many stored blocks these bytes would take up. // (Integer divison rounding up.) let num_blocks = (input_bytes .checked_sub(1) .expect("Underflow calculating stored block length!") / MAX_STORED_BLOCK_LENGTH as u64) + 1; // The length will be the input length and the headers for each block. (Excluding the start // of block code for the first one) (input_bytes + (STORED_BLOCK_HEADER_LENGTH as u64 * num_blocks) + (num_blocks - 1)) * 8 } pub enum BlockType { Stored, Fixed, Dynamic(DynamicBlockHeader), } /// A struct containing the different data needed to write the header for a dynamic block. /// /// The code lengths are stored directly in the `HuffmanTable` struct. /// TODO: Do the same for other things here. pub struct DynamicBlockHeader { /// Length of the run-length encoding symbols. pub huffman_table_lengths: Vec, /// Number of lengths for values describing the huffman table that encodes the length values /// of the main huffman tables. pub used_hclens: usize, } /// Generate the lengths of the huffman codes we will be using, using the /// frequency of the different symbols/lengths/distances, and determine what block type will give /// the shortest representation. /// TODO: This needs a test pub fn gen_huffman_lengths( l_freqs: &[FrequencyType], d_freqs: &[FrequencyType], num_input_bytes: u64, pending_bits: u8, l_lengths: &mut [u8; 288], d_lengths: &mut [u8; 32], length_buffers: &mut LengthBuffers, ) -> BlockType { // Avoid corner cases and issues if this is called for an empty block. // For blocks this short, a fixed block will be the shortest. // TODO: Find the minimum value it's worth doing calculations for. if num_input_bytes <= 4 { return BlockType::Fixed; }; let l_freqs = remove_trailing_zeroes(l_freqs, MIN_NUM_LITERALS_AND_LENGTHS); let d_freqs = remove_trailing_zeroes(d_freqs, MIN_NUM_DISTANCES); // The huffman spec allows us to exclude zeroes at the end of the // table of huffman lengths. // Since a frequency of 0 will give an huffman // length of 0. We strip off the trailing zeroes before even // generating the lengths to save some work. // There is however a minimum number of values we have to keep // according to the deflate spec. // TODO: We could probably compute some of this in parallel. huffman_lengths_from_frequency_m( l_freqs, MAX_CODE_LENGTH, &mut length_buffers.leaf_buf, l_lengths, ); huffman_lengths_from_frequency_m( d_freqs, MAX_CODE_LENGTH, &mut length_buffers.leaf_buf, d_lengths, ); let used_lengths = l_freqs.len(); let used_distances = d_freqs.len(); // Encode length values let mut freqs = [0u16; 19]; encode_lengths_m( l_lengths[..used_lengths] .iter() .chain(&d_lengths[..used_distances]), &mut length_buffers.length_buf, &mut freqs, ); // Create huffman lengths for the length/distance code lengths let mut huffman_table_lengths = vec![0; freqs.len()]; huffman_lengths_from_frequency_m( &freqs, MAX_HUFFMAN_CODE_LENGTH, &mut length_buffers.leaf_buf, huffman_table_lengths.as_mut_slice(), ); // Count how many of these lengths we use. let used_hclens = HUFFMAN_LENGTH_ORDER.len() - HUFFMAN_LENGTH_ORDER .iter() .rev() .take_while(|&&n| huffman_table_lengths[n as usize] == 0) .count(); // There has to be at least 4 hclens, so if there isn't, something went wrong. debug_assert!(used_hclens >= 4); // Calculate how many bytes of space this block will take up with the different block types // (excluding the 3-bit block header since it's used in all block types). // Total length of the compressed literals/lengths. let (d_ll_length, s_ll_length) = calculate_block_length(l_freqs, l_lengths, &|c| { num_extra_bits_for_length_code(c.saturating_sub(LENGTH_BITS_START as usize) as u8).into() }); // Total length of the compressed distances. let (d_dist_length, s_dist_length) = calculate_block_length(d_freqs, d_lengths, &|c| { num_extra_bits_for_distance_code(c as u8).into() }); // Total length of the compressed huffman code lengths. let huff_table_length = calculate_huffman_length(&freqs, &huffman_table_lengths); // For dynamic blocks the huffman tables takes up some extra space. let dynamic_length = d_ll_length + d_dist_length + huff_table_length + (used_hclens as u64 * 3) + u64::from(HLIT_BITS) + u64::from(HDIST_BITS) + u64::from(HCLEN_BITS); // Static blocks don't have any extra header data. let static_length = s_ll_length + s_dist_length; // Calculate how many bits it will take to store the data in uncompressed (stored) block(s). let stored_length = stored_length(num_input_bytes) + stored_padding(pending_bits % 8); let used_length = cmp::min(cmp::min(dynamic_length, static_length), stored_length); // Check if the block is actually compressed. If using a dynamic block // increases the length of the block (for instance if the input data is mostly random or // already compressed), we want to output a stored(uncompressed) block instead to avoid wasting // space. if used_length == static_length { BlockType::Fixed } else if used_length == stored_length { BlockType::Stored } else { BlockType::Dynamic(DynamicBlockHeader { huffman_table_lengths: huffman_table_lengths, used_hclens: used_hclens, }) } } /// Write the specified huffman lengths to the bit writer pub fn write_huffman_lengths( header: &DynamicBlockHeader, huffman_table: &HuffmanTable, encoded_lengths: &[EncodedLength], writer: &mut LsbWriter, ) { // Ignore trailing zero lengths as allowed by the deflate spec. let (literal_len_lengths, distance_lengths) = huffman_table.get_lengths(); let literal_len_lengths = remove_trailing_zeroes(literal_len_lengths, MIN_NUM_LITERALS_AND_LENGTHS); let distance_lengths = remove_trailing_zeroes(distance_lengths, MIN_NUM_DISTANCES); let huffman_table_lengths = &header.huffman_table_lengths; let used_hclens = header.used_hclens; assert!(literal_len_lengths.len() <= NUM_LITERALS_AND_LENGTHS); assert!(literal_len_lengths.len() >= MIN_NUM_LITERALS_AND_LENGTHS); assert!(distance_lengths.len() <= NUM_DISTANCE_CODES); assert!(distance_lengths.len() >= MIN_NUM_DISTANCES); // Number of length codes - 257. let hlit = (literal_len_lengths.len() - MIN_NUM_LITERALS_AND_LENGTHS) as u16; writer.write_bits(hlit, HLIT_BITS); // Number of distance codes - 1. let hdist = (distance_lengths.len() - MIN_NUM_DISTANCES) as u16; writer.write_bits(hdist, HDIST_BITS); // Number of huffman table lengths - 4. let hclen = used_hclens.saturating_sub(4); // Write HCLEN. // Casting to u16 is safe since the length can never be more than the length of // `HUFFMAN_LENGTH_ORDER` anyhow. writer.write_bits(hclen as u16, HCLEN_BITS); // Write the lengths for the huffman table describing the huffman table // Each length is 3 bits for n in &HUFFMAN_LENGTH_ORDER[..used_hclens] { writer.write_bits(huffman_table_lengths[usize::from(*n)] as u16, 3); } // Generate codes for the main huffman table using the lengths we just wrote let mut codes = [0u16; NUM_HUFFMAN_LENGTHS]; create_codes_in_place(&mut codes[..], huffman_table_lengths); // Write the actual huffman lengths for v in encoded_lengths { match *v { EncodedLength::Length(n) => { let (c, l) = (codes[usize::from(n)], huffman_table_lengths[usize::from(n)]); writer.write_bits(c, l); } EncodedLength::CopyPrevious(n) => { let (c, l) = (codes[COPY_PREVIOUS], huffman_table_lengths[COPY_PREVIOUS]); writer.write_bits(c, l); debug_assert!(n >= 3); debug_assert!(n <= 6); writer.write_bits((n - 3).into(), 2); } EncodedLength::RepeatZero3Bits(n) => { let (c, l) = ( codes[REPEAT_ZERO_3_BITS], huffman_table_lengths[REPEAT_ZERO_3_BITS], ); writer.write_bits(c, l); debug_assert!(n >= 3); writer.write_bits((n - 3).into(), 3); } EncodedLength::RepeatZero7Bits(n) => { let (c, l) = ( codes[REPEAT_ZERO_7_BITS], huffman_table_lengths[REPEAT_ZERO_7_BITS], ); writer.write_bits(c, l); debug_assert!(n >= 11); debug_assert!(n <= 138); writer.write_bits((n - 11).into(), 7); } } } } #[cfg(test)] mod test { use super::stored_padding; #[test] fn padding() { assert_eq!(stored_padding(0), 5); assert_eq!(stored_padding(1), 4); assert_eq!(stored_padding(2), 3); assert_eq!(stored_padding(3), 2); assert_eq!(stored_padding(4), 1); assert_eq!(stored_padding(5), 0); assert_eq!(stored_padding(6), 7); assert_eq!(stored_padding(7), 6); } }