1 // Copyright 2017 Brian Langenberger
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! Traits and implementations for reading or writing Huffman codes
10 //! from or to a stream.
11 
12 #![warn(missing_docs)]
13 
14 use super::BitQueue;
15 use super::Endianness;
16 use std::collections::BTreeMap;
17 use std::fmt;
18 use std::marker::PhantomData;
19 
20 /// A compiled Huffman tree element for use with the `read_huffman` method.
21 /// Returned by `compile_read_tree`.
22 ///
23 /// Compiled read trees are optimized for faster lookup
24 /// and are therefore endian-specific.
25 ///
26 /// In addition, each symbol in the source tree may occur many times
27 /// in the compiled tree.  If symbols require a nontrivial amount of space,
28 /// consider using reference counting so that they may be cloned
29 /// more efficiently.
30 pub enum ReadHuffmanTree<E: Endianness, T: Clone> {
31     /// The final value and new reader state
32     Done(T, u8, u32, PhantomData<E>),
33     /// Another byte is necessary to determine final value
34     Continue(Box<[ReadHuffmanTree<E, T>]>),
35     /// An invalid reader state has been used
36     InvalidState,
37 }
38 
39 /// Given a vector of symbol/code pairs, compiles a Huffman tree
40 /// for reading.
41 ///
42 /// Code must be 0 or 1 bits and are always read from the stream
43 /// from least-significant in the list to most signficant
44 /// (which makes them easier to read for humans).
45 ///
46 /// All possible codes must be assigned some symbol,
47 /// and it is acceptable for the same symbol to occur multiple times.
48 ///
49 /// ## Examples
50 /// ```
51 /// use bitstream_io::huffman::compile_read_tree;
52 /// use bitstream_io::BigEndian;
53 /// assert!(compile_read_tree::<BigEndian,i32>(
54 ///     vec![(1, vec![0]),
55 ///          (2, vec![1, 0]),
56 ///          (3, vec![1, 1])]).is_ok());
57 /// ```
58 ///
59 /// ```
60 /// use std::io::{Read, Cursor};
61 /// use bitstream_io::{BigEndian, BitReader, HuffmanRead};
62 /// use bitstream_io::huffman::compile_read_tree;
63 /// let tree = compile_read_tree(
64 ///     vec![('a', vec![0]),
65 ///          ('b', vec![1, 0]),
66 ///          ('c', vec![1, 1, 0]),
67 ///          ('d', vec![1, 1, 1])]).unwrap();
68 /// let data = [0b10110111];
69 /// let mut cursor = Cursor::new(&data);
70 /// let mut reader = BitReader::endian(&mut cursor, BigEndian);
71 /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'b');
72 /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'c');
73 /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'd');
74 /// ```
compile_read_tree<E, T>( values: Vec<(T, Vec<u8>)>, ) -> Result<Box<[ReadHuffmanTree<E, T>]>, HuffmanTreeError> where E: Endianness, T: Clone,75 pub fn compile_read_tree<E, T>(
76     values: Vec<(T, Vec<u8>)>,
77 ) -> Result<Box<[ReadHuffmanTree<E, T>]>, HuffmanTreeError>
78 where
79     E: Endianness,
80     T: Clone,
81 {
82     let tree = FinalHuffmanTree::new(values)?;
83 
84     let mut result = Vec::with_capacity(256);
85     result.extend((0..256).map(|_| ReadHuffmanTree::InvalidState));
86     let queue = BitQueue::from_value(0, 0);
87     let i = queue.to_state();
88     result[i] = compile_queue(queue, &tree);
89     for bits in 1..8 {
90         for value in 0..(1 << bits) {
91             let queue = BitQueue::from_value(value, bits);
92             let i = queue.to_state();
93             result[i] = compile_queue(queue, &tree);
94         }
95     }
96     assert_eq!(result.len(), 256);
97     Ok(result.into_boxed_slice())
98 }
99 
compile_queue<E, T>( mut queue: BitQueue<E, u8>, tree: &FinalHuffmanTree<T>, ) -> ReadHuffmanTree<E, T> where E: Endianness, T: Clone,100 fn compile_queue<E, T>(
101     mut queue: BitQueue<E, u8>,
102     tree: &FinalHuffmanTree<T>,
103 ) -> ReadHuffmanTree<E, T>
104 where
105     E: Endianness,
106     T: Clone,
107 {
108     match tree {
109         FinalHuffmanTree::Leaf(ref value) => {
110             let len = queue.len();
111             ReadHuffmanTree::Done(value.clone(), queue.value(), len, PhantomData)
112         }
113         FinalHuffmanTree::Tree(ref bit0, ref bit1) => {
114             if queue.is_empty() {
115                 ReadHuffmanTree::Continue(
116                     (0..256)
117                         .map(|byte| compile_queue(BitQueue::from_value(byte as u8, 8), tree))
118                         .collect::<Vec<ReadHuffmanTree<E, T>>>()
119                         .into_boxed_slice(),
120                 )
121             } else if queue.pop(1) == 0 {
122                 compile_queue(queue, bit0)
123             } else {
124                 compile_queue(queue, bit1)
125             }
126         }
127     }
128 }
129 
130 // A complete Huffman tree with no empty nodes
131 enum FinalHuffmanTree<T: Clone> {
132     Leaf(T),
133     Tree(Box<FinalHuffmanTree<T>>, Box<FinalHuffmanTree<T>>),
134 }
135 
136 impl<T: Clone> FinalHuffmanTree<T> {
new(values: Vec<(T, Vec<u8>)>) -> Result<FinalHuffmanTree<T>, HuffmanTreeError>137     fn new(values: Vec<(T, Vec<u8>)>) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> {
138         let mut tree = WipHuffmanTree::new_empty();
139 
140         for (symbol, code) in values {
141             tree.add(code.as_slice(), symbol)?;
142         }
143 
144         tree.into_read_tree()
145     }
146 }
147 
148 // Work-in-progress trees may have empty nodes during construction
149 // but those are not allowed in a finalized tree.
150 // If the user wants some codes to be None or an error symbol of some sort,
151 // those will need to be specified explicitly.
152 enum WipHuffmanTree<T: Clone> {
153     Empty,
154     Leaf(T),
155     Tree(Box<WipHuffmanTree<T>>, Box<WipHuffmanTree<T>>),
156 }
157 
158 impl<T: Clone> WipHuffmanTree<T> {
new_empty() -> WipHuffmanTree<T>159     fn new_empty() -> WipHuffmanTree<T> {
160         WipHuffmanTree::Empty
161     }
162 
new_leaf(value: T) -> WipHuffmanTree<T>163     fn new_leaf(value: T) -> WipHuffmanTree<T> {
164         WipHuffmanTree::Leaf(value)
165     }
166 
new_tree() -> WipHuffmanTree<T>167     fn new_tree() -> WipHuffmanTree<T> {
168         WipHuffmanTree::Tree(Box::new(Self::new_empty()), Box::new(Self::new_empty()))
169     }
170 
into_read_tree(self) -> Result<FinalHuffmanTree<T>, HuffmanTreeError>171     fn into_read_tree(self) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> {
172         match self {
173             WipHuffmanTree::Empty => Err(HuffmanTreeError::MissingLeaf),
174             WipHuffmanTree::Leaf(v) => Ok(FinalHuffmanTree::Leaf(v)),
175             WipHuffmanTree::Tree(zero, one) => {
176                 let zero = zero.into_read_tree()?;
177                 let one = one.into_read_tree()?;
178                 Ok(FinalHuffmanTree::Tree(Box::new(zero), Box::new(one)))
179             }
180         }
181     }
182 
add(&mut self, code: &[u8], symbol: T) -> Result<(), HuffmanTreeError>183     fn add(&mut self, code: &[u8], symbol: T) -> Result<(), HuffmanTreeError> {
184         match self {
185             WipHuffmanTree::Empty => {
186                 if code.is_empty() {
187                     *self = WipHuffmanTree::new_leaf(symbol);
188                     Ok(())
189                 } else {
190                     *self = WipHuffmanTree::new_tree();
191                     self.add(code, symbol)
192                 }
193             }
194             WipHuffmanTree::Leaf(_) => Err(if code.is_empty() {
195                 HuffmanTreeError::DuplicateLeaf
196             } else {
197                 HuffmanTreeError::OrphanedLeaf
198             }),
199             WipHuffmanTree::Tree(ref mut zero, ref mut one) => {
200                 if code.is_empty() {
201                     Err(HuffmanTreeError::DuplicateLeaf)
202                 } else {
203                     match code[0] {
204                         0 => zero.add(&code[1..], symbol),
205                         1 => one.add(&code[1..], symbol),
206                         _ => Err(HuffmanTreeError::InvalidBit),
207                     }
208                 }
209             }
210         }
211     }
212 }
213 
214 /// An error type during Huffman tree compilation.
215 #[derive(PartialEq, Copy, Clone, Debug)]
216 pub enum HuffmanTreeError {
217     /// One of the bits in a Huffman code is not 0 or 1
218     InvalidBit,
219     /// A Huffman code in the specification has no defined symbol
220     MissingLeaf,
221     /// The same Huffman code specifies multiple symbols
222     DuplicateLeaf,
223     /// A Huffman code is the prefix of some longer code
224     OrphanedLeaf,
225 }
226 
227 impl fmt::Display for HuffmanTreeError {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result228     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
229         match *self {
230             HuffmanTreeError::InvalidBit => write!(f, "invalid bit in code"),
231             HuffmanTreeError::MissingLeaf => write!(f, "missing leaf node in specification"),
232             HuffmanTreeError::DuplicateLeaf => write!(f, "duplicate leaf node in specification"),
233             HuffmanTreeError::OrphanedLeaf => write!(f, "orphaned leaf node in specification"),
234         }
235     }
236 }
237 
238 impl std::error::Error for HuffmanTreeError {}
239 
240 /// Given a vector of symbol/code pairs, compiles a Huffman tree
241 /// for writing.
242 ///
243 /// Code must be 0 or 1 bits and are always written to the stream
244 /// from least-significant in the list to most signficant
245 /// (which makes them easier to read for humans).
246 ///
247 /// If the same symbol occurs multiple times, the first code is used.
248 /// Unlike in read trees, not all possible codes need to be
249 /// assigned a symbol.
250 ///
251 /// ## Examples
252 /// ```
253 /// use bitstream_io::huffman::compile_write_tree;
254 /// use bitstream_io::BigEndian;
255 /// assert!(compile_write_tree::<BigEndian,i32>(
256 ///     vec![(1, vec![0]),
257 ///          (2, vec![1, 0]),
258 ///          (3, vec![1, 1])]).is_ok());
259 /// ```
260 ///
261 /// ```
262 /// use std::io::Write;
263 /// use bitstream_io::{BigEndian, BitWriter, HuffmanWrite};
264 /// use bitstream_io::huffman::compile_write_tree;
265 /// let tree = compile_write_tree(
266 ///     vec![('a', vec![0]),
267 ///          ('b', vec![1, 0]),
268 ///          ('c', vec![1, 1, 0]),
269 ///          ('d', vec![1, 1, 1])]).unwrap();
270 /// let mut data = Vec::new();
271 /// {
272 ///     let mut writer = BitWriter::endian(&mut data, BigEndian);
273 ///     writer.write_huffman(&tree, 'b').unwrap();
274 ///     writer.write_huffman(&tree, 'c').unwrap();
275 ///     writer.write_huffman(&tree, 'd').unwrap();
276 /// }
277 /// assert_eq!(data, [0b10110111]);
278 /// ```
compile_write_tree<E, T>( values: Vec<(T, Vec<u8>)>, ) -> Result<WriteHuffmanTree<E, T>, HuffmanTreeError> where E: Endianness, T: Ord + Clone,279 pub fn compile_write_tree<E, T>(
280     values: Vec<(T, Vec<u8>)>,
281 ) -> Result<WriteHuffmanTree<E, T>, HuffmanTreeError>
282 where
283     E: Endianness,
284     T: Ord + Clone,
285 {
286     let mut map = BTreeMap::new();
287 
288     for (symbol, code) in values {
289         let mut encoded = Vec::new();
290         for bits in code.chunks(32) {
291             let mut acc = BitQueue::<E, u32>::new();
292             for bit in bits {
293                 match *bit {
294                     0 => acc.push(1, 0),
295                     1 => acc.push(1, 1),
296                     _ => return Err(HuffmanTreeError::InvalidBit),
297                 }
298             }
299             let len = acc.len();
300             encoded.push((len, acc.value()))
301         }
302         map.entry(symbol)
303             .or_insert_with(|| encoded.into_boxed_slice());
304     }
305 
306     Ok(WriteHuffmanTree {
307         map,
308         phantom: PhantomData,
309     })
310 }
311 
312 /// A compiled Huffman tree for use with the `write_huffman` method.
313 /// Returned by `compiled_write_tree`.
314 pub struct WriteHuffmanTree<E: Endianness, T: Ord> {
315     map: BTreeMap<T, Box<[(u32, u32)]>>,
316     phantom: PhantomData<E>,
317 }
318 
319 impl<E: Endianness, T: Ord + Clone> WriteHuffmanTree<E, T> {
320     /// Returns true if symbol is in tree.
321     #[inline]
has_symbol(&self, symbol: &T) -> bool322     pub fn has_symbol(&self, symbol: &T) -> bool {
323         self.map.contains_key(symbol)
324     }
325 
326     /// Given symbol, returns iterator of
327     /// (bits, value) pairs for writing code.
328     /// Panics if symbol is not found.
329     #[inline]
get(&self, symbol: &T) -> impl Iterator<Item = &(u32, u32)>330     pub fn get(&self, symbol: &T) -> impl Iterator<Item = &(u32, u32)> {
331         self.map[symbol].iter()
332     }
333 }
334