1 //! This modules provides an implementation of the Lempel–Ziv–Welch Compression Algorithm
2 
3 // Note: This implementation borrows heavily from the work of Julius Pettersson
4 // See http://www.cplusplus.com/articles/iL18T05o/ for his extensive explanations
5 // and a C++ implementatation
6 
7 use std::io;
8 use std::io::{Read, Write};
9 
10 use bitstream::{Bits, BitReader, BitWriter};
11 
12 const MAX_CODESIZE: u8 = 12;
13 const MAX_ENTRIES: usize = 1 << MAX_CODESIZE as usize;
14 
15 /// Alias for a LZW code point
16 type Code = u16;
17 
18 /// Decoding dictionary.
19 ///
20 /// It is not generic due to current limitations of Rust
21 /// Inspired by http://www.cplusplus.com/articles/iL18T05o/
22 #[derive(Debug)]
23 struct DecodingDict {
24     min_size: u8,
25     table: Vec<(Option<Code>, u8)>,
26     buffer: Vec<u8>,
27 }
28 
29 impl DecodingDict {
30     /// Creates a new dict
new(min_size: u8) -> DecodingDict31     fn new(min_size: u8) -> DecodingDict {
32         DecodingDict {
33             min_size: min_size,
34             table: Vec::with_capacity(512),
35             buffer: Vec::with_capacity((1 << MAX_CODESIZE as usize) - 1)
36         }
37     }
38 
39     /// Resets the dictionary
reset(&mut self)40     fn reset(&mut self) {
41         self.table.clear();
42         for i in 0..(1u16 << self.min_size as usize) {
43             self.table.push((None, i as u8));
44         }
45     }
46 
47     /// Inserts a value into the dict
48     #[inline(always)]
push(&mut self, key: Option<Code>, value: u8)49     fn push(&mut self, key: Option<Code>, value: u8) {
50         self.table.push((key, value))
51     }
52 
53     /// Reconstructs the data for the corresponding code
reconstruct(&mut self, code: Option<Code>) -> io::Result<&[u8]>54     fn reconstruct(&mut self, code: Option<Code>) -> io::Result<&[u8]> {
55         self.buffer.clear();
56         let mut code = code;
57         let mut cha;
58         // Check the first access more thoroughly since a bad code
59         // could occur if the data is malformed
60         if let Some(k) = code {
61             match self.table.get(k as usize) {
62                 Some(&(code_, cha_)) => {
63                     code = code_;
64                     cha = cha_;
65                 }
66                 None => return Err(io::Error::new(
67                     io::ErrorKind::InvalidInput,
68                     &*format!("Invalid code {:X}, expected code <= {:X}", k, self.table.len())
69                 ))
70             }
71             self.buffer.push(cha);
72         }
73         while let Some(k) = code {
74             if self.buffer.len() >= MAX_ENTRIES {
75                 return Err(io::Error::new(
76                     io::ErrorKind::InvalidInput,
77                     "Invalid code sequence. Cycle in decoding table."
78                 ))
79             }
80             //(code, cha) = self.table[k as usize];
81             // Note: This could possibly be replaced with an unchecked array access if
82             //  - value is asserted to be < self.next_code() in push
83             //  - min_size is asserted to be < MAX_CODESIZE
84             let entry = self.table[k as usize]; code = entry.0; cha = entry.1;
85             self.buffer.push(cha);
86         }
87         self.buffer.reverse();
88         Ok(&self.buffer)
89     }
90 
91     /// Returns the buffer constructed by the last reconstruction
92     #[inline(always)]
buffer(&self) -> &[u8]93     fn buffer(&self) -> &[u8] {
94         &self.buffer
95     }
96 
97     /// Number of entries in the dictionary
98     #[inline(always)]
next_code(&self) -> u1699     fn next_code(&self) -> u16 {
100         self.table.len() as u16
101     }
102 }
103 
104 macro_rules! define_decoder_struct {
105     {$(
106         $name:ident, $offset:expr, #[$doc:meta];
107     )*} => {
108 
109 $( // START struct definition
110 
111 #[$doc]
112 ///
113 /// The maximum supported code size is 16 bits. The decoder assumes two
114 /// special code word to be present in the stream:
115 ///
116 ///  * `CLEAR_CODE == 1 << min_code_size`
117 ///  * `END_CODE   == CLEAR_CODE + 1`
118 ///
119 ///Furthermore the decoder expects the stream to start with a `CLEAR_CODE`. This
120 /// corresponds to the implementation needed for en- and decoding GIF and TIFF files.
121 #[derive(Debug)]
122 pub struct $name<R: BitReader> {
123     r: R,
124     prev: Option<Code>,
125     table: DecodingDict,
126     buf: [u8; 1],
127     code_size: u8,
128     min_code_size: u8,
129     clear_code: Code,
130     end_code: Code,
131 }
132 
133 impl<R> $name<R> where R: BitReader {
134     /// Creates a new LZW decoder.
135     pub fn new(reader: R, min_code_size: u8) -> $name<R> {
136         $name {
137             r: reader,
138             prev: None,
139             table: DecodingDict::new(min_code_size),
140             buf: [0; 1],
141             code_size: min_code_size + 1,
142             min_code_size: min_code_size,
143             clear_code: 1 << min_code_size,
144             end_code: (1 << min_code_size) + 1,
145         }
146     }
147 
148     /// Tries to obtain and decode a code word from `bytes`.
149     ///
150     /// Returns the number of bytes that have been consumed from `bytes`. An empty
151     /// slice does not indicate `EOF`.
152     pub fn decode_bytes(&mut self, bytes: &[u8]) -> io::Result<(usize, &[u8])> {
153         Ok(match self.r.read_bits(bytes, self.code_size) {
154             Bits::Some(consumed, code) => {
155                 (consumed, if code == self.clear_code {
156                     self.table.reset();
157                     self.table.push(None, 0); // clear code
158                     self.table.push(None, 0); // end code
159                     self.code_size = self.min_code_size + 1;
160                     self.prev = None;
161                     &[]
162                 } else if code == self.end_code {
163                     &[]
164                 } else {
165                     let next_code = self.table.next_code();
166                     if code > next_code {
167                         return Err(io::Error::new(
168                             io::ErrorKind::InvalidInput,
169                             &*format!("Invalid code {:X}, expected code <= {:X}",
170                                       code,
171                                       next_code
172                             )
173                         ))
174                     }
175                     let prev = self.prev;
176                     let result = if prev.is_none() {
177                         self.buf = [code as u8];
178                         &self.buf[..]
179                     } else {
180                         let data = if code == next_code {
181                             let cha = try!(self.table.reconstruct(prev))[0];
182                             self.table.push(prev, cha);
183                             try!(self.table.reconstruct(Some(code)))
184                         } else if code < next_code {
185                             let cha = try!(self.table.reconstruct(Some(code)))[0];
186                             self.table.push(prev, cha);
187                             self.table.buffer()
188                         } else {
189                             // code > next_code is already tested a few lines earlier
190                             unreachable!()
191                         };
192                         data
193                     };
194                     if next_code == (1 << self.code_size as usize) - 1 - $offset
195                        && self.code_size < MAX_CODESIZE {
196                         self.code_size += 1;
197                     }
198                     self.prev = Some(code);
199                     result
200                 })
201             },
202             Bits::None(consumed) => {
203                 (consumed, &[])
204             }
205         })
206     }
207 }
208 
209 )* // END struct definition
210 
211     }
212 }
213 
214 define_decoder_struct!{
215     Decoder, 0, #[doc = "Decoder for a LZW compressed stream (this algorithm is used for GIF files)."];
216     DecoderEarlyChange, 1, #[doc = "Decoder for a LZW compressed stream using an “early change” algorithm (used in TIFF files)."];
217 }
218 
219 struct Node {
220     prefix: Option<Code>,
221     c: u8,
222     left: Option<Code>,
223     right: Option<Code>,
224 }
225 
226 impl Node {
227     #[inline(always)]
new(c: u8) -> Node228     fn new(c: u8) -> Node {
229         Node {
230             prefix: None,
231             c: c,
232             left: None,
233             right: None
234         }
235     }
236 }
237 
238 struct EncodingDict {
239     table: Vec<Node>,
240     min_size: u8,
241 
242 }
243 
244 /// Encoding dictionary based on a binary tree
245 impl EncodingDict {
new(min_size: u8) -> EncodingDict246     fn new(min_size: u8) -> EncodingDict {
247         let mut this = EncodingDict {
248             table: Vec::with_capacity(MAX_ENTRIES),
249             min_size: min_size
250         };
251         this.reset();
252         this
253     }
254 
reset(&mut self)255     fn reset(&mut self) {
256         self.table.clear();
257         for i in 0 .. (1u16 << self.min_size as usize) {
258             self.push_node(Node::new(i as u8));
259         }
260     }
261 
262     #[inline(always)]
push_node(&mut self, node: Node)263     fn push_node(&mut self, node: Node) {
264         self.table.push(node)
265     }
266 
267     #[inline(always)]
clear_code(&self) -> Code268     fn clear_code(&self) -> Code {
269         1u16 << self.min_size
270     }
271 
272     #[inline(always)]
end_code(&self) -> Code273     fn end_code(&self) -> Code {
274         self.clear_code() + 1
275     }
276 
277     // Searches for a new prefix
search_and_insert(&mut self, i: Option<Code>, c: u8) -> Option<Code>278     fn search_and_insert(&mut self, i: Option<Code>, c: u8) -> Option<Code> {
279         if let Some(i) = i.map(|v| v as usize) {
280             let table_size = self.table.len() as Code;
281             if let Some(mut j) = self.table[i].prefix {
282                 loop {
283                     let entry = &mut self.table[j as usize];
284                     if c < entry.c {
285                         if let Some(k) = entry.left {
286                             j = k
287                         } else {
288                             entry.left = Some(table_size);
289                             break
290                         }
291                     } else if c > entry.c {
292                         if let Some(k) = entry.right {
293                             j = k
294                         } else {
295                             entry.right = Some(table_size);
296                             break
297                         }
298                     } else {
299                         return Some(j)
300                     }
301                 }
302             } else {
303                 self.table[i].prefix = Some(table_size);
304             }
305             self.table.push(Node::new(c));
306             None
307         } else {
308             Some(self.search_initials(c as Code))
309         }
310     }
311 
next_code(&self) -> usize312     fn next_code(&self) -> usize {
313         self.table.len()
314     }
315 
search_initials(&self, i: Code) -> Code316     fn search_initials(&self, i: Code) -> Code {
317         self.table[i as usize].c as Code
318     }
319 }
320 
321 /// Convenience function that reads and compresses all bytes from `R`.
encode<R, W>(r: R, mut w: W, min_code_size: u8) -> io::Result<()> where R: Read, W: BitWriter322 pub fn encode<R, W>(r: R, mut w: W, min_code_size: u8) -> io::Result<()>
323 where R: Read, W: BitWriter {
324     let mut dict = EncodingDict::new(min_code_size);
325     dict.push_node(Node::new(0)); // clear code
326     dict.push_node(Node::new(0)); // end code
327     let mut code_size = min_code_size + 1;
328     let mut i = None;
329     // gif spec: first clear code
330     try!(w.write_bits(dict.clear_code(), code_size));
331     let mut r = r.bytes();
332     while let Some(Ok(c)) = r.next() {
333         let prev = i;
334         i = dict.search_and_insert(prev, c);
335         if i.is_none() {
336             if let Some(code) = prev {
337                 try!(w.write_bits(code, code_size));
338             }
339             i = Some(dict.search_initials(c as Code))
340         }
341         // There is a hit: do not write out code but continue
342         let next_code = dict.next_code();
343         if next_code > (1 << code_size as usize)
344            && code_size < MAX_CODESIZE {
345             code_size += 1;
346         }
347         if next_code > MAX_ENTRIES {
348             dict.reset();
349             dict.push_node(Node::new(0)); // clear code
350             dict.push_node(Node::new(0)); // end code
351             try!(w.write_bits(dict.clear_code(), code_size));
352             code_size = min_code_size + 1;
353         }
354 
355     }
356     if let Some(code) = i {
357         try!(w.write_bits(code, code_size));
358     }
359     try!(w.write_bits(dict.end_code(), code_size));
360     try!(w.flush());
361     Ok(())
362 }
363 
364 /// LZW encoder using the algorithm of GIF files.
365 pub struct Encoder<W: BitWriter> {
366     w: W,
367     dict: EncodingDict,
368     min_code_size: u8,
369     code_size: u8,
370     i: Option<Code>
371 }
372 
373 impl<W: BitWriter> Encoder<W> {
374     /// Creates a new LZW encoder.
375     ///
376     /// **Note**: If `min_code_size < 8` then `Self::encode_bytes` might panic when
377     /// the supplied data containts values that exceed `1 << min_code_size`.
new(mut w: W, min_code_size: u8) -> io::Result<Encoder<W>>378     pub fn new(mut w: W, min_code_size: u8) -> io::Result<Encoder<W>> {
379         let mut dict = EncodingDict::new(min_code_size);
380         dict.push_node(Node::new(0)); // clear code
381         dict.push_node(Node::new(0)); // end code
382         let code_size = min_code_size + 1;
383         try!(w.write_bits(dict.clear_code(), code_size));
384         Ok(Encoder {
385             w: w,
386             dict: dict,
387             min_code_size: min_code_size,
388             code_size: code_size,
389             i: None
390         })
391     }
392 
393     /// Compresses `bytes` and writes the result into the writer.
394     ///
395     /// ## Panics
396     ///
397     /// This function might panic if any of the input bytes exceeds `1 << min_code_size`.
398     /// This cannot happen if `min_code_size >= 8`.
encode_bytes(&mut self, bytes: &[u8]) -> io::Result<()>399     pub fn encode_bytes(&mut self, bytes: &[u8]) -> io::Result<()> {
400         let w = &mut self.w;
401         let dict = &mut self.dict;
402         let code_size = &mut self.code_size;
403         let i = &mut self.i;
404         for &c in bytes {
405             let prev = *i;
406             *i = dict.search_and_insert(prev, c);
407             if i.is_none() {
408                 if let Some(code) = prev {
409                     try!(w.write_bits(code, *code_size));
410                 }
411                 *i = Some(dict.search_initials(c as Code))
412             }
413             // There is a hit: do not write out code but continue
414             let next_code = dict.next_code();
415             if next_code > (1 << *code_size as usize)
416                && *code_size < MAX_CODESIZE {
417                 *code_size += 1;
418             }
419             if next_code > MAX_ENTRIES {
420                 dict.reset();
421                 dict.push_node(Node::new(0)); // clear code
422                 dict.push_node(Node::new(0)); // end code
423                 try!(w.write_bits(dict.clear_code(), *code_size));
424                 *code_size = self.min_code_size + 1;
425             }
426 
427         }
428         Ok(())
429     }
430 }
431 
432 impl<W: BitWriter> Drop for Encoder<W> {
433     #[cfg(feature = "raii_no_panic")]
drop(&mut self)434     fn drop(&mut self) {
435         let w = &mut self.w;
436         let code_size = &mut self.code_size;
437 
438         if let Some(code) = self.i {
439             let _ = w.write_bits(code, *code_size);
440         }
441         let _ = w.write_bits(self.dict.end_code(), *code_size);
442         let _ = w.flush();
443     }
444     #[cfg(not(feature = "raii_no_panic"))]
drop(&mut self)445     fn drop(&mut self) {
446         (|| {
447             let w = &mut self.w;
448             let code_size = &mut self.code_size;
449             if let Some(code) = self.i {
450                 try!(w.write_bits(code, *code_size));
451             }
452             try!(w.write_bits(self.dict.end_code(), *code_size));
453             w.flush()
454          })().unwrap()
455 
456     }
457 }
458 
459 #[cfg(test)]
460 #[test]
round_trip()461 fn round_trip() {
462     use {LsbWriter, LsbReader};
463 
464     let size = 8;
465     let data = b"TOBEORNOTTOBEORTOBEORNOT";
466     let mut compressed = vec![];
467     {
468         let mut enc = Encoder::new(LsbWriter::new(&mut compressed), size).unwrap();
469         enc.encode_bytes(data).unwrap();
470     }
471     println!("{:?}", compressed);
472     let mut dec = Decoder::new(LsbReader::new(), size);
473     let mut compressed = &compressed[..];
474     let mut data2 = vec![];
475     while compressed.len() > 0 {
476         let (start, bytes) = dec.decode_bytes(&compressed).unwrap();
477         compressed = &compressed[start..];
478         data2.extend(bytes.iter().map(|&i| i));
479     }
480     assert_eq!(data2, data)
481 }