1 //! This modules provides an implementation of the Lempel–Ziv–Welch Compression Algorithm
2
3 // Note: This implementation borrows heavily from the work of Julius Pettersson
4 // See http://www.cplusplus.com/articles/iL18T05o/ for his extensive explanations
5 // and a C++ implementatation
6
7 use std::io;
8 use std::io::{Read, Write};
9
10 use bitstream::{Bits, BitReader, BitWriter};
11
12 const MAX_CODESIZE: u8 = 12;
13 const MAX_ENTRIES: usize = 1 << MAX_CODESIZE as usize;
14
15 /// Alias for a LZW code point
16 type Code = u16;
17
18 /// Decoding dictionary.
19 ///
20 /// It is not generic due to current limitations of Rust
21 /// Inspired by http://www.cplusplus.com/articles/iL18T05o/
22 #[derive(Debug)]
23 struct DecodingDict {
24 min_size: u8,
25 table: Vec<(Option<Code>, u8)>,
26 buffer: Vec<u8>,
27 }
28
29 impl DecodingDict {
30 /// Creates a new dict
new(min_size: u8) -> DecodingDict31 fn new(min_size: u8) -> DecodingDict {
32 DecodingDict {
33 min_size: min_size,
34 table: Vec::with_capacity(512),
35 buffer: Vec::with_capacity((1 << MAX_CODESIZE as usize) - 1)
36 }
37 }
38
39 /// Resets the dictionary
reset(&mut self)40 fn reset(&mut self) {
41 self.table.clear();
42 for i in 0..(1u16 << self.min_size as usize) {
43 self.table.push((None, i as u8));
44 }
45 }
46
47 /// Inserts a value into the dict
48 #[inline(always)]
push(&mut self, key: Option<Code>, value: u8)49 fn push(&mut self, key: Option<Code>, value: u8) {
50 self.table.push((key, value))
51 }
52
53 /// Reconstructs the data for the corresponding code
reconstruct(&mut self, code: Option<Code>) -> io::Result<&[u8]>54 fn reconstruct(&mut self, code: Option<Code>) -> io::Result<&[u8]> {
55 self.buffer.clear();
56 let mut code = code;
57 let mut cha;
58 // Check the first access more thoroughly since a bad code
59 // could occur if the data is malformed
60 if let Some(k) = code {
61 match self.table.get(k as usize) {
62 Some(&(code_, cha_)) => {
63 code = code_;
64 cha = cha_;
65 }
66 None => return Err(io::Error::new(
67 io::ErrorKind::InvalidInput,
68 &*format!("Invalid code {:X}, expected code <= {:X}", k, self.table.len())
69 ))
70 }
71 self.buffer.push(cha);
72 }
73 while let Some(k) = code {
74 if self.buffer.len() >= MAX_ENTRIES {
75 return Err(io::Error::new(
76 io::ErrorKind::InvalidInput,
77 "Invalid code sequence. Cycle in decoding table."
78 ))
79 }
80 //(code, cha) = self.table[k as usize];
81 // Note: This could possibly be replaced with an unchecked array access if
82 // - value is asserted to be < self.next_code() in push
83 // - min_size is asserted to be < MAX_CODESIZE
84 let entry = self.table[k as usize]; code = entry.0; cha = entry.1;
85 self.buffer.push(cha);
86 }
87 self.buffer.reverse();
88 Ok(&self.buffer)
89 }
90
91 /// Returns the buffer constructed by the last reconstruction
92 #[inline(always)]
buffer(&self) -> &[u8]93 fn buffer(&self) -> &[u8] {
94 &self.buffer
95 }
96
97 /// Number of entries in the dictionary
98 #[inline(always)]
next_code(&self) -> u1699 fn next_code(&self) -> u16 {
100 self.table.len() as u16
101 }
102 }
103
104 macro_rules! define_decoder_struct {
105 {$(
106 $name:ident, $offset:expr, #[$doc:meta];
107 )*} => {
108
109 $( // START struct definition
110
111 #[$doc]
112 ///
113 /// The maximum supported code size is 16 bits. The decoder assumes two
114 /// special code word to be present in the stream:
115 ///
116 /// * `CLEAR_CODE == 1 << min_code_size`
117 /// * `END_CODE == CLEAR_CODE + 1`
118 ///
119 ///Furthermore the decoder expects the stream to start with a `CLEAR_CODE`. This
120 /// corresponds to the implementation needed for en- and decoding GIF and TIFF files.
121 #[derive(Debug)]
122 pub struct $name<R: BitReader> {
123 r: R,
124 prev: Option<Code>,
125 table: DecodingDict,
126 buf: [u8; 1],
127 code_size: u8,
128 min_code_size: u8,
129 clear_code: Code,
130 end_code: Code,
131 }
132
133 impl<R> $name<R> where R: BitReader {
134 /// Creates a new LZW decoder.
135 pub fn new(reader: R, min_code_size: u8) -> $name<R> {
136 $name {
137 r: reader,
138 prev: None,
139 table: DecodingDict::new(min_code_size),
140 buf: [0; 1],
141 code_size: min_code_size + 1,
142 min_code_size: min_code_size,
143 clear_code: 1 << min_code_size,
144 end_code: (1 << min_code_size) + 1,
145 }
146 }
147
148 /// Tries to obtain and decode a code word from `bytes`.
149 ///
150 /// Returns the number of bytes that have been consumed from `bytes`. An empty
151 /// slice does not indicate `EOF`.
152 pub fn decode_bytes(&mut self, bytes: &[u8]) -> io::Result<(usize, &[u8])> {
153 Ok(match self.r.read_bits(bytes, self.code_size) {
154 Bits::Some(consumed, code) => {
155 (consumed, if code == self.clear_code {
156 self.table.reset();
157 self.table.push(None, 0); // clear code
158 self.table.push(None, 0); // end code
159 self.code_size = self.min_code_size + 1;
160 self.prev = None;
161 &[]
162 } else if code == self.end_code {
163 &[]
164 } else {
165 let next_code = self.table.next_code();
166 if code > next_code {
167 return Err(io::Error::new(
168 io::ErrorKind::InvalidInput,
169 &*format!("Invalid code {:X}, expected code <= {:X}",
170 code,
171 next_code
172 )
173 ))
174 }
175 let prev = self.prev;
176 let result = if prev.is_none() {
177 self.buf = [code as u8];
178 &self.buf[..]
179 } else {
180 let data = if code == next_code {
181 let cha = try!(self.table.reconstruct(prev))[0];
182 self.table.push(prev, cha);
183 try!(self.table.reconstruct(Some(code)))
184 } else if code < next_code {
185 let cha = try!(self.table.reconstruct(Some(code)))[0];
186 self.table.push(prev, cha);
187 self.table.buffer()
188 } else {
189 // code > next_code is already tested a few lines earlier
190 unreachable!()
191 };
192 data
193 };
194 if next_code == (1 << self.code_size as usize) - 1 - $offset
195 && self.code_size < MAX_CODESIZE {
196 self.code_size += 1;
197 }
198 self.prev = Some(code);
199 result
200 })
201 },
202 Bits::None(consumed) => {
203 (consumed, &[])
204 }
205 })
206 }
207 }
208
209 )* // END struct definition
210
211 }
212 }
213
214 define_decoder_struct!{
215 Decoder, 0, #[doc = "Decoder for a LZW compressed stream (this algorithm is used for GIF files)."];
216 DecoderEarlyChange, 1, #[doc = "Decoder for a LZW compressed stream using an “early change” algorithm (used in TIFF files)."];
217 }
218
219 struct Node {
220 prefix: Option<Code>,
221 c: u8,
222 left: Option<Code>,
223 right: Option<Code>,
224 }
225
226 impl Node {
227 #[inline(always)]
new(c: u8) -> Node228 fn new(c: u8) -> Node {
229 Node {
230 prefix: None,
231 c: c,
232 left: None,
233 right: None
234 }
235 }
236 }
237
238 struct EncodingDict {
239 table: Vec<Node>,
240 min_size: u8,
241
242 }
243
244 /// Encoding dictionary based on a binary tree
245 impl EncodingDict {
new(min_size: u8) -> EncodingDict246 fn new(min_size: u8) -> EncodingDict {
247 let mut this = EncodingDict {
248 table: Vec::with_capacity(MAX_ENTRIES),
249 min_size: min_size
250 };
251 this.reset();
252 this
253 }
254
reset(&mut self)255 fn reset(&mut self) {
256 self.table.clear();
257 for i in 0 .. (1u16 << self.min_size as usize) {
258 self.push_node(Node::new(i as u8));
259 }
260 }
261
262 #[inline(always)]
push_node(&mut self, node: Node)263 fn push_node(&mut self, node: Node) {
264 self.table.push(node)
265 }
266
267 #[inline(always)]
clear_code(&self) -> Code268 fn clear_code(&self) -> Code {
269 1u16 << self.min_size
270 }
271
272 #[inline(always)]
end_code(&self) -> Code273 fn end_code(&self) -> Code {
274 self.clear_code() + 1
275 }
276
277 // Searches for a new prefix
search_and_insert(&mut self, i: Option<Code>, c: u8) -> Option<Code>278 fn search_and_insert(&mut self, i: Option<Code>, c: u8) -> Option<Code> {
279 if let Some(i) = i.map(|v| v as usize) {
280 let table_size = self.table.len() as Code;
281 if let Some(mut j) = self.table[i].prefix {
282 loop {
283 let entry = &mut self.table[j as usize];
284 if c < entry.c {
285 if let Some(k) = entry.left {
286 j = k
287 } else {
288 entry.left = Some(table_size);
289 break
290 }
291 } else if c > entry.c {
292 if let Some(k) = entry.right {
293 j = k
294 } else {
295 entry.right = Some(table_size);
296 break
297 }
298 } else {
299 return Some(j)
300 }
301 }
302 } else {
303 self.table[i].prefix = Some(table_size);
304 }
305 self.table.push(Node::new(c));
306 None
307 } else {
308 Some(self.search_initials(c as Code))
309 }
310 }
311
next_code(&self) -> usize312 fn next_code(&self) -> usize {
313 self.table.len()
314 }
315
search_initials(&self, i: Code) -> Code316 fn search_initials(&self, i: Code) -> Code {
317 self.table[i as usize].c as Code
318 }
319 }
320
321 /// Convenience function that reads and compresses all bytes from `R`.
encode<R, W>(r: R, mut w: W, min_code_size: u8) -> io::Result<()> where R: Read, W: BitWriter322 pub fn encode<R, W>(r: R, mut w: W, min_code_size: u8) -> io::Result<()>
323 where R: Read, W: BitWriter {
324 let mut dict = EncodingDict::new(min_code_size);
325 dict.push_node(Node::new(0)); // clear code
326 dict.push_node(Node::new(0)); // end code
327 let mut code_size = min_code_size + 1;
328 let mut i = None;
329 // gif spec: first clear code
330 try!(w.write_bits(dict.clear_code(), code_size));
331 let mut r = r.bytes();
332 while let Some(Ok(c)) = r.next() {
333 let prev = i;
334 i = dict.search_and_insert(prev, c);
335 if i.is_none() {
336 if let Some(code) = prev {
337 try!(w.write_bits(code, code_size));
338 }
339 i = Some(dict.search_initials(c as Code))
340 }
341 // There is a hit: do not write out code but continue
342 let next_code = dict.next_code();
343 if next_code > (1 << code_size as usize)
344 && code_size < MAX_CODESIZE {
345 code_size += 1;
346 }
347 if next_code > MAX_ENTRIES {
348 dict.reset();
349 dict.push_node(Node::new(0)); // clear code
350 dict.push_node(Node::new(0)); // end code
351 try!(w.write_bits(dict.clear_code(), code_size));
352 code_size = min_code_size + 1;
353 }
354
355 }
356 if let Some(code) = i {
357 try!(w.write_bits(code, code_size));
358 }
359 try!(w.write_bits(dict.end_code(), code_size));
360 try!(w.flush());
361 Ok(())
362 }
363
364 /// LZW encoder using the algorithm of GIF files.
365 pub struct Encoder<W: BitWriter> {
366 w: W,
367 dict: EncodingDict,
368 min_code_size: u8,
369 code_size: u8,
370 i: Option<Code>
371 }
372
373 impl<W: BitWriter> Encoder<W> {
374 /// Creates a new LZW encoder.
375 ///
376 /// **Note**: If `min_code_size < 8` then `Self::encode_bytes` might panic when
377 /// the supplied data containts values that exceed `1 << min_code_size`.
new(mut w: W, min_code_size: u8) -> io::Result<Encoder<W>>378 pub fn new(mut w: W, min_code_size: u8) -> io::Result<Encoder<W>> {
379 let mut dict = EncodingDict::new(min_code_size);
380 dict.push_node(Node::new(0)); // clear code
381 dict.push_node(Node::new(0)); // end code
382 let code_size = min_code_size + 1;
383 try!(w.write_bits(dict.clear_code(), code_size));
384 Ok(Encoder {
385 w: w,
386 dict: dict,
387 min_code_size: min_code_size,
388 code_size: code_size,
389 i: None
390 })
391 }
392
393 /// Compresses `bytes` and writes the result into the writer.
394 ///
395 /// ## Panics
396 ///
397 /// This function might panic if any of the input bytes exceeds `1 << min_code_size`.
398 /// This cannot happen if `min_code_size >= 8`.
encode_bytes(&mut self, bytes: &[u8]) -> io::Result<()>399 pub fn encode_bytes(&mut self, bytes: &[u8]) -> io::Result<()> {
400 let w = &mut self.w;
401 let dict = &mut self.dict;
402 let code_size = &mut self.code_size;
403 let i = &mut self.i;
404 for &c in bytes {
405 let prev = *i;
406 *i = dict.search_and_insert(prev, c);
407 if i.is_none() {
408 if let Some(code) = prev {
409 try!(w.write_bits(code, *code_size));
410 }
411 *i = Some(dict.search_initials(c as Code))
412 }
413 // There is a hit: do not write out code but continue
414 let next_code = dict.next_code();
415 if next_code > (1 << *code_size as usize)
416 && *code_size < MAX_CODESIZE {
417 *code_size += 1;
418 }
419 if next_code > MAX_ENTRIES {
420 dict.reset();
421 dict.push_node(Node::new(0)); // clear code
422 dict.push_node(Node::new(0)); // end code
423 try!(w.write_bits(dict.clear_code(), *code_size));
424 *code_size = self.min_code_size + 1;
425 }
426
427 }
428 Ok(())
429 }
430 }
431
432 impl<W: BitWriter> Drop for Encoder<W> {
433 #[cfg(feature = "raii_no_panic")]
drop(&mut self)434 fn drop(&mut self) {
435 let w = &mut self.w;
436 let code_size = &mut self.code_size;
437
438 if let Some(code) = self.i {
439 let _ = w.write_bits(code, *code_size);
440 }
441 let _ = w.write_bits(self.dict.end_code(), *code_size);
442 let _ = w.flush();
443 }
444 #[cfg(not(feature = "raii_no_panic"))]
drop(&mut self)445 fn drop(&mut self) {
446 (|| {
447 let w = &mut self.w;
448 let code_size = &mut self.code_size;
449 if let Some(code) = self.i {
450 try!(w.write_bits(code, *code_size));
451 }
452 try!(w.write_bits(self.dict.end_code(), *code_size));
453 w.flush()
454 })().unwrap()
455
456 }
457 }
458
459 #[cfg(test)]
460 #[test]
round_trip()461 fn round_trip() {
462 use {LsbWriter, LsbReader};
463
464 let size = 8;
465 let data = b"TOBEORNOTTOBEORTOBEORNOT";
466 let mut compressed = vec![];
467 {
468 let mut enc = Encoder::new(LsbWriter::new(&mut compressed), size).unwrap();
469 enc.encode_bytes(data).unwrap();
470 }
471 println!("{:?}", compressed);
472 let mut dec = Decoder::new(LsbReader::new(), size);
473 let mut compressed = &compressed[..];
474 let mut data2 = vec![];
475 while compressed.len() > 0 {
476 let (start, bytes) = dec.decode_bytes(&compressed).unwrap();
477 compressed = &compressed[start..];
478 data2.extend(bytes.iter().map(|&i| i));
479 }
480 assert_eq!(data2, data)
481 }