1 /*
2     File lz_slide.c, part of lzxcomp library
3     Copyright (C) 2002 Matthew T. Russotto
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU Lesser General Public License as published by
7     the Free Software Foundation; version 2.1 only
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU Lesser General Public License for more details.
13 
14     You should have received a copy of the GNU Lesser General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18 
19 /*
20  *  This file implements a Lempel-Ziv compression mechanism on top of a
21  *  sliding-window string search algorithm.  Currently hash_slide.c, which
22  *  uses a hash table method similar to that of gnuzip/zlib, is used.
23  *
24  *  Previously, a sliding-window suffix tree method was tried, but it
25  *  resulted in poor compression at the LZX layer because it did not get the
26  *  closest matching string.  SHORTSLIDE was an attempt to get around this
27  *  by maintaining a close tree and a full tree.
28  *
29  *  A binary search tree was also tried, but it was too slow, particularly
30  *  when searching for the closest match
31  *
32  *  LZ_ONEBUFFER instructs the algorithm to use the same circular buffer for
33  *  matches (lookahead) and the window itself.  This is required when
34  *  hash_slide is used.
35  *
36  *  LAZY is lazy evaluation, similar to the gnuzip/zlib method.
37  *
38  */
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <assert.h>
42 #include "hash_slide.h"
43 #include "lz_slide.h"
44 
45 #define MAX_MATCH 253
46 #define MIN_MATCH 2
47 
48 #ifdef SHORTSLIDE
49 #ifdef LZ_ONEBUFFER
50 #error Short Slide cannot be used with One Buffer
51 #endif
52 #endif
lz_init(lz_info * lzi,int wsize,int max_match,int min_match,int frame_size,get_chars_t get_chars,output_match_t output_match,output_literal_t output_literal,void * user_data)53 void lz_init(lz_info *lzi, int wsize, int max_match, int min_match,
54 	     int frame_size,
55 	     get_chars_t get_chars,
56 	     output_match_t output_match,
57 	     output_literal_t output_literal, void *user_data)
58 {
59   assert(sizeof(SYMB) == sizeof(u_char));
60 
61   lzi->wsize = wsize;
62   if (max_match > wsize)
63     lzi->max_match = wsize;
64   else
65     lzi->max_match = max_match;
66 
67 #ifdef LAZY
68   lzi->match_buf_size = lzi->max_match + 1;
69 #else
70   lzi->match_buf_size = lzi->max_match;
71 #endif
72 
73   lzi->min_match = min_match;
74 
75 
76 #ifdef LZ_ONEBUFFER
77   lzi->slide_buf_size = wsize + lzi->match_buf_size;
78   lzi->slide_buf = malloc(lzi->slide_buf_size);
79   lzi->slide_bufe = lzi->slide_buf + lzi->slide_buf_size;
80 
81   lzi->match_buf = lzi->slide_buf;
82   lzi->match_bufp = lzi->match_buf;
83   lzi->match_bufe = lzi->slide_bufe;
84 #else
85   lzi->slide_buf = malloc(wsize);
86   lzi->slide_bufp = lzi->slide_buf;
87   lzi->slide_bufe = lzi->slide_buf + wsize;
88 
89   lzi->match_buf = malloc(lzi->match_buf_size);
90   lzi->match_bufp = lzi->match_buf;
91   lzi->match_bufe = lzi->match_buf + lzi->match_buf_size;
92 #endif
93 
94 #ifdef SHORTSLIDE
95   /* it is possible to use the same slide buffer for both.  But that would use
96      MORE memory, at least unless I can fix the slide code to allocate fewer nodes
97   */
98   lzi->short_slide = malloc(2048);
99   lzi->short_slidep = lzi->short_slide;
100   lzi->short_slidee = lzi->short_slide + 2048;
101   lzi->chars_in_short_slide = 0;
102   lzi->short_front_offset = 0;
103   initslide(2048, lzi->short_slide, &lzi->short_tree);
104 #endif
105 
106   assert(lzi->slide_buf != NULL);
107   assert(lzi->match_buf != NULL);
108 
109   lzi->frame_size = frame_size;
110   lzi->chars_in_slide = lzi->chars_in_match = lzi->front_offset = 0;
111   lzi->cur_loc = 0;
112   lzi->loc_in_frame = 0;
113   lzi->get_chars = get_chars;
114   lzi->output_match = output_match;
115   lzi->output_literal = output_literal;
116   lzi->user_data = user_data;
117   inithash(lzi->slide_buf_size, lzi->slide_buf, &lzi->main_tree);
118 }
119 
lz_release(lz_info * lzi)120 void lz_release(lz_info *lzi)
121 {
122   releasehash(lzi->main_tree);
123 #ifdef SHORTSLIDE
124   releaseslide(lzi->short_tree);
125   free(lzi->short_slide);
126 #endif
127   free(lzi->slide_buf);
128 #ifndef LZ_ONEBUFFER
129   free(lzi->match_buf);
130 #endif
131 }
132 
lz_reset(lz_info * lzi)133 void lz_reset(lz_info *lzi)
134 {
135   lzi->slide_bufp = lzi->slide_buf;
136   lzi->match_bufp = lzi->match_buf;
137   lzi->chars_in_slide = lzi->chars_in_match = lzi->front_offset = 0;
138   lzi->loc_in_frame = 0;
139   resethash(lzi->main_tree);
140 
141 #ifdef SHORTSLIDE
142   lzi->short_slidep = lzi->short_slide;
143   lzi->chars_in_short_slide = 0;
144   lzi->short_front_offset = 0;
145   resetslide(lzi->short_tree);
146 #endif
147 }
148 
149 #ifdef LZSLIDE_MAIN
150 typedef struct lz_user_data
151 {
152   FILE *infile;
153   FILE *outfile;
154   int R0, R1, R2;
155 } lz_user_data;
156 
tmp_get_chars(lz_info * lzi,int n,u_char * buf)157 int tmp_get_chars(lz_info *lzi, int n, u_char *buf)
158 {
159   lz_user_data *lzud = (lz_user_data *)lzi->user_data;
160   return fread(buf, 1, n, lzud->infile);
161 }
162 
tmp_output_match(lz_info * lzi,int match_pos,int match_len)163 int tmp_output_match(lz_info *lzi, int match_pos, int match_len)
164 {
165   lz_user_data *lzud = (lz_user_data *)lzi->user_data;
166   int mod_match_loc;
167 #if 0
168   if (match_pos == lzud->R0)
169     match_pos = 0;
170   else if (match_pos == lzud->R1) {
171     lzud->R1 = lzud->R0;
172     lzud->R0 = match_pos;
173     match_pos = 1;
174   }
175   else if (match_pos == lzud->R2) {
176     lzud->R2 = lzud->R0;
177     lzud->R0 = match_pos;
178     match_pos = 2;
179   }
180   else {
181     lzud->R2 = lzud->R1;
182     lzud->R1 = lzud->R0;
183     lzud->R0 = match_pos;
184   }
185 #endif
186   mod_match_loc = match_pos + lzi->front_offset;
187   if (mod_match_loc < 0)
188     mod_match_loc += lzi->slide_buf_size;
189 
190   fprintf(lzud->outfile, "(%d, %d)(%d)\n", match_pos, match_len, mod_match_loc);
191   return 0;
192 }
193 
tmp_output_literal(lz_info * lzi,u_char ch)194 void tmp_output_literal(lz_info *lzi, u_char ch)
195 {
196   lz_user_data *lzud = (lz_user_data *)lzi->user_data;
197   fprintf(lzud->outfile, "'%c'", ch);
198 }
199 
main(int argc,char * argv[])200 int main(int argc, char *argv[])
201 {
202   int wsize = atoi(argv[1]);
203   lz_info lzi;
204   lz_user_data lzu = {stdin, stdout, 1, 1, 1};
205 
206   lz_init(&lzi, wsize, MAX_MATCH, MIN_MATCH, tmp_get_chars, tmp_output_match, tmp_output_literal,&lzu);
207   lz_compress(&lzi);
208 }
209 #endif
210 
advance_slide(lz_info * lzi,int nchars)211 static void advance_slide(lz_info *lzi, int nchars)
212 {
213   int excess = lzi->chars_in_slide + nchars - lzi->wsize;
214 
215   if (excess > 0) {
216     advancetail(lzi->main_tree, excess);
217     lzi->chars_in_slide -= excess;
218   }
219 
220 #ifdef SHORTSLIDE
221   excess = lzi->chars_in_short_slide + nchars - 2048;
222   if (excess > 0) {
223     advancetail(lzi->short_tree, excess);
224     lzi->chars_in_short_slide -= excess;
225   }
226 #endif
227 
228 
229 #ifndef LZ_ONEBUFFER
230   /* copy from match to slide */
231   for (i = 0; i < nchars; i++) {
232 #ifdef SHORTSLIDE
233     *(lzi->short_slidep++) = *(lzi->match_bufp);
234     if (lzi->short_slidep == lzi->short_slidee)
235       lzi->short_slidep = lzi->short_slide;
236 #endif
237     *(lzi->slide_bufp++) = *(lzi->match_bufp++);
238 
239     if (lzi->slide_bufp == lzi->slide_bufe)
240       lzi->slide_bufp = lzi->slide_buf;
241     if (lzi->match_bufp == lzi->match_bufe)
242       lzi->match_bufp = lzi->match_buf;
243   }
244 #else
245   /* move the match buffer to keep up with the window */
246   lzi->match_bufp += nchars;
247   if (lzi->match_bufp >= lzi->slide_bufe)
248     lzi->match_bufp -= lzi->slide_buf_size;
249 #endif
250 
251 #ifdef USE_PSBST
252 #ifndef LZ_ONEBUFFER
253 #error Need LZ_ONEBUFFER with PBST
254 #endif
255   advancefront(lzi->main_tree, nchars, lzi->max_match, lzi->chars_in_match);
256 #else
257   advancefront(lzi->main_tree, nchars);
258 #endif
259   lzi->front_offset += nchars;
260   if (lzi->front_offset >= lzi->slide_buf_size) lzi->front_offset -= lzi->slide_buf_size;
261   lzi->chars_in_slide += nchars;
262 #ifdef SHORTSLIDE
263   advancefront(lzi->short_tree, nchars);
264   lzi->short_front_offset += nchars;
265   if (lzi->short_front_offset >= 2048) lzi->short_front_offset -= 2048;
266   lzi->chars_in_short_slide += nchars;
267 #endif
268   lzi->chars_in_match -= nchars;
269 }
270 
fill_match(lz_info * lzi,int maxchars)271 static void fill_match(lz_info *lzi, int maxchars)
272 {
273   int room_in_match = lzi->match_buf_size - lzi->chars_in_match;
274   u_char *read_start;
275   int read_size;
276 
277   maxchars -= lzi->chars_in_match;
278   read_start = lzi->match_bufp + lzi->chars_in_match;
279 #ifdef LZ_ONEBUFFER
280   if (read_start >= lzi->match_bufe) read_start -= lzi->slide_buf_size;
281   /* in one buffer mode, lzi->match_bufp + lzi_chars_in_match should never cross
282      the match buffer boundary */
283 #else
284   if (read_start >= lzi->match_bufe) read_start -= lzi->match_buf_size;
285 #endif
286 
287   read_size = lzi->match_bufe - read_start;
288   if (read_size > room_in_match) read_size = room_in_match;
289   if (read_size > maxchars) read_size = maxchars;
290   read_size = lzi->get_chars(lzi, read_size, read_start);
291   maxchars -= read_size;
292   lzi->cur_loc += read_size;
293   lzi->chars_in_match += read_size;
294   room_in_match -= read_size;
295   if (room_in_match && read_size && maxchars) {
296     if (room_in_match < maxchars)
297       read_size = room_in_match;
298     else
299       read_size = maxchars;
300 
301 #ifdef LZ_ONEBUFFER
302     read_size = lzi->get_chars(lzi, read_size, lzi->slide_buf);
303 #else
304     read_size = lzi->get_chars(lzi, read_size, lzi->match_buf);
305 #endif
306     lzi->chars_in_match += read_size;
307     lzi->cur_loc += read_size;
308   }
309 }
310 
lz_stop_compressing(lz_info * lzi)311 void lz_stop_compressing(lz_info *lzi)
312 {
313   lzi->stop = 1;
314 }
315 
lz_left_to_process(lz_info * lzi)316 int lz_left_to_process(lz_info *lzi)
317 {
318   return lzi->chars_in_match;
319 }
320 
lz_compress(lz_info * lzi,int nchars)321 int lz_compress(lz_info *lzi, int nchars)
322 {
323   int match_len, match_loc, mod_match_loc;
324   int advance_chars;
325   int new_match_loc, new_match_len;
326   int old_match_loc, old_match_len;
327   u_char *new_bufp;
328   int new_chars_in_match;
329   int chars_in_match;
330   int bank_literals = 0;
331   u_char *old_slide_bufp;
332   u_char *bank;
333   short force_literals = 0;
334   short lazy_match = 0;
335 
336   fill_match(lzi, nchars);
337   lzi->stop = 0;
338   while (lzi->chars_in_match && !lzi->stop && (nchars > 0)) {
339     chars_in_match = lzi->chars_in_match;
340     if (chars_in_match > lzi->max_match)
341       chars_in_match = lzi->max_match;
342     if (lzi->frame_size && (chars_in_match > (lzi->frame_size - lzi->loc_in_frame))) {
343       chars_in_match = (lzi->frame_size - lzi->loc_in_frame);
344     }
345     if (chars_in_match > nchars) {
346       chars_in_match = nchars;
347     }
348 #ifdef LAZY
349     if (!force_literals) {
350       if (!lazy_match) {
351 	match_loc = longestmatch(lzi->main_tree, lzi->match_bufp,
352 				 chars_in_match, &match_len,
353 				 lzi->match_bufe, lzi->match_buf);
354       }
355       else {
356 	match_loc = new_match_loc;
357 	match_len = new_match_len;
358 	lazy_match = 0;
359       }
360     }
361     else {
362       match_loc = -1;
363       force_literals--;
364     }
365 #else
366     match_loc = longestmatch(lzi->main_tree, lzi->match_bufp,
367 			     chars_in_match, &match_len,
368 			     lzi->match_bufe, lzi->match_buf);
369 #endif
370 
371     if ((match_loc == -1) || (match_len < lzi->min_match)) {/* literal */
372       lzi->output_literal(lzi, *lzi->match_bufp);
373       advance_chars = 1;
374     }
375     else { /* found a match */
376 
377 #ifdef LAZY /* lazy evaluation.. based on the gzip algorithm */
378       new_bufp = lzi->match_bufp;
379       /* make sure the first character isn't matched */
380       if (lzi->chars_in_slide == lzi->wsize) {
381 	advancetail(lzi->main_tree, 1);
382 	lzi->chars_in_slide--;
383       }
384 
385       if (++new_bufp == lzi->match_bufe) new_bufp = lzi->match_buf;
386       new_chars_in_match = chars_in_match - 1;
387       if (--new_chars_in_match > (match_len + 1)) {
388 	new_match_loc = longestmatch(lzi->main_tree, new_bufp,
389 				     new_chars_in_match, &new_match_len,
390 				     lzi->match_bufe, lzi->match_buf);
391 	if ((new_match_loc != -1 ) && (new_match_len > (match_len+1))) {
392 	  /*	  force_literals = 1;  used before the bank method */
393 	  /* assumes window is big enough that the bank will never be pushed off the end */
394 	  if (!bank_literals) {
395 	    bank = lzi->match_bufp;
396 	    old_match_loc = match_loc - lzi->front_offset;
397 	    if (old_match_loc >= 0)
398 	      old_match_loc -= lzi->slide_buf_size;
399 	    old_match_len = match_len;
400 	    old_slide_bufp = lzi->slide_bufp;
401 	  }
402 	  bank_literals++;
403 	  lazy_match = 1;
404 	  advance_chars = 1;
405 	}
406       }
407       if (!lazy_match) {
408 	if ((bank_literals > 2) && ((bank_literals - old_match_loc) < lzi->wsize)) {
409 	  /* this bank stuff is pretty marginal.  The idea is a succession of lazy matches may leave multiple
410 	     literals which can be re-combined into a match.  In practice, it rarely happens and when it
411 	     does the match is usually rejected for being too short and too far away.
412 
413 	     The second condition above makes sure we haven't lost the original match
414 	  */
415 	  new_bufp = lzi->slide_bufp;
416 	  lzi->slide_bufp = old_slide_bufp;
417 	  if (old_match_len > bank_literals)
418 	    old_match_len = bank_literals;
419 	  if (lzi->output_match(lzi, old_match_loc, old_match_len) == 0) {
420 	    /*	    fprintf(stderr, "leftover match %d %d %d %d %d\n", bank_literals, old_match_len, match_len, old_match_loc, lzi->cur_loc); */
421 	    bank_literals -= old_match_len;
422 	    bank += old_match_len;
423 	    if (bank >= lzi->match_bufe) bank = bank - (lzi->match_bufe - lzi->match_buf);
424 	  }
425 	  lzi->slide_bufp = new_bufp;
426 	}
427 	while (bank_literals) {
428 	  lzi->output_literal(lzi, *bank);
429 	  if (++bank == lzi->match_bufe) bank = lzi->match_buf;
430 	  bank_literals--;
431 	}
432 #endif
433 	mod_match_loc = match_loc - lzi->front_offset;
434 	if (mod_match_loc >= 0)
435 	  mod_match_loc -= lzi->slide_buf_size;
436 
437 #ifdef SHORTSLIDE
438 	new_match_loc = longestmatch(lzi->short_tree, lzi->match_bufp,
439 				     match_len, &new_match_len,
440 				     lzi->match_bufe, lzi->match_buf);
441 	new_match_loc = new_match_loc - lzi->short_front_offset;
442 	if (new_match_loc >= 0)
443 	  new_match_loc -= 2048;
444 	if ((new_match_loc > mod_match_loc) && (new_match_len == match_len)) {
445 	  /*	fprintf(stderr, "Found closer match %d  %d  %d %d\n", lzi->cur_loc, match_len, -mod_match_loc, -new_match_loc); */
446 	mod_match_loc = new_match_loc;
447 	}
448 #endif
449 	if (lzi->output_match(lzi, mod_match_loc, match_len) == 0)
450 	  advance_chars = match_len;
451 	else /* match rejected! */ {
452 	  lzi->output_literal(lzi, *lzi->match_bufp);
453 	  advance_chars = 1;
454 	}
455 #ifdef LAZY
456       }
457 #endif
458     }
459     advance_slide(lzi, advance_chars);
460     lzi->loc_in_frame += advance_chars;
461     if (lzi->frame_size && (lzi->loc_in_frame >= lzi->frame_size))
462       lzi->loc_in_frame -= lzi->frame_size;
463     nchars -= advance_chars;
464     fill_match(lzi, nchars);
465   }
466 #ifdef DEBUG_LZ
467   assert(!bank_literals);
468 #endif
469   assert(!bank_literals);
470   assert(!(lzi->chars_in_match - lz_left_to_process(lzi)));
471   return 0;
472 }
473 
474