1 /*
2 File lz_slide.c, part of lzxcomp library
3 Copyright (C) 2002 Matthew T. Russotto
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; version 2.1 only
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19 /*
20 * This file implements a Lempel-Ziv compression mechanism on top of a
21 * sliding-window string search algorithm. Currently hash_slide.c, which
22 * uses a hash table method similar to that of gnuzip/zlib, is used.
23 *
24 * Previously, a sliding-window suffix tree method was tried, but it
25 * resulted in poor compression at the LZX layer because it did not get the
26 * closest matching string. SHORTSLIDE was an attempt to get around this
27 * by maintaining a close tree and a full tree.
28 *
29 * A binary search tree was also tried, but it was too slow, particularly
30 * when searching for the closest match
31 *
32 * LZ_ONEBUFFER instructs the algorithm to use the same circular buffer for
33 * matches (lookahead) and the window itself. This is required when
34 * hash_slide is used.
35 *
36 * LAZY is lazy evaluation, similar to the gnuzip/zlib method.
37 *
38 */
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <assert.h>
42 #include "hash_slide.h"
43 #include "lz_slide.h"
44
45 #define MAX_MATCH 253
46 #define MIN_MATCH 2
47
48 #ifdef SHORTSLIDE
49 #ifdef LZ_ONEBUFFER
50 #error Short Slide cannot be used with One Buffer
51 #endif
52 #endif
lz_init(lz_info * lzi,int wsize,int max_match,int min_match,int frame_size,get_chars_t get_chars,output_match_t output_match,output_literal_t output_literal,void * user_data)53 void lz_init(lz_info *lzi, int wsize, int max_match, int min_match,
54 int frame_size,
55 get_chars_t get_chars,
56 output_match_t output_match,
57 output_literal_t output_literal, void *user_data)
58 {
59 assert(sizeof(SYMB) == sizeof(u_char));
60
61 lzi->wsize = wsize;
62 if (max_match > wsize)
63 lzi->max_match = wsize;
64 else
65 lzi->max_match = max_match;
66
67 #ifdef LAZY
68 lzi->match_buf_size = lzi->max_match + 1;
69 #else
70 lzi->match_buf_size = lzi->max_match;
71 #endif
72
73 lzi->min_match = min_match;
74
75
76 #ifdef LZ_ONEBUFFER
77 lzi->slide_buf_size = wsize + lzi->match_buf_size;
78 lzi->slide_buf = malloc(lzi->slide_buf_size);
79 lzi->slide_bufe = lzi->slide_buf + lzi->slide_buf_size;
80
81 lzi->match_buf = lzi->slide_buf;
82 lzi->match_bufp = lzi->match_buf;
83 lzi->match_bufe = lzi->slide_bufe;
84 #else
85 lzi->slide_buf = malloc(wsize);
86 lzi->slide_bufp = lzi->slide_buf;
87 lzi->slide_bufe = lzi->slide_buf + wsize;
88
89 lzi->match_buf = malloc(lzi->match_buf_size);
90 lzi->match_bufp = lzi->match_buf;
91 lzi->match_bufe = lzi->match_buf + lzi->match_buf_size;
92 #endif
93
94 #ifdef SHORTSLIDE
95 /* it is possible to use the same slide buffer for both. But that would use
96 MORE memory, at least unless I can fix the slide code to allocate fewer nodes
97 */
98 lzi->short_slide = malloc(2048);
99 lzi->short_slidep = lzi->short_slide;
100 lzi->short_slidee = lzi->short_slide + 2048;
101 lzi->chars_in_short_slide = 0;
102 lzi->short_front_offset = 0;
103 initslide(2048, lzi->short_slide, &lzi->short_tree);
104 #endif
105
106 assert(lzi->slide_buf != NULL);
107 assert(lzi->match_buf != NULL);
108
109 lzi->frame_size = frame_size;
110 lzi->chars_in_slide = lzi->chars_in_match = lzi->front_offset = 0;
111 lzi->cur_loc = 0;
112 lzi->loc_in_frame = 0;
113 lzi->get_chars = get_chars;
114 lzi->output_match = output_match;
115 lzi->output_literal = output_literal;
116 lzi->user_data = user_data;
117 inithash(lzi->slide_buf_size, lzi->slide_buf, &lzi->main_tree);
118 }
119
lz_release(lz_info * lzi)120 void lz_release(lz_info *lzi)
121 {
122 releasehash(lzi->main_tree);
123 #ifdef SHORTSLIDE
124 releaseslide(lzi->short_tree);
125 free(lzi->short_slide);
126 #endif
127 free(lzi->slide_buf);
128 #ifndef LZ_ONEBUFFER
129 free(lzi->match_buf);
130 #endif
131 }
132
lz_reset(lz_info * lzi)133 void lz_reset(lz_info *lzi)
134 {
135 lzi->slide_bufp = lzi->slide_buf;
136 lzi->match_bufp = lzi->match_buf;
137 lzi->chars_in_slide = lzi->chars_in_match = lzi->front_offset = 0;
138 lzi->loc_in_frame = 0;
139 resethash(lzi->main_tree);
140
141 #ifdef SHORTSLIDE
142 lzi->short_slidep = lzi->short_slide;
143 lzi->chars_in_short_slide = 0;
144 lzi->short_front_offset = 0;
145 resetslide(lzi->short_tree);
146 #endif
147 }
148
149 #ifdef LZSLIDE_MAIN
150 typedef struct lz_user_data
151 {
152 FILE *infile;
153 FILE *outfile;
154 int R0, R1, R2;
155 } lz_user_data;
156
tmp_get_chars(lz_info * lzi,int n,u_char * buf)157 int tmp_get_chars(lz_info *lzi, int n, u_char *buf)
158 {
159 lz_user_data *lzud = (lz_user_data *)lzi->user_data;
160 return fread(buf, 1, n, lzud->infile);
161 }
162
tmp_output_match(lz_info * lzi,int match_pos,int match_len)163 int tmp_output_match(lz_info *lzi, int match_pos, int match_len)
164 {
165 lz_user_data *lzud = (lz_user_data *)lzi->user_data;
166 int mod_match_loc;
167 #if 0
168 if (match_pos == lzud->R0)
169 match_pos = 0;
170 else if (match_pos == lzud->R1) {
171 lzud->R1 = lzud->R0;
172 lzud->R0 = match_pos;
173 match_pos = 1;
174 }
175 else if (match_pos == lzud->R2) {
176 lzud->R2 = lzud->R0;
177 lzud->R0 = match_pos;
178 match_pos = 2;
179 }
180 else {
181 lzud->R2 = lzud->R1;
182 lzud->R1 = lzud->R0;
183 lzud->R0 = match_pos;
184 }
185 #endif
186 mod_match_loc = match_pos + lzi->front_offset;
187 if (mod_match_loc < 0)
188 mod_match_loc += lzi->slide_buf_size;
189
190 fprintf(lzud->outfile, "(%d, %d)(%d)\n", match_pos, match_len, mod_match_loc);
191 return 0;
192 }
193
tmp_output_literal(lz_info * lzi,u_char ch)194 void tmp_output_literal(lz_info *lzi, u_char ch)
195 {
196 lz_user_data *lzud = (lz_user_data *)lzi->user_data;
197 fprintf(lzud->outfile, "'%c'", ch);
198 }
199
main(int argc,char * argv[])200 int main(int argc, char *argv[])
201 {
202 int wsize = atoi(argv[1]);
203 lz_info lzi;
204 lz_user_data lzu = {stdin, stdout, 1, 1, 1};
205
206 lz_init(&lzi, wsize, MAX_MATCH, MIN_MATCH, tmp_get_chars, tmp_output_match, tmp_output_literal,&lzu);
207 lz_compress(&lzi);
208 }
209 #endif
210
advance_slide(lz_info * lzi,int nchars)211 static void advance_slide(lz_info *lzi, int nchars)
212 {
213 int excess = lzi->chars_in_slide + nchars - lzi->wsize;
214
215 if (excess > 0) {
216 advancetail(lzi->main_tree, excess);
217 lzi->chars_in_slide -= excess;
218 }
219
220 #ifdef SHORTSLIDE
221 excess = lzi->chars_in_short_slide + nchars - 2048;
222 if (excess > 0) {
223 advancetail(lzi->short_tree, excess);
224 lzi->chars_in_short_slide -= excess;
225 }
226 #endif
227
228
229 #ifndef LZ_ONEBUFFER
230 /* copy from match to slide */
231 for (i = 0; i < nchars; i++) {
232 #ifdef SHORTSLIDE
233 *(lzi->short_slidep++) = *(lzi->match_bufp);
234 if (lzi->short_slidep == lzi->short_slidee)
235 lzi->short_slidep = lzi->short_slide;
236 #endif
237 *(lzi->slide_bufp++) = *(lzi->match_bufp++);
238
239 if (lzi->slide_bufp == lzi->slide_bufe)
240 lzi->slide_bufp = lzi->slide_buf;
241 if (lzi->match_bufp == lzi->match_bufe)
242 lzi->match_bufp = lzi->match_buf;
243 }
244 #else
245 /* move the match buffer to keep up with the window */
246 lzi->match_bufp += nchars;
247 if (lzi->match_bufp >= lzi->slide_bufe)
248 lzi->match_bufp -= lzi->slide_buf_size;
249 #endif
250
251 #ifdef USE_PSBST
252 #ifndef LZ_ONEBUFFER
253 #error Need LZ_ONEBUFFER with PBST
254 #endif
255 advancefront(lzi->main_tree, nchars, lzi->max_match, lzi->chars_in_match);
256 #else
257 advancefront(lzi->main_tree, nchars);
258 #endif
259 lzi->front_offset += nchars;
260 if (lzi->front_offset >= lzi->slide_buf_size) lzi->front_offset -= lzi->slide_buf_size;
261 lzi->chars_in_slide += nchars;
262 #ifdef SHORTSLIDE
263 advancefront(lzi->short_tree, nchars);
264 lzi->short_front_offset += nchars;
265 if (lzi->short_front_offset >= 2048) lzi->short_front_offset -= 2048;
266 lzi->chars_in_short_slide += nchars;
267 #endif
268 lzi->chars_in_match -= nchars;
269 }
270
fill_match(lz_info * lzi,int maxchars)271 static void fill_match(lz_info *lzi, int maxchars)
272 {
273 int room_in_match = lzi->match_buf_size - lzi->chars_in_match;
274 u_char *read_start;
275 int read_size;
276
277 maxchars -= lzi->chars_in_match;
278 read_start = lzi->match_bufp + lzi->chars_in_match;
279 #ifdef LZ_ONEBUFFER
280 if (read_start >= lzi->match_bufe) read_start -= lzi->slide_buf_size;
281 /* in one buffer mode, lzi->match_bufp + lzi_chars_in_match should never cross
282 the match buffer boundary */
283 #else
284 if (read_start >= lzi->match_bufe) read_start -= lzi->match_buf_size;
285 #endif
286
287 read_size = lzi->match_bufe - read_start;
288 if (read_size > room_in_match) read_size = room_in_match;
289 if (read_size > maxchars) read_size = maxchars;
290 read_size = lzi->get_chars(lzi, read_size, read_start);
291 maxchars -= read_size;
292 lzi->cur_loc += read_size;
293 lzi->chars_in_match += read_size;
294 room_in_match -= read_size;
295 if (room_in_match && read_size && maxchars) {
296 if (room_in_match < maxchars)
297 read_size = room_in_match;
298 else
299 read_size = maxchars;
300
301 #ifdef LZ_ONEBUFFER
302 read_size = lzi->get_chars(lzi, read_size, lzi->slide_buf);
303 #else
304 read_size = lzi->get_chars(lzi, read_size, lzi->match_buf);
305 #endif
306 lzi->chars_in_match += read_size;
307 lzi->cur_loc += read_size;
308 }
309 }
310
lz_stop_compressing(lz_info * lzi)311 void lz_stop_compressing(lz_info *lzi)
312 {
313 lzi->stop = 1;
314 }
315
lz_left_to_process(lz_info * lzi)316 int lz_left_to_process(lz_info *lzi)
317 {
318 return lzi->chars_in_match;
319 }
320
lz_compress(lz_info * lzi,int nchars)321 int lz_compress(lz_info *lzi, int nchars)
322 {
323 int match_len, match_loc, mod_match_loc;
324 int advance_chars;
325 int new_match_loc, new_match_len;
326 int old_match_loc, old_match_len;
327 u_char *new_bufp;
328 int new_chars_in_match;
329 int chars_in_match;
330 int bank_literals = 0;
331 u_char *old_slide_bufp;
332 u_char *bank;
333 short force_literals = 0;
334 short lazy_match = 0;
335
336 fill_match(lzi, nchars);
337 lzi->stop = 0;
338 while (lzi->chars_in_match && !lzi->stop && (nchars > 0)) {
339 chars_in_match = lzi->chars_in_match;
340 if (chars_in_match > lzi->max_match)
341 chars_in_match = lzi->max_match;
342 if (lzi->frame_size && (chars_in_match > (lzi->frame_size - lzi->loc_in_frame))) {
343 chars_in_match = (lzi->frame_size - lzi->loc_in_frame);
344 }
345 if (chars_in_match > nchars) {
346 chars_in_match = nchars;
347 }
348 #ifdef LAZY
349 if (!force_literals) {
350 if (!lazy_match) {
351 match_loc = longestmatch(lzi->main_tree, lzi->match_bufp,
352 chars_in_match, &match_len,
353 lzi->match_bufe, lzi->match_buf);
354 }
355 else {
356 match_loc = new_match_loc;
357 match_len = new_match_len;
358 lazy_match = 0;
359 }
360 }
361 else {
362 match_loc = -1;
363 force_literals--;
364 }
365 #else
366 match_loc = longestmatch(lzi->main_tree, lzi->match_bufp,
367 chars_in_match, &match_len,
368 lzi->match_bufe, lzi->match_buf);
369 #endif
370
371 if ((match_loc == -1) || (match_len < lzi->min_match)) {/* literal */
372 lzi->output_literal(lzi, *lzi->match_bufp);
373 advance_chars = 1;
374 }
375 else { /* found a match */
376
377 #ifdef LAZY /* lazy evaluation.. based on the gzip algorithm */
378 new_bufp = lzi->match_bufp;
379 /* make sure the first character isn't matched */
380 if (lzi->chars_in_slide == lzi->wsize) {
381 advancetail(lzi->main_tree, 1);
382 lzi->chars_in_slide--;
383 }
384
385 if (++new_bufp == lzi->match_bufe) new_bufp = lzi->match_buf;
386 new_chars_in_match = chars_in_match - 1;
387 if (--new_chars_in_match > (match_len + 1)) {
388 new_match_loc = longestmatch(lzi->main_tree, new_bufp,
389 new_chars_in_match, &new_match_len,
390 lzi->match_bufe, lzi->match_buf);
391 if ((new_match_loc != -1 ) && (new_match_len > (match_len+1))) {
392 /* force_literals = 1; used before the bank method */
393 /* assumes window is big enough that the bank will never be pushed off the end */
394 if (!bank_literals) {
395 bank = lzi->match_bufp;
396 old_match_loc = match_loc - lzi->front_offset;
397 if (old_match_loc >= 0)
398 old_match_loc -= lzi->slide_buf_size;
399 old_match_len = match_len;
400 old_slide_bufp = lzi->slide_bufp;
401 }
402 bank_literals++;
403 lazy_match = 1;
404 advance_chars = 1;
405 }
406 }
407 if (!lazy_match) {
408 if ((bank_literals > 2) && ((bank_literals - old_match_loc) < lzi->wsize)) {
409 /* this bank stuff is pretty marginal. The idea is a succession of lazy matches may leave multiple
410 literals which can be re-combined into a match. In practice, it rarely happens and when it
411 does the match is usually rejected for being too short and too far away.
412
413 The second condition above makes sure we haven't lost the original match
414 */
415 new_bufp = lzi->slide_bufp;
416 lzi->slide_bufp = old_slide_bufp;
417 if (old_match_len > bank_literals)
418 old_match_len = bank_literals;
419 if (lzi->output_match(lzi, old_match_loc, old_match_len) == 0) {
420 /* fprintf(stderr, "leftover match %d %d %d %d %d\n", bank_literals, old_match_len, match_len, old_match_loc, lzi->cur_loc); */
421 bank_literals -= old_match_len;
422 bank += old_match_len;
423 if (bank >= lzi->match_bufe) bank = bank - (lzi->match_bufe - lzi->match_buf);
424 }
425 lzi->slide_bufp = new_bufp;
426 }
427 while (bank_literals) {
428 lzi->output_literal(lzi, *bank);
429 if (++bank == lzi->match_bufe) bank = lzi->match_buf;
430 bank_literals--;
431 }
432 #endif
433 mod_match_loc = match_loc - lzi->front_offset;
434 if (mod_match_loc >= 0)
435 mod_match_loc -= lzi->slide_buf_size;
436
437 #ifdef SHORTSLIDE
438 new_match_loc = longestmatch(lzi->short_tree, lzi->match_bufp,
439 match_len, &new_match_len,
440 lzi->match_bufe, lzi->match_buf);
441 new_match_loc = new_match_loc - lzi->short_front_offset;
442 if (new_match_loc >= 0)
443 new_match_loc -= 2048;
444 if ((new_match_loc > mod_match_loc) && (new_match_len == match_len)) {
445 /* fprintf(stderr, "Found closer match %d %d %d %d\n", lzi->cur_loc, match_len, -mod_match_loc, -new_match_loc); */
446 mod_match_loc = new_match_loc;
447 }
448 #endif
449 if (lzi->output_match(lzi, mod_match_loc, match_len) == 0)
450 advance_chars = match_len;
451 else /* match rejected! */ {
452 lzi->output_literal(lzi, *lzi->match_bufp);
453 advance_chars = 1;
454 }
455 #ifdef LAZY
456 }
457 #endif
458 }
459 advance_slide(lzi, advance_chars);
460 lzi->loc_in_frame += advance_chars;
461 if (lzi->frame_size && (lzi->loc_in_frame >= lzi->frame_size))
462 lzi->loc_in_frame -= lzi->frame_size;
463 nchars -= advance_chars;
464 fill_match(lzi, nchars);
465 }
466 #ifdef DEBUG_LZ
467 assert(!bank_literals);
468 #endif
469 assert(!bank_literals);
470 assert(!(lzi->chars_in_match - lz_left_to_process(lzi)));
471 return 0;
472 }
473
474