1 /*
2  * lz77.c: common LZ77 compression code between Deflate and LZX.
3  */
4 
5 #include "halibut.h" /* only for snew, sfree etc */
6 #include "lz77.h"
7 
8 /*
9  * Modifiable parameters.
10  */
11 #define HASHMAX 2039		       /* one more than max hash value */
12 #define MAXMATCH 32		       /* how many matches we track */
13 #define HASHCHARS 3		       /* how many chars make a hash */
14 
15 /*
16  * This compressor takes a less slapdash approach than the
17  * gzip/zlib one. Rather than allowing our hash chains to fall into
18  * disuse near the far end, we keep them doubly linked so we can
19  * _find_ the far end, and then every time we add a new byte to the
20  * window (thus rolling round by one and removing the previous
21  * byte), we can carefully remove the hash chain entry.
22  */
23 
24 #define INVALID -1		       /* invalid hash _and_ invalid offset */
25 struct WindowEntry {
26     short next, prev;		       /* array indices within the window */
27     short hashval;
28 };
29 
30 struct HashEntry {
31     short first;		       /* window index of first in chain */
32 };
33 
34 struct Match {
35     int distance, len;
36 };
37 
38 struct LZ77InternalContext {
39     int winsize;
40     struct WindowEntry *win; /* [winsize] */
41     unsigned char *data; /* [winsize] */
42 
43     int winpos;
44     struct HashEntry hashtab[HASHMAX];
45     unsigned char pending[HASHCHARS];
46     int npending;
47 };
48 
lz77_hash(const unsigned char * data)49 static int lz77_hash(const unsigned char *data)
50 {
51     return (257 * data[0] + 263 * data[1] + 269 * data[2]) % HASHMAX;
52 }
53 
lz77_init(struct LZ77Context * ctx,int winsize)54 void lz77_init(struct LZ77Context *ctx, int winsize)
55 {
56     struct LZ77InternalContext *st;
57     int i;
58 
59     st = snew(struct LZ77InternalContext);
60 
61     ctx->ictx = st;
62 
63     st->winsize = winsize;
64     st->win = snewn(st->winsize, struct WindowEntry);
65     st->data = snewn(st->winsize, unsigned char);
66 
67     for (i = 0; i < st->winsize; i++)
68 	st->win[i].next = st->win[i].prev = st->win[i].hashval = INVALID;
69     for (i = 0; i < HASHMAX; i++)
70 	st->hashtab[i].first = INVALID;
71     st->winpos = 0;
72 
73     st->npending = 0;
74 }
75 
lz77_cleanup(struct LZ77Context * ctx)76 void lz77_cleanup(struct LZ77Context *ctx)
77 {
78     struct LZ77InternalContext *st = ctx->ictx;
79     sfree(st->win);
80     sfree(st->data);
81     sfree(st);
82 }
83 
lz77_advance(struct LZ77InternalContext * st,unsigned char c,int hash)84 static void lz77_advance(struct LZ77InternalContext *st,
85 			 unsigned char c, int hash)
86 {
87     int off;
88 
89     /*
90      * Remove the hash entry at winpos from the tail of its chain,
91      * or empty the chain if it's the only thing on the chain.
92      */
93     if (st->win[st->winpos].prev != INVALID) {
94 	st->win[st->win[st->winpos].prev].next = INVALID;
95     } else if (st->win[st->winpos].hashval != INVALID) {
96 	st->hashtab[st->win[st->winpos].hashval].first = INVALID;
97     }
98 
99     /*
100      * Create a new entry at winpos and add it to the head of its
101      * hash chain.
102      */
103     st->win[st->winpos].hashval = hash;
104     st->win[st->winpos].prev = INVALID;
105     off = st->win[st->winpos].next = st->hashtab[hash].first;
106     st->hashtab[hash].first = st->winpos;
107     if (off != INVALID)
108 	st->win[off].prev = st->winpos;
109     st->data[st->winpos] = c;
110 
111     /*
112      * Advance the window pointer.
113      */
114     st->winpos = (st->winpos + 1) % st->winsize;
115 }
116 
117 #define CHARAT(k) ( (k)<0 ? st->data[(st->winpos+k)%st->winsize] : data[k] )
118 
lz77_compress(struct LZ77Context * ctx,const unsigned char * data,int len,int compress)119 void lz77_compress(struct LZ77Context *ctx,
120                    const unsigned char *data, int len, int compress)
121 {
122     struct LZ77InternalContext *st = ctx->ictx;
123     int i, hash, distance, off, nmatch, matchlen, advance;
124     struct Match defermatch, matches[MAXMATCH];
125     int deferchr;
126 
127     /*
128      * Add any pending characters from last time to the window. (We
129      * might not be able to.)
130      */
131     for (i = 0; i < st->npending; i++) {
132 	unsigned char foo[HASHCHARS];
133 	int j;
134 	if (len + st->npending - i < HASHCHARS) {
135 	    /* Update the pending array. */
136 	    for (j = i; j < st->npending; j++)
137 		st->pending[j - i] = st->pending[j];
138 	    break;
139 	}
140 	for (j = 0; j < HASHCHARS; j++)
141 	    foo[j] = (i + j < st->npending ? st->pending[i + j] :
142 		      data[i + j - st->npending]);
143 	lz77_advance(st, foo[0], lz77_hash(foo));
144     }
145     st->npending -= i;
146 
147     defermatch.len = 0;
148     deferchr = '\0';
149     while (len > 0) {
150 
151 	/* Don't even look for a match, if we're not compressing. */
152 	if (compress && len >= HASHCHARS) {
153 	    /*
154 	     * Hash the next few characters.
155 	     */
156 	    hash = lz77_hash(data);
157 
158 	    /*
159 	     * Look the hash up in the corresponding hash chain and see
160 	     * what we can find.
161 	     */
162 	    nmatch = 0;
163 	    for (off = st->hashtab[hash].first;
164 		 off != INVALID; off = st->win[off].next) {
165 		/* distance = 1       if off == st->winpos-1 */
166 		/* distance = winsize if off == st->winpos   */
167 		distance = st->winsize -
168                     (off + st->winsize - st->winpos) % st->winsize;
169 		for (i = 0; i < HASHCHARS; i++)
170 		    if (CHARAT(i) != CHARAT(i - distance))
171 			break;
172 		if (i == HASHCHARS) {
173 		    matches[nmatch].distance = distance;
174 		    matches[nmatch].len = 3;
175 		    if (++nmatch >= MAXMATCH)
176 			break;
177 		}
178 	    }
179 	} else {
180 	    nmatch = 0;
181 	    hash = INVALID;
182 	}
183 
184 	if (nmatch > 0) {
185 	    /*
186 	     * We've now filled up matches[] with nmatch potential
187 	     * matches. Follow them down to find the longest. (We
188 	     * assume here that it's always worth favouring a
189 	     * longer match over a shorter one.)
190 	     */
191 	    matchlen = HASHCHARS;
192 	    while (matchlen < len) {
193 		int j;
194 		for (i = j = 0; i < nmatch; i++) {
195 		    if (CHARAT(matchlen) ==
196 			CHARAT(matchlen - matches[i].distance)) {
197 			matches[j++] = matches[i];
198 		    }
199 		}
200 		if (j == 0)
201 		    break;
202 		matchlen++;
203 		nmatch = j;
204 	    }
205 
206 	    /*
207 	     * We've now got all the longest matches. We favour the
208 	     * shorter distances, which means we go with matches[0].
209 	     * So see if we want to defer it or throw it away.
210 	     */
211 	    matches[0].len = matchlen;
212 	    if (defermatch.len > 0) {
213 		if (matches[0].len > defermatch.len + 1) {
214 		    /* We have a better match. Emit the deferred char,
215 		     * and defer this match. */
216 		    ctx->literal(ctx, (unsigned char) deferchr);
217 		    defermatch = matches[0];
218 		    deferchr = data[0];
219 		    advance = 1;
220 		} else {
221 		    /* We don't have a better match. Do the deferred one. */
222 		    ctx->match(ctx, defermatch.distance, defermatch.len);
223 		    advance = defermatch.len - 1;
224 		    defermatch.len = 0;
225 		}
226 	    } else {
227 		/* There was no deferred match. Defer this one. */
228 		defermatch = matches[0];
229 		deferchr = data[0];
230 		advance = 1;
231 	    }
232 	} else {
233 	    /*
234 	     * We found no matches. Emit the deferred match, if
235 	     * any; otherwise emit a literal.
236 	     */
237 	    if (defermatch.len > 0) {
238 		ctx->match(ctx, defermatch.distance, defermatch.len);
239 		advance = defermatch.len - 1;
240 		defermatch.len = 0;
241 	    } else {
242 		ctx->literal(ctx, data[0]);
243 		advance = 1;
244 	    }
245 	}
246 
247 	/*
248 	 * Now advance the position by `advance' characters,
249 	 * keeping the window and hash chains consistent.
250 	 */
251 	while (advance > 0) {
252 	    if (len >= HASHCHARS) {
253 		lz77_advance(st, *data, lz77_hash(data));
254 	    } else {
255 		st->pending[st->npending++] = *data;
256 	    }
257 	    data++;
258 	    len--;
259 	    advance--;
260 	}
261     }
262 }
263 
264