1 /*
2 * lz77.c: common LZ77 compression code between Deflate and LZX.
3 */
4
5 #include "halibut.h" /* only for snew, sfree etc */
6 #include "lz77.h"
7
8 /*
9 * Modifiable parameters.
10 */
11 #define HASHMAX 2039 /* one more than max hash value */
12 #define MAXMATCH 32 /* how many matches we track */
13 #define HASHCHARS 3 /* how many chars make a hash */
14
15 /*
16 * This compressor takes a less slapdash approach than the
17 * gzip/zlib one. Rather than allowing our hash chains to fall into
18 * disuse near the far end, we keep them doubly linked so we can
19 * _find_ the far end, and then every time we add a new byte to the
20 * window (thus rolling round by one and removing the previous
21 * byte), we can carefully remove the hash chain entry.
22 */
23
24 #define INVALID -1 /* invalid hash _and_ invalid offset */
25 struct WindowEntry {
26 short next, prev; /* array indices within the window */
27 short hashval;
28 };
29
30 struct HashEntry {
31 short first; /* window index of first in chain */
32 };
33
34 struct Match {
35 int distance, len;
36 };
37
38 struct LZ77InternalContext {
39 int winsize;
40 struct WindowEntry *win; /* [winsize] */
41 unsigned char *data; /* [winsize] */
42
43 int winpos;
44 struct HashEntry hashtab[HASHMAX];
45 unsigned char pending[HASHCHARS];
46 int npending;
47 };
48
lz77_hash(const unsigned char * data)49 static int lz77_hash(const unsigned char *data)
50 {
51 return (257 * data[0] + 263 * data[1] + 269 * data[2]) % HASHMAX;
52 }
53
lz77_init(struct LZ77Context * ctx,int winsize)54 void lz77_init(struct LZ77Context *ctx, int winsize)
55 {
56 struct LZ77InternalContext *st;
57 int i;
58
59 st = snew(struct LZ77InternalContext);
60
61 ctx->ictx = st;
62
63 st->winsize = winsize;
64 st->win = snewn(st->winsize, struct WindowEntry);
65 st->data = snewn(st->winsize, unsigned char);
66
67 for (i = 0; i < st->winsize; i++)
68 st->win[i].next = st->win[i].prev = st->win[i].hashval = INVALID;
69 for (i = 0; i < HASHMAX; i++)
70 st->hashtab[i].first = INVALID;
71 st->winpos = 0;
72
73 st->npending = 0;
74 }
75
lz77_cleanup(struct LZ77Context * ctx)76 void lz77_cleanup(struct LZ77Context *ctx)
77 {
78 struct LZ77InternalContext *st = ctx->ictx;
79 sfree(st->win);
80 sfree(st->data);
81 sfree(st);
82 }
83
lz77_advance(struct LZ77InternalContext * st,unsigned char c,int hash)84 static void lz77_advance(struct LZ77InternalContext *st,
85 unsigned char c, int hash)
86 {
87 int off;
88
89 /*
90 * Remove the hash entry at winpos from the tail of its chain,
91 * or empty the chain if it's the only thing on the chain.
92 */
93 if (st->win[st->winpos].prev != INVALID) {
94 st->win[st->win[st->winpos].prev].next = INVALID;
95 } else if (st->win[st->winpos].hashval != INVALID) {
96 st->hashtab[st->win[st->winpos].hashval].first = INVALID;
97 }
98
99 /*
100 * Create a new entry at winpos and add it to the head of its
101 * hash chain.
102 */
103 st->win[st->winpos].hashval = hash;
104 st->win[st->winpos].prev = INVALID;
105 off = st->win[st->winpos].next = st->hashtab[hash].first;
106 st->hashtab[hash].first = st->winpos;
107 if (off != INVALID)
108 st->win[off].prev = st->winpos;
109 st->data[st->winpos] = c;
110
111 /*
112 * Advance the window pointer.
113 */
114 st->winpos = (st->winpos + 1) % st->winsize;
115 }
116
117 #define CHARAT(k) ( (k)<0 ? st->data[(st->winpos+k)%st->winsize] : data[k] )
118
lz77_compress(struct LZ77Context * ctx,const unsigned char * data,int len,int compress)119 void lz77_compress(struct LZ77Context *ctx,
120 const unsigned char *data, int len, int compress)
121 {
122 struct LZ77InternalContext *st = ctx->ictx;
123 int i, hash, distance, off, nmatch, matchlen, advance;
124 struct Match defermatch, matches[MAXMATCH];
125 int deferchr;
126
127 /*
128 * Add any pending characters from last time to the window. (We
129 * might not be able to.)
130 */
131 for (i = 0; i < st->npending; i++) {
132 unsigned char foo[HASHCHARS];
133 int j;
134 if (len + st->npending - i < HASHCHARS) {
135 /* Update the pending array. */
136 for (j = i; j < st->npending; j++)
137 st->pending[j - i] = st->pending[j];
138 break;
139 }
140 for (j = 0; j < HASHCHARS; j++)
141 foo[j] = (i + j < st->npending ? st->pending[i + j] :
142 data[i + j - st->npending]);
143 lz77_advance(st, foo[0], lz77_hash(foo));
144 }
145 st->npending -= i;
146
147 defermatch.len = 0;
148 deferchr = '\0';
149 while (len > 0) {
150
151 /* Don't even look for a match, if we're not compressing. */
152 if (compress && len >= HASHCHARS) {
153 /*
154 * Hash the next few characters.
155 */
156 hash = lz77_hash(data);
157
158 /*
159 * Look the hash up in the corresponding hash chain and see
160 * what we can find.
161 */
162 nmatch = 0;
163 for (off = st->hashtab[hash].first;
164 off != INVALID; off = st->win[off].next) {
165 /* distance = 1 if off == st->winpos-1 */
166 /* distance = winsize if off == st->winpos */
167 distance = st->winsize -
168 (off + st->winsize - st->winpos) % st->winsize;
169 for (i = 0; i < HASHCHARS; i++)
170 if (CHARAT(i) != CHARAT(i - distance))
171 break;
172 if (i == HASHCHARS) {
173 matches[nmatch].distance = distance;
174 matches[nmatch].len = 3;
175 if (++nmatch >= MAXMATCH)
176 break;
177 }
178 }
179 } else {
180 nmatch = 0;
181 hash = INVALID;
182 }
183
184 if (nmatch > 0) {
185 /*
186 * We've now filled up matches[] with nmatch potential
187 * matches. Follow them down to find the longest. (We
188 * assume here that it's always worth favouring a
189 * longer match over a shorter one.)
190 */
191 matchlen = HASHCHARS;
192 while (matchlen < len) {
193 int j;
194 for (i = j = 0; i < nmatch; i++) {
195 if (CHARAT(matchlen) ==
196 CHARAT(matchlen - matches[i].distance)) {
197 matches[j++] = matches[i];
198 }
199 }
200 if (j == 0)
201 break;
202 matchlen++;
203 nmatch = j;
204 }
205
206 /*
207 * We've now got all the longest matches. We favour the
208 * shorter distances, which means we go with matches[0].
209 * So see if we want to defer it or throw it away.
210 */
211 matches[0].len = matchlen;
212 if (defermatch.len > 0) {
213 if (matches[0].len > defermatch.len + 1) {
214 /* We have a better match. Emit the deferred char,
215 * and defer this match. */
216 ctx->literal(ctx, (unsigned char) deferchr);
217 defermatch = matches[0];
218 deferchr = data[0];
219 advance = 1;
220 } else {
221 /* We don't have a better match. Do the deferred one. */
222 ctx->match(ctx, defermatch.distance, defermatch.len);
223 advance = defermatch.len - 1;
224 defermatch.len = 0;
225 }
226 } else {
227 /* There was no deferred match. Defer this one. */
228 defermatch = matches[0];
229 deferchr = data[0];
230 advance = 1;
231 }
232 } else {
233 /*
234 * We found no matches. Emit the deferred match, if
235 * any; otherwise emit a literal.
236 */
237 if (defermatch.len > 0) {
238 ctx->match(ctx, defermatch.distance, defermatch.len);
239 advance = defermatch.len - 1;
240 defermatch.len = 0;
241 } else {
242 ctx->literal(ctx, data[0]);
243 advance = 1;
244 }
245 }
246
247 /*
248 * Now advance the position by `advance' characters,
249 * keeping the window and hash chains consistent.
250 */
251 while (advance > 0) {
252 if (len >= HASHCHARS) {
253 lz77_advance(st, *data, lz77_hash(data));
254 } else {
255 st->pending[st->npending++] = *data;
256 }
257 data++;
258 len--;
259 advance--;
260 }
261 }
262 }
263
264