1 /*
2 * huffman.c: implementation of huffman.h.
3 */
4
5 #include <assert.h>
6
7 #include "halibut.h"
8 #include "huffman.h"
9
10 static const unsigned fibonacci[] = {
11 0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597,
12 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418,
13 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465,
14 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
15 433494437, 701408733, 1134903170, 1836311903, 2971215073,
16 };
17
18 /*
19 * Binary heap functions used by buildhuf(). Each one assumes the
20 * heap to be stored in an array of ints, with two ints per node
21 * (user data and key). They take in the old heap length, and
22 * return the new one.
23 */
24 #define HEAPPARENT(x) (((x)-2)/4*2)
25 #define HEAPLEFT(x) ((x)*2+2)
26 #define HEAPRIGHT(x) ((x)*2+4)
addheap(int * heap,int len,int userdata,int key)27 static int addheap(int *heap, int len, int userdata, int key)
28 {
29 int me, dad, tmp;
30
31 me = len;
32 heap[len++] = userdata;
33 heap[len++] = key;
34
35 while (me > 0) {
36 dad = HEAPPARENT(me);
37 if (heap[me+1] < heap[dad+1]) {
38 tmp = heap[me]; heap[me] = heap[dad]; heap[dad] = tmp;
39 tmp = heap[me+1]; heap[me+1] = heap[dad+1]; heap[dad+1] = tmp;
40 me = dad;
41 } else
42 break;
43 }
44
45 return len;
46 }
rmheap(int * heap,int len,int * userdata,int * key)47 static int rmheap(int *heap, int len, int *userdata, int *key)
48 {
49 int me, lc, rc, c, tmp;
50
51 len -= 2;
52 *userdata = heap[0];
53 *key = heap[1];
54 heap[0] = heap[len];
55 heap[1] = heap[len+1];
56
57 me = 0;
58
59 while (1) {
60 lc = HEAPLEFT(me);
61 rc = HEAPRIGHT(me);
62 if (lc >= len)
63 break;
64 else if (rc >= len || heap[lc+1] < heap[rc+1])
65 c = lc;
66 else
67 c = rc;
68 if (heap[me+1] > heap[c+1]) {
69 tmp = heap[me]; heap[me] = heap[c]; heap[c] = tmp;
70 tmp = heap[me+1]; heap[me+1] = heap[c+1]; heap[c+1] = tmp;
71 } else
72 break;
73 me = c;
74 }
75
76 return len;
77 }
78
79 struct hufscratch {
80 int *parent, *length, *heap;
81 };
82
83 /*
84 * The core of the Huffman algorithm: takes an input array of
85 * symbol frequencies, and produces an output array of code
86 * lengths.
87 *
88 * We cap the output code lengths to fit in an unsigned char (which is
89 * safe since our clients will impose some smaller limit on code
90 * length anyway). So if you see 255 in the output, it means '255 or
91 * more' and is a sign that whatever limit you really wanted has
92 * certainly been overflowed.
93 */
buildhuf(struct hufscratch * sc,const int * freqs,unsigned char * lengths,int nsyms)94 static void buildhuf(struct hufscratch *sc, const int *freqs,
95 unsigned char *lengths, int nsyms)
96 {
97 int heapsize;
98 int i, j, n;
99 int si, sj;
100
101 for (i = 0; i < nsyms; i++)
102 sc->parent[i] = 0;
103
104 /*
105 * Begin by building the heap.
106 */
107 heapsize = 0;
108 for (i = 0; i < nsyms; i++)
109 if (freqs[i] > 0) /* leave unused symbols out totally */
110 heapsize = addheap(sc->heap, heapsize, i, freqs[i]);
111
112 /*
113 * Now repeatedly take two elements off the heap and merge
114 * them.
115 */
116 n = nsyms;
117 while (heapsize > 2) {
118 heapsize = rmheap(sc->heap, heapsize, &i, &si);
119 heapsize = rmheap(sc->heap, heapsize, &j, &sj);
120 sc->parent[i] = n;
121 sc->parent[j] = n;
122 heapsize = addheap(sc->heap, heapsize, n, si + sj);
123 n++;
124 }
125
126 /*
127 * Now we have our tree, in the form of a link from each node
128 * to the index of its parent. Count back down the tree to
129 * determine the code lengths.
130 */
131 for (i = 0; i < 2*nsyms+1; i++)
132 sc->length[i] = 0;
133 /* The tree root has length 0 after that, which is correct. */
134 for (i = n-1; i-- ;)
135 if (sc->parent[i] > 0)
136 sc->length[i] = 1 + sc->length[sc->parent[i]];
137
138 /*
139 * And that's it. (Simple, wasn't it?) Copy the lengths into
140 * the output array and leave.
141 *
142 * Here we cap lengths to fit in unsigned char.
143 */
144 for (i = 0; i < nsyms; i++)
145 lengths[i] = (sc->length[i] > 255 ? 255 : sc->length[i]);
146 }
147
148 /*
149 * Wrapper around buildhuf() which enforces the restriction on code
150 * length.
151 */
build_huffman_tree(int * freqs,unsigned char * lengths,int nsyms,int limit)152 void build_huffman_tree(int *freqs, unsigned char *lengths,
153 int nsyms, int limit)
154 {
155 struct hufscratch hsc, *sc = &hsc;
156 int smallestfreq, totalfreq, nactivesyms;
157 int num, denom, adjust;
158 int i;
159 int maxprob;
160
161 sc->parent = snewn(2*nsyms+1, int);
162 sc->length = snewn(2*nsyms+1, int);
163 sc->heap = snewn(2*nsyms, int);
164
165 /*
166 * Nasty special case: if the frequency table has fewer than
167 * two non-zero elements, we must invent some, because we can't
168 * have fewer than one bit encoding a symbol.
169 */
170 assert(nsyms >= 2);
171 {
172 int count = 0;
173 for (i = 0; i < nsyms; i++)
174 if (freqs[i] > 0)
175 count++;
176 if (count < 2) {
177 for (i = 0; i < nsyms && count > 0; i++)
178 if (freqs[i] == 0) {
179 freqs[i] = 1;
180 count--;
181 }
182 }
183 }
184
185 /*
186 * First, try building the Huffman table the normal way. If
187 * this works, it's optimal, so we don't want to mess with it.
188 */
189 buildhuf(sc, freqs, lengths, nsyms);
190
191 for (i = 0; i < nsyms; i++)
192 if (lengths[i] > limit)
193 break;
194
195 if (i == nsyms)
196 goto cleanup; /* OK */
197
198 /*
199 * The Huffman algorithm can only ever generate a code length
200 * of N bits or more if there is a symbol whose probability is
201 * less than the reciprocal of the (N+2)th Fibonacci number
202 * (counting from F_0=0 and F_1=1), i.e. 1/2584 for N=16, or
203 * 1/55 for N=8. (This is a necessary though not sufficient
204 * condition.)
205 *
206 * Why is this? Well, consider the input symbol with the
207 * smallest probability. Let that probability be x. In order
208 * for this symbol to have a code length of at least 1, the
209 * Huffman algorithm will have to merge it with some other
210 * node; and since x is the smallest probability, the node it
211 * gets merged with must be at least x. Thus, the probability
212 * of the resulting combined node will be at least 2x. Now in
213 * order for our node to reach depth 2, this 2x-node must be
214 * merged again. But what with? We can't assume the node it
215 * merges with is at least 2x, because this one might only be
216 * the _second_ smallest remaining node. But we do know the
217 * node it merges with must be at least x, so our order-2
218 * internal node is at least 3x.
219 *
220 * How small a node can merge with _that_ to get an order-3
221 * internal node? Well, it must be at least 2x, because if it
222 * was smaller than that then it would have been one of the two
223 * smallest nodes in the previous step and been merged at that
224 * point. So at least 3x, plus at least 2x, comes to at least
225 * 5x for an order-3 node.
226 *
227 * And so it goes on: at every stage we must merge our current
228 * node with a node at least as big as the bigger of this one's
229 * two parents, and from this starting point that gives rise to
230 * the Fibonacci sequence. So we find that in order to have a
231 * node n levels deep (i.e. a maximum code length of n), the
232 * overall probability of the root of the entire tree must be
233 * at least F_{n+2} times the probability of the rarest symbol.
234 * In other words, since the overall probability is 1, it is a
235 * necessary condition for a code length of 16 or more that
236 * there must be at least one symbol with probability <=
237 * 1/F_18.
238 *
239 * (To demonstrate that a probability this big really can give
240 * rise to a code length of 16, consider the set of input
241 * frequencies { 1-epsilon, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,
242 * 89, 144, 233, 377, 610, 987 }, for arbitrarily small
243 * epsilon.)
244 *
245 * So here buildhuf() has returned us an overlong code. So to
246 * ensure it doesn't do it again, we add a constant to all the
247 * (non-zero) symbol frequencies, causing them to become more
248 * balanced and removing the danger. We can then feed the
249 * results back to the standard buildhuf() and be
250 * assert()-level confident that the resulting code lengths
251 * contain nothing outside the permitted range.
252 */
253 assert(limit+3 < (int)lenof(fibonacci));
254 maxprob = fibonacci[limit+3];
255 totalfreq = nactivesyms = 0;
256 smallestfreq = -1;
257 for (i = 0; i < nsyms; i++) {
258 if (freqs[i] == 0)
259 continue;
260 if (smallestfreq < 0 || smallestfreq > freqs[i])
261 smallestfreq = freqs[i];
262 totalfreq += freqs[i];
263 nactivesyms++;
264 }
265 assert(smallestfreq <= totalfreq / maxprob);
266
267 /*
268 * We want to find the smallest integer `adjust' such that
269 * (totalfreq + nactivesyms * adjust) / (smallestfreq +
270 * adjust) is less than maxprob. A bit of algebra tells us
271 * that the threshold value is equal to
272 *
273 * totalfreq - maxprob * smallestfreq
274 * ----------------------------------
275 * maxprob - nactivesyms
276 *
277 * rounded up, of course. And we'll only even be trying
278 * this if
279 */
280 num = totalfreq - smallestfreq * maxprob;
281 denom = maxprob - nactivesyms;
282 adjust = (num + denom - 1) / denom;
283
284 /*
285 * Now add `adjust' to all the input symbol frequencies.
286 */
287 for (i = 0; i < nsyms; i++)
288 if (freqs[i] != 0)
289 freqs[i] += adjust;
290
291 /*
292 * Rebuild the Huffman tree...
293 */
294 buildhuf(sc, freqs, lengths, nsyms);
295
296 /*
297 * ... and this time it ought to be OK.
298 */
299 for (i = 0; i < nsyms; i++)
300 assert(lengths[i] <= limit);
301
302 cleanup:
303 /*
304 * Finally, free our scratch space.
305 */
306 sfree(sc->parent);
307 sfree(sc->length);
308 sfree(sc->heap);
309 }
310
311 #define MAXCODELEN 31 /* codes must fit in an int */
312
compute_huffman_codes(const unsigned char * lengths,int * codes,int nsyms)313 int compute_huffman_codes(const unsigned char *lengths, int *codes, int nsyms)
314 {
315 unsigned count[MAXCODELEN], startcode[MAXCODELEN], code;
316 int maxlen, i;
317
318 /* Count the codes of each length. */
319 maxlen = 0;
320 for (i = 1; i < MAXCODELEN; i++)
321 count[i] = 0;
322 for (i = 0; i < nsyms; i++) {
323 count[lengths[i]]++;
324 if (maxlen < lengths[i])
325 maxlen = lengths[i];
326 }
327
328 /* Determine the starting code for each length block. */
329 code = 0;
330 for (i = 1; i < MAXCODELEN; i++) {
331 startcode[i] = code;
332 code += count[i];
333 if (code > (1U << i))
334 maxlen = -1; /* overcommitted */
335 code <<= 1;
336 }
337 if (code < (1U << MAXCODELEN))
338 maxlen = -2; /* undercommitted */
339
340 /* Determine the code for each symbol. */
341 for (i = 0; i < nsyms; i++)
342 codes[i] = startcode[lengths[i]]++;
343
344 return maxlen;
345 }
346