1 /*
2  * huffman.c: implementation of huffman.h.
3  */
4 
5 #include <assert.h>
6 
7 #include "halibut.h"
8 #include "huffman.h"
9 
10 static const unsigned fibonacci[] = {
11     0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597,
12     2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418,
13     317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465,
14     14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
15     433494437, 701408733, 1134903170, 1836311903, 2971215073,
16 };
17 
18 /*
19  * Binary heap functions used by buildhuf(). Each one assumes the
20  * heap to be stored in an array of ints, with two ints per node
21  * (user data and key). They take in the old heap length, and
22  * return the new one.
23  */
24 #define HEAPPARENT(x) (((x)-2)/4*2)
25 #define HEAPLEFT(x) ((x)*2+2)
26 #define HEAPRIGHT(x) ((x)*2+4)
addheap(int * heap,int len,int userdata,int key)27 static int addheap(int *heap, int len, int userdata, int key)
28 {
29     int me, dad, tmp;
30 
31     me = len;
32     heap[len++] = userdata;
33     heap[len++] = key;
34 
35     while (me > 0) {
36 	dad = HEAPPARENT(me);
37 	if (heap[me+1] < heap[dad+1]) {
38 	    tmp = heap[me]; heap[me] = heap[dad]; heap[dad] = tmp;
39 	    tmp = heap[me+1]; heap[me+1] = heap[dad+1]; heap[dad+1] = tmp;
40 	    me = dad;
41 	} else
42 	    break;
43     }
44 
45     return len;
46 }
rmheap(int * heap,int len,int * userdata,int * key)47 static int rmheap(int *heap, int len, int *userdata, int *key)
48 {
49     int me, lc, rc, c, tmp;
50 
51     len -= 2;
52     *userdata = heap[0];
53     *key = heap[1];
54     heap[0] = heap[len];
55     heap[1] = heap[len+1];
56 
57     me = 0;
58 
59     while (1) {
60 	lc = HEAPLEFT(me);
61 	rc = HEAPRIGHT(me);
62 	if (lc >= len)
63 	    break;
64 	else if (rc >= len || heap[lc+1] < heap[rc+1])
65 	    c = lc;
66 	else
67 	    c = rc;
68 	if (heap[me+1] > heap[c+1]) {
69 	    tmp = heap[me]; heap[me] = heap[c]; heap[c] = tmp;
70 	    tmp = heap[me+1]; heap[me+1] = heap[c+1]; heap[c+1] = tmp;
71 	} else
72 	    break;
73 	me = c;
74     }
75 
76     return len;
77 }
78 
79 struct hufscratch {
80     int *parent, *length, *heap;
81 };
82 
83 /*
84  * The core of the Huffman algorithm: takes an input array of
85  * symbol frequencies, and produces an output array of code
86  * lengths.
87  *
88  * We cap the output code lengths to fit in an unsigned char (which is
89  * safe since our clients will impose some smaller limit on code
90  * length anyway). So if you see 255 in the output, it means '255 or
91  * more' and is a sign that whatever limit you really wanted has
92  * certainly been overflowed.
93  */
buildhuf(struct hufscratch * sc,const int * freqs,unsigned char * lengths,int nsyms)94 static void buildhuf(struct hufscratch *sc, const int *freqs,
95                      unsigned char *lengths, int nsyms)
96 {
97     int heapsize;
98     int i, j, n;
99     int si, sj;
100 
101     for (i = 0; i < nsyms; i++)
102         sc->parent[i] = 0;
103 
104     /*
105      * Begin by building the heap.
106      */
107     heapsize = 0;
108     for (i = 0; i < nsyms; i++)
109 	if (freqs[i] > 0)	       /* leave unused symbols out totally */
110 	    heapsize = addheap(sc->heap, heapsize, i, freqs[i]);
111 
112     /*
113      * Now repeatedly take two elements off the heap and merge
114      * them.
115      */
116     n = nsyms;
117     while (heapsize > 2) {
118 	heapsize = rmheap(sc->heap, heapsize, &i, &si);
119 	heapsize = rmheap(sc->heap, heapsize, &j, &sj);
120 	sc->parent[i] = n;
121 	sc->parent[j] = n;
122 	heapsize = addheap(sc->heap, heapsize, n, si + sj);
123 	n++;
124     }
125 
126     /*
127      * Now we have our tree, in the form of a link from each node
128      * to the index of its parent. Count back down the tree to
129      * determine the code lengths.
130      */
131     for (i = 0; i < 2*nsyms+1; i++)
132         sc->length[i] = 0;
133     /* The tree root has length 0 after that, which is correct. */
134     for (i = n-1; i-- ;)
135 	if (sc->parent[i] > 0)
136 	    sc->length[i] = 1 + sc->length[sc->parent[i]];
137 
138     /*
139      * And that's it. (Simple, wasn't it?) Copy the lengths into
140      * the output array and leave.
141      *
142      * Here we cap lengths to fit in unsigned char.
143      */
144     for (i = 0; i < nsyms; i++)
145 	lengths[i] = (sc->length[i] > 255 ? 255 : sc->length[i]);
146 }
147 
148 /*
149  * Wrapper around buildhuf() which enforces the restriction on code
150  * length.
151  */
build_huffman_tree(int * freqs,unsigned char * lengths,int nsyms,int limit)152 void build_huffman_tree(int *freqs, unsigned char *lengths,
153                         int nsyms, int limit)
154 {
155     struct hufscratch hsc, *sc = &hsc;
156     int smallestfreq, totalfreq, nactivesyms;
157     int num, denom, adjust;
158     int i;
159     int maxprob;
160 
161     sc->parent = snewn(2*nsyms+1, int);
162     sc->length = snewn(2*nsyms+1, int);
163     sc->heap = snewn(2*nsyms, int);
164 
165     /*
166      * Nasty special case: if the frequency table has fewer than
167      * two non-zero elements, we must invent some, because we can't
168      * have fewer than one bit encoding a symbol.
169      */
170     assert(nsyms >= 2);
171     {
172 	int count = 0;
173 	for (i = 0; i < nsyms; i++)
174 	    if (freqs[i] > 0)
175 		count++;
176 	if (count < 2) {
177 	    for (i = 0; i < nsyms && count > 0; i++)
178 		if (freqs[i] == 0) {
179 		    freqs[i] = 1;
180 		    count--;
181 		}
182 	}
183     }
184 
185     /*
186      * First, try building the Huffman table the normal way. If
187      * this works, it's optimal, so we don't want to mess with it.
188      */
189     buildhuf(sc, freqs, lengths, nsyms);
190 
191     for (i = 0; i < nsyms; i++)
192 	if (lengths[i] > limit)
193 	    break;
194 
195     if (i == nsyms)
196         goto cleanup;                  /* OK */
197 
198     /*
199      * The Huffman algorithm can only ever generate a code length
200      * of N bits or more if there is a symbol whose probability is
201      * less than the reciprocal of the (N+2)th Fibonacci number
202      * (counting from F_0=0 and F_1=1), i.e. 1/2584 for N=16, or
203      * 1/55 for N=8. (This is a necessary though not sufficient
204      * condition.)
205      *
206      * Why is this? Well, consider the input symbol with the
207      * smallest probability. Let that probability be x. In order
208      * for this symbol to have a code length of at least 1, the
209      * Huffman algorithm will have to merge it with some other
210      * node; and since x is the smallest probability, the node it
211      * gets merged with must be at least x. Thus, the probability
212      * of the resulting combined node will be at least 2x. Now in
213      * order for our node to reach depth 2, this 2x-node must be
214      * merged again. But what with? We can't assume the node it
215      * merges with is at least 2x, because this one might only be
216      * the _second_ smallest remaining node. But we do know the
217      * node it merges with must be at least x, so our order-2
218      * internal node is at least 3x.
219      *
220      * How small a node can merge with _that_ to get an order-3
221      * internal node? Well, it must be at least 2x, because if it
222      * was smaller than that then it would have been one of the two
223      * smallest nodes in the previous step and been merged at that
224      * point. So at least 3x, plus at least 2x, comes to at least
225      * 5x for an order-3 node.
226      *
227      * And so it goes on: at every stage we must merge our current
228      * node with a node at least as big as the bigger of this one's
229      * two parents, and from this starting point that gives rise to
230      * the Fibonacci sequence. So we find that in order to have a
231      * node n levels deep (i.e. a maximum code length of n), the
232      * overall probability of the root of the entire tree must be
233      * at least F_{n+2} times the probability of the rarest symbol.
234      * In other words, since the overall probability is 1, it is a
235      * necessary condition for a code length of 16 or more that
236      * there must be at least one symbol with probability <=
237      * 1/F_18.
238      *
239      * (To demonstrate that a probability this big really can give
240      * rise to a code length of 16, consider the set of input
241      * frequencies { 1-epsilon, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,
242      * 89, 144, 233, 377, 610, 987 }, for arbitrarily small
243      * epsilon.)
244      *
245      * So here buildhuf() has returned us an overlong code. So to
246      * ensure it doesn't do it again, we add a constant to all the
247      * (non-zero) symbol frequencies, causing them to become more
248      * balanced and removing the danger. We can then feed the
249      * results back to the standard buildhuf() and be
250      * assert()-level confident that the resulting code lengths
251      * contain nothing outside the permitted range.
252      */
253     assert(limit+3 < (int)lenof(fibonacci));
254     maxprob = fibonacci[limit+3];
255     totalfreq = nactivesyms = 0;
256     smallestfreq = -1;
257     for (i = 0; i < nsyms; i++) {
258 	if (freqs[i] == 0)
259 	    continue;
260 	if (smallestfreq < 0 || smallestfreq > freqs[i])
261 	    smallestfreq = freqs[i];
262 	totalfreq += freqs[i];
263 	nactivesyms++;
264     }
265     assert(smallestfreq <= totalfreq / maxprob);
266 
267     /*
268      * We want to find the smallest integer `adjust' such that
269      * (totalfreq + nactivesyms * adjust) / (smallestfreq +
270      * adjust) is less than maxprob. A bit of algebra tells us
271      * that the threshold value is equal to
272      *
273      *   totalfreq - maxprob * smallestfreq
274      *   ----------------------------------
275      *          maxprob - nactivesyms
276      *
277      * rounded up, of course. And we'll only even be trying
278      * this if
279      */
280     num = totalfreq - smallestfreq * maxprob;
281     denom = maxprob - nactivesyms;
282     adjust = (num + denom - 1) / denom;
283 
284     /*
285      * Now add `adjust' to all the input symbol frequencies.
286      */
287     for (i = 0; i < nsyms; i++)
288 	if (freqs[i] != 0)
289 	    freqs[i] += adjust;
290 
291     /*
292      * Rebuild the Huffman tree...
293      */
294     buildhuf(sc, freqs, lengths, nsyms);
295 
296     /*
297      * ... and this time it ought to be OK.
298      */
299     for (i = 0; i < nsyms; i++)
300 	assert(lengths[i] <= limit);
301 
302   cleanup:
303     /*
304      * Finally, free our scratch space.
305      */
306     sfree(sc->parent);
307     sfree(sc->length);
308     sfree(sc->heap);
309 }
310 
311 #define MAXCODELEN 31                  /* codes must fit in an int */
312 
compute_huffman_codes(const unsigned char * lengths,int * codes,int nsyms)313 int compute_huffman_codes(const unsigned char *lengths, int *codes, int nsyms)
314 {
315     unsigned count[MAXCODELEN], startcode[MAXCODELEN], code;
316     int maxlen, i;
317 
318     /* Count the codes of each length. */
319     maxlen = 0;
320     for (i = 1; i < MAXCODELEN; i++)
321 	count[i] = 0;
322     for (i = 0; i < nsyms; i++) {
323 	count[lengths[i]]++;
324 	if (maxlen < lengths[i])
325 	    maxlen = lengths[i];
326     }
327 
328     /* Determine the starting code for each length block. */
329     code = 0;
330     for (i = 1; i < MAXCODELEN; i++) {
331 	startcode[i] = code;
332 	code += count[i];
333 	if (code > (1U << i))
334 	    maxlen = -1;	       /* overcommitted */
335 	code <<= 1;
336     }
337     if (code < (1U << MAXCODELEN))
338 	maxlen = -2;		       /* undercommitted */
339 
340     /* Determine the code for each symbol. */
341     for (i = 0; i < nsyms; i++)
342 	codes[i] = startcode[lengths[i]]++;
343 
344     return maxlen;
345 }
346