1 /* Huffman coding, especially for digitized alphabets.
2 *
3 * Contents:
4 * 1. The ESL_HUFFMAN object
5 * 2. Huffman encoding
6 * 3. Huffman decoding
7 * 4. Debugging, development
8 * 5. Internal function, components of creating huffman codes
9 * 6. Example driver
10 *
11 * Useful emacs gdb tricks for displaying bit field v:
12 * p /t v (no leading zeros, beware!)
13 * x &v
14 */
15 #include "esl_config.h"
16
17 #include <stdio.h>
18
19 #include "easel.h"
20 #include "esl_quicksort.h"
21
22 #include "esl_huffman.h"
23
24
25 /* Declarations of stuff in internal functions/structures section */
26 struct hufftree_s {
27 float val; // Sum of frequencies of all leaves under this node
28 int depth; // Depth of node
29 int left; // index of left child in array of tree nodes (0..N-2; 0 is the root)
30 int right; // "" for right child
31 };
32 static int sort_floats_decreasing(const void *data, int e1, int e2);
33 static int sort_canonical (const void *data, int e1, int e2);
34 static int huffman_tree (ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq);
35 static int huffman_codelengths (ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq);
36 static int huffman_canonize (ESL_HUFFMAN *hc);
37 static int huffman_decoding_table(ESL_HUFFMAN *hc);
38 static void dump_uint32(FILE *fp, uint32_t v, int L);
39
40 static void huffman_pack(uint32_t *X, int *ip, int *ap, uint32_t code, int L);
41 static void huffman_unpack(const ESL_HUFFMAN *hc, uint32_t *vp, const uint32_t *X, int n, int *ip, int *ap, char *ret_x, int *ret_L);
42
43
44
45 /*****************************************************************
46 * 1. The ESL_HUFFMAN object
47 *****************************************************************/
48
49 /* Function: esl_huffman_Build()
50 * Synopsis: Build a new Huffman code.
51 * Incept: SRE, Thu Nov 12 11:08:09 2015
52 *
53 * Purpose: Build a canonical Huffman code for observed symbol
54 * frequencies <fq[0..K]> for <K> possible symbols.
55 * Frequencies can be counts, or normalized probabilities;
56 * all that matters is their relative magnitude (and that
57 * they're $\geq 0$).
58 *
59 * If you're encoding an Easel digital alphabet, you want
60 * <K = abc->Kp>, inclusive of ambiguity codes, gaps,
61 * missing data, and rare digital codes.
62 *
63 * If you're encoding 7-bit ASCII text, you want K=128, and
64 * the symbols codes are ASCII codes.
65 *
66 * If you're encoding MTF-encoded ASCII text, again you
67 * want K=128 and the "symbol" codes are 0..127 offsets in
68 * the move-to-front encoding.
69 *
70 * If you're encoding an arbitrary symbol table -- a table
71 * of gap lengths, perhaps? -- <K> can be anything.
72 *
73 * Unobserved symbols (with <fq[] = 0>) will not be encoded;
74 * they get a code length of 0, and a code of 0.
75 *
76 * Args: fq - symbol frequencies 0..K-1; sum to 1
77 * K - size of fq (encoded alphabet size)
78 * ret_hc - RETURN: new huffman code object
79 *
80 * Returns: <eslOK> on success, and <*ret_hc> points to the new
81 * <ESL_HUFFMAN> object.
82 *
83 * Throws: <eslEMEM> on allocation error.
84 *
85 * <eslERANGE> if the encoding requires a code length
86 * that exceeds <eslHUFFMAN_MAXCODE>, and won't fit in
87 * a <uint32_t>.
88 */
89 int
esl_huffman_Build(const float * fq,int K,ESL_HUFFMAN ** ret_hc)90 esl_huffman_Build(const float *fq, int K, ESL_HUFFMAN **ret_hc)
91 {
92 ESL_HUFFMAN *hc = NULL;
93 struct hufftree_s *htree = NULL; // only need tree temporarily, during code construction.
94 int i,r;
95 int status;
96
97 ESL_DASSERT1(( fq ));
98 ESL_DASSERT1(( K > 0 ));
99
100 ESL_ALLOC(hc, sizeof(ESL_HUFFMAN));
101 hc->len = NULL;
102 hc->code = NULL;
103 hc->sorted_at = NULL;
104 hc->dt_len = NULL;
105 hc->dt_lcode = NULL;
106 hc->dt_rank = NULL;
107
108 hc->K = K;
109 hc->Ku = 0;
110 hc->D = 0;
111 hc->Lmax = 0;
112
113 ESL_ALLOC(hc->len, sizeof(int) * hc->K);
114 ESL_ALLOC(hc->code, sizeof(uint32_t) * hc->K);
115 ESL_ALLOC(hc->sorted_at, sizeof(int) * hc->K);
116
117 for (i = 0; i < hc->K; i++) hc->len[i] = 0;
118 for (i = 0; i < hc->K; i++) hc->code[i] = 0;
119
120 /* Sort the symbol frequencies, largest to smallest */
121 esl_quicksort(fq, hc->K, sort_floats_decreasing, hc->sorted_at);
122
123 /* Figure out how many are nonzero: that's hc->Ku */
124 for (r = hc->K-1; r >= 0; r--)
125 if (fq[hc->sorted_at[r]] > 0.) break;
126 hc->Ku = r+1;
127
128 ESL_ALLOC(htree, sizeof(struct hufftree_s) * (ESL_MAX(1, hc->Ku-1))); // Ku=1 is ok; avoid zero malloc.
129 if ( (status = huffman_tree (hc, htree, fq)) != eslOK) goto ERROR;
130 if ( (status = huffman_codelengths(hc, htree, fq)) != eslOK) goto ERROR; // can fail eslERANGE on maxlen > 32
131 if ( (status = huffman_canonize (hc)) != eslOK) goto ERROR;
132
133
134 ESL_ALLOC(hc->dt_len, sizeof(int) * hc->D);
135 ESL_ALLOC(hc->dt_lcode, sizeof(uint32_t) * hc->D);
136 ESL_ALLOC(hc->dt_rank, sizeof(int) * hc->D);
137 if ( (status = huffman_decoding_table(hc)) != eslOK) goto ERROR;
138
139 free(htree);
140 *ret_hc = hc;
141 return eslOK;
142
143 ERROR:
144 free(htree);
145 esl_huffman_Destroy(hc);
146 *ret_hc = NULL;
147 return status;
148 }
149
150
151
152 /* Function: esl_huffman_Destroy()
153 * Synopsis: Free an <ESL_HUFFMAN> code.
154 * Incept: SRE, Thu Nov 12 11:07:39 2015
155 */
156 void
esl_huffman_Destroy(ESL_HUFFMAN * hc)157 esl_huffman_Destroy(ESL_HUFFMAN *hc)
158 {
159 if (hc) {
160 free(hc->len);
161 free(hc->code);
162 free(hc->sorted_at);
163 free(hc->dt_len);
164 free(hc->dt_lcode);
165 free(hc->dt_rank);
166 free(hc);
167 }
168 }
169
170
171
172 /*****************************************************************
173 * 2. Encoding
174 *****************************************************************/
175
176
177 /* Function: esl_huffman_Encode()
178 * Synopsis: Encode a string.
179 * Incept: SRE, Thu Jun 2 09:27:43 2016 [Hamilton]
180 *
181 * Purpose: Use Huffman code <hc> to encode the plaintext input <T> of
182 * length <n>. The encoded result <X> consists of <nb> bits,
183 * stored in an array of <nX> <uint32_t>'s; this result is
184 * returned through the pointers <*ret_X>, <*ret_nX>, and
185 * <*ret_nb>.
186 *
187 * The encoded array <X> is allocated here, and must be
188 * free'd by the caller.
189 *
190 * Args: hc - Huffman code to use for encoding
191 * T - plaintext input to encode, [0..n-1]; does not need to be NUL-terminated.
192 * n - length of T
193 * ret_X - RETURN: encoded bit array
194 * ret_nb - RETURN: length of X in bits (nX = nb / 32, rounded up)
195 *
196 * Returns: <eslOK> on success.
197 *
198 * Throws: <eslEMEM> on allocation failure. Now <*ret_X = NULL> and <*ret_nb = 0>.
199 */
200 int
esl_huffman_Encode(const ESL_HUFFMAN * hc,const char * T,int n,uint32_t ** ret_X,int * ret_nb)201 esl_huffman_Encode(const ESL_HUFFMAN *hc, const char *T, int n, uint32_t **ret_X, int *ret_nb)
202 {
203 uint32_t *X = NULL;
204 int xalloc = ESL_MAX(16, (n+15)/16); // current allocation for X, in uint32_t's
205 int pos = 0; // current position in X's uint32_t array
206 int nb;
207 int i;
208 int status;
209
210 ESL_DASSERT1(( hc != NULL ));
211 ESL_DASSERT1(( T != NULL ));
212 ESL_DASSERT1(( n > 0 ));
213
214 ESL_ALLOC(X, sizeof(uint32_t) * xalloc);
215
216 X[0] = 0;
217 nb = 0;
218 for (i = 0; i < n; i++)
219 {
220 huffman_pack(X, &pos, &nb, hc->code[(int) T[i]], hc->len[(int) T[i]]);
221
222 if (pos+1 == xalloc) {
223 xalloc *= 2;
224 ESL_REALLOC(X, sizeof(uint32_t) * xalloc);
225 }
226 }
227
228 *ret_X = X; // X consists of <pos+1> uint32_t's
229 *ret_nb = 32*pos + nb; // ... we return exact # of bits.
230 return eslOK;
231
232 ERROR:
233 *ret_X = NULL;
234 *ret_nb = 0;
235 return status;
236 }
237
238
239
240
241 /*****************************************************************
242 * 3. Decoding
243 *****************************************************************/
244
245
246 /* Function: esl_huffman_Decode()
247 * Synopsis: Decode a bit stream.
248 * Incept: SRE, Thu Jun 2 09:52:46 2016 [Hamilton, Act I]
249 *
250 * Purpose: Use Huffman code <hc> to decode a bit stream <X> of length
251 * <n> integers and <nb> bits. The result is a plaintext
252 * string <T> of length <nT> characters. Return this result
253 * through <*ret_T> and <*ret_nT>.
254 *
255 * The decoded plaintext <T> is allocated here, and must be
256 * free'd by the caller.
257 *
258 * <T> is NUL-terminated, just in case that's useful --
259 * though the caller isn't necessarily going to treat <T>
260 * as a string. (It could be using "symbols" 0..127, which
261 * would include <\0> as a valid symbol.)
262 *
263 * Args: hc - Huffman code to use to decode <X>
264 * X - bit stream to decode
265 * nb - length of <X> in BITS (nX = nb/32, rounded up)
266 * ret_T - RETURN: decoded plaintext string, \0-terminated
267 * ret_n - RETURN: length of <T> in chars
268 *
269 * Returns: <eslOK> on success; <*ret_T> and <*ret_nT> hold the result.
270 *
271 * Throws: <eslEMEM> on allocation failure. Now <*ret_T> is <NULL> and
272 * <*ret_nT> is 0.
273 *
274 * Xref:
275 */
276 int
esl_huffman_Decode(const ESL_HUFFMAN * hc,const uint32_t * X,int nb,char ** ret_T,int * ret_n)277 esl_huffman_Decode(const ESL_HUFFMAN *hc, const uint32_t *X, int nb, char **ret_T, int *ret_n)
278 {
279 char *T = NULL;
280 int allocT; // current allocation for T
281 uint32_t v = X[0]; // current (full) 32 bits we're going to decode in this step
282 int i = 1; // index of X[i] we will first pull *new* bits from, after decoding v
283 int nX = (nb+31)/32; // length of X in uint32_t's: nb/32 rounded up.
284 int a = (nX > 1 ? 32 : 0);
285 int pos = 0;
286 int L; // length of code we just decoded, in bits
287 int status;
288
289 allocT = nX * 4; // an initial guess: 4 bytes per X, maybe 4x compression
290 ESL_ALLOC(T, sizeof(char) * allocT);
291
292 while (nb > 0)
293 {
294 huffman_unpack(hc, &v, X, nX, &i, &a, &(T[pos]), &L);
295 nb -= L;
296
297 if (++pos == allocT) {
298 allocT *= 2;
299 ESL_REALLOC(T, sizeof(char) * allocT);
300 }
301 }
302
303 /* We know we have space for the \0, from how we reallocated. */
304 T[pos] = '\0';
305 *ret_T = T;
306 *ret_n = pos;
307 return eslOK;
308
309 ERROR:
310 *ret_T = NULL;
311 *ret_n = 0;
312 return status;
313 }
314
315
316
317
318
319 /*****************************************************************
320 * 4. Debugging, development
321 *****************************************************************/
322
323 /* Function: esl_huffman_Dump()
324 * Synopsis: Dump info on a huffman code structure.
325 * Incept: SRE, Sat Jun 4 07:38:15 2016
326 *
327 * Purpose: Dump the internals of object <hc> to output stream <fp>.
328 */
329 int
esl_huffman_Dump(FILE * fp,ESL_HUFFMAN * hc)330 esl_huffman_Dump(FILE *fp, ESL_HUFFMAN *hc)
331 {
332 int r,x;
333 int d,L;
334
335 /* Encoding table: <letter index> <code length> <binary encoding> */
336 fprintf(fp, "Encoding table:\n");
337 for (r = 0; r < hc->Ku; r++)
338 {
339 x = hc->sorted_at[r];
340 fprintf(fp, "%3d %2d ", x, hc->len[x]);
341 dump_uint32(fp, hc->code[x], hc->len[x]);
342 fprintf(fp, "\n");
343 }
344 fputc('\n', fp);
345
346
347 /* Decoding table (if set) */
348 if (hc->dt_len)
349 {
350 fprintf(fp, "Decoding table:\n");
351 for (d = 0; d < hc->D; d++)
352 {
353 L = hc->dt_len[d];
354 fprintf(fp, "L=%2d r=%3d (%3d) ", L, hc->dt_rank[d], hc->sorted_at[hc->dt_rank[d]]);
355 dump_uint32(fp, hc->dt_lcode[d], eslHUFFMAN_MAXCODE);
356 fputc('\n', fp);
357 }
358 }
359
360 return eslOK;
361 }
362
363
364
365 /*****************************************************************
366 * 5. Internal functions and structures
367 *****************************************************************/
368
369 /* sort_floats_decreasing()
370 * Sorting function for esl_quicksort(), putting
371 * symbol frequencies in decreasing order.
372 */
373 static int
sort_floats_decreasing(const void * data,int e1,int e2)374 sort_floats_decreasing(const void *data, int e1, int e2)
375 {
376 float *fq = (float *) data;
377 if (fq[e1] > fq[e2]) return -1;
378 if (fq[e1] < fq[e2]) return 1;
379 return 0;
380 }
381
382 /* sort_canonical()
383 * Sorting function for esl_quicksort(), putting symbols into
384 * canonical Huffman order: primarily by ascending code length,
385 * secondarily by ascending symbol code.
386 */
387 static int
sort_canonical(const void * data,int e1,int e2)388 sort_canonical(const void *data, int e1, int e2)
389 {
390 ESL_HUFFMAN *hc = (ESL_HUFFMAN *) data;
391 int L1 = hc->len[e1];
392 int L2 = hc->len[e2];
393
394 if (L2 == 0) return -1; // len=0 means symbol isn't encoded at all, doesn't occur
395 else if (L1 == 0) return 1;
396 else if (L1 < L2) return -1;
397 else if (L1 > L2) return 1;
398 else if (e1 < e2) return -1;
399 else if (e1 > e2) return 1;
400 else return 0;
401 }
402
403 /* Build the Huffman tree, joining nodes/leaves of smallest frequency.
404 * This takes advantage of having the fq[] array sorted, and the fact
405 * that the internal node values also come out sorted... i.e. we don't
406 * have to re-sort, we can always find the smallest leaves/nodes by
407 * looking at the last ones.
408 *
409 * For the Ku=1 edge case, there's no tree, and this no-ops.
410 *
411 * Input:
412 * hc->sorted_at[] lists symbol indices from largest to smallest freq.
413 * hc->Ku is the number of syms w/ nonzero freq; tree has Ku-1 nodes
414 * htree blank, allocated for at least Ku-1 nodes
415 *
416 * Output:
417 * htree's left, right, val fields are filled.
418 *
419 * Returns:
420 * <eslOK> on success.
421 */
422 static int
huffman_tree(ESL_HUFFMAN * hc,struct hufftree_s * htree,const float * fq)423 huffman_tree(ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq)
424 {
425 int r = hc->Ku-1; // r = smallest leaf symbol that hasn't been included in tree yet; r+1 = # of leaves left
426 int k = hc->Ku-2; // k = smallest internal node not used as a child yet; k-j = # nodes not used as child yet
427 int j;
428
429 for (j = hc->Ku-2; j >= 0; j--) // j = index of next node we add; we add one per iteration
430 {
431 /* Should we join two leaves?
432 * If we have no internal nodes yet (because we're just starting),
433 * or the two smallest frequencies are <= the smallest unjoined node's value
434 */
435 if ( (j == hc->Ku-2) || (r >= 1 && fq[hc->sorted_at[r]] <= htree[k].val))
436 {
437 htree[j].right = -hc->sorted_at[r]; // leaves are signified by negative indices in tree
438 htree[j].left = -hc->sorted_at[r-1];
439 htree[j].val = fq[hc->sorted_at[r]] + fq[hc->sorted_at[r-1]];
440 r -= 2;
441 }
442
443 /* Or should we join two nodes?
444 * If we have no leaves left,
445 * or (we do have two nodes) and both are smaller than smallest unjoined leaf's value
446 */
447 else if (r == -1 || (k-j >= 2 && htree[k-1].val < fq[hc->sorted_at[r]]))
448 {
449 htree[j].right = k;
450 htree[j].left = k-1;
451 htree[j].val = htree[k].val + htree[k-1].val;
452 k -= 2;
453 }
454
455 /* Otherwise, we join smallest node and smallest leaf. */
456 else
457 {
458 htree[j].right = -hc->sorted_at[r];
459 htree[j].left = k;
460 htree[j].val = fq[hc->sorted_at[r]] + htree[k].val;
461 r--;
462 k--;
463 }
464 }
465 return eslOK;
466 }
467
468
469 /* Calculate code lengths, equal to the depth of each node.
470 * Traverse the tree, calculating depth of each node, starting with
471 * depth 0 for root 0. We don't need a stack for this traversal,
472 * tree is already indexed in traversal order.
473 *
474 * For the Ku=1 edge case, there's no tree; for a single encoded
475 * symbol we set hc->len[0] = 1, hc->Lmax = 1
476 *
477 * Input:
478 * hc->Ku is the number of syms w/ nonzero freqs; tree has Ku-1 nodes.
479 * htree[0..Ku-2] is the constructed Huffman tree, with right/left/val set.
480 * htree[].len has been initialized to 0 for all symbols 0..K
481 *
482 * Output:
483 * htree's depth field is set.
484 * hc->len is set for all encoded symbols (left at 0 for unused symbols)
485 * hc->Lmax is set
486 *
487 * Return:
488 * <eslOK> on success
489 * <eslERANGE> if max code length > eslHUFFMAN_MAXCODE and won't fit in uint32_t
490 */
491 static int
huffman_codelengths(ESL_HUFFMAN * hc,struct hufftree_s * htree,const float * fq)492 huffman_codelengths(ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq)
493 {
494 int i;
495
496 if (hc->Ku == 1)
497 {
498 hc->len[ hc->sorted_at[0] ] = 1;
499 hc->Lmax = 1;
500 return eslOK;
501 }
502
503 htree[0].depth = 0;
504 for (i = 0; i < hc->Ku-1; i++)
505 {
506 if (htree[i].right <= 0) hc->len[-htree[i].right] = htree[i].depth + 1;
507 else htree[htree[i].right].depth = htree[i].depth + 1;
508
509 if (htree[i].left <= 0) hc->len[-htree[i].left] = htree[i].depth + 1;
510 else htree[htree[i].left].depth = htree[i].depth + 1;
511 }
512
513 hc->Lmax = 0;
514 for (i = 0; i < hc->K; i++)
515 hc->Lmax = ESL_MAX(hc->len[i], hc->Lmax);
516
517 return (hc->Lmax > eslHUFFMAN_MAXCODE ? eslERANGE : eslOK);
518 }
519
520
521 /* huffman_canonize()
522 * Given code lengths, now we calculate the canonical Huffman encoding.
523 *
524 * Input:
525 * hc->len[] code lengths are set for all K (0 for unused symbols)
526 * hc->code[] have been initialized to 0 for all K
527 *
528 * Output:
529 * hc->code[] have been set for all used symbols.
530 * hc->D number of different code lengths is set
531 *
532 * Returns:
533 * <eslOK> on success.
534 */
535 static int
huffman_canonize(ESL_HUFFMAN * hc)536 huffman_canonize(ESL_HUFFMAN *hc)
537 {
538 int r;
539
540 /* Sort symbols according to 1) code length; 2) order in digital alphabet (i.e. symbol code itself)
541 * Reuse/reset <sorted_at>.
542 * You can't just sort the encoded Ku; you have to sort all K, because
543 * quicksort expects indices to be contiguous (0..K-1).
544 */
545 esl_quicksort(hc, hc->K, sort_canonical, hc->sorted_at);
546
547 /* Assign codes. (All K have been initialized to zero already.) */
548 for (r = 1; r < hc->Ku; r++)
549 hc->code[hc->sorted_at[r]] =
550 (hc->code[hc->sorted_at[r-1]] + 1) << (hc->len[hc->sorted_at[r]] - hc->len[hc->sorted_at[r-1]]);
551
552 /* Set D, the number of different code lengths */
553 hc->D = 1;
554 for (r = 1; r < hc->Ku; r++)
555 if (hc->len[hc->sorted_at[r]] > hc->len[hc->sorted_at[r-1]]) hc->D++;
556
557 return eslOK;
558 }
559
560
561 /* huffman_decoding_table()
562 * Given a canonical Huffman code; build the table that lets us
563 * efficiently decode it.
564 *
565 * Input:
566 * hc->K is set: total # of symbols (inclusive of unused ones)
567 * hc->Ku is set: total # of encoded/used symbols
568 * hc->code is set: canonical Huffman codes for symbols 0..K-1
569 * hc->len is set: code lengths for symbols 0..K-1
570 * hc->sorted_at is set: canonical Huffman sort order
571 * hc->Lmax is set: maximum code length
572 * hc->D is set: # of different code lengths
573 *
574 * hc->dt_len is allocated for hc->D, but otherwise uninitialized
575 * hc->dt_lcode is allocated for hc->D, but otherwise uninitialized
576 * hc->dt_rank is allocated for hc->D, but otherwise uninitialized
577 *
578 * Output:
579 * hc->dt_len is set: lengths of each used code length 0..D-1
580 * hc->dt_lcode is set: left-flushed first code for each code length [d]
581 * hc->dt_rank is set: rank r for 1st code for each used code length [d]
582 */
583 static int
huffman_decoding_table(ESL_HUFFMAN * hc)584 huffman_decoding_table(ESL_HUFFMAN *hc)
585 {
586 int r;
587 int D = 0;
588
589 hc->dt_len[0] = hc->len[hc->sorted_at[0]];
590 hc->dt_lcode[0] = hc->code[hc->sorted_at[0]] << (eslHUFFMAN_MAXCODE - hc->len[hc->sorted_at[0]]);
591 hc->dt_rank[0] = 0;
592 for (r = 1; r < hc->Ku; r++)
593 if (hc->len[hc->sorted_at[r]] > hc->len[hc->sorted_at[r-1]])
594 {
595 D++;
596 hc->dt_len[D] = hc->len[hc->sorted_at[r]];
597 hc->dt_lcode[D] = hc->code[hc->sorted_at[r]] << (eslHUFFMAN_MAXCODE - hc->len[hc->sorted_at[r]]);
598 hc->dt_rank[D] = r;
599 }
600 ESL_DASSERT1(( hc->D == D+1 ));
601 return eslOK;
602 }
603
604
605 static void
dump_uint32(FILE * fp,uint32_t v,int L)606 dump_uint32(FILE *fp, uint32_t v, int L)
607 {
608 uint32_t mask;
609 int i;
610
611 for (mask = 1 << (L-1), i = L; i >= 1; i--, mask = mask >> 1)
612 putc( ((v & mask) ? '1' : '0'), fp);
613 }
614
615
616
617 /* huffman_pack()
618 *
619 * <X[i]> is the current uint32_t unit in the encoded buffer <X>. It
620 * has <a> bits in it so far, maximally left-shifted; therefore (32-a)
621 * bits are available.
622 *
623 * <code> is the next Huffman code to pack into the buffer, of length
624 * <L>, and it's right flush.
625 *
626 * a=10 used (32-a)=20 free
627 * |xxxxxxxxxx|......................| X[i]
628 * |........................|yyyyyyyy| code, L=8
629 * |----- w -----|
630 * w = 32-(a+L)
631 *
632 * If L < 32-a, then we just shift by w and pack it into X[i]. Else,
633 * we shift the other way (by -w), pack what we can into X[i], and
634 * leave the remainder in X[i+1].
635 *
636 * We update <i> and <a> for <X> accordingly... so we pass them by
637 * reference in <ip> and <ap>.
638 */
639 static void
huffman_pack(uint32_t * X,int * ip,int * ap,uint32_t code,int L)640 huffman_pack(uint32_t *X, int *ip, int *ap, uint32_t code, int L)
641 {
642 int w = 32 - (*ap+L);
643
644 if (w > 0) // code can pack into X[i]'s available space.
645 {
646 X[*ip] = X[*ip] | (code << w);
647 *ap += L;
648 }
649 else if (w < 0) // code packs partly in X[i], remainder in X[i+1].
650 {
651 X[*ip] = X[*ip] | (code >> (-w));
652 (*ip)++;
653 X[*ip] = code << (32+w);
654 (*ap) = -w;
655 }
656 else // code packs exactly; w=0, no leftshift needed, OR it as is.
657 {
658 X[*ip] = X[*ip] | code;
659 *ip += 1;
660 *ap = 0;
661 X[*ip] = 0; // don't forget to initialize X[i+1]!
662 }
663 }
664
665
666
667 /* huffman_unpack()
668 * *vp : ptr to v; v = next 32 bits
669 * *X : encoded input
670 * n : length of input (in uint32_t)
671 * *ip : current position in <X>
672 * *ap : number of bits left in X[*ip]
673 *
674 * If we have to buffer X (say, if we're reading it from
675 * a long input) we'll have to redesign. Right now we assume
676 * it's just an array.
677 */
678 static void
huffman_unpack(const ESL_HUFFMAN * hc,uint32_t * vp,const uint32_t * X,int n,int * ip,int * ap,char * ret_x,int * ret_L)679 huffman_unpack(const ESL_HUFFMAN *hc, uint32_t *vp, const uint32_t *X, int n, int *ip, int *ap, char *ret_x, int *ret_L)
680 {
681 int L,D;
682 int idx;
683 uint32_t w;
684
685 for (D = 0; D < hc->D-1; D++)
686 if ((*vp) < hc->dt_lcode[D+1]) break;
687 L = hc->dt_len[D];
688 /* L is now the next code's length (prefix of v) */
689
690 /* Decode, by taking advantage of lexicographic sort/numerical order of canonical code, within each L */
691 idx = hc->dt_rank[D] + ( ((*vp) - hc->dt_lcode[D]) >> (eslHUFFMAN_MAXCODE-L) );
692
693 /* Now refill v, as much as we can, from bits in X[i] and X[i+1], and update i, a */
694 *vp = ( (*vp) << L); // Remove L bits from *vp by leftshifting it.
695
696 if (*ip < n) { // Take either L or all *ap bits from X[i], if it exists.
697 w = X[*ip] << (32-(*ap)); // Shift off the bits we already used in X[i]. w is now X[i], left-flushed.
698 *vp |= (w >> (32-L)); // Right-shift w into position, leaving it with leading 0's where *vp already has bits.
699 *ap -= L; // We used up to L bits from X[i]
700
701 // if *ap is still >0, we have bits left to use in X[i]. Otherwise:
702 if (*ap == 0) // If we exactly finished off X[i]:
703 {
704 (*ip)++; // then advance in X[].
705 *ap = 32;
706 }
707 else if (*ap < 0) // If we finished off X[i] but still need some bits
708 {
709 (*ip)++; // then go on to X[i+1] and 32 fresh bits.
710 if (*ip < n) // If it exists...
711 { // (...no, I don't like all these branches either...)
712 *ap += 32; // then we're going to leave it w/ <*ap> bits
713 *vp |= (X[*ip] >> *ap); // after taking the bits we need to fill v
714 }
715 else
716 {
717 *ap = 0; // If X[i+1] doesn't exist, leave *ip = n and *ap = 0; out of data in X (though not necessarily in v)
718 }
719 }
720 }
721
722 *ret_x = (char) hc->sorted_at[idx];
723 *ret_L = L;
724 }
725
726
727 /*****************************************************************
728 * 6. Unit tests
729 *****************************************************************/
730 #ifdef eslHUFFMAN_TESTDRIVE
731 #include "esl_random.h"
732 #include "esl_randomseq.h"
733 #include "esl_vectorops.h"
734
735 #include <string.h>
736
737
738 static void
utest_kryptos(ESL_RANDOMNESS * rng)739 utest_kryptos(ESL_RANDOMNESS *rng)
740 {
741 char msg[] = "kryptos utest failed";
742 ESL_HUFFMAN *hc = NULL;
743 char T[] = "BETWEEN SUBTLE SHADING AND THE ABSENCE OF LIGHT LIES THE NUANCE OF IQLUSION";
744 int n = strlen(T);
745 uint32_t *X = NULL;
746 int nb;
747 char *T2 = NULL;
748 int n2;
749 float fq[128];
750 int K = 128;
751 int i;
752 int status;
753
754 /* Any half-assed frequency distribution will do for this, over [ A-Z] */
755 for (i = 0; i < 128; i++) fq[i] = 0.;
756 for (i = 'A'; i <= 'Z'; i++) fq[i] = esl_random(rng);
757 fq[' '] = esl_random(rng);
758 esl_vec_FNorm(fq, 128);
759
760 if (( status = esl_huffman_Build (fq, K, &hc) ) != eslOK) esl_fatal(msg);
761 if (( status = esl_huffman_Encode(hc, T, n, &X, &nb)) != eslOK) esl_fatal(msg);
762 if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
763
764 //esl_huffman_Dump(stdout, hc);
765 //printf("%s\n", T);
766 //printf("%s\n", T2);
767
768 if (n2 != n) esl_fatal(msg);
769 if (strcmp(T, T2) != 0) esl_fatal(msg);
770
771 free(X);
772 free(T2);
773 esl_huffman_Destroy(hc);
774 }
775
776 /* utest_uniletter()
777 * Tests an edge case of a text consisting of a single letter, Ku=1.
778 * (Ku=1 cases get tested occasionally by utest_backandforth() too.)
779 */
780 static void
utest_uniletter(void)781 utest_uniletter(void)
782 {
783 char msg[] = "uniletter utest failed";
784 char T[] = "AAAAAAAAAA";
785 int n = strlen(T);
786 int K = 128;
787 float fq[128];
788 ESL_HUFFMAN *hc = NULL;
789 uint32_t *X = NULL;
790 int nb;
791 char *T2 = NULL;
792 int n2;
793 int i;
794 int status;
795
796 for (i = 0; i < 128; i++) fq[i] = 0.;
797 fq['A'] = (float) n;
798
799 if (( status = esl_huffman_Build (fq, K, &hc) ) != eslOK) esl_fatal(msg);
800 if (( status = esl_huffman_Encode(hc, T, n, &X, &nb)) != eslOK) esl_fatal(msg);
801 if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
802
803 if (n2 != n) esl_fatal(msg);
804 if (strcmp(T, T2) != 0) esl_fatal(msg);
805
806 free(X);
807 free(T2);
808 esl_huffman_Destroy(hc);
809 }
810
811
812 /* utest_backandforth()
813 * Encode and decode a random text string, and test
814 * that it decodes to the original.
815 */
816 static void
utest_backandforth(ESL_RANDOMNESS * rng)817 utest_backandforth(ESL_RANDOMNESS *rng)
818 {
819 char msg[] = "back and forth utest failed";
820 ESL_HUFFMAN *hc = NULL;
821 double *fq0 = NULL;
822 float *fq = NULL;
823 int K; // alphabet size: randomly chosen from 1..128
824 char *T = NULL; // random plaintext
825 int n; // randomly chosen length of plaintext T
826 uint32_t *X = NULL; // Huffman-coded bit stream
827 int nb; // length of X in bits
828 char *T2 = NULL; // decoded plaintext
829 int n2; // length of T2 in chars
830 int i;
831 int status;
832
833 /* Sample a zero-peppered frequency distribution <fq> for a randomly
834 * selected alphabet size <K>.
835 */
836 K = 1 + esl_rnd_Roll(rng, 128); // Choose a random alphabet size from 1 to 128
837 if (( fq0 = malloc(sizeof(double) * K)) == NULL) esl_fatal(msg); // esl_random works in doubles
838 if (( fq = malloc(sizeof(float) * K)) == NULL) esl_fatal(msg); // esl_huffman works in floats
839 esl_rnd_Dirichlet(rng, NULL, K, fq0); // Sample a uniform random probability vector
840 for (i = 0; i < K; i++) // Pepper it with exact 0's while converting to float
841 fq[i] = ( esl_rnd_Roll(rng, 4) == 0 ? 0. : (float) fq0[i] );
842 esl_vec_FNorm(fq, K); // and renormalize. (edge case: if fq was all 0, now it's uniform.)
843
844 /* Sample a random plaintext array <T>, of randomly selected length <n>.
845 * We're using codes 0..K-1 -- T is not a string, it's an array -- don't \0 it.
846 */
847 n = 1 + esl_rnd_Roll(rng, 10);
848 if (( T = malloc(sizeof(char) * (n+1))) == NULL) esl_fatal(msg);
849 for (i = 0; i < n; i++) T[i] = esl_rnd_FChoose(rng,fq,K);
850
851 if (( status = esl_huffman_Build (fq, K, &hc) ) != eslOK) esl_fatal(msg);
852 if (( status = esl_huffman_Encode(hc, T, n, &X, &nb)) != eslOK) esl_fatal(msg);
853 if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
854
855 //esl_huffman_Dump(stdout, hc);
856
857 if ( n2 != n) esl_fatal(msg);
858 if ( memcmp(T, T2, n) != 0) esl_fatal(msg);
859
860 free(T2);
861 free(X);
862 free(fq0);
863 free(fq);
864 free(T);
865 esl_huffman_Destroy(hc);
866 }
867
868
869
870
871 #endif /*eslHUFFMAN_TESTDRIVE*/
872
873
874
875
876 /*****************************************************************
877 * 7. Test driver
878 *****************************************************************/
879 #ifdef eslHUFFMAN_TESTDRIVE
880 #include "esl_config.h"
881
882 #include <stdio.h>
883
884 #include "easel.h"
885 #include "esl_getopts.h"
886 #include "esl_huffman.h"
887 #include "esl_random.h"
888
889 static ESL_OPTIONS options[] = {
890 /* name type default env range togs reqs incomp help docgrp */
891 {"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
892 {"-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
893 { 0,0,0,0,0,0,0,0,0,0},
894 };
895 static char usage[] = "[-options]";
896 static char banner[] = "test driver for huffman module";
897
898 int
main(int argc,char ** argv)899 main(int argc, char **argv)
900 {
901 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
902 ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
903
904 fprintf(stderr, "## %s\n", argv[0]);
905 fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
906
907 utest_kryptos (rng);
908 utest_uniletter ( );
909 utest_backandforth(rng);
910
911 fprintf(stderr, "# status = ok\n");
912
913 esl_getopts_Destroy(go);
914 esl_randomness_Destroy(rng);
915 return eslOK;
916 }
917 #endif /*eslHUFFMAN_TESTDRIVE*/
918
919
920
921
922 /*****************************************************************
923 * 8. Examples
924 *****************************************************************/
925 #ifdef eslHUFFMAN_EXAMPLE2
926
927 /* esl_huffman_example2 <fqfile>
928 *
929 * The input <fqfile> consists of N lines with
930 * two whitespace-delimited fields:
931 * <label> <frequency>
932 *
933 * Huffman code the frequency distribution, and output the resulting
934 * encoding.
935 */
936
937
938 #include "easel.h"
939 #include "esl_buffer.h"
940 #include "esl_huffman.h"
941 #include "esl_mem.h"
942 #include "esl_vectorops.h"
943
944 int
main(int argc,char ** argv)945 main(int argc, char **argv)
946 {
947 ESL_HUFFMAN *hc = NULL;
948 ESL_BUFFER *bf = NULL;
949 esl_pos_t n;
950 char *p;
951 char *tok;
952 esl_pos_t toklen;
953 int kalloc = 16;
954 char **label = malloc(sizeof(char *) * kalloc);
955 float *fq = malloc(sizeof(float) * kalloc);
956 int K = 0;
957 float meanL = 0.;
958 int i;
959 int status;
960
961 status = esl_buffer_Open(argv[1], NULL, &bf);
962 if (status == eslENOTFOUND) esl_fatal("open failed: %s", bf->errmsg);
963 else if (status == eslFAIL) esl_fatal("gzip -dc failed: %s", bf->errmsg);
964 else if (status != eslOK) esl_fatal("open failed with error code %d", status);
965
966 while (( status = esl_buffer_GetLine(bf, &p, &n)) == eslOK)
967 {
968 if ( esl_memtok(&p, &n, " \t\n", &tok, &toklen) != eslOK) continue;
969 if ( esl_memstrdup(tok, toklen, &(label[K])) != eslOK) continue;
970 if ( esl_mem_strtof(p, n, NULL, &(fq[K])) != eslOK) continue;
971
972 if (++K == kalloc) {
973 kalloc *= 2;
974 label = realloc(label, sizeof(char *) * kalloc);
975 fq = realloc(fq, sizeof(float) * kalloc);
976 }
977 }
978 esl_vec_FNorm(fq, K);
979
980 if (( status = esl_huffman_Build(fq, K, &hc)) != eslOK) esl_fatal("failed to build huffman code");
981
982 for (i = 0; i < K; i++)
983 {
984 printf("%-10s %2d ", label[i], hc->len[i]);
985 dump_uint32(stdout, hc->code[i], hc->len[i]);
986 printf("\n");
987 }
988
989 for (i = 0; i < K; i++)
990 meanL += (float) hc->len[i] * fq[i];
991 printf("\nMean code length = %.2f bits\n", meanL);
992
993 for (i = 0; i < K; i++) free(label[i]);
994 free(label);
995 free(fq);
996 esl_huffman_Destroy(hc);
997 esl_buffer_Close(bf);
998 return 0;
999 }
1000 #endif /*eslHUFFMAN_EXAMPLE2*/
1001
1002
1003
1004
1005 #ifdef eslHUFFMAN_EXAMPLE
1006
1007 #include "easel.h"
1008 #include "esl_huffman.h"
1009
1010 #include <stdio.h>
1011 #include <string.h>
1012
1013 /* Given an open <fp> for reading;
1014 * input text from it and return it as a single string.
1015 * Optionally return the number of characters in <opt_n>.
1016 * Convert all \n to spaces.
1017 */
1018 static char *
read_text(FILE * fp,int * opt_n)1019 read_text(FILE *fp, int *opt_n)
1020 {
1021 int maxlinelen = 4096;
1022 char *text = malloc(sizeof(char) * maxlinelen);
1023 int n = 0;
1024 char *p;
1025
1026 while (fgets(text+n, maxlinelen-1, fp) != NULL)
1027 {
1028 for (p = text+n; *p != '\0'; p++)
1029 if (*p == '\n') *p = ' ';
1030 n += strlen(text+n);
1031 text = realloc(text, sizeof(char) * (n+maxlinelen));
1032 }
1033
1034 if (opt_n) *opt_n = n;
1035 return text;
1036 }
1037
1038 int
main(int argc,char ** argv)1039 main(int argc, char **argv)
1040 {
1041 FILE *fp = fopen(argv[1], "r");
1042 int n;
1043 char *T = read_text(fp, &n);
1044 uint32_t *X = NULL;
1045 float fq[128];
1046 int c,i;
1047 int nb;
1048 ESL_HUFFMAN *hc = NULL;
1049 char *newT = NULL;
1050 int nT;
1051
1052 /* You provide a frequency table for your digital alphabet 0..K-1.
1053 * It's fine for there to be 0-frequency characters, even many of them;
1054 * they will not be encoded, and cost nothing.
1055 * Here, our digital alphabet is 7-bit ASCII text, 0..127, K=128.
1056 */
1057 for (c = 0; c < 128; c++) fq[c] = 0.;
1058 for (i = 0; i < n; i++) fq[(int) T[i]] += 1.;
1059
1060 /* There does have to be at least one character to encode, of course. */
1061 ESL_DASSERT1(( n > 0 ));
1062
1063 esl_huffman_Build(fq, 128, &hc);
1064
1065 esl_huffman_Dump(stdout, hc);
1066
1067 esl_huffman_Encode(hc, T, n, &X, &nb);
1068
1069 printf("\nOriginal: %d bytes\n", n);
1070 printf("Compressed: %d bytes (%d bits)\n", 4*(nb+31)/32, nb);
1071
1072 /* Dump the compresstext, up to 30 words of it */
1073 printf("\nCompressed text:\n");
1074 for (i = 0; i < ESL_MIN(30, (nb+31)/32); i++) {
1075 dump_uint32(stdout, X[i], 32);
1076 fputc('\n', stdout);
1077 }
1078
1079 esl_huffman_Decode(hc, X, nb, &newT, &nT);
1080
1081 /* Show the decoded plaintext, up to 100 chars of it */
1082 printf("\nDecoded text:\n");
1083 for (i = 0; i < ESL_MIN(100, nT); i++)
1084 fputc(newT[i], stdout);
1085 fputc('\n', stdout);
1086
1087 esl_huffman_Destroy(hc);
1088 free(T);
1089 fclose(fp);
1090 return 0;
1091 }
1092 #endif /*eslHUFFMAN_EXAMPLE*/
1093
1094