1 /* Huffman coding, especially for digitized alphabets.
2  *
3  * Contents:
4  *   1. The ESL_HUFFMAN object
5  *   2. Huffman encoding
6  *   3. Huffman decoding
7  *   4. Debugging, development
8  *   5. Internal function, components of creating huffman codes
9  *   6. Example driver
10  *
11  * Useful emacs gdb tricks for displaying bit field v:
12  *   p /t v     (no leading zeros, beware!)
13  *   x &v
14  */
15 #include "esl_config.h"
16 
17 #include <stdio.h>
18 
19 #include "easel.h"
20 #include "esl_quicksort.h"
21 
22 #include "esl_huffman.h"
23 
24 
25 /* Declarations of stuff in internal functions/structures section  */
26 struct hufftree_s {
27   float val;    // Sum of frequencies of all leaves under this node
28   int   depth;  // Depth of node
29   int   left;   // index of left child in array of tree nodes (0..N-2; 0 is the root)
30   int   right;  //  "" for right child
31 };
32 static int  sort_floats_decreasing(const void *data, int e1, int e2);
33 static int  sort_canonical        (const void *data, int e1, int e2);
34 static int  huffman_tree          (ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq);
35 static int  huffman_codelengths   (ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq);
36 static int  huffman_canonize      (ESL_HUFFMAN *hc);
37 static int  huffman_decoding_table(ESL_HUFFMAN *hc);
38 static void dump_uint32(FILE *fp, uint32_t v, int L);
39 
40 static void huffman_pack(uint32_t *X, int *ip, int *ap, uint32_t code, int L);
41 static void huffman_unpack(const ESL_HUFFMAN *hc, uint32_t *vp, const uint32_t *X, int n, int *ip, int *ap, char *ret_x, int *ret_L);
42 
43 
44 
45 /*****************************************************************
46  * 1. The ESL_HUFFMAN object
47  *****************************************************************/
48 
49 /* Function:  esl_huffman_Build()
50  * Synopsis:  Build a new Huffman code.
51  * Incept:    SRE, Thu Nov 12 11:08:09 2015
52  *
53  * Purpose:   Build a canonical Huffman code for observed symbol
54  *            frequencies <fq[0..K]> for <K> possible symbols.
55  *            Frequencies can be counts, or normalized probabilities;
56  *            all that matters is their relative magnitude (and that
57  *            they're $\geq 0$).
58  *
59  *            If you're encoding an Easel digital alphabet, you want
60  *            <K = abc->Kp>, inclusive of ambiguity codes, gaps,
61  *            missing data, and rare digital codes.
62  *
63  *            If you're encoding 7-bit ASCII text, you want K=128, and
64  *            the symbols codes are ASCII codes.
65  *
66  *            If you're encoding MTF-encoded ASCII text, again you
67  *            want K=128 and the "symbol" codes are 0..127 offsets in
68  *            the move-to-front encoding.
69  *
70  *            If you're encoding an arbitrary symbol table -- a table
71  *            of gap lengths, perhaps? -- <K> can be anything.
72  *
73  *            Unobserved symbols (with <fq[] = 0>) will not be encoded;
74  *            they get a code length of 0, and a code of 0.
75  *
76  * Args:      fq     - symbol frequencies 0..K-1; sum to 1
77  *            K      - size of fq (encoded alphabet size)
78  *            ret_hc - RETURN: new huffman code object
79  *
80  * Returns:   <eslOK> on success, and <*ret_hc> points to the new
81  *            <ESL_HUFFMAN> object.
82  *
83  * Throws:    <eslEMEM> on allocation error.
84  *
85  *            <eslERANGE> if the encoding requires a code length
86  *            that exceeds <eslHUFFMAN_MAXCODE>, and won't fit in
87  *            a <uint32_t>.
88  */
89 int
esl_huffman_Build(const float * fq,int K,ESL_HUFFMAN ** ret_hc)90 esl_huffman_Build(const float *fq, int K, ESL_HUFFMAN **ret_hc)
91 {
92   ESL_HUFFMAN       *hc    = NULL;
93   struct hufftree_s *htree = NULL;  // only need tree temporarily, during code construction.
94   int                i,r;
95   int                status;
96 
97   ESL_DASSERT1(( fq    ));
98   ESL_DASSERT1(( K > 0 ));
99 
100   ESL_ALLOC(hc, sizeof(ESL_HUFFMAN));
101   hc->len       = NULL;
102   hc->code      = NULL;
103   hc->sorted_at = NULL;
104   hc->dt_len    = NULL;
105   hc->dt_lcode  = NULL;
106   hc->dt_rank   = NULL;
107 
108   hc->K         = K;
109   hc->Ku        = 0;
110   hc->D         = 0;
111   hc->Lmax      = 0;
112 
113   ESL_ALLOC(hc->len,       sizeof(int)      * hc->K);
114   ESL_ALLOC(hc->code,      sizeof(uint32_t) * hc->K);
115   ESL_ALLOC(hc->sorted_at, sizeof(int)      * hc->K);
116 
117   for (i = 0; i < hc->K; i++) hc->len[i]  = 0;
118   for (i = 0; i < hc->K; i++) hc->code[i] = 0;
119 
120   /* Sort the symbol frequencies, largest to smallest */
121   esl_quicksort(fq, hc->K, sort_floats_decreasing, hc->sorted_at);
122 
123   /* Figure out how many are nonzero: that's hc->Ku */
124   for (r = hc->K-1; r >= 0; r--)
125     if (fq[hc->sorted_at[r]] > 0.) break;
126   hc->Ku = r+1;
127 
128   ESL_ALLOC(htree,         sizeof(struct hufftree_s) * (ESL_MAX(1, hc->Ku-1)));  // Ku=1 is ok; avoid zero malloc.
129   if ( (status = huffman_tree       (hc, htree, fq)) != eslOK) goto ERROR;
130   if ( (status = huffman_codelengths(hc, htree, fq)) != eslOK) goto ERROR;       // can fail eslERANGE on maxlen > 32
131   if ( (status = huffman_canonize   (hc))            != eslOK) goto ERROR;
132 
133 
134   ESL_ALLOC(hc->dt_len,   sizeof(int)      * hc->D);
135   ESL_ALLOC(hc->dt_lcode, sizeof(uint32_t) * hc->D);
136   ESL_ALLOC(hc->dt_rank,  sizeof(int)      * hc->D);
137   if ( (status = huffman_decoding_table(hc))         != eslOK) goto ERROR;
138 
139   free(htree);
140   *ret_hc = hc;
141   return eslOK;
142 
143  ERROR:
144   free(htree);
145   esl_huffman_Destroy(hc);
146   *ret_hc = NULL;
147   return status;
148 }
149 
150 
151 
152 /* Function:  esl_huffman_Destroy()
153  * Synopsis:  Free an <ESL_HUFFMAN> code.
154  * Incept:    SRE, Thu Nov 12 11:07:39 2015
155  */
156 void
esl_huffman_Destroy(ESL_HUFFMAN * hc)157 esl_huffman_Destroy(ESL_HUFFMAN *hc)
158 {
159   if (hc) {
160     free(hc->len);
161     free(hc->code);
162     free(hc->sorted_at);
163     free(hc->dt_len);
164     free(hc->dt_lcode);
165     free(hc->dt_rank);
166     free(hc);
167   }
168 }
169 
170 
171 
172 /*****************************************************************
173  * 2. Encoding
174  *****************************************************************/
175 
176 
177 /* Function:  esl_huffman_Encode()
178  * Synopsis:  Encode a string.
179  * Incept:    SRE, Thu Jun  2 09:27:43 2016 [Hamilton]
180  *
181  * Purpose:   Use Huffman code <hc> to encode the plaintext input <T> of
182  *            length <n>. The encoded result <X> consists of <nb> bits,
183  *            stored in an array of <nX> <uint32_t>'s; this result is
184  *            returned through the pointers <*ret_X>, <*ret_nX>, and
185  *            <*ret_nb>.
186  *
187  *            The encoded array <X> is allocated here, and must be
188  *            free'd by the caller.
189  *
190  * Args:      hc     - Huffman code to use for encoding
191  *            T      - plaintext input to encode, [0..n-1]; does not need to be NUL-terminated.
192  *            n      - length of T
193  *            ret_X  - RETURN: encoded bit array
194  *            ret_nb - RETURN: length of X in bits  (nX = nb / 32, rounded up)
195  *
196  * Returns:   <eslOK> on success.
197  *
198  * Throws:    <eslEMEM> on allocation failure. Now <*ret_X = NULL> and <*ret_nb = 0>.
199  */
200 int
esl_huffman_Encode(const ESL_HUFFMAN * hc,const char * T,int n,uint32_t ** ret_X,int * ret_nb)201 esl_huffman_Encode(const ESL_HUFFMAN *hc, const char *T, int n, uint32_t **ret_X, int *ret_nb)
202 {
203   uint32_t *X      = NULL;
204   int       xalloc = ESL_MAX(16, (n+15)/16);   // current allocation for X, in uint32_t's
205   int       pos    = 0;                        // current position in X's uint32_t array
206   int       nb;
207   int       i;
208   int       status;
209 
210   ESL_DASSERT1(( hc != NULL ));
211   ESL_DASSERT1(( T  != NULL ));
212   ESL_DASSERT1(( n > 0      ));
213 
214   ESL_ALLOC(X, sizeof(uint32_t) * xalloc);
215 
216   X[0] = 0;
217   nb     = 0;
218   for (i = 0; i < n; i++)
219     {
220       huffman_pack(X, &pos, &nb, hc->code[(int) T[i]], hc->len[(int) T[i]]);
221 
222       if (pos+1 == xalloc) {
223 	xalloc *= 2;
224 	ESL_REALLOC(X, sizeof(uint32_t) * xalloc);
225       }
226     }
227 
228   *ret_X  = X;            // X consists of <pos+1> uint32_t's
229   *ret_nb = 32*pos + nb;  //  ... we return exact # of bits.
230   return eslOK;
231 
232  ERROR:
233   *ret_X  = NULL;
234   *ret_nb = 0;
235   return status;
236 }
237 
238 
239 
240 
241 /*****************************************************************
242  * 3. Decoding
243  *****************************************************************/
244 
245 
246 /* Function:  esl_huffman_Decode()
247  * Synopsis:  Decode a bit stream.
248  * Incept:    SRE, Thu Jun  2 09:52:46 2016 [Hamilton, Act I]
249  *
250  * Purpose:   Use Huffman code <hc> to decode a bit stream <X> of length
251  *            <n> integers and <nb> bits. The result is a plaintext
252  *            string <T> of length <nT> characters. Return this result
253  *            through <*ret_T> and <*ret_nT>.
254  *
255  *            The decoded plaintext <T> is allocated here, and must be
256  *            free'd by the caller.
257  *
258  *            <T> is NUL-terminated, just in case that's useful --
259  *            though the caller isn't necessarily going to treat <T>
260  *            as a string. (It could be using "symbols" 0..127, which
261  *            would include <\0> as a valid symbol.)
262  *
263  * Args:      hc     - Huffman code to use to decode <X>
264  *            X      - bit stream to decode
265  *            nb     - length of <X> in BITS (nX = nb/32, rounded up)
266  *            ret_T  - RETURN: decoded plaintext string, \0-terminated
267  *            ret_n  - RETURN: length of <T> in chars
268  *
269  * Returns:   <eslOK> on success; <*ret_T> and <*ret_nT> hold the result.
270  *
271  * Throws:    <eslEMEM> on allocation failure. Now <*ret_T> is <NULL> and
272  *            <*ret_nT> is 0.
273  *
274  * Xref:
275  */
276 int
esl_huffman_Decode(const ESL_HUFFMAN * hc,const uint32_t * X,int nb,char ** ret_T,int * ret_n)277 esl_huffman_Decode(const ESL_HUFFMAN *hc, const uint32_t *X, int nb, char **ret_T, int *ret_n)
278 {
279   char    *T   = NULL;
280   int      allocT;                   // current allocation for T
281   uint32_t v   = X[0];               // current (full) 32 bits we're going to decode in this step
282   int      i   = 1;                  // index of X[i] we will first pull *new* bits from, after decoding v
283   int      nX  = (nb+31)/32;         // length of X in uint32_t's: nb/32 rounded up.
284   int      a   = (nX > 1 ? 32 : 0);
285   int      pos = 0;
286   int      L;                        // length of code we just decoded, in bits
287   int      status;
288 
289   allocT = nX * 4;                  // an initial guess: 4 bytes per X, maybe 4x compression
290   ESL_ALLOC(T, sizeof(char) * allocT);
291 
292   while (nb > 0)
293     {
294       huffman_unpack(hc, &v, X, nX, &i, &a, &(T[pos]), &L);
295       nb -= L;
296 
297       if (++pos == allocT) {
298 	allocT *= 2;
299 	ESL_REALLOC(T, sizeof(char) * allocT);
300       }
301     }
302 
303   /* We know we have space for the \0, from how we reallocated. */
304   T[pos] = '\0';
305   *ret_T  = T;
306   *ret_n  = pos;
307   return eslOK;
308 
309  ERROR:
310   *ret_T  = NULL;
311   *ret_n  = 0;
312   return status;
313 }
314 
315 
316 
317 
318 
319 /*****************************************************************
320  * 4. Debugging, development
321  *****************************************************************/
322 
323 /* Function:  esl_huffman_Dump()
324  * Synopsis:  Dump info on a huffman code structure.
325  * Incept:    SRE, Sat Jun  4 07:38:15 2016
326  *
327  * Purpose:   Dump the internals of object <hc> to output stream <fp>.
328  */
329 int
esl_huffman_Dump(FILE * fp,ESL_HUFFMAN * hc)330 esl_huffman_Dump(FILE *fp, ESL_HUFFMAN *hc)
331 {
332   int r,x;
333   int d,L;
334 
335   /* Encoding table: <letter index> <code length> <binary encoding> */
336   fprintf(fp, "Encoding table:\n");
337   for (r = 0; r < hc->Ku; r++)
338     {
339       x = hc->sorted_at[r];
340       fprintf(fp, "%3d %2d ", x, hc->len[x]);
341       dump_uint32(fp, hc->code[x], hc->len[x]);
342       fprintf(fp, "\n");
343     }
344   fputc('\n', fp);
345 
346 
347   /* Decoding table (if set) */
348   if (hc->dt_len)
349     {
350       fprintf(fp, "Decoding table:\n");
351       for (d = 0; d < hc->D; d++)
352         {
353           L = hc->dt_len[d];
354           fprintf(fp, "L=%2d  r=%3d (%3d) ", L, hc->dt_rank[d], hc->sorted_at[hc->dt_rank[d]]);
355           dump_uint32(fp, hc->dt_lcode[d], eslHUFFMAN_MAXCODE);
356           fputc('\n', fp);
357         }
358     }
359 
360   return eslOK;
361 }
362 
363 
364 
365 /*****************************************************************
366  * 5. Internal functions and structures
367  *****************************************************************/
368 
369 /* sort_floats_decreasing()
370  * Sorting function for esl_quicksort(), putting
371  * symbol frequencies in decreasing order.
372  */
373 static int
sort_floats_decreasing(const void * data,int e1,int e2)374 sort_floats_decreasing(const void *data, int e1, int e2)
375 {
376   float *fq = (float *) data;
377   if (fq[e1] > fq[e2]) return -1;
378   if (fq[e1] < fq[e2]) return 1;
379   return 0;
380 }
381 
382 /* sort_canonical()
383  * Sorting function for esl_quicksort(), putting symbols into
384  * canonical Huffman order: primarily by ascending code length,
385  * secondarily by ascending symbol code.
386  */
387 static int
sort_canonical(const void * data,int e1,int e2)388 sort_canonical(const void *data, int e1, int e2)
389 {
390   ESL_HUFFMAN *hc = (ESL_HUFFMAN *) data;
391   int          L1 = hc->len[e1];
392   int          L2 = hc->len[e2];
393 
394   if      (L2 == 0) return -1;   // len=0 means symbol isn't encoded at all, doesn't occur
395   else if (L1 == 0) return 1;
396   else if (L1 < L2) return -1;
397   else if (L1 > L2) return 1;
398   else if (e1 < e2) return -1;
399   else if (e1 > e2) return 1;
400   else              return 0;
401 }
402 
403 /* Build the Huffman tree, joining nodes/leaves of smallest frequency.
404  * This takes advantage of having the fq[] array sorted, and the fact
405  * that the internal node values also come out sorted... i.e. we don't
406  * have to re-sort, we can always find the smallest leaves/nodes by
407  * looking at the last ones.
408  *
409  * For the Ku=1 edge case, there's no tree, and this no-ops.
410  *
411  * Input:
412  *   hc->sorted_at[] lists symbol indices from largest to smallest freq.
413  *   hc->Ku          is the number of syms w/ nonzero freq; tree has Ku-1 nodes
414  *   htree           blank, allocated for at least Ku-1 nodes
415  *
416  * Output:
417  *   htree's left, right, val fields are filled.
418  *
419  * Returns:
420  *   <eslOK> on success.
421  */
422 static int
huffman_tree(ESL_HUFFMAN * hc,struct hufftree_s * htree,const float * fq)423 huffman_tree(ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq)
424 {
425   int r = hc->Ku-1;   // r = smallest leaf symbol that hasn't been included in tree yet; r+1 = # of leaves left
426   int k = hc->Ku-2;   // k = smallest internal node not used as a child yet; k-j = # nodes not used as child yet
427   int j;
428 
429   for (j = hc->Ku-2; j >= 0; j--)  // j = index of next node we add; we add one per iteration
430     {
431       /* Should we join two leaves?
432        *   If we have no internal nodes yet (because we're just starting),
433        *   or the two smallest frequencies are <= the smallest unjoined node's value
434        */
435       if ( (j == hc->Ku-2) ||  (r >= 1 && fq[hc->sorted_at[r]] <= htree[k].val))
436 	{
437 	  htree[j].right = -hc->sorted_at[r];    // leaves are signified by negative indices in tree
438 	  htree[j].left  = -hc->sorted_at[r-1];
439 	  htree[j].val   = fq[hc->sorted_at[r]] + fq[hc->sorted_at[r-1]];
440 	  r -= 2;
441 	}
442 
443       /* Or should we join two nodes?
444        *  If we have no leaves left,
445        *  or (we do have two nodes) and both are smaller than smallest unjoined leaf's value
446        */
447        else if (r == -1  || (k-j >= 2 && htree[k-1].val < fq[hc->sorted_at[r]]))
448 	 {
449 	   htree[j].right = k;
450 	   htree[j].left  = k-1;
451 	   htree[j].val   = htree[k].val + htree[k-1].val;
452 	   k -= 2;
453 	 }
454 
455       /* Otherwise, we join smallest node and smallest leaf. */
456        else
457 	 {
458 	   htree[j].right = -hc->sorted_at[r];
459 	   htree[j].left  = k;
460 	   htree[j].val   = fq[hc->sorted_at[r]] + htree[k].val;
461 	   r--;
462 	   k--;
463 	 }
464     }
465   return eslOK;
466 }
467 
468 
469 /* Calculate code lengths, equal to the depth of each node.
470  * Traverse the tree, calculating depth of each node, starting with
471  * depth 0 for root 0. We don't need a stack for this traversal,
472  * tree is already indexed in traversal order.
473  *
474  * For the Ku=1 edge case, there's no tree; for a single encoded
475  * symbol we set hc->len[0] = 1, hc->Lmax = 1
476  *
477  * Input:
478  *   hc->Ku          is the number of syms w/ nonzero freqs; tree has Ku-1 nodes.
479  *   htree[0..Ku-2]  is the constructed Huffman tree, with right/left/val set.
480  *   htree[].len     has been initialized to 0 for all symbols 0..K
481  *
482  * Output:
483  *   htree's depth field is set.
484  *   hc->len is set for all encoded symbols (left at 0 for unused symbols)
485  *   hc->Lmax is set
486  *
487  * Return:
488  *   <eslOK> on success
489  *   <eslERANGE> if max code length > eslHUFFMAN_MAXCODE and won't fit in uint32_t
490  */
491 static int
huffman_codelengths(ESL_HUFFMAN * hc,struct hufftree_s * htree,const float * fq)492 huffman_codelengths(ESL_HUFFMAN *hc, struct hufftree_s *htree, const float *fq)
493 {
494   int i;
495 
496   if (hc->Ku == 1)
497     {
498       hc->len[ hc->sorted_at[0] ] = 1;
499       hc->Lmax   = 1;
500       return eslOK;
501     }
502 
503   htree[0].depth = 0;
504   for (i = 0; i < hc->Ku-1; i++)
505     {
506       if (htree[i].right <= 0) hc->len[-htree[i].right]    = htree[i].depth + 1;
507       else                     htree[htree[i].right].depth = htree[i].depth + 1;
508 
509       if (htree[i].left <= 0)  hc->len[-htree[i].left]     = htree[i].depth + 1;
510       else                     htree[htree[i].left].depth  = htree[i].depth + 1;
511     }
512 
513   hc->Lmax = 0;
514   for (i = 0; i < hc->K; i++)
515     hc->Lmax = ESL_MAX(hc->len[i], hc->Lmax);
516 
517   return (hc->Lmax > eslHUFFMAN_MAXCODE ? eslERANGE : eslOK);
518 }
519 
520 
521 /* huffman_canonize()
522  * Given code lengths, now we calculate the canonical Huffman encoding.
523  *
524  * Input:
525  *   hc->len[]  code lengths are set for all K (0 for unused symbols)
526  *   hc->code[] have been initialized to 0 for all K
527  *
528  * Output:
529  *   hc->code[] have been set for all used symbols.
530  *   hc->D      number of different code lengths is set
531  *
532  * Returns:
533  *  <eslOK> on success.
534  */
535 static int
huffman_canonize(ESL_HUFFMAN * hc)536 huffman_canonize(ESL_HUFFMAN *hc)
537 {
538   int r;
539 
540   /* Sort symbols according to 1) code length; 2) order in digital alphabet (i.e. symbol code itself)
541    * Reuse/reset <sorted_at>.
542    * You can't just sort the encoded Ku; you have to sort all K, because
543    * quicksort expects indices to be contiguous (0..K-1).
544    */
545   esl_quicksort(hc, hc->K, sort_canonical, hc->sorted_at);
546 
547   /* Assign codes. (All K have been initialized to zero already.) */
548   for (r = 1; r < hc->Ku; r++)
549     hc->code[hc->sorted_at[r]] =
550       (hc->code[hc->sorted_at[r-1]] + 1) << (hc->len[hc->sorted_at[r]] - hc->len[hc->sorted_at[r-1]]);
551 
552   /* Set D, the number of different code lengths */
553   hc->D = 1;
554   for (r = 1; r < hc->Ku; r++)
555     if (hc->len[hc->sorted_at[r]] > hc->len[hc->sorted_at[r-1]]) hc->D++;
556 
557   return eslOK;
558 }
559 
560 
561 /* huffman_decoding_table()
562  * Given a canonical Huffman code; build the table that lets us
563  * efficiently decode it.
564  *
565  * Input:
566  *   hc->K         is set: total # of symbols (inclusive of unused ones)
567  *   hc->Ku        is set: total # of encoded/used symbols
568  *   hc->code      is set: canonical Huffman codes for symbols 0..K-1
569  *   hc->len       is set: code lengths for symbols 0..K-1
570  *   hc->sorted_at is set: canonical Huffman sort order
571  *   hc->Lmax      is set: maximum code length
572  *   hc->D         is set: # of different code lengths
573  *
574  *   hc->dt_len    is allocated for hc->D, but otherwise uninitialized
575  *   hc->dt_lcode  is allocated for hc->D, but otherwise uninitialized
576  *   hc->dt_rank   is allocated for hc->D, but otherwise uninitialized
577  *
578  * Output:
579  *   hc->dt_len    is set: lengths of each used code length 0..D-1
580  *   hc->dt_lcode  is set: left-flushed first code for each code length [d]
581  *   hc->dt_rank   is set: rank r for 1st code for each used code length [d]
582  */
583 static int
huffman_decoding_table(ESL_HUFFMAN * hc)584 huffman_decoding_table(ESL_HUFFMAN *hc)
585 {
586   int r;
587   int D = 0;
588 
589   hc->dt_len[0]   = hc->len[hc->sorted_at[0]];
590   hc->dt_lcode[0] = hc->code[hc->sorted_at[0]] << (eslHUFFMAN_MAXCODE - hc->len[hc->sorted_at[0]]);
591   hc->dt_rank[0]  = 0;
592   for (r = 1; r < hc->Ku; r++)
593     if (hc->len[hc->sorted_at[r]] > hc->len[hc->sorted_at[r-1]])
594       {
595 	D++;
596 	hc->dt_len[D]   = hc->len[hc->sorted_at[r]];
597 	hc->dt_lcode[D] = hc->code[hc->sorted_at[r]] << (eslHUFFMAN_MAXCODE - hc->len[hc->sorted_at[r]]);
598 	hc->dt_rank[D]  = r;
599       }
600   ESL_DASSERT1(( hc->D == D+1 ));
601   return eslOK;
602 }
603 
604 
605 static void
dump_uint32(FILE * fp,uint32_t v,int L)606 dump_uint32(FILE *fp, uint32_t v, int L)
607 {
608   uint32_t mask;
609   int      i;
610 
611   for (mask = 1 << (L-1), i = L; i >= 1; i--, mask = mask >> 1)
612     putc( ((v & mask) ? '1' : '0'), fp);
613 }
614 
615 
616 
617 /* huffman_pack()
618  *
619  * <X[i]> is the current uint32_t unit in the encoded buffer <X>. It
620  * has <a> bits in it so far, maximally left-shifted; therefore (32-a)
621  * bits are available.
622  *
623  * <code> is the next Huffman code to pack into the buffer, of length
624  * <L>, and it's right flush.
625  *
626  *     a=10 used      (32-a)=20 free
627  *    |xxxxxxxxxx|......................| X[i]
628  *    |........................|yyyyyyyy| code, L=8
629  *               |----- w -----|
630  *                w = 32-(a+L)
631  *
632  * If L < 32-a, then we just shift by w and pack it into X[i]. Else,
633  * we shift the other way (by -w), pack what we can into X[i], and
634  * leave the remainder in X[i+1].
635  *
636  * We update <i> and <a> for <X> accordingly... so we pass them by
637  * reference in <ip> and <ap>.
638  */
639 static void
huffman_pack(uint32_t * X,int * ip,int * ap,uint32_t code,int L)640 huffman_pack(uint32_t *X, int *ip, int *ap, uint32_t code, int L)
641 {
642   int w = 32 - (*ap+L);
643 
644   if (w > 0)      // code can pack into X[i]'s available space.
645     {
646       X[*ip] = X[*ip] | (code << w);
647       *ap += L;
648     }
649   else if (w < 0) // code packs partly in X[i], remainder in X[i+1].
650     {
651       X[*ip] = X[*ip] | (code >> (-w));
652       (*ip)++;
653       X[*ip] = code << (32+w);
654       (*ap) = -w;
655     }
656   else           // code packs exactly; w=0, no leftshift needed, OR it as is.
657     {
658       X[*ip] = X[*ip] | code;
659       *ip += 1;
660       *ap =  0;
661       X[*ip] = 0;  // don't forget to initialize X[i+1]!
662     }
663 }
664 
665 
666 
667 /* huffman_unpack()
668  * *vp  : ptr to v; v = next 32 bits
669  * *X   : encoded input
670  *  n   : length of input (in uint32_t)
671  * *ip  : current position in <X>
672  * *ap  : number of bits left in X[*ip]
673  *
674  * If we have to buffer X (say, if we're reading it from
675  * a long input) we'll have to redesign. Right now we assume
676  * it's just an array.
677  */
678 static void
huffman_unpack(const ESL_HUFFMAN * hc,uint32_t * vp,const uint32_t * X,int n,int * ip,int * ap,char * ret_x,int * ret_L)679 huffman_unpack(const ESL_HUFFMAN *hc, uint32_t *vp, const uint32_t *X, int n, int *ip, int *ap, char *ret_x, int *ret_L)
680 {
681   int      L,D;
682   int      idx;
683   uint32_t w;
684 
685   for (D = 0; D < hc->D-1; D++)
686     if ((*vp) < hc->dt_lcode[D+1]) break;
687   L = hc->dt_len[D];
688   /* L is now the next code's length (prefix of v) */
689 
690   /* Decode, by taking advantage of lexicographic sort/numerical order of canonical code, within each L */
691   idx     = hc->dt_rank[D] +  ( ((*vp) - hc->dt_lcode[D]) >> (eslHUFFMAN_MAXCODE-L) );
692 
693   /* Now refill v, as much as we can, from bits in X[i] and X[i+1], and update i, a */
694   *vp  = ( (*vp) << L);                // Remove L bits from *vp by leftshifting it.
695 
696   if (*ip < n) {                      // Take either L or all *ap bits from X[i], if it exists.
697     w    = X[*ip] << (32-(*ap));      // Shift off the bits we already used in X[i]. w is now X[i], left-flushed.
698     *vp |= (w >> (32-L));             // Right-shift w into position, leaving it with leading 0's where *vp already has bits.
699     *ap -= L;                         // We used up to L bits from X[i]
700 
701     // if *ap is still >0, we have bits left to use in X[i]. Otherwise:
702     if (*ap == 0)                       // If we exactly finished off X[i]:
703       {
704 	(*ip)++;                        // then advance in X[].
705 	*ap = 32;
706       }
707     else if (*ap < 0)                   // If we finished off X[i] but still need some bits
708       {
709 	(*ip)++;                        //   then go on to X[i+1] and 32 fresh bits.
710 	if (*ip < n)                    //   If it exists...
711 	  {                             //       (...no, I don't like all these branches either...)
712 	    *ap += 32;                  //     then we're going to leave it w/ <*ap> bits
713 	    *vp |= (X[*ip] >> *ap);     //     after taking the bits we need to fill v
714 	  }
715 	else
716 	  {
717 	    *ap = 0;                      //   If X[i+1] doesn't exist, leave *ip = n and *ap = 0; out of data in X (though not necessarily in v)
718 	  }
719       }
720   }
721 
722   *ret_x  = (char) hc->sorted_at[idx];
723   *ret_L  = L;
724 }
725 
726 
727 /*****************************************************************
728  * 6. Unit tests
729  *****************************************************************/
730 #ifdef eslHUFFMAN_TESTDRIVE
731 #include "esl_random.h"
732 #include "esl_randomseq.h"
733 #include "esl_vectorops.h"
734 
735 #include <string.h>
736 
737 
738 static void
utest_kryptos(ESL_RANDOMNESS * rng)739 utest_kryptos(ESL_RANDOMNESS *rng)
740 {
741   char         msg[]   = "kryptos utest failed";
742   ESL_HUFFMAN *hc      = NULL;
743   char         T[]     = "BETWEEN SUBTLE SHADING AND THE ABSENCE OF LIGHT LIES THE NUANCE OF IQLUSION";
744   int          n       = strlen(T);
745   uint32_t    *X       = NULL;
746   int          nb;
747   char        *T2      = NULL;
748   int          n2;
749   float        fq[128];
750   int          K       = 128;
751   int          i;
752   int          status;
753 
754   /* Any half-assed frequency distribution will do for this, over [ A-Z] */
755   for (i = 0;   i <  128; i++) fq[i] = 0.;
756   for (i = 'A'; i <= 'Z'; i++) fq[i] = esl_random(rng);
757   fq[' '] = esl_random(rng);
758   esl_vec_FNorm(fq, 128);
759 
760   if (( status = esl_huffman_Build (fq, K, &hc) )         != eslOK) esl_fatal(msg);
761   if (( status = esl_huffman_Encode(hc, T, n, &X, &nb))   != eslOK) esl_fatal(msg);
762   if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
763 
764   //esl_huffman_Dump(stdout, hc);
765   //printf("%s\n", T);
766   //printf("%s\n", T2);
767 
768   if (n2 != n)            esl_fatal(msg);
769   if (strcmp(T, T2) != 0) esl_fatal(msg);
770 
771   free(X);
772   free(T2);
773   esl_huffman_Destroy(hc);
774 }
775 
776 /* utest_uniletter()
777  * Tests an edge case of a text consisting of a single letter, Ku=1.
778  * (Ku=1 cases get tested occasionally by utest_backandforth() too.)
779  */
780 static void
utest_uniletter(void)781 utest_uniletter(void)
782 {
783   char   msg[]    = "uniletter utest failed";
784   char   T[]      = "AAAAAAAAAA";
785   int    n        = strlen(T);
786   int    K        = 128;
787   float  fq[128];
788   ESL_HUFFMAN *hc = NULL;
789   uint32_t    *X  = NULL;
790   int          nb;
791   char        *T2 = NULL;
792   int          n2;
793   int          i;
794   int          status;
795 
796   for (i = 0; i < 128; i++) fq[i] = 0.;
797   fq['A'] = (float) n;
798 
799   if (( status = esl_huffman_Build (fq, K, &hc) )         != eslOK) esl_fatal(msg);
800   if (( status = esl_huffman_Encode(hc, T, n, &X, &nb))   != eslOK) esl_fatal(msg);
801   if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
802 
803   if (n2 != n)            esl_fatal(msg);
804   if (strcmp(T, T2) != 0) esl_fatal(msg);
805 
806   free(X);
807   free(T2);
808   esl_huffman_Destroy(hc);
809 }
810 
811 
812 /* utest_backandforth()
813  * Encode and decode a random text string, and test
814  * that it decodes to the original.
815  */
816 static void
utest_backandforth(ESL_RANDOMNESS * rng)817 utest_backandforth(ESL_RANDOMNESS *rng)
818 {
819   char         msg[] = "back and forth utest failed";
820   ESL_HUFFMAN *hc    = NULL;
821   double      *fq0   = NULL;
822   float       *fq    = NULL;
823   int          K;             // alphabet size: randomly chosen from 1..128
824   char        *T     = NULL;  // random plaintext
825   int          n;             // randomly chosen length of plaintext T
826   uint32_t    *X     = NULL;  // Huffman-coded bit stream
827   int          nb;	      // length of X in bits
828   char        *T2    = NULL;  // decoded plaintext
829   int          n2;            // length of T2 in chars
830   int          i;
831   int          status;
832 
833   /* Sample a zero-peppered frequency distribution <fq> for a randomly
834    * selected alphabet size <K>.
835    */
836   K  = 1 + esl_rnd_Roll(rng, 128);                  // Choose a random alphabet size from 1 to 128
837   if (( fq0 = malloc(sizeof(double) * K)) == NULL) esl_fatal(msg);  // esl_random works in doubles
838   if (( fq  = malloc(sizeof(float)  * K)) == NULL) esl_fatal(msg);  // esl_huffman works in floats
839   esl_rnd_Dirichlet(rng, NULL, K, fq0);             // Sample a uniform random probability vector
840   for (i = 0; i < K; i++)                           // Pepper it with exact 0's while converting to float
841     fq[i] =  ( esl_rnd_Roll(rng, 4) == 0 ? 0. : (float) fq0[i] );
842   esl_vec_FNorm(fq, K);                             // and renormalize. (edge case: if fq was all 0, now it's uniform.)
843 
844   /* Sample a random plaintext array <T>, of randomly selected length <n>.
845    * We're using codes 0..K-1 -- T is not a string, it's an array -- don't \0 it.
846    */
847   n = 1 + esl_rnd_Roll(rng, 10);
848   if (( T = malloc(sizeof(char) * (n+1))) == NULL) esl_fatal(msg);
849   for (i = 0; i < n; i++) T[i] = esl_rnd_FChoose(rng,fq,K);
850 
851   if (( status = esl_huffman_Build (fq, K, &hc) )         != eslOK) esl_fatal(msg);
852   if (( status = esl_huffman_Encode(hc, T, n, &X, &nb))   != eslOK) esl_fatal(msg);
853   if (( status = esl_huffman_Decode(hc, X, nb, &T2, &n2)) != eslOK) esl_fatal(msg);
854 
855   //esl_huffman_Dump(stdout, hc);
856 
857   if ( n2 != n)               esl_fatal(msg);
858   if ( memcmp(T, T2, n) != 0) esl_fatal(msg);
859 
860   free(T2);
861   free(X);
862   free(fq0);
863   free(fq);
864   free(T);
865   esl_huffman_Destroy(hc);
866 }
867 
868 
869 
870 
871 #endif /*eslHUFFMAN_TESTDRIVE*/
872 
873 
874 
875 
876 /*****************************************************************
877  * 7. Test driver
878  *****************************************************************/
879 #ifdef eslHUFFMAN_TESTDRIVE
880 #include "esl_config.h"
881 
882 #include <stdio.h>
883 
884 #include "easel.h"
885 #include "esl_getopts.h"
886 #include "esl_huffman.h"
887 #include "esl_random.h"
888 
889 static ESL_OPTIONS options[] = {
890    /* name  type         default  env   range togs  reqs  incomp  help                docgrp */
891   {"-h",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage",             0},
892   {"-s",  eslARG_INT,       "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>",   0},
893   { 0,0,0,0,0,0,0,0,0,0},
894 };
895 static char usage[]  = "[-options]";
896 static char banner[] = "test driver for huffman module";
897 
898 int
main(int argc,char ** argv)899 main(int argc, char **argv)
900 {
901   ESL_GETOPTS    *go   = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
902   ESL_RANDOMNESS *rng  = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
903 
904   fprintf(stderr, "## %s\n", argv[0]);
905   fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
906 
907   utest_kryptos     (rng);
908   utest_uniletter   (   );
909   utest_backandforth(rng);
910 
911   fprintf(stderr, "#  status = ok\n");
912 
913   esl_getopts_Destroy(go);
914   esl_randomness_Destroy(rng);
915   return eslOK;
916 }
917 #endif /*eslHUFFMAN_TESTDRIVE*/
918 
919 
920 
921 
922 /*****************************************************************
923  * 8. Examples
924  *****************************************************************/
925 #ifdef eslHUFFMAN_EXAMPLE2
926 
927 /* esl_huffman_example2 <fqfile>
928  *
929  * The input <fqfile> consists of N lines with
930  * two whitespace-delimited fields:
931  *    <label>  <frequency>
932  *
933  * Huffman code the frequency distribution, and output the resulting
934  * encoding.
935  */
936 
937 
938 #include "easel.h"
939 #include "esl_buffer.h"
940 #include "esl_huffman.h"
941 #include "esl_mem.h"
942 #include "esl_vectorops.h"
943 
944 int
main(int argc,char ** argv)945 main(int argc, char **argv)
946 {
947   ESL_HUFFMAN *hc     = NULL;
948   ESL_BUFFER  *bf     = NULL;
949   esl_pos_t    n;
950   char        *p;
951   char        *tok;
952   esl_pos_t    toklen;
953   int          kalloc = 16;
954   char       **label  = malloc(sizeof(char *) * kalloc);
955   float       *fq     = malloc(sizeof(float)  * kalloc);
956   int          K      = 0;
957   float        meanL  = 0.;
958   int          i;
959   int          status;
960 
961   status = esl_buffer_Open(argv[1], NULL, &bf);
962   if      (status == eslENOTFOUND) esl_fatal("open failed: %s",   bf->errmsg);
963   else if (status == eslFAIL)      esl_fatal("gzip -dc failed: %s", bf->errmsg);
964   else if (status != eslOK)        esl_fatal("open failed with error code %d", status);
965 
966   while (( status = esl_buffer_GetLine(bf, &p, &n)) == eslOK)
967     {
968       if ( esl_memtok(&p, &n, " \t\n", &tok, &toklen) != eslOK) continue;
969       if ( esl_memstrdup(tok, toklen, &(label[K]))    != eslOK) continue;
970       if ( esl_mem_strtof(p, n, NULL, &(fq[K]))       != eslOK) continue;
971 
972       if (++K == kalloc) {
973 	kalloc *= 2;
974 	label = realloc(label, sizeof(char *) * kalloc);
975 	fq    = realloc(fq,    sizeof(float)  * kalloc);
976       }
977     }
978   esl_vec_FNorm(fq, K);
979 
980   if (( status = esl_huffman_Build(fq, K, &hc)) != eslOK) esl_fatal("failed to build huffman code");
981 
982   for (i = 0; i < K; i++)
983     {
984       printf("%-10s %2d ", label[i], hc->len[i]);
985       dump_uint32(stdout, hc->code[i], hc->len[i]);
986       printf("\n");
987     }
988 
989   for (i = 0; i < K; i++)
990     meanL += (float) hc->len[i] * fq[i];
991   printf("\nMean code length = %.2f bits\n", meanL);
992 
993   for (i = 0; i < K; i++) free(label[i]);
994   free(label);
995   free(fq);
996   esl_huffman_Destroy(hc);
997   esl_buffer_Close(bf);
998   return 0;
999 }
1000 #endif /*eslHUFFMAN_EXAMPLE2*/
1001 
1002 
1003 
1004 
1005 #ifdef eslHUFFMAN_EXAMPLE
1006 
1007 #include "easel.h"
1008 #include "esl_huffman.h"
1009 
1010 #include <stdio.h>
1011 #include <string.h>
1012 
1013 /* Given an open <fp> for reading;
1014  * input text from it and return it as a single string.
1015  * Optionally return the number of characters in <opt_n>.
1016  * Convert all \n to spaces.
1017  */
1018 static char *
read_text(FILE * fp,int * opt_n)1019 read_text(FILE *fp, int *opt_n)
1020 {
1021   int   maxlinelen = 4096;
1022   char *text       = malloc(sizeof(char) * maxlinelen);
1023   int   n          = 0;
1024   char *p;
1025 
1026   while (fgets(text+n, maxlinelen-1, fp) != NULL)
1027     {
1028       for (p = text+n; *p != '\0'; p++)
1029 	if (*p == '\n') *p = ' ';
1030       n   += strlen(text+n);
1031       text = realloc(text, sizeof(char) * (n+maxlinelen));
1032     }
1033 
1034   if (opt_n) *opt_n = n;
1035   return text;
1036 }
1037 
1038 int
main(int argc,char ** argv)1039 main(int argc, char **argv)
1040 {
1041   FILE     *fp = fopen(argv[1], "r");
1042   int       n;
1043   char     *T  = read_text(fp, &n);
1044   uint32_t *X  = NULL;
1045   float     fq[128];
1046   int       c,i;
1047   int       nb;
1048   ESL_HUFFMAN *hc   = NULL;
1049   char        *newT = NULL;
1050   int          nT;
1051 
1052   /* You provide a frequency table for your digital alphabet 0..K-1.
1053    * It's fine for there to be 0-frequency characters, even many of them;
1054    * they will not be encoded, and cost nothing.
1055    * Here, our digital alphabet is 7-bit ASCII text, 0..127, K=128.
1056    */
1057   for (c = 0; c < 128; c++) fq[c]           = 0.;
1058   for (i = 0; i < n;   i++) fq[(int) T[i]] += 1.;
1059 
1060   /* There does have to be at least one character to encode, of course. */
1061   ESL_DASSERT1(( n > 0 ));
1062 
1063   esl_huffman_Build(fq, 128, &hc);
1064 
1065   esl_huffman_Dump(stdout, hc);
1066 
1067   esl_huffman_Encode(hc, T, n, &X, &nb);
1068 
1069   printf("\nOriginal:   %d bytes\n", n);
1070   printf("Compressed: %d bytes (%d bits)\n", 4*(nb+31)/32, nb);
1071 
1072   /* Dump the compresstext, up to 30 words of it */
1073   printf("\nCompressed text:\n");
1074   for (i = 0; i < ESL_MIN(30, (nb+31)/32); i++) {
1075     dump_uint32(stdout, X[i], 32);
1076     fputc('\n', stdout);
1077   }
1078 
1079   esl_huffman_Decode(hc, X, nb, &newT, &nT);
1080 
1081   /* Show the decoded plaintext, up to 100 chars of it */
1082   printf("\nDecoded text:\n");
1083   for (i = 0; i < ESL_MIN(100, nT); i++)
1084     fputc(newT[i], stdout);
1085   fputc('\n', stdout);
1086 
1087   esl_huffman_Destroy(hc);
1088   free(T);
1089   fclose(fp);
1090   return 0;
1091 }
1092 #endif /*eslHUFFMAN_EXAMPLE*/
1093 
1094