1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2009 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 #include <string.h>
39 
40 #include "sphinxbase/huff_code.h"
41 #include "sphinxbase/ckd_alloc.h"
42 #include "sphinxbase/hash_table.h"
43 #include "sphinxbase/byteorder.h"
44 #include "sphinxbase/heap.h"
45 #include "sphinxbase/pio.h"
46 #include "sphinxbase/err.h"
47 
48 typedef struct huff_node_s {
49     int nbits;
50     struct huff_node_s *l;
51     union {
52         int32 ival;
53         char *sval;
54         struct huff_node_s *r;
55     } r;
56 } huff_node_t;
57 
58 typedef struct huff_codeword_s {
59     union {
60         int32 ival;
61         char *sval;
62     } r;
63     uint32 nbits, codeword;
64 } huff_codeword_t;
65 
66 enum {
67     HUFF_CODE_INT,
68     HUFF_CODE_STR
69 };
70 
71 struct huff_code_s {
72     int16 refcount;
73     uint8 maxbits;
74     uint8 type;
75     uint32 *firstcode;
76     uint32 *numl;
77     huff_codeword_t **syms;
78     hash_table_t *codewords;
79     FILE *fh;
80     bit_encode_t *be;
81     int boff;
82 };
83 
84 static huff_node_t *
huff_node_new_int(int32 val)85 huff_node_new_int(int32 val)
86 {
87     huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
88     hn->r.ival = val;
89     return hn;
90 }
91 
92 static huff_node_t *
huff_node_new_str(char const * val)93 huff_node_new_str(char const *val)
94 {
95     huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
96     hn->r.sval = ckd_salloc(val);
97     return hn;
98 }
99 
100 static huff_node_t *
huff_node_new_parent(huff_node_t * l,huff_node_t * r)101 huff_node_new_parent(huff_node_t *l, huff_node_t *r)
102 {
103     huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
104     hn->l = l;
105     hn->r.r = r;
106     /* Propagate maximum bit length. */
107     if (r->nbits > l->nbits)
108         hn->nbits = r->nbits + 1;
109     else
110         hn->nbits = l->nbits + 1;
111     return hn;
112 }
113 
114 static void
huff_node_free_int(huff_node_t * root)115 huff_node_free_int(huff_node_t *root)
116 {
117     if (root->l) {
118         huff_node_free_int(root->l);
119         huff_node_free_int(root->r.r);
120     }
121     ckd_free(root);
122 }
123 
124 static void
huff_node_free_str(huff_node_t * root,int freestr)125 huff_node_free_str(huff_node_t *root, int freestr)
126 {
127     if (root->l) {
128         huff_node_free_str(root->l, freestr);
129         huff_node_free_str(root->r.r, freestr);
130     }
131     else {
132         if (freestr)
133             ckd_free(root->r.sval);
134     }
135     ckd_free(root);
136 }
137 
138 static huff_node_t *
huff_code_build_tree(heap_t * q)139 huff_code_build_tree(heap_t *q)
140 {
141     huff_node_t *root = NULL;
142     int32 rf;
143 
144     while (heap_size(q) > 1) {
145         huff_node_t *l, *r, *p;
146         int32 lf, rf;
147 
148         heap_pop(q, (void *)&l, &lf);
149         heap_pop(q, (void *)&r, &rf);
150         p = huff_node_new_parent(l, r);
151         heap_insert(q, p, lf + rf);
152     }
153     heap_pop(q, (void **)&root, &rf);
154     return root;
155 }
156 
157 static void
huff_code_canonicalize(huff_code_t * hc,huff_node_t * root)158 huff_code_canonicalize(huff_code_t *hc, huff_node_t *root)
159 {
160     glist_t agenda;
161     uint32 *nextcode;
162     int i, ncw;
163 
164     hc->firstcode = ckd_calloc(hc->maxbits+1, sizeof(*hc->firstcode));
165     hc->syms = ckd_calloc(hc->maxbits+1, sizeof(*hc->syms));
166     hc->numl = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
167     nextcode = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
168 
169     /* Traverse the tree, annotating it with the actual bit
170      * lengths, and histogramming them in numl. */
171     root->nbits = 0;
172     ncw = 0;
173     agenda = glist_add_ptr(NULL, root);
174     while (agenda) {
175         huff_node_t *node = gnode_ptr(agenda);
176         agenda = gnode_free(agenda, NULL);
177         if (node->l) {
178             node->l->nbits = node->nbits + 1;
179             agenda = glist_add_ptr(agenda, node->l);
180             node->r.r->nbits = node->nbits + 1;
181             agenda = glist_add_ptr(agenda, node->r.r);
182         }
183         else {
184             hc->numl[node->nbits]++;
185             ncw++;
186         }
187     }
188     /* Create starting codes and symbol tables for each bit length. */
189     hc->syms[hc->maxbits] = ckd_calloc(hc->numl[hc->maxbits], sizeof(**hc->syms));
190     for (i = hc->maxbits - 1; i > 0; --i) {
191         hc->firstcode[i] = (hc->firstcode[i+1] + hc->numl[i+1]) / 2;
192         hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
193     }
194     memcpy(nextcode, hc->firstcode, (hc->maxbits + 1) * sizeof(*nextcode));
195     /* Traverse the tree again to produce the codebook itself. */
196     hc->codewords = hash_table_new(ncw, HASH_CASE_YES);
197     agenda = glist_add_ptr(NULL, root);
198     while (agenda) {
199         huff_node_t *node = gnode_ptr(agenda);
200         agenda = gnode_free(agenda, NULL);
201         if (node->l) {
202             agenda = glist_add_ptr(agenda, node->l);
203             agenda = glist_add_ptr(agenda, node->r.r);
204         }
205         else {
206             /* Initialize codebook entry, which also retains symbol pointer. */
207             huff_codeword_t *cw;
208             uint32 codeword = nextcode[node->nbits] & ((1 << node->nbits) - 1);
209             cw = hc->syms[node->nbits] + (codeword - hc->firstcode[node->nbits]);
210             cw->nbits = node->nbits;
211             cw->r.sval = node->r.sval; /* Will copy ints too... */
212             cw->codeword = codeword;
213             if (hc->type == HUFF_CODE_INT) {
214                 hash_table_enter_bkey(hc->codewords,
215                                       (char const *)&cw->r.ival,
216                                       sizeof(cw->r.ival),
217                                       (void *)cw);
218             }
219             else {
220                 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
221             }
222             ++nextcode[node->nbits];
223         }
224     }
225     ckd_free(nextcode);
226 }
227 
228 huff_code_t *
huff_code_build_int(int32 const * values,int32 const * frequencies,int nvals)229 huff_code_build_int(int32 const *values, int32 const *frequencies, int nvals)
230 {
231     huff_code_t *hc;
232     huff_node_t *root;
233     heap_t *q;
234     int i;
235 
236     hc = ckd_calloc(1, sizeof(*hc));
237     hc->refcount = 1;
238     hc->type = HUFF_CODE_INT;
239 
240     /* Initialize the heap with nodes for each symbol. */
241     q = heap_new();
242     for (i = 0; i < nvals; ++i) {
243         heap_insert(q,
244                     huff_node_new_int(values[i]),
245                     frequencies[i]);
246     }
247 
248     /* Now build the tree, which gives us codeword lengths. */
249     root = huff_code_build_tree(q);
250     heap_destroy(q);
251     if (root == NULL || root->nbits > 32) {
252         E_ERROR("Huffman trees currently limited to 32 bits\n");
253         huff_node_free_int(root);
254         huff_code_free(hc);
255         return NULL;
256     }
257 
258     /* Build a canonical codebook. */
259     hc->maxbits = root->nbits;
260     huff_code_canonicalize(hc, root);
261 
262     /* Tree no longer needed. */
263     huff_node_free_int(root);
264 
265     return hc;
266 }
267 
268 huff_code_t *
huff_code_build_str(char * const * values,int32 const * frequencies,int nvals)269 huff_code_build_str(char * const *values, int32 const *frequencies, int nvals)
270 {
271     huff_code_t *hc;
272     huff_node_t *root;
273     heap_t *q;
274     int i;
275 
276     hc = ckd_calloc(1, sizeof(*hc));
277     hc->refcount = 1;
278     hc->type = HUFF_CODE_STR;
279 
280     /* Initialize the heap with nodes for each symbol. */
281     q = heap_new();
282     for (i = 0; i < nvals; ++i) {
283         heap_insert(q,
284                     huff_node_new_str(values[i]),
285                     frequencies[i]);
286     }
287 
288     /* Now build the tree, which gives us codeword lengths. */
289     root = huff_code_build_tree(q);
290     heap_destroy(q);
291     if (root == NULL || root->nbits > 32) {
292         E_ERROR("Huffman trees currently limited to 32 bits\n");
293         huff_node_free_str(root, TRUE);
294         huff_code_free(hc);
295         return NULL;
296     }
297 
298     /* Build a canonical codebook. */
299     hc->maxbits = root->nbits;
300     huff_code_canonicalize(hc, root);
301 
302     /* Tree no longer needed (note we retain pointers to its strings). */
303     huff_node_free_str(root, FALSE);
304 
305     return hc;
306 }
307 
308 huff_code_t *
huff_code_read(FILE * infh)309 huff_code_read(FILE *infh)
310 {
311     huff_code_t *hc;
312     int i, j;
313 
314     hc = ckd_calloc(1, sizeof(*hc));
315     hc->refcount = 1;
316 
317     hc->maxbits = fgetc(infh);
318     hc->type = fgetc(infh);
319 
320     /* Two bytes of padding. */
321     fgetc(infh);
322     fgetc(infh);
323 
324     /* Allocate stuff. */
325     hc->firstcode = ckd_calloc(hc->maxbits + 1, sizeof(*hc->firstcode));
326     hc->numl = ckd_calloc(hc->maxbits + 1, sizeof(*hc->numl));
327     hc->syms = ckd_calloc(hc->maxbits + 1, sizeof(*hc->syms));
328 
329     /* Read the symbol tables. */
330     hc->codewords = hash_table_new(hc->maxbits, HASH_CASE_YES);
331     for (i = 1; i <= hc->maxbits; ++i) {
332         if (fread(&hc->firstcode[i], 4, 1, infh) != 1)
333             goto error_out;
334         SWAP_BE_32(&hc->firstcode[i]);
335         if (fread(&hc->numl[i], 4, 1, infh) != 1)
336             goto error_out;
337         SWAP_BE_32(&hc->numl[i]);
338         hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
339         for (j = 0; j < hc->numl[i]; ++j) {
340             huff_codeword_t *cw = &hc->syms[i][j];
341             cw->nbits = i;
342             cw->codeword = hc->firstcode[i] + j;
343             if (hc->type == HUFF_CODE_INT) {
344                 if (fread(&cw->r.ival, 4, 1, infh) != 1)
345                     goto error_out;
346                 SWAP_BE_32(&cw->r.ival);
347                 hash_table_enter_bkey(hc->codewords,
348                                       (char const *)&cw->r.ival,
349                                       sizeof(cw->r.ival),
350                                       (void *)cw);
351             }
352             else {
353                 size_t len;
354                 cw->r.sval = fread_line(infh, &len);
355                 cw->r.sval[len-1] = '\0';
356                 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
357             }
358         }
359     }
360 
361     return hc;
362 error_out:
363     huff_code_free(hc);
364     return NULL;
365 }
366 
367 int
huff_code_write(huff_code_t * hc,FILE * outfh)368 huff_code_write(huff_code_t *hc, FILE *outfh)
369 {
370     int i, j;
371 
372     /* Maximum codeword length */
373     fputc(hc->maxbits, outfh);
374     /* Symbol type */
375     fputc(hc->type, outfh);
376     /* Two extra bytes (for future use and alignment) */
377     fputc(0, outfh);
378     fputc(0, outfh);
379     /* For each codeword length: */
380     for (i = 1; i <= hc->maxbits; ++i) {
381         uint32 val;
382 
383         /* Starting code, number of codes. */
384         val = hc->firstcode[i];
385         /* Canonically big-endian (like the data itself) */
386         SWAP_BE_32(&val);
387         fwrite(&val, 4, 1, outfh);
388         val = hc->numl[i];
389         SWAP_BE_32(&val);
390         fwrite(&val, 4, 1, outfh);
391 
392         /* Symbols for each code (FIXME: Should compress these too) */
393         for (j = 0; j < hc->numl[i]; ++j) {
394             if (hc->type == HUFF_CODE_INT) {
395                 int32 val = hc->syms[i][j].r.ival;
396                 SWAP_BE_32(&val);
397                 fwrite(&val, 4, 1, outfh);
398             }
399             else {
400                 /* Write them all separated by newlines, so that
401                  * fgets() will read them for us. */
402                 fprintf(outfh, "%s\n", hc->syms[i][j].r.sval);
403             }
404         }
405     }
406     return 0;
407 }
408 
409 int
huff_code_dump_codebits(FILE * dumpfh,uint32 nbits,uint32 codeword)410 huff_code_dump_codebits(FILE *dumpfh, uint32 nbits, uint32 codeword)
411 {
412     uint32 i;
413 
414     for (i = 0; i < nbits; ++i)
415         fputc((codeword & (1<<(nbits-i-1))) ? '1' : '0', dumpfh);
416     return 0;
417 }
418 
419 int
huff_code_dump(huff_code_t * hc,FILE * dumpfh)420 huff_code_dump(huff_code_t *hc, FILE *dumpfh)
421 {
422     int i, j;
423 
424     /* Print out all codewords. */
425     fprintf(dumpfh, "Maximum codeword length: %d\n", hc->maxbits);
426     fprintf(dumpfh, "Symbols are %s\n", (hc->type == HUFF_CODE_STR) ? "strings" : "ints");
427     fprintf(dumpfh, "Codewords:\n");
428     for (i = 1; i <= hc->maxbits; ++i) {
429         for (j = 0; j < hc->numl[i]; ++j) {
430             if (hc->type == HUFF_CODE_STR)
431                 fprintf(dumpfh, "%-30s", hc->syms[i][j].r.sval);
432             else
433                 fprintf(dumpfh, "%-30d", hc->syms[i][j].r.ival);
434             huff_code_dump_codebits(dumpfh, hc->syms[i][j].nbits,
435                                     hc->syms[i][j].codeword);
436             fprintf(dumpfh, "\n");
437         }
438     }
439     return 0;
440 }
441 
442 huff_code_t *
huff_code_retain(huff_code_t * hc)443 huff_code_retain(huff_code_t *hc)
444 {
445     ++hc->refcount;
446     return hc;
447 }
448 
449 int
huff_code_free(huff_code_t * hc)450 huff_code_free(huff_code_t *hc)
451 {
452     int i;
453 
454     if (hc == NULL)
455         return 0;
456     if (--hc->refcount > 0)
457         return hc->refcount;
458     for (i = 0; i <= hc->maxbits; ++i) {
459         int j;
460         for (j = 0; j < hc->numl[i]; ++j) {
461             if (hc->type == HUFF_CODE_STR)
462                 ckd_free(hc->syms[i][j].r.sval);
463         }
464         ckd_free(hc->syms[i]);
465     }
466     ckd_free(hc->firstcode);
467     ckd_free(hc->numl);
468     ckd_free(hc->syms);
469     hash_table_free(hc->codewords);
470     ckd_free(hc);
471     return 0;
472 }
473 
474 FILE *
huff_code_attach(huff_code_t * hc,FILE * fh,char const * mode)475 huff_code_attach(huff_code_t *hc, FILE *fh, char const *mode)
476 {
477     FILE *oldfh = huff_code_detach(hc);
478 
479     hc->fh = fh;
480     if (mode[0] == 'w')
481         hc->be = bit_encode_attach(hc->fh);
482     return oldfh;
483 }
484 
485 FILE *
huff_code_detach(huff_code_t * hc)486 huff_code_detach(huff_code_t *hc)
487 {
488     FILE *oldfh = hc->fh;
489 
490     if (hc->be) {
491         bit_encode_flush(hc->be);
492         bit_encode_free(hc->be);
493         hc->be = NULL;
494     }
495     hc->fh = NULL;
496     return oldfh;
497 }
498 
499 int
huff_code_encode_int(huff_code_t * hc,int32 sym,uint32 * outcw)500 huff_code_encode_int(huff_code_t *hc, int32 sym, uint32 *outcw)
501 {
502     huff_codeword_t *cw;
503 
504     if (hash_table_lookup_bkey(hc->codewords,
505                                (char const *)&sym,
506                                sizeof(sym),
507                                (void **)&cw) < 0)
508         return 0;
509     if (hc->be)
510         bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
511     if (outcw) *outcw = cw->codeword;
512     return cw->nbits;
513 }
514 
515 int
huff_code_encode_str(huff_code_t * hc,char const * sym,uint32 * outcw)516 huff_code_encode_str(huff_code_t *hc, char const *sym, uint32 *outcw)
517 {
518     huff_codeword_t *cw;
519 
520     if (hash_table_lookup(hc->codewords,
521                           sym,
522                           (void **)&cw) < 0)
523         return 0;
524     if (hc->be)
525         bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
526     if (outcw) *outcw = cw->codeword;
527     return cw->nbits;
528 }
529 
530 static huff_codeword_t *
huff_code_decode_data(huff_code_t * hc,char const ** inout_data,size_t * inout_data_len,int * inout_offset)531 huff_code_decode_data(huff_code_t *hc, char const **inout_data,
532                       size_t *inout_data_len, int *inout_offset)
533 {
534     char const *data = *inout_data;
535     char const *end = data + *inout_data_len;
536     int offset = *inout_offset;
537     uint32 cw;
538     int cwlen;
539     int byte;
540 
541     if (data == end)
542         return NULL;
543     byte = *data++;
544     cw = !!(byte & (1 << (7-offset++)));
545     cwlen = 1;
546     /* printf("%.*x ", cwlen, cw); */
547     while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
548         ++cwlen;
549         cw <<= 1;
550         if (offset > 7) {
551             if (data == end)
552                 return NULL;
553             byte = *data++;
554             offset = 0;
555         }
556         cw |= !!(byte & (1 << (7-offset++)));
557         /* printf("%.*x ", cwlen, cw); */
558     }
559     if (cwlen > hc->maxbits) /* FAIL: invalid data */
560         return NULL;
561 
562     /* Put the last byte back if there are bits left over. */
563     if (offset < 8)
564         --data;
565     else
566         offset = 0;
567 
568     /* printf("%.*x\n", cwlen, cw); */
569     *inout_data_len = end - data;
570     *inout_data = data;
571     *inout_offset = offset;
572     return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
573 }
574 
575 static huff_codeword_t *
huff_code_decode_fh(huff_code_t * hc)576 huff_code_decode_fh(huff_code_t *hc)
577 {
578     uint32 cw;
579     int cwlen;
580     int byte;
581 
582     if ((byte = fgetc(hc->fh)) == EOF)
583         return NULL;
584     cw = !!(byte & (1 << (7-hc->boff++)));
585     cwlen = 1;
586     /* printf("%.*x ", cwlen, cw); */
587     while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
588         ++cwlen;
589         cw <<= 1;
590         if (hc->boff > 7) {
591             if ((byte = fgetc(hc->fh)) == EOF)
592                 return NULL;
593             hc->boff = 0;
594         }
595         cw |= !!(byte & (1 << (7-hc->boff++)));
596         /* printf("%.*x ", cwlen, cw); */
597     }
598     if (cwlen > hc->maxbits) /* FAIL: invalid data */
599         return NULL;
600 
601     /* Put the last byte back if there are bits left over. */
602     if (hc->boff < 8)
603         ungetc(byte, hc->fh);
604     else
605         hc->boff = 0;
606 
607     /* printf("%.*x\n", cwlen, cw); */
608     return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
609 }
610 
611 int
huff_code_decode_int(huff_code_t * hc,int * outval,char const ** inout_data,size_t * inout_data_len,int * inout_offset)612 huff_code_decode_int(huff_code_t *hc, int *outval,
613                      char const **inout_data,
614                      size_t *inout_data_len, int *inout_offset)
615 {
616     huff_codeword_t *cw;
617 
618     if (inout_data)
619         cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset);
620     else if (hc->fh)
621         cw = huff_code_decode_fh(hc);
622     else
623         return -1;
624 
625     if (cw == NULL)
626         return -1;
627     if (outval)
628         *outval = cw->r.ival;
629 
630     return 0;
631 }
632 
633 char const *
huff_code_decode_str(huff_code_t * hc,char const ** inout_data,size_t * inout_data_len,int * inout_offset)634 huff_code_decode_str(huff_code_t *hc,
635                      char const **inout_data,
636                      size_t *inout_data_len, int *inout_offset)
637 {
638     huff_codeword_t *cw;
639 
640     if (inout_data)
641         cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset);
642     else if (hc->fh)
643         cw = huff_code_decode_fh(hc);
644     else
645         return NULL;
646 
647     if (cw == NULL)
648         return NULL;
649 
650     return cw->r.sval;
651 }
652