1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 2009 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38 #include <string.h>
39
40 #include "sphinxbase/huff_code.h"
41 #include "sphinxbase/ckd_alloc.h"
42 #include "sphinxbase/hash_table.h"
43 #include "sphinxbase/byteorder.h"
44 #include "sphinxbase/heap.h"
45 #include "sphinxbase/pio.h"
46 #include "sphinxbase/err.h"
47
48 typedef struct huff_node_s {
49 int nbits;
50 struct huff_node_s *l;
51 union {
52 int32 ival;
53 char *sval;
54 struct huff_node_s *r;
55 } r;
56 } huff_node_t;
57
58 typedef struct huff_codeword_s {
59 union {
60 int32 ival;
61 char *sval;
62 } r;
63 uint32 nbits, codeword;
64 } huff_codeword_t;
65
66 enum {
67 HUFF_CODE_INT,
68 HUFF_CODE_STR
69 };
70
71 struct huff_code_s {
72 int16 refcount;
73 uint8 maxbits;
74 uint8 type;
75 uint32 *firstcode;
76 uint32 *numl;
77 huff_codeword_t **syms;
78 hash_table_t *codewords;
79 FILE *fh;
80 bit_encode_t *be;
81 int boff;
82 };
83
84 static huff_node_t *
huff_node_new_int(int32 val)85 huff_node_new_int(int32 val)
86 {
87 huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
88 hn->r.ival = val;
89 return hn;
90 }
91
92 static huff_node_t *
huff_node_new_str(char const * val)93 huff_node_new_str(char const *val)
94 {
95 huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
96 hn->r.sval = ckd_salloc(val);
97 return hn;
98 }
99
100 static huff_node_t *
huff_node_new_parent(huff_node_t * l,huff_node_t * r)101 huff_node_new_parent(huff_node_t *l, huff_node_t *r)
102 {
103 huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
104 hn->l = l;
105 hn->r.r = r;
106 /* Propagate maximum bit length. */
107 if (r->nbits > l->nbits)
108 hn->nbits = r->nbits + 1;
109 else
110 hn->nbits = l->nbits + 1;
111 return hn;
112 }
113
114 static void
huff_node_free_int(huff_node_t * root)115 huff_node_free_int(huff_node_t *root)
116 {
117 if (root->l) {
118 huff_node_free_int(root->l);
119 huff_node_free_int(root->r.r);
120 }
121 ckd_free(root);
122 }
123
124 static void
huff_node_free_str(huff_node_t * root,int freestr)125 huff_node_free_str(huff_node_t *root, int freestr)
126 {
127 if (root->l) {
128 huff_node_free_str(root->l, freestr);
129 huff_node_free_str(root->r.r, freestr);
130 }
131 else {
132 if (freestr)
133 ckd_free(root->r.sval);
134 }
135 ckd_free(root);
136 }
137
138 static huff_node_t *
huff_code_build_tree(heap_t * q)139 huff_code_build_tree(heap_t *q)
140 {
141 huff_node_t *root = NULL;
142 int32 rf;
143
144 while (heap_size(q) > 1) {
145 huff_node_t *l, *r, *p;
146 int32 lf, rf;
147
148 heap_pop(q, (void *)&l, &lf);
149 heap_pop(q, (void *)&r, &rf);
150 p = huff_node_new_parent(l, r);
151 heap_insert(q, p, lf + rf);
152 }
153 heap_pop(q, (void **)&root, &rf);
154 return root;
155 }
156
157 static void
huff_code_canonicalize(huff_code_t * hc,huff_node_t * root)158 huff_code_canonicalize(huff_code_t *hc, huff_node_t *root)
159 {
160 glist_t agenda;
161 uint32 *nextcode;
162 int i, ncw;
163
164 hc->firstcode = ckd_calloc(hc->maxbits+1, sizeof(*hc->firstcode));
165 hc->syms = ckd_calloc(hc->maxbits+1, sizeof(*hc->syms));
166 hc->numl = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
167 nextcode = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
168
169 /* Traverse the tree, annotating it with the actual bit
170 * lengths, and histogramming them in numl. */
171 root->nbits = 0;
172 ncw = 0;
173 agenda = glist_add_ptr(NULL, root);
174 while (agenda) {
175 huff_node_t *node = gnode_ptr(agenda);
176 agenda = gnode_free(agenda, NULL);
177 if (node->l) {
178 node->l->nbits = node->nbits + 1;
179 agenda = glist_add_ptr(agenda, node->l);
180 node->r.r->nbits = node->nbits + 1;
181 agenda = glist_add_ptr(agenda, node->r.r);
182 }
183 else {
184 hc->numl[node->nbits]++;
185 ncw++;
186 }
187 }
188 /* Create starting codes and symbol tables for each bit length. */
189 hc->syms[hc->maxbits] = ckd_calloc(hc->numl[hc->maxbits], sizeof(**hc->syms));
190 for (i = hc->maxbits - 1; i > 0; --i) {
191 hc->firstcode[i] = (hc->firstcode[i+1] + hc->numl[i+1]) / 2;
192 hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
193 }
194 memcpy(nextcode, hc->firstcode, (hc->maxbits + 1) * sizeof(*nextcode));
195 /* Traverse the tree again to produce the codebook itself. */
196 hc->codewords = hash_table_new(ncw, HASH_CASE_YES);
197 agenda = glist_add_ptr(NULL, root);
198 while (agenda) {
199 huff_node_t *node = gnode_ptr(agenda);
200 agenda = gnode_free(agenda, NULL);
201 if (node->l) {
202 agenda = glist_add_ptr(agenda, node->l);
203 agenda = glist_add_ptr(agenda, node->r.r);
204 }
205 else {
206 /* Initialize codebook entry, which also retains symbol pointer. */
207 huff_codeword_t *cw;
208 uint32 codeword = nextcode[node->nbits] & ((1 << node->nbits) - 1);
209 cw = hc->syms[node->nbits] + (codeword - hc->firstcode[node->nbits]);
210 cw->nbits = node->nbits;
211 cw->r.sval = node->r.sval; /* Will copy ints too... */
212 cw->codeword = codeword;
213 if (hc->type == HUFF_CODE_INT) {
214 hash_table_enter_bkey(hc->codewords,
215 (char const *)&cw->r.ival,
216 sizeof(cw->r.ival),
217 (void *)cw);
218 }
219 else {
220 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
221 }
222 ++nextcode[node->nbits];
223 }
224 }
225 ckd_free(nextcode);
226 }
227
228 huff_code_t *
huff_code_build_int(int32 const * values,int32 const * frequencies,int nvals)229 huff_code_build_int(int32 const *values, int32 const *frequencies, int nvals)
230 {
231 huff_code_t *hc;
232 huff_node_t *root;
233 heap_t *q;
234 int i;
235
236 hc = ckd_calloc(1, sizeof(*hc));
237 hc->refcount = 1;
238 hc->type = HUFF_CODE_INT;
239
240 /* Initialize the heap with nodes for each symbol. */
241 q = heap_new();
242 for (i = 0; i < nvals; ++i) {
243 heap_insert(q,
244 huff_node_new_int(values[i]),
245 frequencies[i]);
246 }
247
248 /* Now build the tree, which gives us codeword lengths. */
249 root = huff_code_build_tree(q);
250 heap_destroy(q);
251 if (root == NULL || root->nbits > 32) {
252 E_ERROR("Huffman trees currently limited to 32 bits\n");
253 huff_node_free_int(root);
254 huff_code_free(hc);
255 return NULL;
256 }
257
258 /* Build a canonical codebook. */
259 hc->maxbits = root->nbits;
260 huff_code_canonicalize(hc, root);
261
262 /* Tree no longer needed. */
263 huff_node_free_int(root);
264
265 return hc;
266 }
267
268 huff_code_t *
huff_code_build_str(char * const * values,int32 const * frequencies,int nvals)269 huff_code_build_str(char * const *values, int32 const *frequencies, int nvals)
270 {
271 huff_code_t *hc;
272 huff_node_t *root;
273 heap_t *q;
274 int i;
275
276 hc = ckd_calloc(1, sizeof(*hc));
277 hc->refcount = 1;
278 hc->type = HUFF_CODE_STR;
279
280 /* Initialize the heap with nodes for each symbol. */
281 q = heap_new();
282 for (i = 0; i < nvals; ++i) {
283 heap_insert(q,
284 huff_node_new_str(values[i]),
285 frequencies[i]);
286 }
287
288 /* Now build the tree, which gives us codeword lengths. */
289 root = huff_code_build_tree(q);
290 heap_destroy(q);
291 if (root == NULL || root->nbits > 32) {
292 E_ERROR("Huffman trees currently limited to 32 bits\n");
293 huff_node_free_str(root, TRUE);
294 huff_code_free(hc);
295 return NULL;
296 }
297
298 /* Build a canonical codebook. */
299 hc->maxbits = root->nbits;
300 huff_code_canonicalize(hc, root);
301
302 /* Tree no longer needed (note we retain pointers to its strings). */
303 huff_node_free_str(root, FALSE);
304
305 return hc;
306 }
307
308 huff_code_t *
huff_code_read(FILE * infh)309 huff_code_read(FILE *infh)
310 {
311 huff_code_t *hc;
312 int i, j;
313
314 hc = ckd_calloc(1, sizeof(*hc));
315 hc->refcount = 1;
316
317 hc->maxbits = fgetc(infh);
318 hc->type = fgetc(infh);
319
320 /* Two bytes of padding. */
321 fgetc(infh);
322 fgetc(infh);
323
324 /* Allocate stuff. */
325 hc->firstcode = ckd_calloc(hc->maxbits + 1, sizeof(*hc->firstcode));
326 hc->numl = ckd_calloc(hc->maxbits + 1, sizeof(*hc->numl));
327 hc->syms = ckd_calloc(hc->maxbits + 1, sizeof(*hc->syms));
328
329 /* Read the symbol tables. */
330 hc->codewords = hash_table_new(hc->maxbits, HASH_CASE_YES);
331 for (i = 1; i <= hc->maxbits; ++i) {
332 if (fread(&hc->firstcode[i], 4, 1, infh) != 1)
333 goto error_out;
334 SWAP_BE_32(&hc->firstcode[i]);
335 if (fread(&hc->numl[i], 4, 1, infh) != 1)
336 goto error_out;
337 SWAP_BE_32(&hc->numl[i]);
338 hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
339 for (j = 0; j < hc->numl[i]; ++j) {
340 huff_codeword_t *cw = &hc->syms[i][j];
341 cw->nbits = i;
342 cw->codeword = hc->firstcode[i] + j;
343 if (hc->type == HUFF_CODE_INT) {
344 if (fread(&cw->r.ival, 4, 1, infh) != 1)
345 goto error_out;
346 SWAP_BE_32(&cw->r.ival);
347 hash_table_enter_bkey(hc->codewords,
348 (char const *)&cw->r.ival,
349 sizeof(cw->r.ival),
350 (void *)cw);
351 }
352 else {
353 size_t len;
354 cw->r.sval = fread_line(infh, &len);
355 cw->r.sval[len-1] = '\0';
356 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
357 }
358 }
359 }
360
361 return hc;
362 error_out:
363 huff_code_free(hc);
364 return NULL;
365 }
366
367 int
huff_code_write(huff_code_t * hc,FILE * outfh)368 huff_code_write(huff_code_t *hc, FILE *outfh)
369 {
370 int i, j;
371
372 /* Maximum codeword length */
373 fputc(hc->maxbits, outfh);
374 /* Symbol type */
375 fputc(hc->type, outfh);
376 /* Two extra bytes (for future use and alignment) */
377 fputc(0, outfh);
378 fputc(0, outfh);
379 /* For each codeword length: */
380 for (i = 1; i <= hc->maxbits; ++i) {
381 uint32 val;
382
383 /* Starting code, number of codes. */
384 val = hc->firstcode[i];
385 /* Canonically big-endian (like the data itself) */
386 SWAP_BE_32(&val);
387 fwrite(&val, 4, 1, outfh);
388 val = hc->numl[i];
389 SWAP_BE_32(&val);
390 fwrite(&val, 4, 1, outfh);
391
392 /* Symbols for each code (FIXME: Should compress these too) */
393 for (j = 0; j < hc->numl[i]; ++j) {
394 if (hc->type == HUFF_CODE_INT) {
395 int32 val = hc->syms[i][j].r.ival;
396 SWAP_BE_32(&val);
397 fwrite(&val, 4, 1, outfh);
398 }
399 else {
400 /* Write them all separated by newlines, so that
401 * fgets() will read them for us. */
402 fprintf(outfh, "%s\n", hc->syms[i][j].r.sval);
403 }
404 }
405 }
406 return 0;
407 }
408
409 int
huff_code_dump_codebits(FILE * dumpfh,uint32 nbits,uint32 codeword)410 huff_code_dump_codebits(FILE *dumpfh, uint32 nbits, uint32 codeword)
411 {
412 uint32 i;
413
414 for (i = 0; i < nbits; ++i)
415 fputc((codeword & (1<<(nbits-i-1))) ? '1' : '0', dumpfh);
416 return 0;
417 }
418
419 int
huff_code_dump(huff_code_t * hc,FILE * dumpfh)420 huff_code_dump(huff_code_t *hc, FILE *dumpfh)
421 {
422 int i, j;
423
424 /* Print out all codewords. */
425 fprintf(dumpfh, "Maximum codeword length: %d\n", hc->maxbits);
426 fprintf(dumpfh, "Symbols are %s\n", (hc->type == HUFF_CODE_STR) ? "strings" : "ints");
427 fprintf(dumpfh, "Codewords:\n");
428 for (i = 1; i <= hc->maxbits; ++i) {
429 for (j = 0; j < hc->numl[i]; ++j) {
430 if (hc->type == HUFF_CODE_STR)
431 fprintf(dumpfh, "%-30s", hc->syms[i][j].r.sval);
432 else
433 fprintf(dumpfh, "%-30d", hc->syms[i][j].r.ival);
434 huff_code_dump_codebits(dumpfh, hc->syms[i][j].nbits,
435 hc->syms[i][j].codeword);
436 fprintf(dumpfh, "\n");
437 }
438 }
439 return 0;
440 }
441
442 huff_code_t *
huff_code_retain(huff_code_t * hc)443 huff_code_retain(huff_code_t *hc)
444 {
445 ++hc->refcount;
446 return hc;
447 }
448
449 int
huff_code_free(huff_code_t * hc)450 huff_code_free(huff_code_t *hc)
451 {
452 int i;
453
454 if (hc == NULL)
455 return 0;
456 if (--hc->refcount > 0)
457 return hc->refcount;
458 for (i = 0; i <= hc->maxbits; ++i) {
459 int j;
460 for (j = 0; j < hc->numl[i]; ++j) {
461 if (hc->type == HUFF_CODE_STR)
462 ckd_free(hc->syms[i][j].r.sval);
463 }
464 ckd_free(hc->syms[i]);
465 }
466 ckd_free(hc->firstcode);
467 ckd_free(hc->numl);
468 ckd_free(hc->syms);
469 hash_table_free(hc->codewords);
470 ckd_free(hc);
471 return 0;
472 }
473
474 FILE *
huff_code_attach(huff_code_t * hc,FILE * fh,char const * mode)475 huff_code_attach(huff_code_t *hc, FILE *fh, char const *mode)
476 {
477 FILE *oldfh = huff_code_detach(hc);
478
479 hc->fh = fh;
480 if (mode[0] == 'w')
481 hc->be = bit_encode_attach(hc->fh);
482 return oldfh;
483 }
484
485 FILE *
huff_code_detach(huff_code_t * hc)486 huff_code_detach(huff_code_t *hc)
487 {
488 FILE *oldfh = hc->fh;
489
490 if (hc->be) {
491 bit_encode_flush(hc->be);
492 bit_encode_free(hc->be);
493 hc->be = NULL;
494 }
495 hc->fh = NULL;
496 return oldfh;
497 }
498
499 int
huff_code_encode_int(huff_code_t * hc,int32 sym,uint32 * outcw)500 huff_code_encode_int(huff_code_t *hc, int32 sym, uint32 *outcw)
501 {
502 huff_codeword_t *cw;
503
504 if (hash_table_lookup_bkey(hc->codewords,
505 (char const *)&sym,
506 sizeof(sym),
507 (void **)&cw) < 0)
508 return 0;
509 if (hc->be)
510 bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
511 if (outcw) *outcw = cw->codeword;
512 return cw->nbits;
513 }
514
515 int
huff_code_encode_str(huff_code_t * hc,char const * sym,uint32 * outcw)516 huff_code_encode_str(huff_code_t *hc, char const *sym, uint32 *outcw)
517 {
518 huff_codeword_t *cw;
519
520 if (hash_table_lookup(hc->codewords,
521 sym,
522 (void **)&cw) < 0)
523 return 0;
524 if (hc->be)
525 bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
526 if (outcw) *outcw = cw->codeword;
527 return cw->nbits;
528 }
529
530 static huff_codeword_t *
huff_code_decode_data(huff_code_t * hc,char const ** inout_data,size_t * inout_data_len,int * inout_offset)531 huff_code_decode_data(huff_code_t *hc, char const **inout_data,
532 size_t *inout_data_len, int *inout_offset)
533 {
534 char const *data = *inout_data;
535 char const *end = data + *inout_data_len;
536 int offset = *inout_offset;
537 uint32 cw;
538 int cwlen;
539 int byte;
540
541 if (data == end)
542 return NULL;
543 byte = *data++;
544 cw = !!(byte & (1 << (7-offset++)));
545 cwlen = 1;
546 /* printf("%.*x ", cwlen, cw); */
547 while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
548 ++cwlen;
549 cw <<= 1;
550 if (offset > 7) {
551 if (data == end)
552 return NULL;
553 byte = *data++;
554 offset = 0;
555 }
556 cw |= !!(byte & (1 << (7-offset++)));
557 /* printf("%.*x ", cwlen, cw); */
558 }
559 if (cwlen > hc->maxbits) /* FAIL: invalid data */
560 return NULL;
561
562 /* Put the last byte back if there are bits left over. */
563 if (offset < 8)
564 --data;
565 else
566 offset = 0;
567
568 /* printf("%.*x\n", cwlen, cw); */
569 *inout_data_len = end - data;
570 *inout_data = data;
571 *inout_offset = offset;
572 return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
573 }
574
575 static huff_codeword_t *
huff_code_decode_fh(huff_code_t * hc)576 huff_code_decode_fh(huff_code_t *hc)
577 {
578 uint32 cw;
579 int cwlen;
580 int byte;
581
582 if ((byte = fgetc(hc->fh)) == EOF)
583 return NULL;
584 cw = !!(byte & (1 << (7-hc->boff++)));
585 cwlen = 1;
586 /* printf("%.*x ", cwlen, cw); */
587 while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
588 ++cwlen;
589 cw <<= 1;
590 if (hc->boff > 7) {
591 if ((byte = fgetc(hc->fh)) == EOF)
592 return NULL;
593 hc->boff = 0;
594 }
595 cw |= !!(byte & (1 << (7-hc->boff++)));
596 /* printf("%.*x ", cwlen, cw); */
597 }
598 if (cwlen > hc->maxbits) /* FAIL: invalid data */
599 return NULL;
600
601 /* Put the last byte back if there are bits left over. */
602 if (hc->boff < 8)
603 ungetc(byte, hc->fh);
604 else
605 hc->boff = 0;
606
607 /* printf("%.*x\n", cwlen, cw); */
608 return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
609 }
610
611 int
huff_code_decode_int(huff_code_t * hc,int * outval,char const ** inout_data,size_t * inout_data_len,int * inout_offset)612 huff_code_decode_int(huff_code_t *hc, int *outval,
613 char const **inout_data,
614 size_t *inout_data_len, int *inout_offset)
615 {
616 huff_codeword_t *cw;
617
618 if (inout_data)
619 cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset);
620 else if (hc->fh)
621 cw = huff_code_decode_fh(hc);
622 else
623 return -1;
624
625 if (cw == NULL)
626 return -1;
627 if (outval)
628 *outval = cw->r.ival;
629
630 return 0;
631 }
632
633 char const *
huff_code_decode_str(huff_code_t * hc,char const ** inout_data,size_t * inout_data_len,int * inout_offset)634 huff_code_decode_str(huff_code_t *hc,
635 char const **inout_data,
636 size_t *inout_data_len, int *inout_offset)
637 {
638 huff_codeword_t *cw;
639
640 if (inout_data)
641 cw = huff_code_decode_data(hc, inout_data, inout_data_len, inout_offset);
642 else if (hc->fh)
643 cw = huff_code_decode_fh(hc);
644 else
645 return NULL;
646
647 if (cw == NULL)
648 return NULL;
649
650 return cw->r.sval;
651 }
652