1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * FIXME: add checking of cram_external_type to return NULL on unsupported
33  * {codec,type} tuples.
34  */
35 
36 #include <config.h>
37 
38 #include <stdlib.h>
39 #include <string.h>
40 #include <assert.h>
41 #include <limits.h>
42 #include <stdint.h>
43 #include <errno.h>
44 
45 #include "cram/cram.h"
46 
47 /*
48  * ---------------------------------------------------------------------------
49  * Block bit-level I/O functions.
50  * All defined static here to promote easy inlining by the compiler.
51  */
52 
53 #if 0
54 /* Get a single bit, MSB first */
55 static signed int get_bit_MSB(cram_block *block) {
56     unsigned int val;
57 
58     if (block->byte > block->alloc)
59         return -1;
60 
61     val = block->data[block->byte] >> block->bit;
62     if (--block->bit == -1) {
63         block->bit = 7;
64         block->byte++;
65         //printf("(%02X)", block->data[block->byte]);
66     }
67 
68     //printf("-B%d-", val&1);
69 
70     return val & 1;
71 }
72 #endif
73 
74 /*
75  * Count number of successive 0 and 1 bits
76  */
get_one_bits_MSB(cram_block * block)77 static int get_one_bits_MSB(cram_block *block) {
78     int n = 0, b;
79     if (block->byte >= block->uncomp_size)
80         return -1;
81     do {
82         b = block->data[block->byte] >> block->bit;
83         if (--block->bit == -1) {
84             block->bit = 7;
85             block->byte++;
86             if (block->byte == block->uncomp_size && (b&1))
87                 return -1;
88         }
89         n++;
90     } while (b&1);
91 
92     return n-1;
93 }
94 
get_zero_bits_MSB(cram_block * block)95 static int get_zero_bits_MSB(cram_block *block) {
96     int n = 0, b;
97     if (block->byte >= block->uncomp_size)
98         return -1;
99     do {
100         b = block->data[block->byte] >> block->bit;
101         if (--block->bit == -1) {
102             block->bit = 7;
103             block->byte++;
104             if (block->byte == block->uncomp_size && !(b&1))
105                 return -1;
106         }
107         n++;
108     } while (!(b&1));
109 
110     return n-1;
111 }
112 
113 #if 0
114 /* Stores a single bit */
115 static void store_bit_MSB(cram_block *block, unsigned int bit) {
116     if (block->byte >= block->alloc) {
117         block->alloc = block->alloc ? block->alloc*2 : 1024;
118         block->data = realloc(block->data, block->alloc);
119     }
120 
121     if (bit)
122         block->data[block->byte] |= (1 << block->bit);
123 
124     if (--block->bit == -1) {
125         block->bit = 7;
126         block->byte++;
127         block->data[block->byte] = 0;
128     }
129 }
130 #endif
131 
132 #if 0
133 /* Rounds to the next whole byte boundary first */
134 static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
135     if (block->bit != 7) {
136         block->bit = 7;
137         block->byte++;
138     }
139 
140     while (block->byte + len >= block->alloc) {
141         block->alloc = block->alloc ? block->alloc*2 : 1024;
142         block->data = realloc(block->data, block->alloc);
143     }
144 
145     memcpy(&block->data[block->byte], bytes, len);
146     block->byte += len;
147 }
148 #endif
149 
150 /* Local optimised copy for inlining */
get_bits_MSB(cram_block * block,int nbits)151 static inline unsigned int get_bits_MSB(cram_block *block, int nbits) {
152     unsigned int val = 0;
153     int i;
154 
155 #if 0
156     // Fits within the current byte */
157     if (nbits <= block->bit+1) {
158         val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
159         if ((block->bit -= nbits) == -1) {
160             block->bit = 7;
161             block->byte++;
162         }
163         return val;
164     }
165 
166     // partial first byte
167     val = block->data[block->byte] & ((1<<(block->bit+1))-1);
168     nbits -= block->bit+1;
169     block->bit = 7;
170     block->byte++;
171 
172     // whole middle bytes
173     while (nbits >= 8) {
174         val = (val << 8) | block->data[block->byte++];
175         nbits -= 8;
176     }
177 
178     val <<= nbits;
179     val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
180     block->bit -= nbits;
181     return val;
182 #endif
183 
184 #if 0
185     /* Inefficient implementation! */
186     //printf("{");
187     for (i = 0; i < nbits; i++)
188         //val = (val << 1) | get_bit_MSB(block);
189         GET_BIT_MSB(block, val);
190 #endif
191 
192 #if 1
193     /* Combination of 1st two methods */
194     if (nbits <= block->bit+1) {
195         val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
196         if ((block->bit -= nbits) == -1) {
197             block->bit = 7;
198             block->byte++;
199         }
200         return val;
201     }
202 
203     switch(nbits) {
204 //  case 15: GET_BIT_MSB(block, val);
205 //  case 14: GET_BIT_MSB(block, val);
206 //  case 13: GET_BIT_MSB(block, val);
207 //  case 12: GET_BIT_MSB(block, val);
208 //  case 11: GET_BIT_MSB(block, val);
209 //  case 10: GET_BIT_MSB(block, val);
210 //  case  9: GET_BIT_MSB(block, val);
211     case  8: GET_BIT_MSB(block, val);
212     case  7: GET_BIT_MSB(block, val);
213     case  6: GET_BIT_MSB(block, val);
214     case  5: GET_BIT_MSB(block, val);
215     case  4: GET_BIT_MSB(block, val);
216     case  3: GET_BIT_MSB(block, val);
217     case  2: GET_BIT_MSB(block, val);
218     case  1: GET_BIT_MSB(block, val);
219         break;
220 
221     default:
222         for (i = 0; i < nbits; i++)
223             //val = (val << 1) | get_bit_MSB(block);
224             GET_BIT_MSB(block, val);
225     }
226 #endif
227 
228     //printf("=0x%x}", val);
229 
230     return val;
231 }
232 
233 /*
234  * Can store up to 24-bits worth of data encoded in an integer value
235  * Possibly we'd want to have a less optimal store_bits function when dealing
236  * with nbits > 24, but for now we assume the codes generated are never
237  * that big. (Given this is only possible with 121392 or more
238  * characters with exactly the correct frequency distribution we check
239  * for it elsewhere.)
240  */
store_bits_MSB(cram_block * block,unsigned int val,int nbits)241 static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) {
242     //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
243 
244     /*
245      * Use slow mode until we tweak the huffman generator to never generate
246      * codes longer than 24-bits.
247      */
248     unsigned int mask;
249 
250     if (block->byte+4 >= block->alloc) {
251         if (block->byte) {
252             block->alloc *= 2;
253             block->data = realloc(block->data, block->alloc + 4);
254             if (!block->data)
255                 return -1;
256         } else {
257             block->alloc = 1024;
258             block->data = realloc(block->data, block->alloc + 4);
259             if (!block->data)
260                 return -1;
261             block->data[0] = 0; // initialise first byte of buffer
262         }
263     }
264 
265     /* fits in current bit-field */
266     if (nbits <= block->bit+1) {
267         block->data[block->byte] |= (val << (block->bit+1-nbits));
268         if ((block->bit-=nbits) == -1) {
269             block->bit = 7;
270             block->byte++;
271             block->data[block->byte] = 0;
272         }
273         return 0;
274     }
275 
276     block->data[block->byte] |= (val >> (nbits -= block->bit+1));
277     block->bit = 7;
278     block->byte++;
279     block->data[block->byte] = 0;
280 
281     mask = 1<<(nbits-1);
282     do {
283         if (val & mask)
284             block->data[block->byte] |= (1 << block->bit);
285         if (--block->bit == -1) {
286             block->bit = 7;
287             block->byte++;
288             block->data[block->byte] = 0;
289         }
290         mask >>= 1;
291     } while(--nbits);
292 
293     return 0;
294 }
295 
296 /*
297  * Returns the next 'size' bytes from a block, or NULL if insufficient
298  * data left.This is just a pointer into the block data and not an
299  * allocated object, so do not free the result.
300  */
cram_extract_block(cram_block * b,int size)301 static char *cram_extract_block(cram_block *b, int size) {
302     char *cp = (char *)b->data + b->idx;
303     b->idx += size;
304     if (b->idx > b->uncomp_size)
305         return NULL;
306 
307     return cp;
308 }
309 
310 /*
311  * ---------------------------------------------------------------------------
312  * EXTERNAL
313  */
cram_external_decode_int(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)314 int cram_external_decode_int(cram_slice *slice, cram_codec *c,
315                              cram_block *in, char *out, int *out_size) {
316     int l;
317     char *cp;
318     cram_block *b;
319 
320     /* Find the external block */
321     b = cram_get_block_by_id(slice, c->external.content_id);
322     if (!b)
323         return *out_size?-1:0;
324 
325     cp = (char *)b->data + b->idx;
326     // E_INT and E_LONG are guaranteed single item queries
327     l = safe_itf8_get(cp, (char *)b->data + b->uncomp_size, (int32_t *)out);
328     b->idx += l;
329     *out_size = 1;
330 
331     return l > 0 ? 0 : -1;
332 }
333 
cram_external_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)334 int cram_external_decode_char(cram_slice *slice, cram_codec *c,
335                               cram_block *in, char *out,
336                               int *out_size) {
337     char *cp;
338     cram_block *b;
339 
340     /* Find the external block */
341     b = cram_get_block_by_id(slice, c->external.content_id);
342     if (!b)
343         return *out_size?-1:0;
344 
345     cp = cram_extract_block(b, *out_size);
346     if (!cp)
347         return -1;
348 
349     if (out)
350         memcpy(out, cp, *out_size);
351     return 0;
352 }
353 
cram_external_decode_block(cram_slice * slice,cram_codec * c,cram_block * in,char * out_,int * out_size)354 static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
355                                       cram_block *in, char *out_,
356                                       int *out_size) {
357     char *cp;
358     cram_block *out = (cram_block *)out_;
359     cram_block *b = NULL;
360 
361     /* Find the external block */
362     b = cram_get_block_by_id(slice, c->external.content_id);
363     if (!b)
364         return *out_size?-1:0;
365 
366     cp = cram_extract_block(b, *out_size);
367     if (!cp)
368         return -1;
369 
370     BLOCK_APPEND(out, cp, *out_size);
371     return 0;
372 }
373 
cram_external_decode_free(cram_codec * c)374 void cram_external_decode_free(cram_codec *c) {
375     if (c)
376         free(c);
377 }
378 
cram_external_decode_init(char * data,int size,enum cram_external_type option,int version)379 cram_codec *cram_external_decode_init(char *data, int size,
380                                       enum cram_external_type option,
381                                       int version) {
382     cram_codec *c = NULL;
383     char *cp = data;
384 
385     if (size < 1)
386         goto malformed;
387 
388     if (!(c = malloc(sizeof(*c))))
389         return NULL;
390 
391     c->codec  = E_EXTERNAL;
392     if (option == E_INT || option == E_LONG)
393         c->decode = cram_external_decode_int;
394     else if (option == E_BYTE_ARRAY || option == E_BYTE)
395         c->decode = cram_external_decode_char;
396     else
397         c->decode = cram_external_decode_block;
398     c->free   = cram_external_decode_free;
399 
400     cp += safe_itf8_get(cp, data + size, &c->external.content_id);
401 
402     if (cp - data != size)
403         goto malformed;
404 
405     c->external.type = option;
406 
407     return c;
408 
409  malformed:
410     hts_log_error("Malformed external header stream");
411     free(c);
412     return NULL;
413 }
414 
cram_external_encode_int(cram_slice * slice,cram_codec * c,char * in,int in_size)415 int cram_external_encode_int(cram_slice *slice, cram_codec *c,
416                              char *in, int in_size) {
417     uint32_t *i32 = (uint32_t *)in;
418 
419     itf8_put_blk(c->out, *i32);
420     return 0;
421 }
422 
cram_external_encode_char(cram_slice * slice,cram_codec * c,char * in,int in_size)423 int cram_external_encode_char(cram_slice *slice, cram_codec *c,
424                               char *in, int in_size) {
425     BLOCK_APPEND(c->out, in, in_size);
426     return 0;
427 }
428 
cram_external_encode_free(cram_codec * c)429 void cram_external_encode_free(cram_codec *c) {
430     if (!c)
431         return;
432     free(c);
433 }
434 
cram_external_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)435 int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
436                                int version) {
437     char tmp[99], *tp = tmp;
438     int len = 0;
439 
440     if (prefix) {
441         size_t l = strlen(prefix);
442         BLOCK_APPEND(b, prefix, l);
443         len += l;
444     }
445 
446     tp += itf8_put(tp, c->e_external.content_id);
447     len += itf8_put_blk(b, c->codec);
448     len += itf8_put_blk(b, tp-tmp);
449     BLOCK_APPEND(b, tmp, tp-tmp);
450     len += tp-tmp;
451 
452     return len;
453 }
454 
cram_external_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)455 cram_codec *cram_external_encode_init(cram_stats *st,
456                                       enum cram_external_type option,
457                                       void *dat,
458                                       int version) {
459     cram_codec *c;
460 
461     c = malloc(sizeof(*c));
462     if (!c)
463         return NULL;
464     c->codec = E_EXTERNAL;
465     c->free = cram_external_encode_free;
466     if (option == E_INT || option == E_LONG)
467         c->encode = cram_external_encode_int;
468     else if (option == E_BYTE_ARRAY || option == E_BYTE)
469         c->encode = cram_external_encode_char;
470     else
471         abort();
472     c->store = cram_external_encode_store;
473 
474     c->e_external.content_id = (size_t)dat;
475 
476     return c;
477 }
478 
479 /*
480  * ---------------------------------------------------------------------------
481  * BETA
482  */
cram_beta_decode_int(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)483 int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
484     int32_t *out_i = (int32_t *)out;
485     int i, n = *out_size;
486 
487     if (c->beta.nbits) {
488         if (cram_not_enough_bits(in, c->beta.nbits * n))
489             return -1;
490 
491         for (i = 0; i < n; i++)
492             out_i[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
493     } else {
494         for (i = 0; i < n; i++)
495             out_i[i] = -c->beta.offset;
496     }
497 
498     return 0;
499 }
500 
cram_beta_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)501 int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
502     int i, n = *out_size;
503 
504 
505     if (c->beta.nbits) {
506         if (cram_not_enough_bits(in, c->beta.nbits * n))
507             return -1;
508 
509         if (out)
510             for (i = 0; i < n; i++)
511                 out[i] = get_bits_MSB(in, c->beta.nbits) - c->beta.offset;
512         else
513             for (i = 0; i < n; i++)
514                 get_bits_MSB(in, c->beta.nbits);
515     } else {
516         if (out)
517             for (i = 0; i < n; i++)
518                 out[i] = -c->beta.offset;
519     }
520 
521     return 0;
522 }
523 
cram_beta_decode_free(cram_codec * c)524 void cram_beta_decode_free(cram_codec *c) {
525     if (c)
526         free(c);
527 }
528 
cram_beta_decode_init(char * data,int size,enum cram_external_type option,int version)529 cram_codec *cram_beta_decode_init(char *data, int size,
530                                   enum cram_external_type option,
531                                   int version) {
532     cram_codec *c;
533     char *cp = data;
534 
535     if (!(c = malloc(sizeof(*c))))
536         return NULL;
537 
538     c->codec  = E_BETA;
539     if (option == E_INT || option == E_LONG)
540         c->decode = cram_beta_decode_int;
541     else if (option == E_BYTE_ARRAY || option == E_BYTE)
542         c->decode = cram_beta_decode_char;
543     else {
544         hts_log_error("BYTE_ARRAYs not supported by this codec");
545         return NULL;
546     }
547     c->free   = cram_beta_decode_free;
548 
549     c->beta.nbits = -1;
550     cp += safe_itf8_get(cp, data + size, &c->beta.offset);
551     if (cp < data + size) // Ensure test below works
552         cp += safe_itf8_get(cp, data + size, &c->beta.nbits);
553 
554     if (cp - data != size
555         || c->beta.nbits < 0 || c->beta.nbits > 8 * sizeof(int)) {
556         hts_log_error("Malformed beta header stream");
557         free(c);
558         return NULL;
559     }
560 
561     return c;
562 }
563 
cram_beta_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)564 int cram_beta_encode_store(cram_codec *c, cram_block *b,
565                            char *prefix, int version) {
566     int len = 0;
567 
568     if (prefix) {
569         size_t l = strlen(prefix);
570         BLOCK_APPEND(b, prefix, l);
571         len += l;
572     }
573 
574     len += itf8_put_blk(b, c->codec);
575     len += itf8_put_blk(b, itf8_size(c->e_beta.offset)
576                         + itf8_size(c->e_beta.nbits)); // codec length
577     len += itf8_put_blk(b, c->e_beta.offset);
578     len += itf8_put_blk(b, c->e_beta.nbits);
579 
580     return len;
581 }
582 
cram_beta_encode_int(cram_slice * slice,cram_codec * c,char * in,int in_size)583 int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
584                          char *in, int in_size) {
585     int *syms = (int *)in;
586     int i, r = 0;
587 
588     for (i = 0; i < in_size; i++)
589         r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
590                             c->e_beta.nbits);
591 
592     return r;
593 }
594 
cram_beta_encode_char(cram_slice * slice,cram_codec * c,char * in,int in_size)595 int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
596                           char *in, int in_size) {
597     unsigned char *syms = (unsigned char *)in;
598     int i, r = 0;
599 
600     for (i = 0; i < in_size; i++)
601         r |= store_bits_MSB(c->out, syms[i] + c->e_beta.offset,
602                             c->e_beta.nbits);
603 
604     return r;
605 }
606 
cram_beta_encode_free(cram_codec * c)607 void cram_beta_encode_free(cram_codec *c) {
608     if (c) free(c);
609 }
610 
cram_beta_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)611 cram_codec *cram_beta_encode_init(cram_stats *st,
612                                   enum cram_external_type option,
613                                   void *dat,
614                                   int version) {
615     cram_codec *c;
616     int min_val, max_val, len = 0;
617     int64_t range;
618 
619     c = malloc(sizeof(*c));
620     if (!c)
621         return NULL;
622     c->codec  = E_BETA;
623     c->free   = cram_beta_encode_free;
624     if (option == E_INT)
625         c->encode = cram_beta_encode_int;
626     else
627         c->encode = cram_beta_encode_char;
628     c->store  = cram_beta_encode_store;
629 
630     if (dat) {
631         min_val = ((int *)dat)[0];
632         max_val = ((int *)dat)[1];
633     } else {
634         min_val = INT_MAX;
635         max_val = INT_MIN;
636         int i;
637         for (i = 0; i < MAX_STAT_VAL; i++) {
638             if (!st->freqs[i])
639                 continue;
640             if (min_val > i)
641                 min_val = i;
642             max_val = i;
643         }
644         if (st->h) {
645             khint_t k;
646 
647             for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
648                 if (!kh_exist(st->h, k))
649                     continue;
650 
651                 i = kh_key(st->h, k);
652                 if (min_val > i)
653                     min_val = i;
654                 if (max_val < i)
655                     max_val = i;
656             }
657         }
658     }
659 
660     assert(max_val >= min_val);
661     c->e_beta.offset = -min_val;
662     range = (int64_t) max_val - min_val;
663     while (range) {
664         len++;
665         range >>= 1;
666     }
667     c->e_beta.nbits = len;
668 
669     return c;
670 }
671 
672 /*
673  * ---------------------------------------------------------------------------
674  * SUBEXP
675  */
cram_subexp_decode(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)676 int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
677     int32_t *out_i = (int32_t *)out;
678     int n, count;
679     int k = c->subexp.k;
680 
681     for (count = 0, n = *out_size; count < n; count++) {
682         int i = 0, tail;
683         int val;
684 
685         /* Get number of 1s */
686         //while (get_bit_MSB(in) == 1) i++;
687         i = get_one_bits_MSB(in);
688         if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
689             return -1;
690         /*
691          * Val is
692          * i > 0:  2^(k+i-1) + k+i-1 bits
693          * i = 0:  k bits
694          */
695         if (i) {
696             tail = i + k-1;
697             val = 0;
698             while (tail) {
699                 //val = val<<1; val |= get_bit_MSB(in);
700                 GET_BIT_MSB(in, val);
701                 tail--;
702             }
703             val += 1 << (i + k-1);
704         } else {
705             tail = k;
706             val = 0;
707             while (tail) {
708                 //val = val<<1; val |= get_bit_MSB(in);
709                 GET_BIT_MSB(in, val);
710                 tail--;
711             }
712         }
713 
714         out_i[count] = val - c->subexp.offset;
715     }
716 
717     return 0;
718 }
719 
cram_subexp_decode_free(cram_codec * c)720 void cram_subexp_decode_free(cram_codec *c) {
721     if (c)
722         free(c);
723 }
724 
cram_subexp_decode_init(char * data,int size,enum cram_external_type option,int version)725 cram_codec *cram_subexp_decode_init(char *data, int size,
726                                     enum cram_external_type option,
727                                     int version) {
728     cram_codec *c;
729     char *cp = data;
730 
731     if (option != E_INT) {
732         hts_log_error("This codec only supports INT encodings");
733         return NULL;
734     }
735 
736     if (!(c = malloc(sizeof(*c))))
737         return NULL;
738 
739     c->codec  = E_SUBEXP;
740     c->decode = cram_subexp_decode;
741     c->free   = cram_subexp_decode_free;
742     c->subexp.k = -1;
743 
744     cp += safe_itf8_get(cp, data + size, &c->subexp.offset);
745     cp += safe_itf8_get(cp, data + size, &c->subexp.k);
746 
747     if (cp - data != size || c->subexp.k < 0) {
748         hts_log_error("Malformed subexp header stream");
749         free(c);
750         return NULL;
751     }
752 
753     return c;
754 }
755 
756 /*
757  * ---------------------------------------------------------------------------
758  * GAMMA
759  */
cram_gamma_decode(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)760 int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
761     int32_t *out_i = (int32_t *)out;
762     int i, n;
763 
764     for (i = 0, n = *out_size; i < n; i++) {
765         int nz = 0;
766         int val;
767         //while (get_bit_MSB(in) == 0) nz++;
768         nz = get_zero_bits_MSB(in);
769         if (cram_not_enough_bits(in, nz))
770             return -1;
771         val = 1;
772         while (nz > 0) {
773             //val <<= 1; val |= get_bit_MSB(in);
774             GET_BIT_MSB(in, val);
775             nz--;
776         }
777 
778         out_i[i] = val - c->gamma.offset;
779     }
780 
781     return 0;
782 }
783 
cram_gamma_decode_free(cram_codec * c)784 void cram_gamma_decode_free(cram_codec *c) {
785     if (c)
786         free(c);
787 }
788 
cram_gamma_decode_init(char * data,int size,enum cram_external_type option,int version)789 cram_codec *cram_gamma_decode_init(char *data, int size,
790                                    enum cram_external_type option,
791                                    int version) {
792     cram_codec *c = NULL;
793     char *cp = data;
794 
795     if (option != E_INT) {
796         hts_log_error("This codec only supports INT encodings");
797         return NULL;
798     }
799 
800     if (size < 1)
801         goto malformed;
802 
803     if (!(c = malloc(sizeof(*c))))
804         return NULL;
805 
806     c->codec  = E_GAMMA;
807     c->decode = cram_gamma_decode;
808     c->free   = cram_gamma_decode_free;
809 
810     cp += safe_itf8_get(cp, data + size, &c->gamma.offset);
811 
812     if (cp - data != size)
813         goto malformed;
814 
815     return c;
816 
817  malformed:
818     hts_log_error("Malformed gamma header stream");
819     free(c);
820     return NULL;
821 }
822 
823 /*
824  * ---------------------------------------------------------------------------
825  * HUFFMAN
826  */
827 
code_sort(const void * vp1,const void * vp2)828 static int code_sort(const void *vp1, const void *vp2) {
829     const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
830     const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
831 
832     if (c1->len != c2->len)
833         return c1->len - c2->len;
834     else
835         return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
836 }
837 
cram_huffman_decode_free(cram_codec * c)838 void cram_huffman_decode_free(cram_codec *c) {
839     if (!c)
840         return;
841 
842     if (c->huffman.codes)
843         free(c->huffman.codes);
844     free(c);
845 }
846 
cram_huffman_decode_null(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)847 int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
848                              cram_block *in, char *out, int *out_size) {
849     return -1;
850 }
851 
cram_huffman_decode_char0(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)852 int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
853                               cram_block *in, char *out, int *out_size) {
854     int i, n;
855 
856     if (!out)
857         return 0;
858 
859     /* Special case of 0 length codes */
860     for (i = 0, n = *out_size; i < n; i++) {
861         out[i] = c->huffman.codes[0].symbol;
862     }
863     return 0;
864 }
865 
cram_huffman_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)866 int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
867                              cram_block *in, char *out, int *out_size) {
868     int i, n, ncodes = c->huffman.ncodes;
869     const cram_huffman_code * const codes = c->huffman.codes;
870 
871     for (i = 0, n = *out_size; i < n; i++) {
872         int idx = 0;
873         int val = 0, len = 0, last_len = 0;
874 
875         for (;;) {
876             int dlen = codes[idx].len - last_len;
877             if (cram_not_enough_bits(in, dlen))
878                 return -1;
879 
880             //val <<= dlen;
881             //val  |= get_bits_MSB(in, dlen);
882             //last_len = (len += dlen);
883 
884             last_len = (len += dlen);
885             for (; dlen; dlen--) GET_BIT_MSB(in, val);
886 
887             idx = val - codes[idx].p;
888             if (idx >= ncodes || idx < 0)
889                 return -1;
890 
891             if (codes[idx].code == val && codes[idx].len == len) {
892                 if (out) out[i] = codes[idx].symbol;
893                 break;
894             }
895         }
896     }
897 
898     return 0;
899 }
900 
cram_huffman_decode_int0(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)901 int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
902                              cram_block *in, char *out, int *out_size) {
903     int32_t *out_i = (int32_t *)out;
904     int i, n;
905     const cram_huffman_code * const codes = c->huffman.codes;
906 
907     /* Special case of 0 length codes */
908     for (i = 0, n = *out_size; i < n; i++) {
909         out_i[i] = codes[0].symbol;
910     }
911     return 0;
912 }
913 
cram_huffman_decode_int(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)914 int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
915                             cram_block *in, char *out, int *out_size) {
916     int32_t *out_i = (int32_t *)out;
917     int i, n, ncodes = c->huffman.ncodes;
918     const cram_huffman_code * const codes = c->huffman.codes;
919 
920     for (i = 0, n = *out_size; i < n; i++) {
921         int idx = 0;
922         int val = 0, len = 0, last_len = 0;
923 
924         // Now one bit at a time for remaining checks
925         for (;;) {
926             int dlen = codes[idx].len - last_len;
927             if (cram_not_enough_bits(in, dlen))
928                 return -1;
929 
930             //val <<= dlen;
931             //val  |= get_bits_MSB(in, dlen);
932             //last_len = (len += dlen);
933 
934             last_len = (len += dlen);
935             for (; dlen; dlen--) GET_BIT_MSB(in, val);
936 
937             idx = val - codes[idx].p;
938             if (idx >= ncodes || idx < 0)
939                 return -1;
940 
941             if (codes[idx].code == val && codes[idx].len == len) {
942                 out_i[i] = codes[idx].symbol;
943                 break;
944             }
945         }
946     }
947 
948     return 0;
949 }
950 
951 /*
952  * Initialises a huffman decoder from an encoding data stream.
953  */
cram_huffman_decode_init(char * data,int size,enum cram_external_type option,int version)954 cram_codec *cram_huffman_decode_init(char *data, int size,
955                                      enum cram_external_type option,
956                                      int version) {
957     int32_t ncodes = 0, i, j;
958     char *cp = data, *data_end = &data[size];
959     cram_codec *h;
960     cram_huffman_code *codes = NULL;
961     int32_t val, last_len, max_len = 0;
962     uint32_t max_val; // needs one more bit than val
963     const int max_code_bits = sizeof(val) * 8 - 1;
964     int l;
965 
966     if (option == E_BYTE_ARRAY_BLOCK) {
967         hts_log_error("BYTE_ARRAYs not supported by this codec");
968         return NULL;
969     }
970 
971     cp += safe_itf8_get(cp, data_end, &ncodes);
972     if (ncodes < 0) {
973         hts_log_error("Invalid number of symbols in huffman stream");
974         return NULL;
975     }
976     if (ncodes >= SIZE_MAX / sizeof(*codes)) {
977         errno = ENOMEM;
978         return NULL;
979     }
980 
981     h = calloc(1, sizeof(*h));
982     if (!h)
983         return NULL;
984 
985     h->codec  = E_HUFFMAN;
986     h->free   = cram_huffman_decode_free;
987 
988     h->huffman.ncodes = ncodes;
989     if (ncodes) {
990         codes = h->huffman.codes = malloc(ncodes * sizeof(*codes));
991         if (!codes) {
992             free(h);
993             return NULL;
994         }
995     } else {
996         codes = h->huffman.codes = NULL;
997     }
998 
999     /* Read symbols and bit-lengths */
1000     for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) {
1001         l = safe_itf8_get(cp, data_end, &codes[i].symbol);
1002     }
1003 
1004     if (l < 1)
1005         goto malformed;
1006 
1007     cp += safe_itf8_get(cp, data_end, &i);
1008     if (i != ncodes)
1009         goto malformed;
1010 
1011     if (ncodes == 0) {
1012         /* NULL huffman stream.  Ensure it returns an error if
1013            anything tries to use it. */
1014         h->decode = cram_huffman_decode_null;
1015         return h;
1016     }
1017 
1018     for (i = 0, l = 1; i < ncodes; i++, cp += l) {
1019         l = safe_itf8_get(cp, data_end, &codes[i].len);
1020         if (l < 1)
1021             break;
1022         if (max_len < codes[i].len)
1023             max_len = codes[i].len;
1024     }
1025     if (l < 1 || cp - data != size || max_len >= ncodes)
1026         goto malformed;
1027 
1028     /* 31 is max. bits available in val */
1029     if (max_len > max_code_bits) {
1030         hts_log_error("Huffman code length (%d) is greater "
1031                       "than maximum supported (%d)", max_len, max_code_bits);
1032         free(h);
1033         free(codes);
1034         return NULL;
1035     }
1036 
1037     /* Sort by bit length and then by symbol value */
1038     qsort(codes, ncodes, sizeof(*codes), code_sort);
1039 
1040     /* Assign canonical codes */
1041     val = -1, last_len = 0, max_val = 0;
1042     for (i = 0; i < ncodes; i++) {
1043         val++;
1044         if (val > max_val)
1045             goto malformed;
1046 
1047         if (codes[i].len > last_len) {
1048             val <<= (codes[i].len - last_len);
1049             last_len = codes[i].len;
1050             max_val = (1U << codes[i].len) - 1;
1051         }
1052         codes[i].code = val;
1053     }
1054 
1055     /*
1056      * Compute the next starting point, offset by the i'th value.
1057      * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
1058      * codes[10..13].p = 30 - 10.
1059      */
1060     last_len = 0;
1061     for (i = j = 0; i < ncodes; i++) {
1062         if (codes[i].len > last_len) {
1063             j = codes[i].code - i;
1064             last_len = codes[i].len;
1065         }
1066         codes[i].p = j;
1067     }
1068 
1069     // puts("==HUFF LEN==");
1070     // for (i = 0; i <= last_len+1; i++) {
1071     //     printf("len %d=%d prefix %d\n", i, h->huffman.lengths[i], h->huffman.prefix[i]);
1072     // }
1073     // puts("===HUFFMAN CODES===");
1074     // for (i = 0; i < ncodes; i++) {
1075     //     int j;
1076     //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
1077     //     j = codes[i].len;
1078     //     while (j) {
1079     //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
1080     //     }
1081     //     printf(" %d\n", codes[i].code);
1082     // }
1083 
1084     if (option == E_BYTE || option == E_BYTE_ARRAY) {
1085         if (h->huffman.codes[0].len == 0)
1086             h->decode = cram_huffman_decode_char0;
1087         else
1088             h->decode = cram_huffman_decode_char;
1089     } else if (option == E_BYTE_ARRAY_BLOCK) {
1090         abort();
1091     } else {
1092         if (h->huffman.codes[0].len == 0)
1093             h->decode = cram_huffman_decode_int0;
1094         else
1095             h->decode = cram_huffman_decode_int;
1096     }
1097 
1098     return (cram_codec *)h;
1099 
1100  malformed:
1101     hts_log_error("Malformed huffman header stream");
1102     free(codes);
1103     free(h);
1104     return NULL;
1105 }
1106 
cram_huffman_encode_char0(cram_slice * slice,cram_codec * c,char * in,int in_size)1107 int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
1108                               char *in, int in_size) {
1109     return 0;
1110 }
1111 
cram_huffman_encode_char(cram_slice * slice,cram_codec * c,char * in,int in_size)1112 int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
1113                              char *in, int in_size) {
1114     int i, code, len, r = 0;
1115     unsigned char *syms = (unsigned char *)in;
1116 
1117     while (in_size--) {
1118         int sym = *syms++;
1119         if (sym >= -1 && sym < MAX_HUFF) {
1120             i = c->e_huffman.val2code[sym+1];
1121             assert(c->e_huffman.codes[i].symbol == sym);
1122             code = c->e_huffman.codes[i].code;
1123             len  = c->e_huffman.codes[i].len;
1124         } else {
1125             /* Slow - use a lookup table for when sym < MAX_HUFF? */
1126             for (i = 0; i < c->e_huffman.nvals; i++) {
1127                 if (c->e_huffman.codes[i].symbol == sym)
1128                     break;
1129             }
1130             if (i == c->e_huffman.nvals)
1131                 return -1;
1132 
1133             code = c->e_huffman.codes[i].code;
1134             len  = c->e_huffman.codes[i].len;
1135         }
1136 
1137         r |= store_bits_MSB(c->out, code, len);
1138     }
1139 
1140     return r;
1141 }
1142 
cram_huffman_encode_int0(cram_slice * slice,cram_codec * c,char * in,int in_size)1143 int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
1144                              char *in, int in_size) {
1145     return 0;
1146 }
1147 
cram_huffman_encode_int(cram_slice * slice,cram_codec * c,char * in,int in_size)1148 int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
1149                             char *in, int in_size) {
1150     int i, code, len, r = 0;
1151     int *syms = (int *)in;
1152 
1153     while (in_size--) {
1154         int sym = *syms++;
1155 
1156         if (sym >= -1 && sym < MAX_HUFF) {
1157             i = c->e_huffman.val2code[sym+1];
1158             assert(c->e_huffman.codes[i].symbol == sym);
1159             code = c->e_huffman.codes[i].code;
1160             len  = c->e_huffman.codes[i].len;
1161         } else {
1162             /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
1163             for (i = 0; i < c->e_huffman.nvals; i++) {
1164                 if (c->e_huffman.codes[i].symbol == sym)
1165                     break;
1166             }
1167             if (i == c->e_huffman.nvals)
1168                 return -1;
1169 
1170             code = c->e_huffman.codes[i].code;
1171             len  = c->e_huffman.codes[i].len;
1172         }
1173 
1174         r |= store_bits_MSB(c->out, code, len);
1175     }
1176 
1177     return r;
1178 }
1179 
cram_huffman_encode_free(cram_codec * c)1180 void cram_huffman_encode_free(cram_codec *c) {
1181     if (!c)
1182         return;
1183 
1184     if (c->e_huffman.codes)
1185         free(c->e_huffman.codes);
1186     free(c);
1187 }
1188 
1189 /*
1190  * Encodes a huffman tree.
1191  * Returns number of bytes written.
1192  */
cram_huffman_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)1193 int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
1194                               int version) {
1195     int i, len = 0;
1196     cram_huffman_code *codes = c->e_huffman.codes;
1197     /*
1198      * Up to code length 127 means 2.5e+26 bytes of data required (worst
1199      * case huffman tree needs symbols with freqs matching the Fibonacci
1200      * series). So guaranteed 1 byte per code.
1201      *
1202      * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
1203      *
1204      * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
1205      */
1206     char *tmp = malloc(6*c->e_huffman.nvals+16);
1207     char *tp = tmp;
1208 
1209     if (!tmp)
1210         return -1;
1211 
1212     if (prefix) {
1213         size_t l = strlen(prefix);
1214         BLOCK_APPEND(b, prefix, l);
1215         len += l;
1216     }
1217 
1218     tp += itf8_put(tp, c->e_huffman.nvals);
1219     for (i = 0; i < c->e_huffman.nvals; i++) {
1220         tp += itf8_put(tp, codes[i].symbol);
1221     }
1222 
1223     tp += itf8_put(tp, c->e_huffman.nvals);
1224     for (i = 0; i < c->e_huffman.nvals; i++) {
1225         tp += itf8_put(tp, codes[i].len);
1226     }
1227 
1228     len += itf8_put_blk(b, c->codec);
1229     len += itf8_put_blk(b, tp-tmp);
1230     BLOCK_APPEND(b, tmp, tp-tmp);
1231     len += tp-tmp;
1232 
1233     free(tmp);
1234 
1235     return len;
1236 }
1237 
cram_huffman_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)1238 cram_codec *cram_huffman_encode_init(cram_stats *st,
1239                                      enum cram_external_type option,
1240                                      void *dat,
1241                                      int version) {
1242     int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens, code, len;
1243     int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
1244     cram_codec *c;
1245     cram_huffman_code *codes;
1246 
1247     c = malloc(sizeof(*c));
1248     if (!c)
1249         return NULL;
1250     c->codec = E_HUFFMAN;
1251 
1252     /* Count number of unique symbols */
1253     for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
1254         if (!st->freqs[i])
1255             continue;
1256         if (nvals >= vals_alloc) {
1257             vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
1258             vals  = realloc(vals,  vals_alloc * sizeof(int));
1259             freqs = realloc(freqs, vals_alloc * sizeof(int));
1260             if (!vals || !freqs) {
1261                 if (vals)  free(vals);
1262                 if (freqs) free(freqs);
1263                 free(c);
1264                 return NULL;
1265             }
1266         }
1267         vals[nvals] = i;
1268         freqs[nvals] = st->freqs[i];
1269         assert(st->freqs[i] > 0);
1270         ntot += freqs[nvals];
1271         if (max_val < i) max_val = i;
1272         if (min_val > i) min_val = i;
1273         nvals++;
1274     }
1275     if (st->h) {
1276         khint_t k;
1277 
1278         for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1279             if (!kh_exist(st->h, k))
1280                 continue;
1281             if (nvals >= vals_alloc) {
1282                 vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
1283                 vals  = realloc(vals,  vals_alloc * sizeof(int));
1284                 freqs = realloc(freqs, vals_alloc * sizeof(int));
1285                 if (!vals || !freqs)
1286                     return NULL;
1287             }
1288             vals[nvals]= kh_key(st->h, k);
1289             freqs[nvals] = kh_val(st->h, k);
1290             assert(freqs[nvals] > 0);
1291             ntot += freqs[nvals];
1292             if (max_val < i) max_val = i;
1293             if (min_val > i) min_val = i;
1294             nvals++;
1295         }
1296     }
1297 
1298     assert(nvals > 0);
1299 
1300     freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
1301     lens = calloc(2*nvals, sizeof(*lens));
1302     if (!lens || !freqs)
1303         return NULL;
1304 
1305     /* Inefficient, use pointers to form chain so we can insert and maintain
1306      * a sorted list? This is currently O(nvals^2) complexity.
1307      */
1308     for (;;) {
1309         int low1 = INT_MAX, low2 = INT_MAX;
1310         int ind1 = 0, ind2 = 0;
1311         for (i = 0; i < nvals; i++) {
1312             if (freqs[i] < 0)
1313                 continue;
1314             if (low1 > freqs[i])
1315                 low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
1316             else if (low2 > freqs[i])
1317                 low2 = freqs[i], ind2 = i;
1318         }
1319         if (low2 == INT_MAX)
1320             break;
1321 
1322         freqs[nvals] = low1 + low2;
1323         lens[ind1] = nvals;
1324         lens[ind2] = nvals;
1325         freqs[ind1] *= -1;
1326         freqs[ind2] *= -1;
1327         nvals++;
1328     }
1329     nvals = nvals/2+1;
1330 
1331     /* Assign lengths */
1332     for (i = 0; i < nvals; i++) {
1333         int code_len = 0;
1334         for (k = lens[i]; k; k = lens[k])
1335             code_len++;
1336         lens[i] = code_len;
1337         freqs[i] *= -1;
1338         //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
1339     }
1340 
1341 
1342     /* Sort, need in a struct */
1343     if (!(codes = malloc(nvals * sizeof(*codes))))
1344         return NULL;
1345     for (i = 0; i < nvals; i++) {
1346         codes[i].symbol = vals[i];
1347         codes[i].len = lens[i];
1348     }
1349     qsort(codes, nvals, sizeof(*codes), code_sort);
1350 
1351     /*
1352      * Generate canonical codes from lengths.
1353      * Sort by length.
1354      * Start with 0.
1355      * Every new code of same length is +1.
1356      * Every new code of new length is +1 then <<1 per extra length.
1357      *
1358      * /\
1359      * a/\
1360      * /\/\
1361      * bcd/\
1362      *    ef
1363      *
1364      * a 1  0
1365      * b 3  4 (0+1)<<2
1366      * c 3  5
1367      * d 3  6
1368      * e 4  14  (6+1)<<1
1369      * f 5  15
1370      */
1371     code = 0; len = codes[0].len;
1372     for (i = 0; i < nvals; i++) {
1373         while (len != codes[i].len) {
1374             code<<=1;
1375             len++;
1376         }
1377         codes[i].code = code++;
1378 
1379         if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
1380             c->e_huffman.val2code[codes[i].symbol+1] = i;
1381 
1382         //fprintf(stderr, "sym %d, code %d, len %d\n",
1383         //      codes[i].symbol, codes[i].code, codes[i].len);
1384     }
1385 
1386     free(lens);
1387     free(vals);
1388     free(freqs);
1389 
1390     c->e_huffman.codes = codes;
1391     c->e_huffman.nvals = nvals;
1392 
1393     c->free = cram_huffman_encode_free;
1394     if (option == E_BYTE || option == E_BYTE_ARRAY) {
1395         if (c->e_huffman.codes[0].len == 0)
1396             c->encode = cram_huffman_encode_char0;
1397         else
1398             c->encode = cram_huffman_encode_char;
1399     } else {
1400         if (c->e_huffman.codes[0].len == 0)
1401             c->encode = cram_huffman_encode_int0;
1402         else
1403             c->encode = cram_huffman_encode_int;
1404     }
1405     c->store = cram_huffman_encode_store;
1406 
1407     return c;
1408 }
1409 
1410 /*
1411  * ---------------------------------------------------------------------------
1412  * BYTE_ARRAY_LEN
1413  */
cram_byte_array_len_decode(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)1414 int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
1415                                cram_block *in, char *out,
1416                                int *out_size) {
1417     /* Fetch length */
1418     int32_t len = 0, one = 1;
1419     int r;
1420 
1421     r = c->byte_array_len.len_codec->decode(slice, c->byte_array_len.len_codec,
1422                                             in, (char *)&len, &one);
1423     //printf("ByteArray Len=%d\n", len);
1424 
1425     if (!r && c->byte_array_len.val_codec && len >= 0) {
1426         r = c->byte_array_len.val_codec->decode(slice,
1427                                                 c->byte_array_len.val_codec,
1428                                                 in, out, &len);
1429     } else {
1430         return -1;
1431     }
1432 
1433     *out_size = len;
1434 
1435     return r;
1436 }
1437 
cram_byte_array_len_decode_free(cram_codec * c)1438 void cram_byte_array_len_decode_free(cram_codec *c) {
1439     if (!c) return;
1440 
1441     if (c->byte_array_len.len_codec)
1442         c->byte_array_len.len_codec->free(c->byte_array_len.len_codec);
1443 
1444     if (c->byte_array_len.val_codec)
1445         c->byte_array_len.val_codec->free(c->byte_array_len.val_codec);
1446 
1447     free(c);
1448 }
1449 
cram_byte_array_len_decode_init(char * data,int size,enum cram_external_type option,int version)1450 cram_codec *cram_byte_array_len_decode_init(char *data, int size,
1451                                             enum cram_external_type option,
1452                                             int version) {
1453     cram_codec *c;
1454     char *cp   = data;
1455     char *endp = data + size;
1456     int32_t encoding = 0;
1457     int32_t sub_size = -1;
1458 
1459     if (!(c = malloc(sizeof(*c))))
1460         return NULL;
1461 
1462     c->codec  = E_BYTE_ARRAY_LEN;
1463     c->decode = cram_byte_array_len_decode;
1464     c->free   = cram_byte_array_len_decode_free;
1465 
1466     cp += safe_itf8_get(cp, endp, &encoding);
1467     cp += safe_itf8_get(cp, endp, &sub_size);
1468     if (sub_size < 0 || endp - cp < sub_size)
1469         goto malformed;
1470     c->byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size,
1471                                                     E_INT, version);
1472     if (c->byte_array_len.len_codec == NULL)
1473         goto no_codec;
1474     cp += sub_size;
1475 
1476     sub_size = -1;
1477     cp += safe_itf8_get(cp, endp, &encoding);
1478     cp += safe_itf8_get(cp, endp, &sub_size);
1479     if (sub_size < 0 || endp - cp < sub_size)
1480         goto malformed;
1481     c->byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size,
1482                                                     option, version);
1483     if (c->byte_array_len.val_codec == NULL)
1484         goto no_codec;
1485     cp += sub_size;
1486 
1487     if (cp - data != size)
1488         goto malformed;
1489 
1490     return c;
1491 
1492  malformed:
1493     hts_log_error("Malformed byte_array_len header stream");
1494  no_codec:
1495     free(c);
1496     return NULL;
1497 }
1498 
cram_byte_array_len_encode(cram_slice * slice,cram_codec * c,char * in,int in_size)1499 int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
1500                                char *in, int in_size) {
1501     int32_t i32 = in_size;
1502     int r = 0;
1503 
1504     r |= c->e_byte_array_len.len_codec->encode(slice,
1505                                                c->e_byte_array_len.len_codec,
1506                                                (char *)&i32, 1);
1507     r |= c->e_byte_array_len.val_codec->encode(slice,
1508                                                c->e_byte_array_len.val_codec,
1509                                                in, in_size);
1510     return r;
1511 }
1512 
cram_byte_array_len_encode_free(cram_codec * c)1513 void cram_byte_array_len_encode_free(cram_codec *c) {
1514     if (!c)
1515         return;
1516 
1517     if (c->e_byte_array_len.len_codec)
1518         c->e_byte_array_len.len_codec->free(c->e_byte_array_len.len_codec);
1519 
1520     if (c->e_byte_array_len.val_codec)
1521         c->e_byte_array_len.val_codec->free(c->e_byte_array_len.val_codec);
1522 
1523     free(c);
1524 }
1525 
cram_byte_array_len_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)1526 int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
1527                                      char *prefix, int version) {
1528     int len = 0, len2, len3;
1529     cram_codec *tc;
1530     cram_block *b_len, *b_val;
1531 
1532     if (prefix) {
1533         size_t l = strlen(prefix);
1534         BLOCK_APPEND(b, prefix, l);
1535         len += l;
1536     }
1537 
1538     tc = c->e_byte_array_len.len_codec;
1539     b_len = cram_new_block(0, 0);
1540     len2 = tc->store(tc, b_len, NULL, version);
1541 
1542     tc = c->e_byte_array_len.val_codec;
1543     b_val = cram_new_block(0, 0);
1544     len3 = tc->store(tc, b_val, NULL, version);
1545 
1546     len += itf8_put_blk(b, c->codec);
1547     len += itf8_put_blk(b, len2+len3);
1548     BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
1549     BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
1550 
1551     cram_free_block(b_len);
1552     cram_free_block(b_val);
1553 
1554     return len + len2 + len3;
1555 }
1556 
cram_byte_array_len_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)1557 cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
1558                                             enum cram_external_type option,
1559                                             void *dat,
1560                                             int version) {
1561     cram_codec *c;
1562     cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
1563 
1564     c = malloc(sizeof(*c));
1565     if (!c)
1566         return NULL;
1567     c->codec = E_BYTE_ARRAY_LEN;
1568     c->free = cram_byte_array_len_encode_free;
1569     c->encode = cram_byte_array_len_encode;
1570     c->store = cram_byte_array_len_encode_store;
1571 
1572     c->e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
1573                                                       st, E_INT,
1574                                                       e->len_dat,
1575                                                       version);
1576     c->e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
1577                                                       NULL, E_BYTE_ARRAY,
1578                                                       e->val_dat,
1579                                                       version);
1580 
1581     return c;
1582 }
1583 
1584 /*
1585  * ---------------------------------------------------------------------------
1586  * BYTE_ARRAY_STOP
1587  */
cram_byte_array_stop_decode_char(cram_slice * slice,cram_codec * c,cram_block * in,char * out,int * out_size)1588 static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
1589                                             cram_block *in, char *out,
1590                                             int *out_size) {
1591     char *cp, ch;
1592     cram_block *b = NULL;
1593 
1594     b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
1595     if (!b)
1596         return *out_size?-1:0;
1597 
1598     if (b->idx >= b->uncomp_size)
1599         return -1;
1600 
1601     cp = (char *)b->data + b->idx;
1602     if (out) {
1603         while ((ch = *cp) != (char)c->byte_array_stop.stop) {
1604             if (cp - (char *)b->data >= b->uncomp_size)
1605                 return -1;
1606             *out++ = ch;
1607             cp++;
1608         }
1609     } else {
1610         // Consume input, but produce no output
1611         while ((ch = *cp) != (char)c->byte_array_stop.stop) {
1612             if (cp - (char *)b->data >= b->uncomp_size)
1613                 return -1;
1614             cp++;
1615         }
1616     }
1617 
1618     *out_size = cp - (char *)(b->data + b->idx);
1619     b->idx = cp - (char *)b->data + 1;
1620 
1621     return 0;
1622 }
1623 
cram_byte_array_stop_decode_block(cram_slice * slice,cram_codec * c,cram_block * in,char * out_,int * out_size)1624 int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
1625                                       cram_block *in, char *out_,
1626                                       int *out_size) {
1627     cram_block *b;
1628     cram_block *out = (cram_block *)out_;
1629     char *cp, *out_cp, *cp_end;
1630     char stop;
1631 
1632     b = cram_get_block_by_id(slice, c->byte_array_stop.content_id);
1633     if (!b)
1634         return *out_size?-1:0;
1635 
1636     if (b->idx >= b->uncomp_size)
1637         return -1;
1638     cp = (char *)b->data + b->idx;
1639     cp_end = (char *)b->data + b->uncomp_size;
1640     out_cp = (char *)BLOCK_END(out);
1641 
1642     stop = c->byte_array_stop.stop;
1643     if (cp_end - cp < out->alloc - out->byte) {
1644         while (cp != cp_end && *cp != stop)
1645             *out_cp++ = *cp++;
1646         BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out);
1647     } else {
1648         char *cp_start;
1649         for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
1650             ;
1651         BLOCK_APPEND(out, cp_start, cp - cp_start);
1652         BLOCK_GROW(out, cp - cp_start);
1653     }
1654 
1655     *out_size = cp - (char *)(b->data + b->idx);
1656     b->idx = cp - (char *)b->data + 1;
1657 
1658     return 0;
1659 }
1660 
cram_byte_array_stop_decode_free(cram_codec * c)1661 void cram_byte_array_stop_decode_free(cram_codec *c) {
1662     if (!c) return;
1663 
1664     free(c);
1665 }
1666 
cram_byte_array_stop_decode_init(char * data,int size,enum cram_external_type option,int version)1667 cram_codec *cram_byte_array_stop_decode_init(char *data, int size,
1668                                              enum cram_external_type option,
1669                                              int version) {
1670     cram_codec *c = NULL;
1671     unsigned char *cp = (unsigned char *)data;
1672 
1673     if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
1674         goto malformed;
1675 
1676     if (!(c = malloc(sizeof(*c))))
1677         return NULL;
1678 
1679     c->codec  = E_BYTE_ARRAY_STOP;
1680     switch (option) {
1681     case E_BYTE_ARRAY_BLOCK:
1682         c->decode = cram_byte_array_stop_decode_block;
1683         break;
1684     case E_BYTE_ARRAY:
1685         c->decode = cram_byte_array_stop_decode_char;
1686         break;
1687     default:
1688         hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
1689         free(c);
1690         return NULL;
1691     }
1692     c->free   = cram_byte_array_stop_decode_free;
1693 
1694     c->byte_array_stop.stop = *cp++;
1695     if (CRAM_MAJOR_VERS(version) == 1) {
1696         c->byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
1697             + (cp[3]<<24);
1698         cp += 4;
1699     } else {
1700         cp += safe_itf8_get((char *) cp, data + size,
1701                             &c->byte_array_stop.content_id);
1702     }
1703 
1704     if ((char *)cp - data != size)
1705         goto malformed;
1706 
1707     return c;
1708 
1709  malformed:
1710     hts_log_error("Malformed byte_array_stop header stream");
1711     free(c);
1712     return NULL;
1713 }
1714 
cram_byte_array_stop_encode(cram_slice * slice,cram_codec * c,char * in,int in_size)1715 int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
1716                                 char *in, int in_size) {
1717     BLOCK_APPEND(c->out, in, in_size);
1718     BLOCK_APPEND_CHAR(c->out, c->e_byte_array_stop.stop);
1719     return 0;
1720 }
1721 
cram_byte_array_stop_encode_free(cram_codec * c)1722 void cram_byte_array_stop_encode_free(cram_codec *c) {
1723     if (!c)
1724         return;
1725     free(c);
1726 }
1727 
cram_byte_array_stop_encode_store(cram_codec * c,cram_block * b,char * prefix,int version)1728 int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
1729                                       char *prefix, int version) {
1730     int len = 0;
1731     char buf[20], *cp = buf;
1732 
1733     if (prefix) {
1734         size_t l = strlen(prefix);
1735         BLOCK_APPEND(b, prefix, l);
1736         len += l;
1737     }
1738 
1739     cp += itf8_put(cp, c->codec);
1740 
1741     if (CRAM_MAJOR_VERS(version) == 1) {
1742         cp += itf8_put(cp, 5);
1743         *cp++ = c->e_byte_array_stop.stop;
1744         *cp++ = (c->e_byte_array_stop.content_id >>  0) & 0xff;
1745         *cp++ = (c->e_byte_array_stop.content_id >>  8) & 0xff;
1746         *cp++ = (c->e_byte_array_stop.content_id >> 16) & 0xff;
1747         *cp++ = (c->e_byte_array_stop.content_id >> 24) & 0xff;
1748     } else {
1749         cp += itf8_put(cp, 1 + itf8_size(c->e_byte_array_stop.content_id));
1750         *cp++ = c->e_byte_array_stop.stop;
1751         cp += itf8_put(cp, c->e_byte_array_stop.content_id);
1752     }
1753 
1754     BLOCK_APPEND(b, buf, cp-buf);
1755     len += cp-buf;
1756 
1757     return len;
1758 }
1759 
cram_byte_array_stop_encode_init(cram_stats * st,enum cram_external_type option,void * dat,int version)1760 cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
1761                                              enum cram_external_type option,
1762                                              void *dat,
1763                                              int version) {
1764     cram_codec *c;
1765 
1766     c = malloc(sizeof(*c));
1767     if (!c)
1768         return NULL;
1769     c->codec = E_BYTE_ARRAY_STOP;
1770     c->free = cram_byte_array_stop_encode_free;
1771     c->encode = cram_byte_array_stop_encode;
1772     c->store = cram_byte_array_stop_encode_store;
1773 
1774     c->e_byte_array_stop.stop = ((int *)dat)[0];
1775     c->e_byte_array_stop.content_id = ((int *)dat)[1];
1776 
1777     return c;
1778 }
1779 
1780 /*
1781  * ---------------------------------------------------------------------------
1782  */
1783 
cram_encoding2str(enum cram_encoding t)1784 const char *cram_encoding2str(enum cram_encoding t) {
1785     switch (t) {
1786     case E_NULL:            return "NULL";
1787     case E_EXTERNAL:        return "EXTERNAL";
1788     case E_GOLOMB:          return "GOLOMB";
1789     case E_HUFFMAN:         return "HUFFMAN";
1790     case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
1791     case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
1792     case E_BETA:            return "BETA";
1793     case E_SUBEXP:          return "SUBEXP";
1794     case E_GOLOMB_RICE:     return "GOLOMB_RICE";
1795     case E_GAMMA:           return "GAMMA";
1796     case E_NUM_CODECS:
1797     default:                return "?";
1798     }
1799 }
1800 
1801 static cram_codec *(*decode_init[])(char *data,
1802                                     int size,
1803                                     enum cram_external_type option,
1804                                     int version) = {
1805     NULL,
1806     cram_external_decode_init,
1807     NULL,
1808     cram_huffman_decode_init,
1809     cram_byte_array_len_decode_init,
1810     cram_byte_array_stop_decode_init,
1811     cram_beta_decode_init,
1812     cram_subexp_decode_init,
1813     NULL,
1814     cram_gamma_decode_init,
1815 };
1816 
cram_decoder_init(enum cram_encoding codec,char * data,int size,enum cram_external_type option,int version)1817 cram_codec *cram_decoder_init(enum cram_encoding codec,
1818                               char *data, int size,
1819                               enum cram_external_type option,
1820                               int version) {
1821     if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
1822         return decode_init[codec](data, size, option, version);
1823     } else {
1824         hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
1825         return NULL;
1826     }
1827 }
1828 
1829 static cram_codec *(*encode_init[])(cram_stats *stx,
1830                                     enum cram_external_type option,
1831                                     void *opt,
1832                                     int version) = {
1833     NULL,
1834     cram_external_encode_init,
1835     NULL,
1836     cram_huffman_encode_init,
1837     cram_byte_array_len_encode_init,
1838     cram_byte_array_stop_encode_init,
1839     cram_beta_encode_init,
1840     NULL, //cram_subexp_encode_init,
1841     NULL,
1842     NULL, //cram_gamma_encode_init,
1843 };
1844 
cram_encoder_init(enum cram_encoding codec,cram_stats * st,enum cram_external_type option,void * dat,int version)1845 cram_codec *cram_encoder_init(enum cram_encoding codec,
1846                               cram_stats *st,
1847                               enum cram_external_type option,
1848                               void *dat,
1849                               int version) {
1850     if (st && !st->nvals)
1851         return NULL;
1852 
1853     if (encode_init[codec]) {
1854         cram_codec *r;
1855         if ((r = encode_init[codec](st, option, dat, version)))
1856             r->out = NULL;
1857         return r;
1858     } else {
1859         hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
1860         abort();
1861     }
1862 }
1863 
1864 /*
1865  * Returns the content_id used by this codec, also in id2 if byte_array_len.
1866  * Returns -1 for the CORE block and -2 for unneeded.
1867  * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
1868  */
cram_codec_to_id(cram_codec * c,int * id2)1869 int cram_codec_to_id(cram_codec *c, int *id2) {
1870     int bnum1, bnum2 = -2;
1871 
1872     switch (c->codec) {
1873     case E_HUFFMAN:
1874         bnum1 = c->huffman.ncodes == 1 ? -2 : -1;
1875         break;
1876     case E_GOLOMB:
1877     case E_BETA:
1878     case E_SUBEXP:
1879     case E_GOLOMB_RICE:
1880     case E_GAMMA:
1881         bnum1 = -1;
1882         break;
1883     case E_EXTERNAL:
1884         bnum1 = c->external.content_id;
1885         break;
1886     case E_BYTE_ARRAY_LEN:
1887         bnum1 = cram_codec_to_id(c->byte_array_len.len_codec, NULL);
1888         bnum2 = cram_codec_to_id(c->byte_array_len.val_codec, NULL);
1889         break;
1890     case E_BYTE_ARRAY_STOP:
1891         bnum1 = c->byte_array_stop.content_id;
1892         break;
1893     case E_NULL:
1894         bnum1 = -2;
1895         break;
1896     default:
1897         hts_log_error("Unknown codec type %d", c->codec);
1898         bnum1 = -1;
1899     }
1900 
1901     if (id2)
1902         *id2 = bnum2;
1903     return bnum1;
1904 }
1905 
1906 
1907 /*
1908  * cram_codec structures are specialised for decoding or encoding.
1909  * Unfortunately this makes turning a decoder into an encoder (such as
1910  * when transcoding files) problematic.
1911  *
1912  * This function converts a cram decoder codec into an encoder version
1913  * in-place (ie it modifiers the codec itself).
1914  *
1915  * Returns 0 on success;
1916  *        -1 on failure.
1917  */
cram_codec_decoder2encoder(cram_fd * fd,cram_codec * c)1918 int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
1919     int j;
1920 
1921     switch (c->codec) {
1922     case E_EXTERNAL:
1923         // shares struct with decode
1924         c->free = cram_external_encode_free;
1925         c->store = cram_external_encode_store;
1926         if (c->decode == cram_external_decode_int)
1927             c->encode = cram_external_encode_int;
1928         else if (c->decode == cram_external_decode_char)
1929             c->encode = cram_external_encode_char;
1930         else
1931             return -1;
1932         break;
1933 
1934     case E_HUFFMAN: {
1935         // New structure, so switch.
1936         // FIXME: we huffman and e_huffman structs amended, we could
1937         // unify this.
1938         cram_codec *t = malloc(sizeof(*t));
1939         t->codec = E_HUFFMAN;
1940         t->free = cram_huffman_encode_free;
1941         t->store = cram_huffman_encode_store;
1942         t->e_huffman.codes = c->huffman.codes;
1943         t->e_huffman.nvals = c->huffman.ncodes;
1944         for (j = 0; j < t->e_huffman.nvals; j++) {
1945             int32_t sym = t->e_huffman.codes[j].symbol;
1946             if (sym >= -1 && sym < MAX_HUFF)
1947                 t->e_huffman.val2code[sym+1] = j;
1948         }
1949 
1950         if (c->decode == cram_huffman_decode_char0)
1951             t->encode = cram_huffman_encode_char0;
1952         else if (c->decode == cram_huffman_decode_char)
1953             t->encode = cram_huffman_encode_char;
1954         else if (c->decode == cram_huffman_decode_int0)
1955             t->encode = cram_huffman_encode_int0;
1956         else if (c->decode == cram_huffman_decode_int)
1957             t->encode = cram_huffman_encode_int;
1958         else {
1959             free(t);
1960             return -1;
1961         }
1962         *c = *t;
1963         free(t);
1964         break;
1965     }
1966 
1967     case E_BETA:
1968         // shares struct with decode
1969         c->free = cram_beta_encode_free;
1970         c->store = cram_beta_encode_store;
1971         if (c->decode == cram_beta_decode_int)
1972             c->encode = cram_beta_encode_int;
1973         else if (c->decode == cram_beta_decode_char)
1974             c->encode = cram_beta_encode_char;
1975         else
1976             return -1;
1977         break;
1978 
1979     case E_BYTE_ARRAY_LEN: {
1980         cram_codec *t = malloc(sizeof(*t));
1981         t->codec = E_BYTE_ARRAY_LEN;
1982         t->free   = cram_byte_array_len_encode_free;
1983         t->store  = cram_byte_array_len_encode_store;
1984         t->encode = cram_byte_array_len_encode;
1985         t->e_byte_array_len.len_codec = c->byte_array_len.len_codec;
1986         t->e_byte_array_len.val_codec = c->byte_array_len.val_codec;
1987         if (cram_codec_decoder2encoder(fd, t->e_byte_array_len.len_codec) == -1 ||
1988             cram_codec_decoder2encoder(fd, t->e_byte_array_len.val_codec) == -1) {
1989             t->free(t);
1990             return -1;
1991         }
1992 
1993         // {len,val}_{encoding,dat} are undefined, but unused.
1994         // Leaving them unset here means we can test that assertion.
1995         *c = *t;
1996         free(t);
1997         break;
1998     }
1999 
2000     case E_BYTE_ARRAY_STOP:
2001         // shares struct with decode
2002         c->free   = cram_byte_array_stop_encode_free;
2003         c->store  = cram_byte_array_stop_encode_store;
2004         c->encode = cram_byte_array_stop_encode;
2005         break;
2006 
2007     default:
2008         return -1;
2009     }
2010 
2011     return 0;
2012 }
2013